^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright 2016-2019 HabanaLabs, Ltd.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * All Rights Reserved.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) #define pr_fmt(fmt) "habanalabs: " fmt
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include "habanalabs.h"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/pci.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/sched/signal.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/hwmon.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <uapi/misc/habanalabs.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #define HL_PLDM_PENDING_RESET_PER_SEC (HL_PENDING_RESET_PER_SEC * 10)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21) if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) enum hl_device_status hl_device_status(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) enum hl_device_status status;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) if (hdev->disabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) status = HL_DEVICE_STATUS_MALFUNCTION;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) else if (atomic_read(&hdev->in_reset))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) status = HL_DEVICE_STATUS_IN_RESET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) status = HL_DEVICE_STATUS_OPERATIONAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) return status;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) static void hpriv_release(struct kref *ref)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) struct hl_fpriv *hpriv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) struct hl_device *hdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) hpriv = container_of(ref, struct hl_fpriv, refcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) hdev = hpriv->hdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) put_pid(hpriv->taskpid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) hl_debugfs_remove_file(hpriv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) mutex_destroy(&hpriv->restore_phase_mutex);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) list_del(&hpriv->dev_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) hdev->compute_ctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) kfree(hpriv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) void hl_hpriv_get(struct hl_fpriv *hpriv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) kref_get(&hpriv->refcount);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) void hl_hpriv_put(struct hl_fpriv *hpriv)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) kref_put(&hpriv->refcount, hpriv_release);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) * hl_device_release - release function for habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) * @inode: pointer to inode structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) * @filp: pointer to file structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) * Called when process closes an habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) static int hl_device_release(struct inode *inode, struct file *filp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) struct hl_fpriv *hpriv = filp->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) hl_cb_mgr_fini(hpriv->hdev, &hpriv->cb_mgr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) hl_ctx_mgr_fini(hpriv->hdev, &hpriv->ctx_mgr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) filp->private_data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) hl_hpriv_put(hpriv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) struct hl_fpriv *hpriv = filp->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) struct hl_device *hdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) filp->private_data = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) hdev = hpriv->hdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) list_del(&hpriv->dev_node);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109) put_pid(hpriv->taskpid);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) kfree(hpriv);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * hl_mmap - mmap function for habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) * @*filp: pointer to file structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * @*vma: pointer to vm_area_struct of the process
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) * Called when process does an mmap on habanalabs device. Call the device's mmap
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) * function at the end of the common code.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127) struct hl_fpriv *hpriv = filp->private_data;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) unsigned long vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130) vm_pgoff = vma->vm_pgoff;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) case HL_MMAP_TYPE_CB:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) return hl_cb_mmap(hpriv, vma);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) static const struct file_operations hl_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) .owner = THIS_MODULE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) .open = hl_device_open,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) .release = hl_device_release,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) .mmap = hl_mmap,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) .unlocked_ioctl = hl_ioctl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) .compat_ioctl = hl_ioctl
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) static const struct file_operations hl_ctrl_ops = {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) .owner = THIS_MODULE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152) .open = hl_device_open_ctrl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) .release = hl_device_release_ctrl,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) .unlocked_ioctl = hl_ioctl_control,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) .compat_ioctl = hl_ioctl_control
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) static void device_release_func(struct device *dev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) kfree(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) * device_init_cdev - Initialize cdev and device for habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) * @hclass: pointer to the class object of the device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) * @minor: minor number of the specific device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * @fpos: file operations to install for this device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) * @name: name of the device as it will appear in the filesystem
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * @cdev: pointer to the char device object that will be initialized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * @dev: pointer to the device object that will be initialized
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * Initialize a cdev and a Linux device for habanalabs's device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) static int device_init_cdev(struct hl_device *hdev, struct class *hclass,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) int minor, const struct file_operations *fops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) char *name, struct cdev *cdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) struct device **dev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) cdev_init(cdev, fops);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) cdev->owner = THIS_MODULE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) *dev = kzalloc(sizeof(**dev), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) if (!*dev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) return -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) device_initialize(*dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) (*dev)->devt = MKDEV(hdev->major, minor);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) (*dev)->class = hclass;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) (*dev)->release = device_release_func;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192) dev_set_drvdata(*dev, hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) dev_set_name(*dev, "%s", name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) static int device_cdev_sysfs_add(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) rc = cdev_device_add(&hdev->cdev, hdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) "failed to add a char device to the system\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209) rc = cdev_device_add(&hdev->cdev_ctrl, hdev->dev_ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) "failed to add a control char device to the system\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213) goto delete_cdev_device;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216) /* hl_sysfs_init() must be done after adding the device to the system */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) rc = hl_sysfs_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) dev_err(hdev->dev, "failed to initialize sysfs\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) goto delete_ctrl_cdev_device;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) hdev->cdev_sysfs_created = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227) delete_ctrl_cdev_device:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) delete_cdev_device:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) cdev_device_del(&hdev->cdev, hdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) static void device_cdev_sysfs_del(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) if (!hdev->cdev_sysfs_created)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237) goto put_devices;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) hl_sysfs_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240) cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) cdev_device_del(&hdev->cdev, hdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) put_devices:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) put_device(hdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245) put_device(hdev->dev_ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) * device_early_init - do some early initialization for the habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) * Install the relevant function pointers and call the early_init function,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) * if such a function exists
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) static int device_early_init(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) int i, rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) char workq_name[32];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) switch (hdev->asic_type) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) case ASIC_GOYA:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) goya_set_asic_funcs(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) strlcpy(hdev->asic_name, "GOYA", sizeof(hdev->asic_name));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) case ASIC_GAUDI:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) gaudi_set_asic_funcs(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) sprintf(hdev->asic_name, "GAUDI");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) hdev->asic_type);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273) return -EINVAL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) rc = hdev->asic_funcs->early_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) rc = hl_asid_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) goto early_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) if (hdev->asic_prop.completion_queues_count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) hdev->cq_wq = kcalloc(hdev->asic_prop.completion_queues_count,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) sizeof(*hdev->cq_wq),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) if (!hdev->cq_wq) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) goto asid_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) if (hdev->cq_wq[i] == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300) goto free_cq_wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) hdev->eq_wq = alloc_workqueue("hl-events", WQ_UNBOUND, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) if (hdev->eq_wq == NULL) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306) dev_err(hdev->dev, "Failed to allocate EQ workqueue\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) goto free_cq_wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) hdev->hl_chip_info = kzalloc(sizeof(struct hwmon_chip_info),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) if (!hdev->hl_chip_info) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) goto free_eq_wq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) hdev->idle_busy_ts_arr = kmalloc_array(HL_IDLE_BUSY_TS_ARR_SIZE,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) sizeof(struct hl_device_idle_busy_ts),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) (GFP_KERNEL | __GFP_ZERO));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) if (!hdev->idle_busy_ts_arr) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) goto free_chip_info;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) rc = hl_mmu_if_set_funcs(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) goto free_idle_busy_ts_arr;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) hl_cb_mgr_init(&hdev->kernel_cb_mgr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332) mutex_init(&hdev->send_cpu_message_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) mutex_init(&hdev->debug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) mutex_init(&hdev->mmu_cache_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336) spin_lock_init(&hdev->hw_queues_mirror_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) INIT_LIST_HEAD(&hdev->fpriv_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) mutex_init(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) atomic_set(&hdev->in_reset, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) free_idle_busy_ts_arr:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) kfree(hdev->idle_busy_ts_arr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) free_chip_info:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) kfree(hdev->hl_chip_info);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) free_eq_wq:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) destroy_workqueue(hdev->eq_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) free_cq_wq:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) if (hdev->cq_wq[i])
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) destroy_workqueue(hdev->cq_wq[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) kfree(hdev->cq_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) asid_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) hl_asid_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) early_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) if (hdev->asic_funcs->early_fini)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) hdev->asic_funcs->early_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) * device_early_fini - finalize all that was done in device_early_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) static void device_early_fini(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) mutex_destroy(&hdev->mmu_cache_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) mutex_destroy(&hdev->debug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) mutex_destroy(&hdev->send_cpu_message_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) mutex_destroy(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379) hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) kfree(hdev->idle_busy_ts_arr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) kfree(hdev->hl_chip_info);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) destroy_workqueue(hdev->eq_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386) for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) destroy_workqueue(hdev->cq_wq[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) kfree(hdev->cq_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) hl_asid_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) if (hdev->asic_funcs->early_fini)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) hdev->asic_funcs->early_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396) static void set_freq_to_low_job(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) struct hl_device *hdev = container_of(work, struct hl_device,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399) work_freq.work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403) if (!hdev->compute_ctx)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) hl_device_set_frequency(hdev, PLL_LOW);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) schedule_delayed_work(&hdev->work_freq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) static void hl_device_heartbeat(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) struct hl_device *hdev = container_of(work, struct hl_device,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) work_heartbeat.work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) if (hl_device_disabled_or_in_reset(hdev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) goto reschedule;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) if (!hdev->asic_funcs->send_heartbeat(hdev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) goto reschedule;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) dev_err(hdev->dev, "Device heartbeat failed!\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) hl_device_reset(hdev, true, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) reschedule:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) schedule_delayed_work(&hdev->work_heartbeat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) * device_late_init - do late stuff initialization for the habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) * Do stuff that either needs the device H/W queues to be active or needs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * to happen after all the rest of the initialization is finished
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) static int device_late_init(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) if (hdev->asic_funcs->late_init) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) rc = hdev->asic_funcs->late_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) "failed late initialization for the H/W\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) hdev->high_pll = hdev->asic_prop.high_pll;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) /* force setting to low frequency */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) hdev->curr_pll_profile = PLL_LOW;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459) if (hdev->pm_mng_profile == PM_AUTO)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464) INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) schedule_delayed_work(&hdev->work_freq,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) if (hdev->heartbeat) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469) INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) schedule_delayed_work(&hdev->work_heartbeat,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) hdev->late_init_done = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) * device_late_fini - finalize all that was done in device_late_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) static void device_late_fini(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) if (!hdev->late_init_done)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490) cancel_delayed_work_sync(&hdev->work_freq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) if (hdev->heartbeat)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) cancel_delayed_work_sync(&hdev->work_heartbeat);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) if (hdev->asic_funcs->late_fini)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) hdev->asic_funcs->late_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) hdev->late_init_done = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) struct hl_device_idle_busy_ts *ts;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) ktime_t zero_ktime, curr = ktime_get();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) u32 overlap_cnt = 0, last_index = hdev->idle_busy_ts_idx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) s64 period_us, last_start_us, last_end_us, last_busy_time_us,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) total_busy_time_us = 0, total_busy_time_ms;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) zero_ktime = ktime_set(0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) period_us = period_ms * USEC_PER_MSEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) ts = &hdev->idle_busy_ts_arr[last_index];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) /* check case that device is currently in idle */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514) !ktime_compare(ts->idle_to_busy_ts, zero_ktime)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) last_index--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) /* Handle case idle_busy_ts_idx was 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521) ts = &hdev->idle_busy_ts_arr[last_index];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) while (overlap_cnt < HL_IDLE_BUSY_TS_ARR_SIZE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) /* Check if we are in last sample case. i.e. if the sample
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) * begun before the sampling period. This could be a real
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) * sample or 0 so need to handle both cases
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) last_start_us = ktime_to_us(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) ktime_sub(curr, ts->idle_to_busy_ts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) if (last_start_us > period_us) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) /* First check two cases:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535) * 1. If the device is currently busy
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) * 2. If the device was idle during the whole sampling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) * period
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) if (!ktime_compare(ts->busy_to_idle_ts, zero_ktime)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) /* Check if the device is currently busy */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) if (ktime_compare(ts->idle_to_busy_ts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543) zero_ktime))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) return 100;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) /* We either didn't have any activity or we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547) * reached an entry which is 0. Either way,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) * exit and return what was accumulated so far
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) /* If sample has finished, check it is relevant */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) last_end_us = ktime_to_us(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) ktime_sub(curr, ts->busy_to_idle_ts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) if (last_end_us > period_us)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) /* It is relevant so add it but with adjustment */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) last_busy_time_us = ktime_to_us(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562) ktime_sub(ts->busy_to_idle_ts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) ts->idle_to_busy_ts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) total_busy_time_us += last_busy_time_us -
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) (last_start_us - period_us);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) /* Check if the sample is finished or still open */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) if (ktime_compare(ts->busy_to_idle_ts, zero_ktime))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) last_busy_time_us = ktime_to_us(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) ktime_sub(ts->busy_to_idle_ts,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) ts->idle_to_busy_ts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575) last_busy_time_us = ktime_to_us(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) ktime_sub(curr, ts->idle_to_busy_ts));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) total_busy_time_us += last_busy_time_us;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) last_index--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) /* Handle case idle_busy_ts_idx was 0 */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) if (last_index > HL_IDLE_BUSY_TS_ARR_SIZE)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583) last_index = HL_IDLE_BUSY_TS_ARR_SIZE - 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) ts = &hdev->idle_busy_ts_arr[last_index];
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) overlap_cnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) total_busy_time_ms = DIV_ROUND_UP_ULL(total_busy_time_us,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) USEC_PER_MSEC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) return DIV_ROUND_UP_ULL(total_busy_time_ms * 100, period_ms);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) * hl_device_set_frequency - set the frequency of the device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) * @freq: the new frequency value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) * Change the frequency if needed. This function has no protection against
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603) * concurrency, therefore it is assumed that the calling function has protected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) * itself against the case of calling this function from multiple threads with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) * different values
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) * Returns 0 if no change was done, otherwise returns 1
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609) int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) if ((hdev->pm_mng_profile == PM_MANUAL) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612) (hdev->curr_pll_profile == freq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) dev_dbg(hdev->dev, "Changing device frequency to %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) freq == PLL_HIGH ? "high" : "low");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) hdev->asic_funcs->set_pll_profile(hdev, freq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) hdev->curr_pll_profile = freq;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) int rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) mutex_lock(&hdev->debug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) if (!enable) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) if (!hdev->in_debug) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) "Failed to disable debug mode because device was not in debug mode\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) rc = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) if (!hdev->hard_reset_pending)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640) hdev->asic_funcs->halt_coresight(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) hdev->in_debug = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) if (!hdev->hard_reset_pending)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) hdev->asic_funcs->set_clock_gating(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) if (hdev->in_debug) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) "Failed to enable debug mode because device is already in debug mode\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) rc = -EFAULT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) goto out;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) hdev->asic_funcs->disable_clock_gating(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) hdev->in_debug = 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) mutex_unlock(&hdev->debug_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) * hl_device_suspend - initiate device suspend
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) * Puts the hw in the suspend state (all asics).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) * Returns 0 for success or an error on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) * Called at driver suspend.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) int hl_device_suspend(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) pci_save_state(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681) /* Block future CS/VM/JOB completion operations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) dev_err(hdev->dev, "Can't suspend while in reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) /* This blocks all other stuff that is not blocked by in_reset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) hdev->disabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692) * Flush anyone that is inside the critical section of enqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) * jobs to the H/W
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) hdev->asic_funcs->hw_queues_lock(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) hdev->asic_funcs->hw_queues_unlock(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) /* Flush processes that are sending message to CPU */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) mutex_lock(&hdev->send_cpu_message_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) mutex_unlock(&hdev->send_cpu_message_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) rc = hdev->asic_funcs->suspend(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) "Failed to disable PCI access of device CPU\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) /* Shut down the device */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) pci_disable_device(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709) pci_set_power_state(hdev->pdev, PCI_D3hot);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) * hl_device_resume - initiate device resume
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) * Bring the hw back to operating state (all asics).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) * Returns 0 for success or an error on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721) * Called at driver resume.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) int hl_device_resume(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) pci_set_power_state(hdev->pdev, PCI_D0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) pci_restore_state(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) rc = pci_enable_device_mem(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) "Failed to enable PCI device in resume\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736) pci_set_master(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) rc = hdev->asic_funcs->resume(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) dev_err(hdev->dev, "Failed to resume device after suspend\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) goto disable_device;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) hdev->disabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) atomic_set(&hdev->in_reset, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748) rc = hl_device_reset(hdev, true, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) dev_err(hdev->dev, "Failed to reset device during resume\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) goto disable_device;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) disable_device:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) pci_clear_master(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) pci_disable_device(hdev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) static int device_kill_open_processes(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) u16 pending_total, pending_cnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) struct hl_fpriv *hpriv;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767) struct task_struct *task = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) if (hdev->pldm)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770) pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) pending_total = HL_PENDING_RESET_PER_SEC;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) /* Giving time for user to close FD, and for processes that are inside
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) * hl_device_open to finish
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777) if (!list_empty(&hdev->fpriv_list))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) ssleep(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) /* This section must be protected because we are dereferencing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) * pointers that are freed if the process exits
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785) list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) if (task) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) dev_info(hdev->dev, "Killing user process pid=%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789) task_pid_nr(task));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) send_sig(SIGKILL, task, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) usleep_range(1000, 10000);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) put_task_struct(task);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) /* We killed the open users, but because the driver cleans up after the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800) * user contexts are closed (e.g. mmu mappings), we need to wait again
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) * to make sure the cleaning phase is finished before continuing with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) * the reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) pending_cnt = pending_total;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807) while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) dev_info(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) "Waiting for all unmap operations to finish before hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811) pending_cnt--;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) ssleep(1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) static void device_hard_reset_pending(struct work_struct *work)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) struct hl_device_reset_work *device_reset_work =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) container_of(work, struct hl_device_reset_work, reset_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) struct hl_device *hdev = device_reset_work->hdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) hl_device_reset(hdev, true, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) kfree(device_reset_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) * hl_device_reset - reset the device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) * @hard_reset: should we do hard reset to all engines or just reset the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) * compute/dma engines
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) * @from_hard_reset_thread: is the caller the hard-reset thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) * Block future CS and wait for pending CS to be enqueued
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) * Call ASIC H/W fini
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) * Flush all completions
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) * Re-initialize all internal data structures
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) * Call ASIC H/W init, late_init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) * Test queues
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) * Enable device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846) * Returns 0 for success or an error on failure.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) int hl_device_reset(struct hl_device *hdev, bool hard_reset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) bool from_hard_reset_thread)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) int i, rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853) if (!hdev->init_done) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) "Can't reset before initialization is done\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) if ((!hard_reset) && (!hdev->supports_soft_reset)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) hard_reset = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) * Prevent concurrency in this function - only one reset should be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) * done at any given time. Only need to perform this if we didn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867) * get from the dedicated hard reset thread
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) if (!from_hard_reset_thread) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) /* Block future CS/VM/JOB completion operations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) /* Disable PCI access from device F/W so he won't send
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) * us additional interrupts. We disable MSI/MSI-X at
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) * the halt_engines function and we can't have the F/W
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) * sending us interrupts after that. We need to disable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880) * the access here because if the device is marked
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) * disable, the message won't be send. Also, in case
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) * of heartbeat, the device CPU is marked as disable
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) * so this message won't be sent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) if (hl_fw_send_pci_access_msg(hdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886) CPUCP_PACKET_DISABLE_PCI_ACCESS))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) dev_warn(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) "Failed to disable PCI access by F/W\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) /* This also blocks future CS/VM/JOB completion operations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) hdev->disabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) /* Flush anyone that is inside the critical section of enqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895) * jobs to the H/W
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) hdev->asic_funcs->hw_queues_lock(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898) hdev->asic_funcs->hw_queues_unlock(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) /* Flush anyone that is inside device open */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) dev_err(hdev->dev, "Going to RESET device!\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907) again:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) if ((hard_reset) && (!from_hard_reset_thread)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) struct hl_device_reset_work *device_reset_work;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) hdev->hard_reset_pending = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) device_reset_work = kzalloc(sizeof(*device_reset_work),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) GFP_ATOMIC);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) if (!device_reset_work) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) * Because the reset function can't run from interrupt or
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) * from heartbeat work, we need to call the reset function
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * from a dedicated work
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) INIT_WORK(&device_reset_work->reset_work,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) device_hard_reset_pending);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) device_reset_work->hdev = hdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) schedule_work(&device_reset_work->reset_work);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) device_late_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) * Now that the heartbeat thread is closed, flush processes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) * which are sending messages to CPU
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) mutex_lock(&hdev->send_cpu_message_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) mutex_unlock(&hdev->send_cpu_message_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) * Halt the engines and disable interrupts so we won't get any more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) * completions from H/W and we won't have any accesses from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) * H/W to the host machine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) hdev->asic_funcs->halt_engines(hdev, hard_reset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) /* Go over all the queues, release all CS and their jobs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) hl_cs_rollback_all(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955) /* Kill processes here after CS rollback. This is because the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) * process can't really exit until all its CSs are done, which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) * is what we do in cs rollback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) rc = device_kill_open_processes(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) dev_crit(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) "Failed to kill all open processes, stopping hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) /* Flush the Event queue workers to make sure no other thread is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) * reading or writing to registers during the reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) flush_workqueue(hdev->eq_wq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) /* Reset the H/W. It will be in idle state after this returns */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) hdev->asic_funcs->hw_fini(hdev, hard_reset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) /* Release kernel context */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) if (hl_ctx_put(hdev->kernel_ctx) == 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978) hdev->kernel_ctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) hl_vm_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) hl_mmu_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) hl_eq_reset(hdev, &hdev->event_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) /* Re-initialize PI,CI to 0 in all queues (hw queue, cq) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) hl_hw_queue_reset(hdev, hard_reset);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) hl_cq_reset(hdev, &hdev->completion_queue[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) hdev->idle_busy_ts_idx = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) hdev->idle_busy_ts_arr[0].busy_to_idle_ts = ktime_set(0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) hdev->idle_busy_ts_arr[0].idle_to_busy_ts = ktime_set(0, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) if (hdev->cs_active_cnt)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994) dev_crit(hdev->dev, "CS active cnt %d is not 0 during reset\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) hdev->cs_active_cnt);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999) /* Make sure the context switch phase will run again */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) if (hdev->compute_ctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) /* Finished tear-down, starting to re-initialize */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) hdev->device_cpu_disabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) hdev->hard_reset_pending = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) if (hdev->kernel_ctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014) dev_crit(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) "kernel ctx was alive during hard reset, something is terribly wrong\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) rc = -EBUSY;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) rc = hl_mmu_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) "Failed to initialize MMU S/W after hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) /* Allocate the kernel context */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) if (!hdev->kernel_ctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) hl_mmu_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) hdev->compute_ctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) "failed to init kernel ctx in hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) kfree(hdev->kernel_ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043) hdev->kernel_ctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) hl_mmu_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) /* Device is now enabled as part of the initialization requires
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) * communication with the device firmware to get information that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) * is required for the initialization itself
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) hdev->disabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055) rc = hdev->asic_funcs->hw_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) "failed to initialize the H/W after reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) /* Check that the communication with the device is working */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) rc = hdev->asic_funcs->test_queues(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066) "Failed to detect if device is alive after reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) rc = device_late_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074) "Failed late init after hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) rc = hl_vm_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) "Failed to init memory module after hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) hl_set_max_power(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) rc = hdev->asic_funcs->soft_reset_late_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) "Failed late init after soft reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) goto out_err;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) atomic_set(&hdev->in_reset, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097) if (hard_reset)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) hdev->hard_reset_cnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) hdev->soft_reset_cnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) dev_warn(hdev->dev, "Successfully finished resetting the device\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) out_err:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) hdev->disabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109) if (hard_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) "Failed to reset! Device is NOT usable\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112) hdev->hard_reset_cnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) "Failed to do soft-reset, trying hard reset\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) hdev->soft_reset_cnt++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) hard_reset = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) goto again;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) atomic_set(&hdev->in_reset, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) * hl_device_init - main initialization function for habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) * Allocate an id for the device, do early initialization and then call the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132) * ASIC specific initialization functions. Finally, create the cdev and the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) * Linux device to expose it to the user
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) int hl_device_init(struct hl_device *hdev, struct class *hclass)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137) int i, rc, cq_cnt, cq_ready_cnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) char *name;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) bool add_cdev_sysfs_on_err = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) name = kasprintf(GFP_KERNEL, "hl%d", hdev->id / 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) if (!name) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147) /* Initialize cdev and device structures */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) rc = device_init_cdev(hdev, hclass, hdev->id, &hl_ops, name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149) &hdev->cdev, &hdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) kfree(name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156) name = kasprintf(GFP_KERNEL, "hl_controlD%d", hdev->id / 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) if (!name) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159) goto free_dev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) /* Initialize cdev and device structures for control device */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163) rc = device_init_cdev(hdev, hclass, hdev->id_control, &hl_ctrl_ops,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) name, &hdev->cdev_ctrl, &hdev->dev_ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) kfree(name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) goto free_dev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) /* Initialize ASIC function pointers and perform early init */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) rc = device_early_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174) goto free_dev_ctrl;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) * Start calling ASIC initialization. First S/W then H/W and finally
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * late init
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) rc = hdev->asic_funcs->sw_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) goto early_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) * Initialize the H/W queues. Must be done before hw_init, because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) * there the addresses of the kernel queue are being written to the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) * registers of the device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) rc = hl_hw_queues_create(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) dev_err(hdev->dev, "failed to initialize kernel queues\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) goto sw_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195) cq_cnt = hdev->asic_prop.completion_queues_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) * Initialize the completion queues. Must be done before hw_init,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) * because there the addresses of the completion queues are being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) * passed as arguments to request_irq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) if (cq_cnt) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203) hdev->completion_queue = kcalloc(cq_cnt,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) sizeof(*hdev->completion_queue),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) if (!hdev->completion_queue) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) "failed to allocate completion queues\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) goto hw_queues_destroy;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) for (i = 0, cq_ready_cnt = 0 ; i < cq_cnt ; i++, cq_ready_cnt++) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) rc = hl_cq_init(hdev, &hdev->completion_queue[i],
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217) hdev->asic_funcs->get_queue_id_for_cq(hdev, i));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) "failed to initialize completion queue\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) goto cq_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) hdev->completion_queue[i].cq_idx = i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1226) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1227) * Initialize the event queue. Must be done before hw_init,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1228) * because there the address of the event queue is being
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1229) * passed as argument to request_irq
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1230) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1231) rc = hl_eq_init(hdev, &hdev->event_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1232) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1233) dev_err(hdev->dev, "failed to initialize event queue\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1234) goto cq_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1235) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1236)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1237) /* MMU S/W must be initialized before kernel context is created */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1238) rc = hl_mmu_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1239) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1240) dev_err(hdev->dev, "Failed to initialize MMU S/W structures\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1241) goto eq_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1242) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1243)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1244) /* Allocate the kernel context */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1245) hdev->kernel_ctx = kzalloc(sizeof(*hdev->kernel_ctx), GFP_KERNEL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1246) if (!hdev->kernel_ctx) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1247) rc = -ENOMEM;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1248) goto mmu_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1251) hdev->compute_ctx = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1252)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1253) rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1254) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1255) dev_err(hdev->dev, "failed to initialize kernel context\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1256) kfree(hdev->kernel_ctx);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1257) goto mmu_fini;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1258) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1259)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1260) rc = hl_cb_pool_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1261) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1262) dev_err(hdev->dev, "failed to initialize CB pool\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1263) goto release_ctx;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1264) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1265)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1266) hl_debugfs_add_device(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1267)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1268) if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1269) dev_info(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1270) "H/W state is dirty, must reset before initializing\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1271) hdev->asic_funcs->halt_engines(hdev, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1272) hdev->asic_funcs->hw_fini(hdev, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1273) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1274)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1275) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1276) * From this point, in case of an error, add char devices and create
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1277) * sysfs nodes as part of the error flow, to allow debugging.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1278) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1279) add_cdev_sysfs_on_err = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1280)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1281) /* Device is now enabled as part of the initialization requires
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1282) * communication with the device firmware to get information that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1283) * is required for the initialization itself
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1284) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1285) hdev->disabled = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1286)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1287) rc = hdev->asic_funcs->hw_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1288) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1289) dev_err(hdev->dev, "failed to initialize the H/W\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1290) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1291) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1292) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1293)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1294) /* Check that the communication with the device is working */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1295) rc = hdev->asic_funcs->test_queues(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1296) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1297) dev_err(hdev->dev, "Failed to detect if device is alive\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1298) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1299) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1300) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1301)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1302) rc = device_late_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1303) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1304) dev_err(hdev->dev, "Failed late initialization\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1305) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1306) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1307) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1308)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1309) dev_info(hdev->dev, "Found %s device with %lluGB DRAM\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1310) hdev->asic_name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1311) hdev->asic_prop.dram_size / 1024 / 1024 / 1024);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1312)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1313) rc = hl_vm_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1314) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1315) dev_err(hdev->dev, "Failed to initialize memory module\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1316) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1317) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1318) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1319)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1320) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1321) * Expose devices and sysfs nodes to user.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1322) * From here there is no need to add char devices and create sysfs nodes
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1323) * in case of an error.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1324) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1325) add_cdev_sysfs_on_err = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1326) rc = device_cdev_sysfs_add(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1327) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1328) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1329) "Failed to add char devices and sysfs nodes\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1330) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1331) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1332) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1333)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1334) /* Need to call this again because the max power might change,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1335) * depending on card type for certain ASICs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1336) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1337) hl_set_max_power(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1338)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1339) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1340) * hl_hwmon_init() must be called after device_late_init(), because only
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1341) * there we get the information from the device about which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1342) * hwmon-related sensors the device supports.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1343) * Furthermore, it must be done after adding the device to the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1344) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1345) rc = hl_hwmon_init(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1346) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1347) dev_err(hdev->dev, "Failed to initialize hwmon\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1348) rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1349) goto out_disabled;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1350) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1351)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1352) dev_notice(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1353) "Successfully added device to habanalabs driver\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1354)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1355) hdev->init_done = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1356)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1357) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1358)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1359) release_ctx:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1360) if (hl_ctx_put(hdev->kernel_ctx) != 1)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1361) dev_err(hdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1362) "kernel ctx is still alive on initialization failure\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1363) mmu_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1364) hl_mmu_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1365) eq_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1366) hl_eq_fini(hdev, &hdev->event_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1367) cq_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1368) for (i = 0 ; i < cq_ready_cnt ; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1369) hl_cq_fini(hdev, &hdev->completion_queue[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1370) kfree(hdev->completion_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1371) hw_queues_destroy:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1372) hl_hw_queues_destroy(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1373) sw_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1374) hdev->asic_funcs->sw_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1375) early_fini:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1376) device_early_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1377) free_dev_ctrl:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1378) put_device(hdev->dev_ctrl);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1379) free_dev:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1380) put_device(hdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1381) out_disabled:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1382) hdev->disabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1383) if (add_cdev_sysfs_on_err)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1384) device_cdev_sysfs_add(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1385) if (hdev->pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1386) dev_err(&hdev->pdev->dev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1387) "Failed to initialize hl%d. Device is NOT usable !\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1388) hdev->id / 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1389) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1390) pr_err("Failed to initialize hl%d. Device is NOT usable !\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1391) hdev->id / 2);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1392)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1393) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1394) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1395)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1396) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1397) * hl_device_fini - main tear-down function for habanalabs device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1398) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1399) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1400) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1401) * Destroy the device, call ASIC fini functions and release the id
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1402) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1403) void hl_device_fini(struct hl_device *hdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1404) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1405) int i, rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1406) ktime_t timeout;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1407)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1408) dev_info(hdev->dev, "Removing device\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1409)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1410) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1411) * This function is competing with the reset function, so try to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1412) * take the reset atomic and if we are already in middle of reset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1413) * wait until reset function is finished. Reset function is designed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1414) * to always finish. However, in Gaudi, because of all the network
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1415) * ports, the hard reset could take between 10-30 seconds
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1416) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1417)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1418) timeout = ktime_add_us(ktime_get(),
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1419) HL_HARD_RESET_MAX_TIMEOUT * 1000 * 1000);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1420) rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1421) while (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1422) usleep_range(50, 200);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1423) rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1424) if (ktime_compare(ktime_get(), timeout) > 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1425) WARN(1, "Failed to remove device because reset function did not finish\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1426) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1427) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1428) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1429)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1430) /* Disable PCI access from device F/W so it won't send us additional
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1431) * interrupts. We disable MSI/MSI-X at the halt_engines function and we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1432) * can't have the F/W sending us interrupts after that. We need to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1433) * disable the access here because if the device is marked disable, the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1434) * message won't be send. Also, in case of heartbeat, the device CPU is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1435) * marked as disable so this message won't be sent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1436) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1437) hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1438)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1439) /* Mark device as disabled */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1440) hdev->disabled = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1441)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1442) /* Flush anyone that is inside the critical section of enqueue
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1443) * jobs to the H/W
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1444) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1445) hdev->asic_funcs->hw_queues_lock(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1446) hdev->asic_funcs->hw_queues_unlock(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1448) /* Flush anyone that is inside device open */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1449) mutex_lock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1450) mutex_unlock(&hdev->fpriv_list_lock);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1451)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1452) hdev->hard_reset_pending = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1453)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1454) hl_hwmon_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1456) device_late_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1457)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1458) hl_debugfs_remove_device(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1460) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1461) * Halt the engines and disable interrupts so we won't get any more
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1462) * completions from H/W and we won't have any accesses from the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1463) * H/W to the host machine
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1464) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1465) hdev->asic_funcs->halt_engines(hdev, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1466)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1467) /* Go over all the queues, release all CS and their jobs */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1468) hl_cs_rollback_all(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1470) /* Kill processes here after CS rollback. This is because the process
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1471) * can't really exit until all its CSs are done, which is what we
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1472) * do in cs rollback
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1473) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1474) rc = device_kill_open_processes(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1475) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1476) dev_crit(hdev->dev, "Failed to kill all open processes\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1477)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1478) hl_cb_pool_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1479)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1480) /* Reset the H/W. It will be in idle state after this returns */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1481) hdev->asic_funcs->hw_fini(hdev, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1482)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1483) /* Release kernel context */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1484) if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1485) dev_err(hdev->dev, "kernel ctx is still alive\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1486)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1487) hl_vm_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1488)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1489) hl_mmu_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1491) hl_eq_fini(hdev, &hdev->event_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1492)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1493) for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1494) hl_cq_fini(hdev, &hdev->completion_queue[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1495) kfree(hdev->completion_queue);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1496)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1497) hl_hw_queues_destroy(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1498)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1499) /* Call ASIC S/W finalize function */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1500) hdev->asic_funcs->sw_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1502) device_early_fini(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1503)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1504) /* Hide devices and sysfs nodes from user */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1505) device_cdev_sysfs_del(hdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1506)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1507) pr_info("removed device successfully\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1508) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1509)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1510) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1511) * MMIO register access helper functions.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1512) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1513)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1514) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1515) * hl_rreg - Read an MMIO register
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1516) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1517) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1518) * @reg: MMIO register offset (in bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1519) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1520) * Returns the value of the MMIO register we are asked to read
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1521) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1522) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1523) inline u32 hl_rreg(struct hl_device *hdev, u32 reg)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1524) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1525) return readl(hdev->rmmio + reg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1526) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1527)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1528) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1529) * hl_wreg - Write to an MMIO register
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1530) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1531) * @hdev: pointer to habanalabs device structure
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1532) * @reg: MMIO register offset (in bytes)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1533) * @val: 32-bit value
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1534) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1535) * Writes the 32-bit value into the MMIO register
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1536) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1537) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1538) inline void hl_wreg(struct hl_device *hdev, u32 reg, u32 val)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1539) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1540) writel(val, hdev->rmmio + reg);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1541) }