^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1) // SPDX-License-Identifier: GPL-2.0-or-later
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 2) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 3) * PCI Error Recovery Driver for RPA-compliant PPC64 platform.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 4) * Copyright IBM Corp. 2004 2005
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 5) * Copyright Linas Vepstas <linas@linas.org> 2004, 2005
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 6) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 7) * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 8) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 9) #include <linux/delay.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 10) #include <linux/interrupt.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 11) #include <linux/irq.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 12) #include <linux/module.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 13) #include <linux/pci.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 14) #include <linux/pci_hotplug.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 15) #include <asm/eeh.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 16) #include <asm/eeh_event.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 17) #include <asm/ppc-pci.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 18) #include <asm/pci-bridge.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 19) #include <asm/prom.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 20) #include <asm/rtas.h>
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 21)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 22) struct eeh_rmv_data {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 23) struct list_head removed_vf_list;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 24) int removed_dev_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 25) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 26)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 27) static int eeh_result_priority(enum pci_ers_result result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 28) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 29) switch (result) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 30) case PCI_ERS_RESULT_NONE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 31) return 1;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 32) case PCI_ERS_RESULT_NO_AER_DRIVER:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 33) return 2;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 34) case PCI_ERS_RESULT_RECOVERED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 35) return 3;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 36) case PCI_ERS_RESULT_CAN_RECOVER:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 37) return 4;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 38) case PCI_ERS_RESULT_DISCONNECT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 39) return 5;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 40) case PCI_ERS_RESULT_NEED_RESET:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 41) return 6;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 42) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 43) WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 44) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 45) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 46) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 47)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 48) static const char *pci_ers_result_name(enum pci_ers_result result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 49) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 50) switch (result) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 51) case PCI_ERS_RESULT_NONE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 52) return "none";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 53) case PCI_ERS_RESULT_CAN_RECOVER:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 54) return "can recover";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 55) case PCI_ERS_RESULT_NEED_RESET:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 56) return "need reset";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 57) case PCI_ERS_RESULT_DISCONNECT:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 58) return "disconnect";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 59) case PCI_ERS_RESULT_RECOVERED:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 60) return "recovered";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 61) case PCI_ERS_RESULT_NO_AER_DRIVER:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 62) return "no AER driver";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 63) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 64) WARN_ONCE(1, "Unknown result type: %d\n", (int)result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 65) return "unknown";
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 66) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 67) };
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 68)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 69) static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 70) enum pci_ers_result new)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 71) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 72) if (eeh_result_priority(new) > eeh_result_priority(old))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 73) return new;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 74) return old;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 75) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 76)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 77) static bool eeh_dev_removed(struct eeh_dev *edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 78) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 79) return !edev || (edev->mode & EEH_DEV_REMOVED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 80) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 81)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 82) static bool eeh_edev_actionable(struct eeh_dev *edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 83) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 84) if (!edev->pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 85) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 86) if (edev->pdev->error_state == pci_channel_io_perm_failure)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 87) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 88) if (eeh_dev_removed(edev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 89) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 90) if (eeh_pe_passed(edev->pe))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 91) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 92)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 93) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 94) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 95)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 96) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 97) * eeh_pcid_get - Get the PCI device driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 98) * @pdev: PCI device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 99) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 100) * The function is used to retrieve the PCI device driver for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 101) * the indicated PCI device. Besides, we will increase the reference
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 102) * of the PCI device driver to prevent that being unloaded on
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 103) * the fly. Otherwise, kernel crash would be seen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 104) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 105) static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 106) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 107) if (!pdev || !pdev->driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 108) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 110) if (!try_module_get(pdev->driver->driver.owner))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 111) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 113) return pdev->driver;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 114) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 115)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 116) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 117) * eeh_pcid_put - Dereference on the PCI device driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 118) * @pdev: PCI device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 119) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 120) * The function is called to do dereference on the PCI device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 121) * driver of the indicated PCI device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 122) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 123) static inline void eeh_pcid_put(struct pci_dev *pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 124) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 125) if (!pdev || !pdev->driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 126) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 127)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 128) module_put(pdev->driver->driver.owner);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 129) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 130)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 131) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 132) * eeh_disable_irq - Disable interrupt for the recovering device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 133) * @dev: PCI device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 134) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 135) * This routine must be called when reporting temporary or permanent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 136) * error to the particular PCI device to disable interrupt of that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 137) * device. If the device has enabled MSI or MSI-X interrupt, we needn't
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 138) * do real work because EEH should freeze DMA transfers for those PCI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 139) * devices encountering EEH errors, which includes MSI or MSI-X.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 140) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 141) static void eeh_disable_irq(struct eeh_dev *edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 142) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 143) /* Don't disable MSI and MSI-X interrupts. They are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 144) * effectively disabled by the DMA Stopped state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 145) * when an EEH error occurs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 146) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 147) if (edev->pdev->msi_enabled || edev->pdev->msix_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 148) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 150) if (!irq_has_action(edev->pdev->irq))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 151) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 152)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 153) edev->mode |= EEH_DEV_IRQ_DISABLED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 154) disable_irq_nosync(edev->pdev->irq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 155) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 157) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 158) * eeh_enable_irq - Enable interrupt for the recovering device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 159) * @dev: PCI device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 160) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 161) * This routine must be called to enable interrupt while failed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 162) * device could be resumed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 163) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 164) static void eeh_enable_irq(struct eeh_dev *edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 165) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 166) if ((edev->mode) & EEH_DEV_IRQ_DISABLED) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 167) edev->mode &= ~EEH_DEV_IRQ_DISABLED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 168) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 169) * FIXME !!!!!
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 170) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 171) * This is just ass backwards. This maze has
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 172) * unbalanced irq_enable/disable calls. So instead of
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 173) * finding the root cause it works around the warning
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 174) * in the irq_enable code by conditionally calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 175) * into it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 176) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 177) * That's just wrong.The warning in the core code is
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 178) * there to tell people to fix their asymmetries in
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 179) * their own code, not by abusing the core information
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 180) * to avoid it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 181) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 182) * I so wish that the assymetry would be the other way
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 183) * round and a few more irq_disable calls render that
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 184) * shit unusable forever.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 185) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 186) * tglx
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 187) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 188) if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq)))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 189) enable_irq(edev->pdev->irq);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 190) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 191) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 192)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 193) static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 194) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 195) struct pci_dev *pdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 196)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 197) if (!edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 198) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 199)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 200) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 201) * We cannot access the config space on some adapters.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 202) * Otherwise, it will cause fenced PHB. We don't save
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 203) * the content in their config space and will restore
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 204) * from the initial config space saved when the EEH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 205) * device is created.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 206) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 207) if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 208) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 209)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 210) pdev = eeh_dev_to_pci_dev(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 211) if (!pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 212) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 213)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 214) pci_save_state(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 215) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 216)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 217) static void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 218) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 219) struct eeh_pe *pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 220) struct eeh_dev *edev, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 221)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 222) eeh_for_each_pe(root, pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 223) eeh_pe_for_each_dev(pe, edev, tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 224) if (eeh_edev_actionable(edev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 225) edev->pdev->error_state = s;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 226) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 227)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 228) static void eeh_set_irq_state(struct eeh_pe *root, bool enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 229) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 230) struct eeh_pe *pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 231) struct eeh_dev *edev, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 232)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 233) eeh_for_each_pe(root, pe) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 234) eeh_pe_for_each_dev(pe, edev, tmp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 235) if (!eeh_edev_actionable(edev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 236) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 237)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 238) if (!eeh_pcid_get(edev->pdev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 239) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 240)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 241) if (enable)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 242) eeh_enable_irq(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 243) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 244) eeh_disable_irq(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 245)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 246) eeh_pcid_put(edev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 247) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 248) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 249) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 250)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 251) typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 252) struct pci_dev *,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 253) struct pci_driver *);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 254) static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 255) enum pci_ers_result *result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 256) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 257) struct pci_dev *pdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 258) struct pci_driver *driver;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 259) enum pci_ers_result new_result;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 260)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 261) pci_lock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 262) pdev = edev->pdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 263) if (pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 264) get_device(&pdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 265) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 266) if (!pdev) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 267) eeh_edev_info(edev, "no device");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 268) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 269) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 270) device_lock(&pdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 271) if (eeh_edev_actionable(edev)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 272) driver = eeh_pcid_get(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 273)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 274) if (!driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 275) eeh_edev_info(edev, "no driver");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 276) else if (!driver->err_handler)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 277) eeh_edev_info(edev, "driver not EEH aware");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 278) else if (edev->mode & EEH_DEV_NO_HANDLER)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 279) eeh_edev_info(edev, "driver bound too late");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 280) else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 281) new_result = fn(edev, pdev, driver);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 282) eeh_edev_info(edev, "%s driver reports: '%s'",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 283) driver->name,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 284) pci_ers_result_name(new_result));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 285) if (result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 286) *result = pci_ers_merge_result(*result,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 287) new_result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 288) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 289) if (driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 290) eeh_pcid_put(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 291) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 292) eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 293) !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 294) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 295) device_unlock(&pdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 296) if (edev->pdev != pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 297) eeh_edev_warn(edev, "Device changed during processing!\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 298) put_device(&pdev->dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 299) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 300)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 301) static void eeh_pe_report(const char *name, struct eeh_pe *root,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 302) eeh_report_fn fn, enum pci_ers_result *result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 303) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 304) struct eeh_pe *pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 305) struct eeh_dev *edev, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 306)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 307) pr_info("EEH: Beginning: '%s'\n", name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 308) eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 309) eeh_pe_report_edev(edev, fn, result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 310) if (result)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 311) pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 312) name, pci_ers_result_name(*result));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 313) else
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 314) pr_info("EEH: Finished:'%s'", name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 315) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 316)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 317) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 318) * eeh_report_error - Report pci error to each device driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 319) * @edev: eeh device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 320) * @driver: device's PCI driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 321) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 322) * Report an EEH error to each device driver.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 323) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 324) static enum pci_ers_result eeh_report_error(struct eeh_dev *edev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 325) struct pci_dev *pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 326) struct pci_driver *driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 327) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 328) enum pci_ers_result rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 329)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 330) if (!driver->err_handler->error_detected)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 331) return PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 332)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 333) eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 334) driver->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 335) rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 336)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 337) edev->in_error = true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 338) pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 339) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 340) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 341)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 342) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 343) * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 344) * @edev: eeh device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 345) * @driver: device's PCI driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 346) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 347) * Tells each device driver that IO ports, MMIO and config space I/O
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 348) * are now enabled.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 349) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 350) static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 351) struct pci_dev *pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 352) struct pci_driver *driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 353) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 354) if (!driver->err_handler->mmio_enabled)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 355) return PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 356) eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 357) return driver->err_handler->mmio_enabled(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 358) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 359)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 360) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 361) * eeh_report_reset - Tell device that slot has been reset
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 362) * @edev: eeh device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 363) * @driver: device's PCI driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 364) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 365) * This routine must be called while EEH tries to reset particular
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 366) * PCI device so that the associated PCI device driver could take
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 367) * some actions, usually to save data the driver needs so that the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 368) * driver can work again while the device is recovered.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 369) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 370) static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 371) struct pci_dev *pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 372) struct pci_driver *driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 373) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 374) if (!driver->err_handler->slot_reset || !edev->in_error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 375) return PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 376) eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 377) return driver->err_handler->slot_reset(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 378) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 379)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 380) static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 381) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 382) struct pci_dev *pdev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 383)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 384) if (!edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 385) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 386)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 387) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 388) * The content in the config space isn't saved because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 389) * the blocked config space on some adapters. We have
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 390) * to restore the initial saved config space when the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 391) * EEH device is created.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 392) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 393) if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 394) if (list_is_last(&edev->entry, &edev->pe->edevs))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 395) eeh_pe_restore_bars(edev->pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 396)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 397) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 398) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 399)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 400) pdev = eeh_dev_to_pci_dev(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 401) if (!pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 402) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 403)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 404) pci_restore_state(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 405) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 406)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 407) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 408) * eeh_report_resume - Tell device to resume normal operations
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 409) * @edev: eeh device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 410) * @driver: device's PCI driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 411) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 412) * This routine must be called to notify the device driver that it
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 413) * could resume so that the device driver can do some initialization
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 414) * to make the recovered device work again.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 415) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 416) static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 417) struct pci_dev *pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 418) struct pci_driver *driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 419) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 420) if (!driver->err_handler->resume || !edev->in_error)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 421) return PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 422)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 423) eeh_edev_info(edev, "Invoking %s->resume()", driver->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 424) driver->err_handler->resume(pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 425)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 426) pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 427) #ifdef CONFIG_PCI_IOV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 428) if (eeh_ops->notify_resume)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 429) eeh_ops->notify_resume(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 430) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 431) return PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 432) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 433)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 434) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 435) * eeh_report_failure - Tell device driver that device is dead.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 436) * @edev: eeh device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 437) * @driver: device's PCI driver
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 438) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 439) * This informs the device driver that the device is permanently
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 440) * dead, and that no further recovery attempts will be made on it.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 441) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 442) static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 443) struct pci_dev *pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 444) struct pci_driver *driver)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 445) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 446) enum pci_ers_result rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 447)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 448) if (!driver->err_handler->error_detected)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 449) return PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 450)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 451) eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 452) driver->name);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 453) rc = driver->err_handler->error_detected(pdev,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 454) pci_channel_io_perm_failure);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 455)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 456) pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 457) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 458) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 459)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 460) static void *eeh_add_virt_device(struct eeh_dev *edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 461) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 462) struct pci_driver *driver;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 463) struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 464)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 465) if (!(edev->physfn)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 466) eeh_edev_warn(edev, "Not for VF\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 467) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 468) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 469)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 470) driver = eeh_pcid_get(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 471) if (driver) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 472) if (driver->err_handler) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 473) eeh_pcid_put(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 474) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 475) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 476) eeh_pcid_put(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 477) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 478)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 479) #ifdef CONFIG_PCI_IOV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 480) pci_iov_add_virtfn(edev->physfn, edev->vf_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 481) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 482) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 483) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 484)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 485) static void eeh_rmv_device(struct eeh_dev *edev, void *userdata)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 486) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 487) struct pci_driver *driver;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 488) struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 489) struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 490)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 491) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 492) * Actually, we should remove the PCI bridges as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 493) * However, that's lots of complexity to do that,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 494) * particularly some of devices under the bridge might
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 495) * support EEH. So we just care about PCI devices for
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 496) * simplicity here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 497) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 498) if (!eeh_edev_actionable(edev) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 499) (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 500) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 501)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 502) if (rmv_data) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 503) driver = eeh_pcid_get(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 504) if (driver) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 505) if (driver->err_handler &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 506) driver->err_handler->error_detected &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 507) driver->err_handler->slot_reset) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 508) eeh_pcid_put(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 509) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 510) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 511) eeh_pcid_put(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 512) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 513) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 514)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 515) /* Remove it from PCI subsystem */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 516) pr_info("EEH: Removing %s without EEH sensitive driver\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 517) pci_name(dev));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 518) edev->mode |= EEH_DEV_DISCONNECTED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 519) if (rmv_data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 520) rmv_data->removed_dev_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 521)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 522) if (edev->physfn) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 523) #ifdef CONFIG_PCI_IOV
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 524) pci_iov_remove_virtfn(edev->physfn, edev->vf_index);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 525) edev->pdev = NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 526) #endif
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 527) if (rmv_data)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 528) list_add(&edev->rmv_entry, &rmv_data->removed_vf_list);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 529) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 530) pci_lock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 531) pci_stop_and_remove_bus_device(dev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 532) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 533) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 534) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 535)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 536) static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 537) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 538) struct eeh_dev *edev, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 539)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 540) eeh_pe_for_each_dev(pe, edev, tmp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 541) if (!(edev->mode & EEH_DEV_DISCONNECTED))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 542) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 543)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 544) edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 545) eeh_pe_tree_remove(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 546) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 547)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 548) return NULL;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 549) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 550)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 551) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 552) * Explicitly clear PE's frozen state for PowerNV where
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 553) * we have frozen PE until BAR restore is completed. It's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 554) * harmless to clear it for pSeries. To be consistent with
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 555) * PE reset (for 3 times), we try to clear the frozen state
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 556) * for 3 times as well.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 557) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 558) static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 559) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 560) struct eeh_pe *pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 561) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 562)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 563) eeh_for_each_pe(root, pe) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 564) if (include_passed || !eeh_pe_passed(pe)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 565) for (i = 0; i < 3; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 566) if (!eeh_unfreeze_pe(pe))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 567) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 568) if (i >= 3)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 569) return -EIO;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 570) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 571) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 572) eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 573) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 574) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 575)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 576) int eeh_pe_reset_and_recover(struct eeh_pe *pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 577) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 578) int ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 579)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 580) /* Bail if the PE is being recovered */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 581) if (pe->state & EEH_PE_RECOVERING)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 582) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 583)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 584) /* Put the PE into recovery mode */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 585) eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 586)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 587) /* Save states */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 588) eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 589)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 590) /* Issue reset */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 591) ret = eeh_pe_reset_full(pe, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 592) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 593) eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 594) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 595) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 596)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 597) /* Unfreeze the PE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 598) ret = eeh_clear_pe_frozen_state(pe, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 599) if (ret) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 600) eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 601) return ret;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 602) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 603)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 604) /* Restore device state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 605) eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 606)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 607) /* Clear recovery mode */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 608) eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 609)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 610) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 611) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 612)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 613) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 614) * eeh_reset_device - Perform actual reset of a pci slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 615) * @driver_eeh_aware: Does the device's driver provide EEH support?
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 616) * @pe: EEH PE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 617) * @bus: PCI bus corresponding to the isolcated slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 618) * @rmv_data: Optional, list to record removed devices
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 619) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 620) * This routine must be called to do reset on the indicated PE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 621) * During the reset, udev might be invoked because those affected
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 622) * PCI devices will be removed and then added.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 623) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 624) static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 625) struct eeh_rmv_data *rmv_data,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 626) bool driver_eeh_aware)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 627) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 628) time64_t tstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 629) int cnt, rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 630) struct eeh_dev *edev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 631) struct eeh_pe *tmp_pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 632) bool any_passed = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 633)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 634) eeh_for_each_pe(pe, tmp_pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 635) any_passed |= eeh_pe_passed(tmp_pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 636)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 637) /* pcibios will clear the counter; save the value */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 638) cnt = pe->freeze_count;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 639) tstamp = pe->tstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 640)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 641) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 642) * We don't remove the corresponding PE instances because
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 643) * we need the information afterwords. The attached EEH
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 644) * devices are expected to be attached soon when calling
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 645) * into pci_hp_add_devices().
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 646) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 647) eeh_pe_state_mark(pe, EEH_PE_KEEP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 648) if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 649) eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 650) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 651) pci_lock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 652) pci_hp_remove_devices(bus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 653) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 654) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 655)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 656) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 657) * Reset the pci controller. (Asserts RST#; resets config space).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 658) * Reconfigure bridges and devices. Don't try to bring the system
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 659) * up if the reset failed for some reason.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 660) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 661) * During the reset, it's very dangerous to have uncontrolled PCI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 662) * config accesses. So we prefer to block them. However, controlled
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 663) * PCI config accesses initiated from EEH itself are allowed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 664) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 665) rc = eeh_pe_reset_full(pe, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 666) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 667) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 668)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 669) pci_lock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 670)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 671) /* Restore PE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 672) eeh_ops->configure_bridge(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 673) eeh_pe_restore_bars(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 674)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 675) /* Clear frozen state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 676) rc = eeh_clear_pe_frozen_state(pe, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 677) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 678) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 679) return rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 680) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 681)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 682) /* Give the system 5 seconds to finish running the user-space
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 683) * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 684) * this is a hack, but if we don't do this, and try to bring
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 685) * the device up before the scripts have taken it down,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 686) * potentially weird things happen.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 687) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 688) if (!driver_eeh_aware || rmv_data->removed_dev_count) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 689) pr_info("EEH: Sleep 5s ahead of %s hotplug\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 690) (driver_eeh_aware ? "partial" : "complete"));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 691) ssleep(5);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 692)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 693) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 694) * The EEH device is still connected with its parent
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 695) * PE. We should disconnect it so the binding can be
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 696) * rebuilt when adding PCI devices.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 697) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 698) edev = list_first_entry(&pe->edevs, struct eeh_dev, entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 699) eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 700) if (pe->type & EEH_PE_VF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 701) eeh_add_virt_device(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 702) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 703) if (!driver_eeh_aware)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 704) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 705) pci_hp_add_devices(bus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 706) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 707) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 708) eeh_pe_state_clear(pe, EEH_PE_KEEP, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 709)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 710) pe->tstamp = tstamp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 711) pe->freeze_count = cnt;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 712)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 713) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 714) return 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 715) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 716)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 717) /* The longest amount of time to wait for a pci device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 718) * to come back on line, in seconds.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 719) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 720) #define MAX_WAIT_FOR_RECOVERY 300
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 721)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 722)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 723) /* Walks the PE tree after processing an event to remove any stale PEs.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 724) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 725) * NB: This needs to be recursive to ensure the leaf PEs get removed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 726) * before their parents do. Although this is possible to do recursively
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 727) * we don't since this is easier to read and we need to garantee
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 728) * the leaf nodes will be handled first.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 729) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 730) static void eeh_pe_cleanup(struct eeh_pe *pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 731) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 732) struct eeh_pe *child_pe, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 733)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 734) list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 735) eeh_pe_cleanup(child_pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 736)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 737) if (pe->state & EEH_PE_KEEP)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 738) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 739)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 740) if (!(pe->state & EEH_PE_INVALID))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 741) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 742)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 743) if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 744) list_del(&pe->child);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 745) kfree(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 746) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 747) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 748)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 749) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 750) * eeh_check_slot_presence - Check if a device is still present in a slot
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 751) * @pdev: pci_dev to check
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 752) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 753) * This function may return a false positive if we can't determine the slot's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 754) * presence state. This might happen for for PCIe slots if the PE containing
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 755) * the upstream bridge is also frozen, or the bridge is part of the same PE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 756) * as the device.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 757) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 758) * This shouldn't happen often, but you might see it if you hotplug a PCIe
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 759) * switch.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 760) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 761) static bool eeh_slot_presence_check(struct pci_dev *pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 762) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 763) const struct hotplug_slot_ops *ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 764) struct pci_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 765) u8 state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 766) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 767)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 768) if (!pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 769) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 770)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 771) if (pdev->error_state == pci_channel_io_perm_failure)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 772) return false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 773)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 774) slot = pdev->slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 775) if (!slot || !slot->hotplug)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 776) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 777)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 778) ops = slot->hotplug->ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 779) if (!ops || !ops->get_adapter_status)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 780) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 781)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 782) /* set the attention indicator while we've got the slot ops */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 783) if (ops->set_attention_status)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 784) ops->set_attention_status(slot->hotplug, 1);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 785)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 786) rc = ops->get_adapter_status(slot->hotplug, &state);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 787) if (rc)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 788) return true;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 789)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 790) return !!state;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 791) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 792)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 793) static void eeh_clear_slot_attention(struct pci_dev *pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 794) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 795) const struct hotplug_slot_ops *ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 796) struct pci_slot *slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 797)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 798) if (!pdev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 799) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 800)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 801) if (pdev->error_state == pci_channel_io_perm_failure)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 802) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 803)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 804) slot = pdev->slot;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 805) if (!slot || !slot->hotplug)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 806) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 807)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 808) ops = slot->hotplug->ops;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 809) if (!ops || !ops->set_attention_status)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 810) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 811)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 812) ops->set_attention_status(slot->hotplug, 0);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 813) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 814)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 815) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 816) * eeh_handle_normal_event - Handle EEH events on a specific PE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 817) * @pe: EEH PE - which should not be used after we return, as it may
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 818) * have been invalidated.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 819) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 820) * Attempts to recover the given PE. If recovery fails or the PE has failed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 821) * too many times, remove the PE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 822) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 823) * While PHB detects address or data parity errors on particular PCI
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 824) * slot, the associated PE will be frozen. Besides, DMA's occurring
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 825) * to wild addresses (which usually happen due to bugs in device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 826) * drivers or in PCI adapter firmware) can cause EEH error. #SERR,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 827) * #PERR or other misc PCI-related errors also can trigger EEH errors.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 828) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 829) * Recovery process consists of unplugging the device driver (which
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 830) * generated hotplug events to userspace), then issuing a PCI #RST to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 831) * the device, then reconfiguring the PCI config space for all bridges
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 832) * & devices under this slot, and then finally restarting the device
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 833) * drivers (which cause a second set of hotplug events to go out to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 834) * userspace).
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 835) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 836) void eeh_handle_normal_event(struct eeh_pe *pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 837) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 838) struct pci_bus *bus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 839) struct eeh_dev *edev, *tmp;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 840) struct eeh_pe *tmp_pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 841) int rc = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 842) enum pci_ers_result result = PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 843) struct eeh_rmv_data rmv_data =
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 844) {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0};
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 845) int devices = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 846)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 847) bus = eeh_pe_bus_get(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 848) if (!bus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 849) pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 850) __func__, pe->phb->global_number, pe->addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 851) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 852) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 853)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 854) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 855) * When devices are hot-removed we might get an EEH due to
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 856) * a driver attempting to touch the MMIO space of a removed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 857) * device. In this case we don't have a device to recover
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 858) * so suppress the event if we can't find any present devices.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 859) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 860) * The hotplug driver should take care of tearing down the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 861) * device itself.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 862) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 863) eeh_for_each_pe(pe, tmp_pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 864) eeh_pe_for_each_dev(tmp_pe, edev, tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 865) if (eeh_slot_presence_check(edev->pdev))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 866) devices++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 867)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 868) if (!devices) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 869) pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 870) pe->phb->global_number, pe->addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 871) goto out; /* nothing to recover */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 872) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 873)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 874) /* Log the event */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 875) if (pe->type & EEH_PE_PHB) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 876) pr_err("EEH: Recovering PHB#%x, location: %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 877) pe->phb->global_number, eeh_pe_loc_get(pe));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 878) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 879) struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 880)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 881) pr_err("EEH: Recovering PHB#%x-PE#%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 882) pe->phb->global_number, pe->addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 883) pr_err("EEH: PE location: %s, PHB location: %s\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 884) eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 885) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 886)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 887) #ifdef CONFIG_STACKTRACE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 888) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 889) * Print the saved stack trace now that we've verified there's
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 890) * something to recover.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 891) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 892) if (pe->trace_entries) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 893) void **ptrs = (void **) pe->stack_trace;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 894) int i;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 895)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 896) pr_err("EEH: Frozen PHB#%x-PE#%x detected\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 897) pe->phb->global_number, pe->addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 898)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 899) /* FIXME: Use the same format as dump_stack() */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 900) pr_err("EEH: Call Trace:\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 901) for (i = 0; i < pe->trace_entries; i++)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 902) pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 903)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 904) pe->trace_entries = 0;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 905) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 906) #endif /* CONFIG_STACKTRACE */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 907)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 908) eeh_pe_update_time_stamp(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 909) pe->freeze_count++;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 910) if (pe->freeze_count > eeh_max_freezes) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 911) pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 912) pe->phb->global_number, pe->addr,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 913) pe->freeze_count);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 914) result = PCI_ERS_RESULT_DISCONNECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 915) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 916)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 917) eeh_for_each_pe(pe, tmp_pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 918) eeh_pe_for_each_dev(tmp_pe, edev, tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 919) edev->mode &= ~EEH_DEV_NO_HANDLER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 920)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 921) /* Walk the various device drivers attached to this slot through
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 922) * a reset sequence, giving each an opportunity to do what it needs
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 923) * to accomplish the reset. Each child gets a report of the
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 924) * status ... if any child can't handle the reset, then the entire
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 925) * slot is dlpar removed and added.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 926) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 927) * When the PHB is fenced, we have to issue a reset to recover from
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 928) * the error. Override the result if necessary to have partially
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 929) * hotplug for this case.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 930) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 931) if (result != PCI_ERS_RESULT_DISCONNECT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 932) pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 933) pe->freeze_count, eeh_max_freezes);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 934) pr_info("EEH: Notify device drivers to shutdown\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 935) eeh_set_channel_state(pe, pci_channel_io_frozen);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 936) eeh_set_irq_state(pe, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 937) eeh_pe_report("error_detected(IO frozen)", pe,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 938) eeh_report_error, &result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 939) if ((pe->type & EEH_PE_PHB) &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 940) result != PCI_ERS_RESULT_NONE &&
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 941) result != PCI_ERS_RESULT_NEED_RESET)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 942) result = PCI_ERS_RESULT_NEED_RESET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 943) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 944)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 945) /* Get the current PCI slot state. This can take a long time,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 946) * sometimes over 300 seconds for certain systems.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 947) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 948) if (result != PCI_ERS_RESULT_DISCONNECT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 949) rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 950) if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 951) pr_warn("EEH: Permanent failure\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 952) result = PCI_ERS_RESULT_DISCONNECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 953) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 954) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 955)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 956) /* Since rtas may enable MMIO when posting the error log,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 957) * don't post the error log until after all dev drivers
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 958) * have been informed.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 959) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 960) if (result != PCI_ERS_RESULT_DISCONNECT) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 961) pr_info("EEH: Collect temporary log\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 962) eeh_slot_error_detail(pe, EEH_LOG_TEMP);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 963) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 964)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 965) /* If all device drivers were EEH-unaware, then shut
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 966) * down all of the device drivers, and hope they
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 967) * go down willingly, without panicing the system.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 968) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 969) if (result == PCI_ERS_RESULT_NONE) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 970) pr_info("EEH: Reset with hotplug activity\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 971) rc = eeh_reset_device(pe, bus, NULL, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 972) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 973) pr_warn("%s: Unable to reset, err=%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 974) __func__, rc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 975) result = PCI_ERS_RESULT_DISCONNECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 976) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 977) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 978)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 979) /* If all devices reported they can proceed, then re-enable MMIO */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 980) if (result == PCI_ERS_RESULT_CAN_RECOVER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 981) pr_info("EEH: Enable I/O for affected devices\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 982) rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 983)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 984) if (rc < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 985) result = PCI_ERS_RESULT_DISCONNECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 986) } else if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 987) result = PCI_ERS_RESULT_NEED_RESET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 988) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 989) pr_info("EEH: Notify device drivers to resume I/O\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 990) eeh_pe_report("mmio_enabled", pe,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 991) eeh_report_mmio_enabled, &result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 992) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 993) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 994)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 995) /* If all devices reported they can proceed, then re-enable DMA */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 996) if (result == PCI_ERS_RESULT_CAN_RECOVER) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 997) pr_info("EEH: Enabled DMA for affected devices\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 998) rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 999)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1000) if (rc < 0) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1001) result = PCI_ERS_RESULT_DISCONNECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1002) } else if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1003) result = PCI_ERS_RESULT_NEED_RESET;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1004) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1005) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1006) * We didn't do PE reset for the case. The PE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1007) * is still in frozen state. Clear it before
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1008) * resuming the PE.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1009) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1010) eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1011) result = PCI_ERS_RESULT_RECOVERED;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1012) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1013) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1014)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1015) /* If any device called out for a reset, then reset the slot */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1016) if (result == PCI_ERS_RESULT_NEED_RESET) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1017) pr_info("EEH: Reset without hotplug activity\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1018) rc = eeh_reset_device(pe, bus, &rmv_data, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1019) if (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1020) pr_warn("%s: Cannot reset, err=%d\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1021) __func__, rc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1022) result = PCI_ERS_RESULT_DISCONNECT;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1023) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1024) result = PCI_ERS_RESULT_NONE;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1025) eeh_set_channel_state(pe, pci_channel_io_normal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1026) eeh_set_irq_state(pe, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1027) eeh_pe_report("slot_reset", pe, eeh_report_reset,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1028) &result);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1029) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1030) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1031)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1032) if ((result == PCI_ERS_RESULT_RECOVERED) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1033) (result == PCI_ERS_RESULT_NONE)) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1034) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1035) * For those hot removed VFs, we should add back them after PF
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1036) * get recovered properly.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1037) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1038) list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1039) rmv_entry) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1040) eeh_add_virt_device(edev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1041) list_del(&edev->rmv_entry);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1042) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1043)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1044) /* Tell all device drivers that they can resume operations */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1045) pr_info("EEH: Notify device driver to resume\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1046) eeh_set_channel_state(pe, pci_channel_io_normal);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1047) eeh_set_irq_state(pe, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1048) eeh_pe_report("resume", pe, eeh_report_resume, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1049) eeh_for_each_pe(pe, tmp_pe) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1050) eeh_pe_for_each_dev(tmp_pe, edev, tmp) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1051) edev->mode &= ~EEH_DEV_NO_HANDLER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1052) edev->in_error = false;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1053) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1054) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1055)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1056) pr_info("EEH: Recovery successful.\n");
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1057) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1058) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1059) * About 90% of all real-life EEH failures in the field
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1060) * are due to poorly seated PCI cards. Only 10% or so are
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1061) * due to actual, failed cards.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1062) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1063) pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n"
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1064) "Please try reseating or replacing it\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1065) pe->phb->global_number, pe->addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1066)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1067) eeh_slot_error_detail(pe, EEH_LOG_PERM);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1068)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1069) /* Notify all devices that they're about to go down. */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1070) eeh_set_channel_state(pe, pci_channel_io_perm_failure);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1071) eeh_set_irq_state(pe, false);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1072) eeh_pe_report("error_detected(permanent failure)", pe,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1073) eeh_report_failure, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1074)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1075) /* Mark the PE to be removed permanently */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1076) eeh_pe_state_mark(pe, EEH_PE_REMOVED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1077)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1078) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1079) * Shut down the device drivers for good. We mark
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1080) * all removed devices correctly to avoid access
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1081) * the their PCI config any more.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1082) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1083) if (pe->type & EEH_PE_VF) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1084) eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1085) eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1086) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1087) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1088) eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1089)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1090) pci_lock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1091) pci_hp_remove_devices(bus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1092) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1093) /* The passed PE should no longer be used */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1094) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1095) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1096) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1097)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1098) out:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1099) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1100) * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1101) * we don't want to modify the PE tree structure so we do it here.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1102) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1103) eeh_pe_cleanup(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1104)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1105) /* clear the slot attention LED for all recovered devices */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1106) eeh_for_each_pe(pe, tmp_pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1107) eeh_pe_for_each_dev(tmp_pe, edev, tmp)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1108) eeh_clear_slot_attention(edev->pdev);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1109)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1110) eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1111) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1112)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1113) /**
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1114) * eeh_handle_special_event - Handle EEH events without a specific failing PE
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1115) *
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1116) * Called when an EEH event is detected but can't be narrowed down to a
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1117) * specific PE. Iterates through possible failures and handles them as
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1118) * necessary.
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1119) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1120) void eeh_handle_special_event(void)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1121) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1122) struct eeh_pe *pe, *phb_pe, *tmp_pe;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1123) struct eeh_dev *edev, *tmp_edev;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1124) struct pci_bus *bus;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1125) struct pci_controller *hose;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1126) unsigned long flags;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1127) int rc;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1128)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1129)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1130) do {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1131) rc = eeh_ops->next_error(&pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1132)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1133) switch (rc) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1134) case EEH_NEXT_ERR_DEAD_IOC:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1135) /* Mark all PHBs in dead state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1136) eeh_serialize_lock(&flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1137)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1138) /* Purge all events */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1139) eeh_remove_event(NULL, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1140)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1141) list_for_each_entry(hose, &hose_list, list_node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1142) phb_pe = eeh_phb_pe_get(hose);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1143) if (!phb_pe) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1144)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1145) eeh_pe_mark_isolated(phb_pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1146) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1147)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1148) eeh_serialize_unlock(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1149)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1150) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1151) case EEH_NEXT_ERR_FROZEN_PE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1152) case EEH_NEXT_ERR_FENCED_PHB:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1153) case EEH_NEXT_ERR_DEAD_PHB:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1154) /* Mark the PE in fenced state */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1155) eeh_serialize_lock(&flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1156)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1157) /* Purge all events of the PHB */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1158) eeh_remove_event(pe, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1159)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1160) if (rc != EEH_NEXT_ERR_DEAD_PHB)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1161) eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1162) eeh_pe_mark_isolated(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1163)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1164) eeh_serialize_unlock(flags);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1165)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1166) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1167) case EEH_NEXT_ERR_NONE:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1168) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1169) default:
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1170) pr_warn("%s: Invalid value %d from next_error()\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1171) __func__, rc);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1172) return;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1173) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1174)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1175) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1176) * For fenced PHB and frozen PE, it's handled as normal
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1177) * event. We have to remove the affected PHBs for dead
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1178) * PHB and IOC
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1179) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1180) if (rc == EEH_NEXT_ERR_FROZEN_PE ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1181) rc == EEH_NEXT_ERR_FENCED_PHB) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1182) eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1183) eeh_handle_normal_event(pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1184) } else {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1185) eeh_for_each_pe(pe, tmp_pe)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1186) eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1187) edev->mode &= ~EEH_DEV_NO_HANDLER;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1188)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1189) /* Notify all devices to be down */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1190) eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1191) eeh_set_channel_state(pe, pci_channel_io_perm_failure);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1192) eeh_pe_report(
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1193) "error_detected(permanent failure)", pe,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1194) eeh_report_failure, NULL);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1195)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1196) pci_lock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1197) list_for_each_entry(hose, &hose_list, list_node) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1198) phb_pe = eeh_phb_pe_get(hose);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1199) if (!phb_pe ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1200) !(phb_pe->state & EEH_PE_ISOLATED) ||
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1201) (phb_pe->state & EEH_PE_RECOVERING))
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1202) continue;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1203)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1204) bus = eeh_pe_bus_get(phb_pe);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1205) if (!bus) {
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1206) pr_err("%s: Cannot find PCI bus for "
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1207) "PHB#%x-PE#%x\n",
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1208) __func__,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1209) pe->phb->global_number,
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1210) pe->addr);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1211) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1212) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1213) pci_hp_remove_devices(bus);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1214) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1215) pci_unlock_rescan_remove();
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1216) }
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1217)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1218) /*
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1219) * If we have detected dead IOC, we needn't proceed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1220) * any more since all PHBs would have been removed
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1221) */
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1222) if (rc == EEH_NEXT_ERR_DEAD_IOC)
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1223) break;
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1224) } while (rc != EEH_NEXT_ERR_NONE);
^8f3ce5b39 (kx 2023-10-28 12:00:06 +0300 1225) }