1
0
linux/debian/patches/patchset-zen/sauce/0003-ZEN-PCI-Add-Intel-remapped-NVMe-device-support.patch

627 lines
19 KiB
Diff
Raw Permalink Normal View History

From 567908cc05dc5e02b1b9c26620bce3791559f9d4 Mon Sep 17 00:00:00 2001
From: Daniel Drake <drake@endlessm.com>
Date: Tue, 4 Jun 2019 14:51:21 +0800
Subject: ZEN: PCI: Add Intel remapped NVMe device support
Contains:
- PCI: Add Intel remapped NVMe device support
Consumer products that are configured by default to run the Intel SATA AHCI
controller in "RAID" or "Intel RST Premium With Intel Optane System
Acceleration" mode are becoming increasingly prevalent.
Unde this mode, NVMe devices are remapped into the SATA device and become
hidden from the PCI bus, which means that Linux users cannot access their
storage devices unless they go into the firmware setup menu to revert back
to AHCI mode - assuming such option is available. Lack of support for this
mode is also causing complications for vendors who distribute Linux.
Add support for the remapped NVMe mode by creating a virtual PCI bus,
where the AHCI and NVMe devices are presented separately, allowing the
ahci and nvme drivers to bind in the normal way.
Unfortunately the NVMe device configuration space is inaccesible under
this scheme, so we provide a fake one, and hope that no DeviceID-based
quirks are needed. The interrupt is shared between the AHCI and NVMe
devices.
Allow pci_real_dma_dev() to traverse back to the real DMA device from
the PCI devices created on our virtual bus, in case the iommu driver
will be involved with data transfers here.
The existing ahci driver is modified to not claim devices where remapped
NVMe devices are present, allowing this new driver to step in.
The details of the remapping scheme came from patches previously
posted by Dan Williams and the resulting discussion.
https://phabricator.endlessm.com/T24358
https://phabricator.endlessm.com/T29119
Signed-off-by: Daniel Drake <drake@endlessm.com>
- PCI: Fix order of remapped NVMe devices
---
arch/x86/include/asm/pci.h | 6 +
arch/x86/pci/common.c | 7 +-
drivers/ata/ahci.c | 23 +-
drivers/pci/controller/Makefile | 6 +
drivers/pci/controller/intel-nvme-remap.c | 462 ++++++++++++++++++++++
5 files changed, 488 insertions(+), 16 deletions(-)
create mode 100644 drivers/pci/controller/intel-nvme-remap.c
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -26,6 +26,7 @@ struct pci_sysdata {
#if IS_ENABLED(CONFIG_VMD)
struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */
#endif
+ struct pci_dev *nvme_remap_dev; /* AHCI Device if NVME remapped bus */
};
extern int pci_routeirq;
@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus
#define is_vmd(bus) false
#endif /* CONFIG_VMD */
+static inline bool is_nvme_remap(struct pci_bus *bus)
+{
+ return to_pci_sysdata(bus)->nvme_remap_dev != NULL;
+}
+
/* Can be used to override the logic in pci_scan_bus for skipping
already-configured bus numbers - to be used for buggy BIOSes
or architectures with incomplete PCI setup by the loader */
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void)
return 0;
}
-#if IS_ENABLED(CONFIG_VMD)
struct pci_dev *pci_real_dma_dev(struct pci_dev *dev)
{
+#if IS_ENABLED(CONFIG_VMD)
if (is_vmd(dev->bus))
return to_pci_sysdata(dev->bus)->vmd_dev;
+#endif
+
+ if (is_nvme_remap(dev->bus))
+ return to_pci_sysdata(dev->bus)->nvme_remap_dev;
return dev;
}
-#endif
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_han
}
#endif
-static void ahci_remap_check(struct pci_dev *pdev, int bar,
+static int ahci_remap_check(struct pci_dev *pdev, int bar,
struct ahci_host_priv *hpriv)
{
int i;
@@ -1631,7 +1631,7 @@ static void ahci_remap_check(struct pci_
pci_resource_len(pdev, bar) < SZ_512K ||
bar != AHCI_PCI_BAR_STANDARD ||
!(readl(hpriv->mmio + AHCI_VSCAP) & 1))
- return;
+ return 0;
cap = readq(hpriv->mmio + AHCI_REMAP_CAP);
for (i = 0; i < AHCI_MAX_REMAP; i++) {
@@ -1646,18 +1646,11 @@ static void ahci_remap_check(struct pci_
}
if (!hpriv->remapped_nvme)
- return;
+ return 0;
- dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n",
- hpriv->remapped_nvme);
- dev_warn(&pdev->dev,
- "Switch your BIOS from RAID to AHCI mode to use them.\n");
-
- /*
- * Don't rely on the msi-x capability in the remap case,
- * share the legacy interrupt across ahci and remapped devices.
- */
- hpriv->flags |= AHCI_HFLAG_NO_MSI;
+ /* Abort probe, allowing intel-nvme-remap to step in when available */
+ dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n");
+ return -ENODEV;
}
static int ahci_get_irq_vector(struct ata_host *host, int port)
@@ -1896,7 +1889,9 @@ static int ahci_init_one(struct pci_dev
hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar];
/* detect remapped nvme devices */
- ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+ rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv);
+ if (rc)
+ return rc;
sysfs_add_file_to_group(&pdev->dev.kobj,
&dev_attr_remapped_nvme.attr,
--- a/drivers/pci/controller/Makefile
+++ b/drivers/pci/controller/Makefile
@@ -1,4 +1,10 @@
# SPDX-License-Identifier: GPL-2.0
+ifdef CONFIG_X86_64
+ifdef CONFIG_SATA_AHCI
+obj-y += intel-nvme-remap.o
+endif
+endif
+
obj-$(CONFIG_PCIE_CADENCE) += cadence/
obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o
obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o
--- /dev/null
+++ b/drivers/pci/controller/intel-nvme-remap.c
@@ -0,0 +1,462 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Intel remapped NVMe device support.
+ *
+ * Copyright (c) 2019 Endless Mobile, Inc.
+ * Author: Daniel Drake <drake@endlessm.com>
+ *
+ * Some products ship by default with the SATA controller in "RAID" or
+ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this
+ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe
+ * devices disappear from the PCI bus, and instead their I/O memory becomes
+ * available within the AHCI device BARs.
+ *
+ * This scheme is understood to be a way of avoiding usage of the standard
+ * Windows NVMe driver under that OS, instead mandating usage of Intel's
+ * driver instead, which has better power management, and presumably offers
+ * some RAID/disk-caching solutions too.
+ *
+ * Here in this driver, we support the remapped NVMe mode by claiming the
+ * AHCI device and creating a fake PCIe root port. On the new bus, the
+ * original AHCI device is exposed with only minor tweaks. Then, fake PCI
+ * devices corresponding to the remapped NVMe devices are created. The usual
+ * ahci and nvme drivers are then expected to bind to these devices and
+ * operate as normal.
+ *
+ * The PCI configuration space for the NVMe devices is completely
+ * unavailable, so we fake a minimal one and hope for the best.
+ *
+ * Interrupts are shared between the AHCI and NVMe devices. For simplicity,
+ * we only support the legacy interrupt here, although MSI support
+ * could potentially be added later.
+ */
+
+#define MODULE_NAME "intel-nvme-remap"
+
+#include <linux/ahci-remap.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define AHCI_PCI_BAR_STANDARD 5
+
+struct nvme_remap_dev {
+ struct pci_dev *dev; /* AHCI device */
+ struct pci_bus *bus; /* our fake PCI bus */
+ struct pci_sysdata sysdata;
+ int irq_base; /* our fake interrupts */
+
+ /*
+ * When we detect an all-ones write to a BAR register, this flag
+ * is set, so that we return the BAR size on the next read (a
+ * standard PCI behaviour).
+ * This includes the assumption that an all-ones BAR write is
+ * immediately followed by a read of the same register.
+ */
+ bool bar_sizing;
+
+ /*
+ * Resources copied from the AHCI device, to be regarded as
+ * resources on our fake bus.
+ */
+ struct resource ahci_resources[PCI_NUM_RESOURCES];
+
+ /* Resources corresponding to the NVMe devices. */
+ struct resource remapped_dev_mem[AHCI_MAX_REMAP];
+
+ /* Number of remapped NVMe devices found. */
+ int num_remapped_devices;
+};
+
+static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus)
+{
+ return container_of(bus->sysdata, struct nvme_remap_dev, sysdata);
+}
+
+
+/******** PCI configuration space **********/
+
+/*
+ * Helper macros for tweaking returned contents of PCI configuration space.
+ *
+ * value contains len bytes of data read from reg.
+ * If fixup_reg is included in that range, fix up the contents of that
+ * register to fixed_value.
+ */
+#define NR_FIX8(fixup_reg, fixed_value) do { \
+ if (reg <= fixup_reg && fixup_reg < reg + len) \
+ ((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \
+ } while (0)
+
+#define NR_FIX16(fixup_reg, fixed_value) do { \
+ NR_FIX8(fixup_reg, fixed_value); \
+ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+ } while (0)
+
+#define NR_FIX24(fixup_reg, fixed_value) do { \
+ NR_FIX8(fixup_reg, fixed_value); \
+ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \
+ NR_FIX8(fixup_reg + 2, fixed_value >> 16); \
+ } while (0)
+
+#define NR_FIX32(fixup_reg, fixed_value) do { \
+ NR_FIX16(fixup_reg, (u16) fixed_value); \
+ NR_FIX16(fixup_reg + 2, fixed_value >> 16); \
+ } while (0)
+
+/*
+ * Read PCI config space of the slot 0 (AHCI) device.
+ * We pass through the read request to the underlying device, but
+ * tweak the results in some cases.
+ */
+static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg,
+ int len, u32 *value)
+{
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+ struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+ int ret;
+
+ ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn,
+ reg, len, value);
+ if (ret)
+ return ret;
+
+ /*
+ * Adjust the device class, to prevent this driver from attempting to
+ * additionally probe the device we're simulating here.
+ */
+ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI);
+
+ /*
+ * Unset interrupt pin, otherwise ACPI tries to find routing
+ * info for our virtual IRQ, fails, and complains.
+ */
+ NR_FIX8(PCI_INTERRUPT_PIN, 0);
+
+ /*
+ * Truncate the AHCI BAR to not include the region that covers the
+ * hidden devices. This will cause the ahci driver to successfully
+ * probe th new device (instead of handing it over to this driver).
+ */
+ if (nrdev->bar_sizing) {
+ NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1));
+ nrdev->bar_sizing = false;
+ }
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+/*
+ * Read PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we provide a minimal,
+ * fake config space instead.
+ */
+static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port,
+ int reg, int len, u32 *value)
+{
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+ struct resource *remapped_mem;
+
+ if (port > nrdev->num_remapped_devices)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ *value = 0;
+ remapped_mem = &nrdev->remapped_dev_mem[port - 1];
+
+ /* Set a Vendor ID, otherwise Linux assumes no device is present */
+ NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL);
+
+ /* Always appear on & bus mastering */
+ NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER);
+
+ /* Set class so that nvme driver probes us */
+ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS);
+
+ if (nrdev->bar_sizing) {
+ NR_FIX32(PCI_BASE_ADDRESS_0,
+ ~(resource_size(remapped_mem) - 1));
+ nrdev->bar_sizing = false;
+ } else {
+ resource_size_t mem_start = remapped_mem->start;
+
+ mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64;
+ NR_FIX32(PCI_BASE_ADDRESS_0, mem_start);
+ mem_start >>= 32;
+ NR_FIX32(PCI_BASE_ADDRESS_1, mem_start);
+ }
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+/* Read PCI configuration space. */
+static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn,
+ int reg, int len, u32 *value)
+{
+ if (PCI_SLOT(devfn) == 0)
+ return nvme_remap_pci_read_slot0(bus, reg, len, value);
+ else
+ return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn),
+ reg, len, value);
+}
+
+/*
+ * Write PCI config space of the slot 0 (AHCI) device.
+ * Apart from the special case of BAR sizing, we disable all writes.
+ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master)
+ * that would affect the operation of the NVMe devices.
+ */
+static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg,
+ int len, u32 value)
+{
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+ struct pci_bus *ahci_dev_bus = nrdev->dev->bus;
+
+ if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) {
+ /*
+ * Writing all-ones to a BAR means that the size of the
+ * memory region is being checked. Flag this so that we can
+ * reply with an appropriate size on the next read.
+ */
+ if (value == ~0)
+ nrdev->bar_sizing = true;
+
+ return ahci_dev_bus->ops->write(ahci_dev_bus,
+ nrdev->dev->devfn,
+ reg, len, value);
+ }
+
+ return PCIBIOS_SET_FAILED;
+}
+
+/*
+ * Write PCI config space of a remapped device.
+ * Since the original PCI config space is inaccessible, we reject all
+ * writes, except for the special case of BAR probing.
+ */
+static int nvme_remap_pci_write_remapped(struct pci_bus *bus,
+ unsigned int port,
+ int reg, int len, u32 value)
+{
+ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus);
+
+ if (port > nrdev->num_remapped_devices)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ /*
+ * Writing all-ones to a BAR means that the size of the memory
+ * region is being checked. Flag this so that we can reply with
+ * an appropriate size on the next read.
+ */
+ if (value == ~0 && reg >= PCI_BASE_ADDRESS_0
+ && reg <= PCI_BASE_ADDRESS_5) {
+ nrdev->bar_sizing = true;
+ return PCIBIOS_SUCCESSFUL;
+ }
+
+ return PCIBIOS_SET_FAILED;
+}
+
+/* Write PCI configuration space. */
+static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn,
+ int reg, int len, u32 value)
+{
+ if (PCI_SLOT(devfn) == 0)
+ return nvme_remap_pci_write_slot0(bus, reg, len, value);
+ else
+ return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn),
+ reg, len, value);
+}
+
+static struct pci_ops nvme_remap_pci_ops = {
+ .read = nvme_remap_pci_read,
+ .write = nvme_remap_pci_write,
+};
+
+
+/******** Initialization & exit **********/
+
+/*
+ * Find a PCI domain ID to use for our fake bus.
+ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits).
+ */
+static int find_free_domain(void)
+{
+ int domain = 0xffff;
+ struct pci_bus *bus = NULL;
+
+ while ((bus = pci_find_next_bus(bus)) != NULL)
+ domain = max_t(int, domain, pci_domain_nr(bus));
+
+ return domain + 1;
+}
+
+static int find_remapped_devices(struct nvme_remap_dev *nrdev,
+ struct list_head *resources)
+{
+ void __iomem *mmio;
+ int i, count = 0;
+ u32 cap;
+
+ mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD,
+ pci_resource_len(nrdev->dev,
+ AHCI_PCI_BAR_STANDARD));
+ if (!mmio)
+ return -ENODEV;
+
+ /* Check if this device might have remapped nvme devices. */
+ if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K ||
+ !(readl(mmio + AHCI_VSCAP) & 1))
+ return -ENODEV;
+
+ cap = readq(mmio + AHCI_REMAP_CAP);
+ for (i = AHCI_MAX_REMAP-1; i >= 0; i--) {
+ struct resource *remapped_mem;
+
+ if ((cap & (1 << i)) == 0)
+ continue;
+ if (readl(mmio + ahci_remap_dcc(i))
+ != PCI_CLASS_STORAGE_EXPRESS)
+ continue;
+
+ /* We've found a remapped device */
+ remapped_mem = &nrdev->remapped_dev_mem[count++];
+ remapped_mem->start =
+ pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD)
+ + ahci_remap_base(i);
+ remapped_mem->end = remapped_mem->start
+ + AHCI_REMAP_N_SIZE - 1;
+ remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED;
+ pci_add_resource(resources, remapped_mem);
+ }
+
+ pcim_iounmap(nrdev->dev, mmio);
+
+ if (count == 0)
+ return -ENODEV;
+
+ nrdev->num_remapped_devices = count;
+ dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n",
+ nrdev->num_remapped_devices);
+ return 0;
+}
+
+static void nvme_remap_remove_root_bus(void *data)
+{
+ struct pci_bus *bus = data;
+
+ pci_stop_root_bus(bus);
+ pci_remove_root_bus(bus);
+}
+
+static int nvme_remap_probe(struct pci_dev *dev,
+ const struct pci_device_id *id)
+{
+ struct nvme_remap_dev *nrdev;
+ LIST_HEAD(resources);
+ int i;
+ int ret;
+ struct pci_dev *child;
+
+ nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL);
+ nrdev->sysdata.domain = find_free_domain();
+ nrdev->sysdata.nvme_remap_dev = dev;
+ nrdev->dev = dev;
+ pci_set_drvdata(dev, nrdev);
+
+ ret = pcim_enable_device(dev);
+ if (ret < 0)
+ return ret;
+
+ pci_set_master(dev);
+
+ ret = find_remapped_devices(nrdev, &resources);
+ if (ret)
+ return ret;
+
+ /* Add resources from the original AHCI device */
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ struct resource *res = &dev->resource[i];
+
+ if (res->start) {
+ struct resource *nr_res = &nrdev->ahci_resources[i];
+
+ nr_res->start = res->start;
+ nr_res->end = res->end;
+ nr_res->flags = res->flags;
+ pci_add_resource(&resources, nr_res);
+ }
+ }
+
+ /* Create virtual interrupts */
+ nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0,
+ nrdev->num_remapped_devices + 1,
+ 0);
+ if (nrdev->irq_base < 0)
+ return nrdev->irq_base;
+
+ /* Create and populate PCI bus */
+ nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops,
+ &nrdev->sysdata, &resources);
+ if (!nrdev->bus)
+ return -ENODEV;
+
+ if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus,
+ nrdev->bus))
+ return -ENOMEM;
+
+ /* We don't support sharing MSI interrupts between these devices */
+ nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
+
+ pci_scan_child_bus(nrdev->bus);
+
+ list_for_each_entry(child, &nrdev->bus->devices, bus_list) {
+ /*
+ * Prevent PCI core from trying to move memory BARs around.
+ * The hidden NVMe devices are at fixed locations.
+ */
+ for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+ struct resource *res = &child->resource[i];
+
+ if (res->flags & IORESOURCE_MEM)
+ res->flags |= IORESOURCE_PCI_FIXED;
+ }
+
+ /* Share the legacy IRQ between all devices */
+ child->irq = dev->irq;
+ }
+
+ pci_assign_unassigned_bus_resources(nrdev->bus);
+ pci_bus_add_devices(nrdev->bus);
+
+ return 0;
+}
+
+static const struct pci_device_id nvme_remap_ids[] = {
+ /*
+ * Match all Intel RAID controllers.
+ *
+ * There's overlap here with the set of devices detected by the ahci
+ * driver, but ahci will only successfully probe when there
+ * *aren't* any remapped NVMe devices, and this driver will only
+ * successfully probe when there *are* remapped NVMe devices that
+ * need handling.
+ */
+ {
+ PCI_VDEVICE(INTEL, PCI_ANY_ID),
+ .class = PCI_CLASS_STORAGE_RAID << 8,
+ .class_mask = 0xffffff00,
+ },
+ {0,}
+};
+MODULE_DEVICE_TABLE(pci, nvme_remap_ids);
+
+static struct pci_driver nvme_remap_drv = {
+ .name = MODULE_NAME,
+ .id_table = nvme_remap_ids,
+ .probe = nvme_remap_probe,
+};
+module_pci_driver(nvme_remap_drv);
+
+MODULE_AUTHOR("Daniel Drake <drake@endlessm.com>");
+MODULE_LICENSE("GPL v2");