[PATCH] ocxl: Mmio invalidation support

Christophe Lombard clombard at linux.vnet.ibm.com
Sat Nov 14 02:33:33 AEDT 2020


OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance
problems caused by the PAU having to process all incoming TLBI/SLBI
commands which will cause them to back up on the PowerBus.

When the Address Translation Mode requires TLB and SLB Invalidate
operations to be initiated using MMIO registers, a set of registers like
the following is used:
• XTS MMIO ATSD0 LPARID register
• XTS MMIO ATSD0 AVA register
• XTS MMIO ATSD0 launch register, write access initiates a shoot down
• XTS MMIO ATSD0 status register

The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE
commands from the PowerBus.

The Shootdown commands (ATSD) will be generated using MMIO registers
in the NPU/PAU and sent to the device.

Signed-off-by: Christophe Lombard <clombard at linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/pnv-ocxl.h   |   2 +
 arch/powerpc/platforms/powernv/ocxl.c |  19 +++
 drivers/misc/ocxl/link.c              | 180 ++++++++++++++++++++++----
 drivers/misc/ocxl/ocxl_internal.h     |  46 ++++++-
 drivers/misc/ocxl/trace.h             | 125 ++++++++++++++++++
 5 files changed, 348 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h
index d37ededca3ee..4a23abcc347b 100644
--- a/arch/powerpc/include/asm/pnv-ocxl.h
+++ b/arch/powerpc/include/asm/pnv-ocxl.h
@@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p
 void pnv_ocxl_spa_release(void *platform_data);
 int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle);
 
+extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
+			     uint64_t lpcr);
 #endif /* _ASM_PNV_OCXL_H */
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
index ecdad219d704..100546ea635f 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
 	return rc;
 }
 EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
+
+int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
+		      uint64_t lpcr)
+{
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	u32 bdfn;
+	int rc;
+
+	bdfn = (dev->bus->number << 8) | dev->devfn;
+	rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr);
+	if (rc) {
+		dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar);
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index fd73d3bc0eb6..9b5b77d40734 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -4,6 +4,8 @@
 #include <linux/mutex.h>
 #include <linux/mm_types.h>
 #include <linux/mmu_context.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
 #include <asm/copro.h>
 #include <asm/pnv-ocxl.h>
 #include <asm/xive.h>
@@ -33,6 +35,31 @@
 
 #define SPA_PE_VALID		0x80000000
 
+struct spa;
+
+/*
+ * A opencapi link can be used be by several PCI functions. We have
+ * one link per device slot.
+ *
+ * A linked list of opencapi links should suffice, as there's a
+ * limited number of opencapi slots on a system and lookup is only
+ * done when the device is probed
+ */
+struct ocxl_link {
+	struct list_head list;
+	struct kref ref;
+	int domain;
+	int bus;
+	int dev;
+	u64 mmio_atsd; /* ATSD physical address */
+	void __iomem *base;    /* ATSD register virtual address */
+	spinlock_t atsd_lock; // to serialize shootdowns
+	atomic_t irq_available;
+	struct spa *spa;
+	void *platform_data;
+};
+static struct list_head links_list = LIST_HEAD_INIT(links_list);
+static DEFINE_MUTEX(links_list_lock);
 
 struct pe_data {
 	struct mm_struct *mm;
@@ -41,6 +68,8 @@ struct pe_data {
 	/* opaque pointer to be passed to the above callback */
 	void *xsl_err_data;
 	struct rcu_head rcu;
+	struct ocxl_link *link;
+	struct mmu_notifier mmu_notifier;
 };
 
 struct spa {
@@ -69,27 +98,6 @@ struct spa {
 	} xsl_fault;
 };
 
-/*
- * A opencapi link can be used be by several PCI functions. We have
- * one link per device slot.
- *
- * A linked list of opencapi links should suffice, as there's a
- * limited number of opencapi slots on a system and lookup is only
- * done when the device is probed
- */
-struct ocxl_link {
-	struct list_head list;
-	struct kref ref;
-	int domain;
-	int bus;
-	int dev;
-	atomic_t irq_available;
-	struct spa *spa;
-	void *platform_data;
-};
-static struct list_head links_list = LIST_HEAD_INIT(links_list);
-static DEFINE_MUTEX(links_list_lock);
-
 enum xsl_response {
 	CONTINUE,
 	ADDRESS_ERROR,
@@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r)
 	}
 }
 
+static const struct mmu_notifier_ops ocxl_mmu_notifier_ops;
+
 static void xsl_fault_handler_bh(struct work_struct *fault_work)
 {
 	vm_fault_t flt = 0;
@@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link)
 
 static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
 {
+	struct pci_controller *hose = pci_bus_to_host(dev->bus);
 	struct ocxl_link *link;
 	int rc;
 
@@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l
 	if (rc)
 		goto err_xsl_irq;
 
+	/* Since OpenCAPI 5.0, Address Translation Mode requires TLB
+	 * and SLB Invalidate operations to be initiated using MMIO
+	 * registers
+	 */
+	if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+				       0, &link->mmio_atsd)) {
+		dev_info(&dev->dev, "No available ATSD found\n");
+	}
+	if (link->mmio_atsd) {
+		link->base = ioremap(link->mmio_atsd, 24);
+		if (!link->base)
+			dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd);
+		else
+			pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0);
+	}
+
 	*out_link = link;
 	return 0;
 
@@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
 {
 	struct ocxl_link *link = (struct ocxl_link *) link_handle;
 
+	if (link->base) {
+		iounmap(link->base);
+		link->base = NULL;
+	}
 	mutex_lock(&links_list_lock);
 	kref_put(&link->ref, release_xsl);
 	mutex_unlock(&links_list_lock);
 }
 EXPORT_SYMBOL_GPL(ocxl_link_release);
 
+static void tlb_invalidate(struct ocxl_link *link,
+			   unsigned long pid,
+			   unsigned long addr)
+{
+	unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT);
+	uint64_t val;
+	int pend;
+
+	if (!link->base)
+		return;
+
+	spin_lock(&link->atsd_lock);
+	if (addr) {
+		/* load Abbreviated Virtual Address register with
+		 * the necessary value
+		 */
+		val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51));
+		out_be64(link->base + XTS_ATSD_AVA, val);
+		eieio();
+		trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid);
+	}
+
+	/* Write access initiates a shoot down to initiate the
+	 * TLB Invalidate command
+	 */
+	val = XTS_ATSD_LNCH_R;
+	if (addr) {
+		val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00);
+		val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00);
+	} else {
+		val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10);
+		val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01);
+		val |= XTS_ATSD_LNCH_OCAPI_SINGLETON;
+	}
+	val |= XTS_ATSD_LNCH_PRS;
+	val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101);
+	val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid);
+	out_be64(link->base + XTS_ATSD_LNCH, val);
+	trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid);
+
+	/* Poll the ATSD status register to determine when the
+	* TLB Invalidate has been completed.
+	*/
+	val = in_be64(link->base + XTS_ATSD_STAT);
+	pend = val >> 63;
+	trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
+
+	while (pend) {
+		if (time_after_eq(jiffies, timeout)) {
+			pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n",
+			       __func__, val, pid);
+			spin_unlock(&link->atsd_lock);
+			return;
+		}
+		cpu_relax();
+		val = in_be64(link->base + XTS_ATSD_STAT);
+		pend = val >> 63;
+	}
+	spin_unlock(&link->atsd_lock);
+	trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
+}
+
+static void invalidate_range_end(struct mmu_notifier *mn,
+				 const struct mmu_notifier_range *range)
+{
+	struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
+	struct ocxl_link *link = pe_data->link;
+	struct mm_struct *mm = mn->mm;
+	unsigned long addr, pid, page_size = PAGE_SIZE;
+
+	pid = mm->context.id;
+	trace_ocxl_mmu_notifier_range(range->start, range->end, pid);
+
+	for (addr = range->start; addr < range->end; addr += page_size)
+		tlb_invalidate(link, pid, addr);
+}
+
+static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
+	/* invalidate_range_end() is called when all pages in the
+	 * range have been unmapped and the pages have been freed by
+	 * the VM
+	 */
+	.invalidate_range_end = invalidate_range_end,
+};
+
 static u64 calculate_cfg_state(bool kernel)
 {
 	u64 state;
@@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
 		goto unlock;
 	}
 
-	pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
+	pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL);
 	if (!pe_data) {
 		rc = -ENOMEM;
 		goto unlock;
@@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
 	pe_data->mm = mm;
 	pe_data->xsl_err_cb = xsl_err_cb;
 	pe_data->xsl_err_data = xsl_err_data;
+	pe_data->link = link;
+	pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
 
 	memset(pe, 0, sizeof(struct ocxl_process_element));
 	pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
+	pe->pasid = cpu_to_be32(pasid << (31 - 19));
+	pe->bdf = cpu_to_be32(1 << (31 - 15));
 	pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
 	pe->pid = cpu_to_be32(pidr);
 	pe->tid = cpu_to_be32(tidr);
@@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
 	 * by the nest MMU. If we have a kernel context, TLBIs are
 	 * already global.
 	 */
-	if (mm)
+	if (mm) {
 		mm_context_add_copro(mm);
+		if (link->base) {
+			/* Use MMIO registers for the TLB and SLB
+			 * Invalidate operations.
+			 */
+			trace_init_mmu_notifier(pasid, mm->context.id);
+			mmu_notifier_register(&pe_data->mmu_notifier, mm);
+		}
+	}
+
 	/*
 	 * Barrier is to make sure PE is visible in the SPA before it
 	 * is used by the device. It also helps with the global TLBI
@@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
 		WARN(1, "Couldn't find pe data when removing PE\n");
 	} else {
 		if (pe_data->mm) {
+			if (link->base) {
+				trace_release_mmu_notifier(pasid, pe_data->mm->context.id);
+				mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm);
+				tlb_invalidate(link, pe_data->mm->context.id, 0ull);
+			}
 			mm_context_remove_copro(pe_data->mm);
 			mmdrop(pe_data->mm);
 		}
diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
index 0bad0a123af6..35d8be3cd270 100644
--- a/drivers/misc/ocxl/ocxl_internal.h
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -8,6 +8,48 @@
 #include <linux/list.h>
 #include <misc/ocxl.h>
 
+/* Find left shift from first set bit in mask */
+#define MASK_TO_LSH(m)		(__builtin_ffsl(m) - 1)
+
+/* Set field fname of oval to fval
+ * NOTE: oval isn't modified, the combined result is returned
+ */
+#define SETFIELD(m, v, val)				\
+	(((v) & ~(m)) |	((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
+
+#define OCXL_ATSD_TIMEOUT		1
+
+/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */
+#define XTS_ATSD_LNCH		0x00
+#define   XTS_ATSD_LNCH_R	PPC_BIT(0)		/* Radix Invalidate */
+#define   XTS_ATSD_LNCH_RIC	PPC_BITMASK(1,2)	/* Radix Invalidation Control
+							 * 0b00 Just invalidate TLB.
+							 * 0b01 Invalidate just Page Walk Cache.
+							 * 0b10 Invalidate TLB, Page Walk Cache, and any
+							 * caching of Partition and Process Table Entries.
+							 */
+#define   XTS_ATSD_LNCH_LP	PPC_BITMASK(3, 10)	/* Number and Page Size of translations to be invalidated (HPT only ?) */
+#define   XTS_ATSD_LNCH_IS	PPC_BITMASK(11, 12)	/* Invalidation Criteria
+							 * 0b00 Invalidate just the target VA.
+							 * 0b01 Invalidate matching PID.
+							 */
+#define   XTS_ATSD_LNCH_PRS	PPC_BIT(13)		/* 0b1: Process Scope, 0b0: Partition Scope */
+#define   XTS_ATSD_LNCH_B	PPC_BIT(14)		/* Invalidation Flag */
+#define   XTS_ATSD_LNCH_AP	PPC_BITMASK(15, 17)	/* Actual Page Size to be invalidated
+							 * 000 4KB
+							 * 101 64KB
+							 * 001 2MB
+							 * 010 1GB
+							 */
+#define   XTS_ATSD_LNCH_L	PPC_BIT(18)		/* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */
+#define   XTS_ATSD_LNCH_PID	PPC_BITMASK(19, 38)	/* Process ID */
+#define   XTS_ATSD_LNCH_F	PPC_BIT(39)		/* NoFlush – Assumed to be 0b0 */
+#define   XTS_ATSD_LNCH_OCAPI_SLBI	PPC_BIT(40)
+#define   XTS_ATSD_LNCH_OCAPI_SINGLETON	PPC_BIT(41)
+#define XTS_ATSD_AVA		0x08
+#define   XTS_ATSD_AVA_AVA	PPC_BITMASK(0, 51) /* au lieu de 35*/
+#define XTS_ATSD_STAT		0x10
+
 #define MAX_IRQ_PER_LINK	2000
 #define MAX_IRQ_PER_CONTEXT	MAX_IRQ_PER_LINK
 
@@ -84,7 +126,9 @@ struct ocxl_context {
 
 struct ocxl_process_element {
 	__be64 config_state;
-	__be32 reserved1[11];
+	__be32 pasid;
+	__be32 bdf;
+	__be32 reserved1[9];
 	__be32 lpid;
 	__be32 tid;
 	__be32 pid;
diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h
index 17e21cb2addd..6171069d071a 100644
--- a/drivers/misc/ocxl/trace.h
+++ b/drivers/misc/ocxl/trace.h
@@ -8,6 +8,131 @@
 
 #include <linux/tracepoint.h>
 
+
+TRACE_EVENT(ocxl_mmu_notifier_range,
+	TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr),
+	TP_ARGS(start, end, pidr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, start)
+		__field(unsigned long, end)
+		__field(unsigned long, pidr)
+	),
+
+	TP_fast_assign(
+		__entry->start = start;
+		__entry->end = end;
+		__entry->pidr = pidr;
+	),
+
+	TP_printk("start=0x%lx end=0x%lx pidr=0x%lx",
+		__entry->start,
+		__entry->end,
+		__entry->pidr
+	)
+);
+
+TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava,
+	TP_PROTO(u64 val, unsigned long pidr),
+	TP_ARGS(val, pidr),
+
+	TP_STRUCT__entry(
+		__field(u64, val)
+		__field(unsigned long, pidr)
+	),
+
+	TP_fast_assign(
+		__entry->val = val;
+		__entry->pidr = pidr;
+	),
+
+	TP_printk("ATSD AVA: 0x%llx pidr=0x%lx",
+		__entry->val, __entry->pidr
+	)
+);
+
+TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch,
+	TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
+	TP_ARGS(val, addr, pidr),
+
+	TP_STRUCT__entry(
+		__field(u64, val)
+		__field(unsigned long, addr)
+		__field(unsigned long, pidr)
+	),
+
+	TP_fast_assign(
+		__entry->val = val;
+		__entry->addr = addr;
+		__entry->pidr = pidr;
+	),
+
+	TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx",
+		__entry->val, __entry->addr, __entry->pidr
+	)
+);
+
+TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat,
+	TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
+	TP_ARGS(val, addr, pidr),
+
+	TP_STRUCT__entry(
+		__field(u64, val)
+		__field(unsigned long, addr)
+		__field(unsigned long, pidr)
+	),
+
+	TP_fast_assign(
+		__entry->val = val;
+		__entry->addr = addr;
+		__entry->pidr = pidr;
+	),
+
+	TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx",
+		__entry->val, __entry->addr, __entry->pidr
+	)
+);
+
+TRACE_EVENT(init_mmu_notifier,
+	TP_PROTO(int pasid, unsigned long pidr),
+	TP_ARGS(pasid, pidr),
+
+	TP_STRUCT__entry(
+		__field(int, pasid)
+		__field(unsigned long, pidr)
+	),
+
+	TP_fast_assign(
+		__entry->pasid = pasid;
+		__entry->pidr = pidr;
+	),
+
+	TP_printk("pasid=%d, pidr=0x%lx",
+		__entry->pasid,
+		__entry->pidr
+	)
+);
+
+TRACE_EVENT(release_mmu_notifier,
+	TP_PROTO(int pasid, unsigned long pidr),
+	TP_ARGS(pasid, pidr),
+
+	TP_STRUCT__entry(
+		__field(int, pasid)
+		__field(unsigned long, pidr)
+	),
+
+	TP_fast_assign(
+		__entry->pasid = pasid;
+		__entry->pidr = pidr;
+	),
+
+	TP_printk("pasid=%d, pidr=0x%lx",
+		__entry->pasid,
+		__entry->pidr
+	)
+);
+
 DECLARE_EVENT_CLASS(ocxl_context,
 	TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
 	TP_ARGS(pid, spa, pasid, pidr, tidr),
-- 
2.28.0



More information about the Linuxppc-dev mailing list