[PATCH] ocxl: Mmio invalidation support
Christophe Lombard
clombard at linux.vnet.ibm.com
Sat Nov 14 02:33:33 AEDT 2020
OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance
problems caused by the PAU having to process all incoming TLBI/SLBI
commands which will cause them to back up on the PowerBus.
When the Address Translation Mode requires TLB and SLB Invalidate
operations to be initiated using MMIO registers, a set of registers like
the following is used:
• XTS MMIO ATSD0 LPARID register
• XTS MMIO ATSD0 AVA register
• XTS MMIO ATSD0 launch register, write access initiates a shoot down
• XTS MMIO ATSD0 status register
The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE
commands from the PowerBus.
The Shootdown commands (ATSD) will be generated using MMIO registers
in the NPU/PAU and sent to the device.
Signed-off-by: Christophe Lombard <clombard at linux.vnet.ibm.com>
---
arch/powerpc/include/asm/pnv-ocxl.h | 2 +
arch/powerpc/platforms/powernv/ocxl.c | 19 +++
drivers/misc/ocxl/link.c | 180 ++++++++++++++++++++++----
drivers/misc/ocxl/ocxl_internal.h | 46 ++++++-
drivers/misc/ocxl/trace.h | 125 ++++++++++++++++++
5 files changed, 348 insertions(+), 24 deletions(-)
diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h
index d37ededca3ee..4a23abcc347b 100644
--- a/arch/powerpc/include/asm/pnv-ocxl.h
+++ b/arch/powerpc/include/asm/pnv-ocxl.h
@@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p
void pnv_ocxl_spa_release(void *platform_data);
int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle);
+extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
+ uint64_t lpcr);
#endif /* _ASM_PNV_OCXL_H */
diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
index ecdad219d704..100546ea635f 100644
--- a/arch/powerpc/platforms/powernv/ocxl.c
+++ b/arch/powerpc/platforms/powernv/ocxl.c
@@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
return rc;
}
EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
+
+int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
+ uint64_t lpcr)
+{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
+ struct pnv_phb *phb = hose->private_data;
+ u32 bdfn;
+ int rc;
+
+ bdfn = (dev->bus->number << 8) | dev->devfn;
+ rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr);
+ if (rc) {
+ dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar);
diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
index fd73d3bc0eb6..9b5b77d40734 100644
--- a/drivers/misc/ocxl/link.c
+++ b/drivers/misc/ocxl/link.c
@@ -4,6 +4,8 @@
#include <linux/mutex.h>
#include <linux/mm_types.h>
#include <linux/mmu_context.h>
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
#include <asm/copro.h>
#include <asm/pnv-ocxl.h>
#include <asm/xive.h>
@@ -33,6 +35,31 @@
#define SPA_PE_VALID 0x80000000
+struct spa;
+
+/*
+ * A opencapi link can be used be by several PCI functions. We have
+ * one link per device slot.
+ *
+ * A linked list of opencapi links should suffice, as there's a
+ * limited number of opencapi slots on a system and lookup is only
+ * done when the device is probed
+ */
+struct ocxl_link {
+ struct list_head list;
+ struct kref ref;
+ int domain;
+ int bus;
+ int dev;
+ u64 mmio_atsd; /* ATSD physical address */
+ void __iomem *base; /* ATSD register virtual address */
+ spinlock_t atsd_lock; // to serialize shootdowns
+ atomic_t irq_available;
+ struct spa *spa;
+ void *platform_data;
+};
+static struct list_head links_list = LIST_HEAD_INIT(links_list);
+static DEFINE_MUTEX(links_list_lock);
struct pe_data {
struct mm_struct *mm;
@@ -41,6 +68,8 @@ struct pe_data {
/* opaque pointer to be passed to the above callback */
void *xsl_err_data;
struct rcu_head rcu;
+ struct ocxl_link *link;
+ struct mmu_notifier mmu_notifier;
};
struct spa {
@@ -69,27 +98,6 @@ struct spa {
} xsl_fault;
};
-/*
- * A opencapi link can be used be by several PCI functions. We have
- * one link per device slot.
- *
- * A linked list of opencapi links should suffice, as there's a
- * limited number of opencapi slots on a system and lookup is only
- * done when the device is probed
- */
-struct ocxl_link {
- struct list_head list;
- struct kref ref;
- int domain;
- int bus;
- int dev;
- atomic_t irq_available;
- struct spa *spa;
- void *platform_data;
-};
-static struct list_head links_list = LIST_HEAD_INIT(links_list);
-static DEFINE_MUTEX(links_list_lock);
-
enum xsl_response {
CONTINUE,
ADDRESS_ERROR,
@@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r)
}
}
+static const struct mmu_notifier_ops ocxl_mmu_notifier_ops;
+
static void xsl_fault_handler_bh(struct work_struct *fault_work)
{
vm_fault_t flt = 0;
@@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link)
static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
{
+ struct pci_controller *hose = pci_bus_to_host(dev->bus);
struct ocxl_link *link;
int rc;
@@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l
if (rc)
goto err_xsl_irq;
+ /* Since OpenCAPI 5.0, Address Translation Mode requires TLB
+ * and SLB Invalidate operations to be initiated using MMIO
+ * registers
+ */
+ if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
+ 0, &link->mmio_atsd)) {
+ dev_info(&dev->dev, "No available ATSD found\n");
+ }
+ if (link->mmio_atsd) {
+ link->base = ioremap(link->mmio_atsd, 24);
+ if (!link->base)
+ dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd);
+ else
+ pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0);
+ }
+
*out_link = link;
return 0;
@@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
{
struct ocxl_link *link = (struct ocxl_link *) link_handle;
+ if (link->base) {
+ iounmap(link->base);
+ link->base = NULL;
+ }
mutex_lock(&links_list_lock);
kref_put(&link->ref, release_xsl);
mutex_unlock(&links_list_lock);
}
EXPORT_SYMBOL_GPL(ocxl_link_release);
+static void tlb_invalidate(struct ocxl_link *link,
+ unsigned long pid,
+ unsigned long addr)
+{
+ unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT);
+ uint64_t val;
+ int pend;
+
+ if (!link->base)
+ return;
+
+ spin_lock(&link->atsd_lock);
+ if (addr) {
+ /* load Abbreviated Virtual Address register with
+ * the necessary value
+ */
+ val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51));
+ out_be64(link->base + XTS_ATSD_AVA, val);
+ eieio();
+ trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid);
+ }
+
+ /* Write access initiates a shoot down to initiate the
+ * TLB Invalidate command
+ */
+ val = XTS_ATSD_LNCH_R;
+ if (addr) {
+ val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00);
+ val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00);
+ } else {
+ val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10);
+ val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01);
+ val |= XTS_ATSD_LNCH_OCAPI_SINGLETON;
+ }
+ val |= XTS_ATSD_LNCH_PRS;
+ val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101);
+ val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid);
+ out_be64(link->base + XTS_ATSD_LNCH, val);
+ trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid);
+
+ /* Poll the ATSD status register to determine when the
+ * TLB Invalidate has been completed.
+ */
+ val = in_be64(link->base + XTS_ATSD_STAT);
+ pend = val >> 63;
+ trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
+
+ while (pend) {
+ if (time_after_eq(jiffies, timeout)) {
+ pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n",
+ __func__, val, pid);
+ spin_unlock(&link->atsd_lock);
+ return;
+ }
+ cpu_relax();
+ val = in_be64(link->base + XTS_ATSD_STAT);
+ pend = val >> 63;
+ }
+ spin_unlock(&link->atsd_lock);
+ trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
+}
+
+static void invalidate_range_end(struct mmu_notifier *mn,
+ const struct mmu_notifier_range *range)
+{
+ struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
+ struct ocxl_link *link = pe_data->link;
+ struct mm_struct *mm = mn->mm;
+ unsigned long addr, pid, page_size = PAGE_SIZE;
+
+ pid = mm->context.id;
+ trace_ocxl_mmu_notifier_range(range->start, range->end, pid);
+
+ for (addr = range->start; addr < range->end; addr += page_size)
+ tlb_invalidate(link, pid, addr);
+}
+
+static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
+ /* invalidate_range_end() is called when all pages in the
+ * range have been unmapped and the pages have been freed by
+ * the VM
+ */
+ .invalidate_range_end = invalidate_range_end,
+};
+
static u64 calculate_cfg_state(bool kernel)
{
u64 state;
@@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
goto unlock;
}
- pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
+ pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL);
if (!pe_data) {
rc = -ENOMEM;
goto unlock;
@@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
pe_data->mm = mm;
pe_data->xsl_err_cb = xsl_err_cb;
pe_data->xsl_err_data = xsl_err_data;
+ pe_data->link = link;
+ pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
memset(pe, 0, sizeof(struct ocxl_process_element));
pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
+ pe->pasid = cpu_to_be32(pasid << (31 - 19));
+ pe->bdf = cpu_to_be32(1 << (31 - 15));
pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
pe->pid = cpu_to_be32(pidr);
pe->tid = cpu_to_be32(tidr);
@@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
* by the nest MMU. If we have a kernel context, TLBIs are
* already global.
*/
- if (mm)
+ if (mm) {
mm_context_add_copro(mm);
+ if (link->base) {
+ /* Use MMIO registers for the TLB and SLB
+ * Invalidate operations.
+ */
+ trace_init_mmu_notifier(pasid, mm->context.id);
+ mmu_notifier_register(&pe_data->mmu_notifier, mm);
+ }
+ }
+
/*
* Barrier is to make sure PE is visible in the SPA before it
* is used by the device. It also helps with the global TLBI
@@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
WARN(1, "Couldn't find pe data when removing PE\n");
} else {
if (pe_data->mm) {
+ if (link->base) {
+ trace_release_mmu_notifier(pasid, pe_data->mm->context.id);
+ mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm);
+ tlb_invalidate(link, pe_data->mm->context.id, 0ull);
+ }
mm_context_remove_copro(pe_data->mm);
mmdrop(pe_data->mm);
}
diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
index 0bad0a123af6..35d8be3cd270 100644
--- a/drivers/misc/ocxl/ocxl_internal.h
+++ b/drivers/misc/ocxl/ocxl_internal.h
@@ -8,6 +8,48 @@
#include <linux/list.h>
#include <misc/ocxl.h>
+/* Find left shift from first set bit in mask */
+#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
+
+/* Set field fname of oval to fval
+ * NOTE: oval isn't modified, the combined result is returned
+ */
+#define SETFIELD(m, v, val) \
+ (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
+
+#define OCXL_ATSD_TIMEOUT 1
+
+/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */
+#define XTS_ATSD_LNCH 0x00
+#define XTS_ATSD_LNCH_R PPC_BIT(0) /* Radix Invalidate */
+#define XTS_ATSD_LNCH_RIC PPC_BITMASK(1,2) /* Radix Invalidation Control
+ * 0b00 Just invalidate TLB.
+ * 0b01 Invalidate just Page Walk Cache.
+ * 0b10 Invalidate TLB, Page Walk Cache, and any
+ * caching of Partition and Process Table Entries.
+ */
+#define XTS_ATSD_LNCH_LP PPC_BITMASK(3, 10) /* Number and Page Size of translations to be invalidated (HPT only ?) */
+#define XTS_ATSD_LNCH_IS PPC_BITMASK(11, 12) /* Invalidation Criteria
+ * 0b00 Invalidate just the target VA.
+ * 0b01 Invalidate matching PID.
+ */
+#define XTS_ATSD_LNCH_PRS PPC_BIT(13) /* 0b1: Process Scope, 0b0: Partition Scope */
+#define XTS_ATSD_LNCH_B PPC_BIT(14) /* Invalidation Flag */
+#define XTS_ATSD_LNCH_AP PPC_BITMASK(15, 17) /* Actual Page Size to be invalidated
+ * 000 4KB
+ * 101 64KB
+ * 001 2MB
+ * 010 1GB
+ */
+#define XTS_ATSD_LNCH_L PPC_BIT(18) /* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */
+#define XTS_ATSD_LNCH_PID PPC_BITMASK(19, 38) /* Process ID */
+#define XTS_ATSD_LNCH_F PPC_BIT(39) /* NoFlush – Assumed to be 0b0 */
+#define XTS_ATSD_LNCH_OCAPI_SLBI PPC_BIT(40)
+#define XTS_ATSD_LNCH_OCAPI_SINGLETON PPC_BIT(41)
+#define XTS_ATSD_AVA 0x08
+#define XTS_ATSD_AVA_AVA PPC_BITMASK(0, 51) /* au lieu de 35*/
+#define XTS_ATSD_STAT 0x10
+
#define MAX_IRQ_PER_LINK 2000
#define MAX_IRQ_PER_CONTEXT MAX_IRQ_PER_LINK
@@ -84,7 +126,9 @@ struct ocxl_context {
struct ocxl_process_element {
__be64 config_state;
- __be32 reserved1[11];
+ __be32 pasid;
+ __be32 bdf;
+ __be32 reserved1[9];
__be32 lpid;
__be32 tid;
__be32 pid;
diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h
index 17e21cb2addd..6171069d071a 100644
--- a/drivers/misc/ocxl/trace.h
+++ b/drivers/misc/ocxl/trace.h
@@ -8,6 +8,131 @@
#include <linux/tracepoint.h>
+
+TRACE_EVENT(ocxl_mmu_notifier_range,
+ TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr),
+ TP_ARGS(start, end, pidr),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, start)
+ __field(unsigned long, end)
+ __field(unsigned long, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->start = start;
+ __entry->end = end;
+ __entry->pidr = pidr;
+ ),
+
+ TP_printk("start=0x%lx end=0x%lx pidr=0x%lx",
+ __entry->start,
+ __entry->end,
+ __entry->pidr
+ )
+);
+
+TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava,
+ TP_PROTO(u64 val, unsigned long pidr),
+ TP_ARGS(val, pidr),
+
+ TP_STRUCT__entry(
+ __field(u64, val)
+ __field(unsigned long, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->val = val;
+ __entry->pidr = pidr;
+ ),
+
+ TP_printk("ATSD AVA: 0x%llx pidr=0x%lx",
+ __entry->val, __entry->pidr
+ )
+);
+
+TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch,
+ TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
+ TP_ARGS(val, addr, pidr),
+
+ TP_STRUCT__entry(
+ __field(u64, val)
+ __field(unsigned long, addr)
+ __field(unsigned long, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->val = val;
+ __entry->addr = addr;
+ __entry->pidr = pidr;
+ ),
+
+ TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx",
+ __entry->val, __entry->addr, __entry->pidr
+ )
+);
+
+TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat,
+ TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
+ TP_ARGS(val, addr, pidr),
+
+ TP_STRUCT__entry(
+ __field(u64, val)
+ __field(unsigned long, addr)
+ __field(unsigned long, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->val = val;
+ __entry->addr = addr;
+ __entry->pidr = pidr;
+ ),
+
+ TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx",
+ __entry->val, __entry->addr, __entry->pidr
+ )
+);
+
+TRACE_EVENT(init_mmu_notifier,
+ TP_PROTO(int pasid, unsigned long pidr),
+ TP_ARGS(pasid, pidr),
+
+ TP_STRUCT__entry(
+ __field(int, pasid)
+ __field(unsigned long, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->pasid = pasid;
+ __entry->pidr = pidr;
+ ),
+
+ TP_printk("pasid=%d, pidr=0x%lx",
+ __entry->pasid,
+ __entry->pidr
+ )
+);
+
+TRACE_EVENT(release_mmu_notifier,
+ TP_PROTO(int pasid, unsigned long pidr),
+ TP_ARGS(pasid, pidr),
+
+ TP_STRUCT__entry(
+ __field(int, pasid)
+ __field(unsigned long, pidr)
+ ),
+
+ TP_fast_assign(
+ __entry->pasid = pasid;
+ __entry->pidr = pidr;
+ ),
+
+ TP_printk("pasid=%d, pidr=0x%lx",
+ __entry->pasid,
+ __entry->pidr
+ )
+);
+
DECLARE_EVENT_CLASS(ocxl_context,
TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
TP_ARGS(pid, spa, pasid, pidr, tidr),
--
2.28.0
More information about the Linuxppc-dev
mailing list