[PATCH] ocxl: Mmio invalidation support
Frederic Barrat
fbarrat at linux.ibm.com
Tue Nov 17 04:51:32 AEDT 2020
On 13/11/2020 16:33, Christophe Lombard wrote:
> OpenCAPI 4.0/5.0 with TLBI/SLBI Snooping, is not used due to performance
> problems caused by the PAU having to process all incoming TLBI/SLBI
> commands which will cause them to back up on the PowerBus.
>
> When the Address Translation Mode requires TLB and SLB Invalidate
> operations to be initiated using MMIO registers, a set of registers like
> the following is used:
> • XTS MMIO ATSD0 LPARID register
> • XTS MMIO ATSD0 AVA register
> • XTS MMIO ATSD0 launch register, write access initiates a shoot down
> • XTS MMIO ATSD0 status register
>
> The MMIO based mechanism also blocks the NPU/PAU from snooping TLBIE
> commands from the PowerBus.
>
> The Shootdown commands (ATSD) will be generated using MMIO registers
> in the NPU/PAU and sent to the device.
>
> Signed-off-by: Christophe Lombard <clombard at linux.vnet.ibm.com>
> ---
> arch/powerpc/include/asm/pnv-ocxl.h | 2 +
> arch/powerpc/platforms/powernv/ocxl.c | 19 +++
> drivers/misc/ocxl/link.c | 180 ++++++++++++++++++++++----
> drivers/misc/ocxl/ocxl_internal.h | 46 ++++++-
> drivers/misc/ocxl/trace.h | 125 ++++++++++++++++++
> 5 files changed, 348 insertions(+), 24 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/pnv-ocxl.h b/arch/powerpc/include/asm/pnv-ocxl.h
> index d37ededca3ee..4a23abcc347b 100644
> --- a/arch/powerpc/include/asm/pnv-ocxl.h
> +++ b/arch/powerpc/include/asm/pnv-ocxl.h
> @@ -28,4 +28,6 @@ int pnv_ocxl_spa_setup(struct pci_dev *dev, void *spa_mem, int PE_mask, void **p
> void pnv_ocxl_spa_release(void *platform_data);
> int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle);
>
> +extern int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
> + uint64_t lpcr);
"extern" is useless
> #endif /* _ASM_PNV_OCXL_H */
> diff --git a/arch/powerpc/platforms/powernv/ocxl.c b/arch/powerpc/platforms/powernv/ocxl.c
> index ecdad219d704..100546ea635f 100644
> --- a/arch/powerpc/platforms/powernv/ocxl.c
> +++ b/arch/powerpc/platforms/powernv/ocxl.c
> @@ -483,3 +483,22 @@ int pnv_ocxl_spa_remove_pe_from_cache(void *platform_data, int pe_handle)
> return rc;
> }
> EXPORT_SYMBOL_GPL(pnv_ocxl_spa_remove_pe_from_cache);
> +
> +int pnv_ocxl_map_lpar(struct pci_dev *dev, uint64_t lparid,
> + uint64_t lpcr)
> +{
> + struct pci_controller *hose = pci_bus_to_host(dev->bus);
> + struct pnv_phb *phb = hose->private_data;
> + u32 bdfn;
> + int rc;
> +
> + bdfn = (dev->bus->number << 8) | dev->devfn;
I was told a bit too late that pci_dev_id() exists, so we should
probably use from now on.
> + rc = opal_npu_map_lpar(phb->opal_id, bdfn, lparid, lpcr);
> + if (rc) {
> + dev_err(&dev->dev, "Error mapping device to LPAR: %d\n", rc);
> + return -EINVAL;
> + }
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(pnv_ocxl_map_lpar);
> diff --git a/drivers/misc/ocxl/link.c b/drivers/misc/ocxl/link.c
> index fd73d3bc0eb6..9b5b77d40734 100644
> --- a/drivers/misc/ocxl/link.c
> +++ b/drivers/misc/ocxl/link.c
Overall, there are many changes in that file and it would help the
review if it could be broken up in a set of smaller patches.
> @@ -4,6 +4,8 @@
> #include <linux/mutex.h>
> #include <linux/mm_types.h>
> #include <linux/mmu_context.h>
> +#include <linux/mm.h>
> +#include <linux/mmu_notifier.h>
> #include <asm/copro.h>
> #include <asm/pnv-ocxl.h>
> #include <asm/xive.h>
> @@ -33,6 +35,31 @@
>
> #define SPA_PE_VALID 0x80000000
>
> +struct spa;
> +
> +/*
> + * A opencapi link can be used be by several PCI functions. We have
> + * one link per device slot.
> + *
> + * A linked list of opencapi links should suffice, as there's a
> + * limited number of opencapi slots on a system and lookup is only
> + * done when the device is probed
> + */
> +struct ocxl_link {
> + struct list_head list;
> + struct kref ref;
> + int domain;
> + int bus;
> + int dev;
> + u64 mmio_atsd; /* ATSD physical address */
> + void __iomem *base; /* ATSD register virtual address */
> + spinlock_t atsd_lock; // to serialize shootdowns
> + atomic_t irq_available;
> + struct spa *spa;
> + void *platform_data;
> +};
> +static struct list_head links_list = LIST_HEAD_INIT(links_list);
> +static DEFINE_MUTEX(links_list_lock);
>
> struct pe_data {
> struct mm_struct *mm;
> @@ -41,6 +68,8 @@ struct pe_data {
> /* opaque pointer to be passed to the above callback */
> void *xsl_err_data;
> struct rcu_head rcu;
> + struct ocxl_link *link;
> + struct mmu_notifier mmu_notifier;
> };
>
> struct spa {
> @@ -69,27 +98,6 @@ struct spa {
> } xsl_fault;
> };
>
> -/*
> - * A opencapi link can be used be by several PCI functions. We have
> - * one link per device slot.
> - *
> - * A linked list of opencapi links should suffice, as there's a
> - * limited number of opencapi slots on a system and lookup is only
> - * done when the device is probed
> - */
> -struct ocxl_link {
> - struct list_head list;
> - struct kref ref;
> - int domain;
> - int bus;
> - int dev;
> - atomic_t irq_available;
> - struct spa *spa;
> - void *platform_data;
> -};
> -static struct list_head links_list = LIST_HEAD_INIT(links_list);
> -static DEFINE_MUTEX(links_list_lock);
> -
> enum xsl_response {
> CONTINUE,
> ADDRESS_ERROR,
> @@ -126,6 +134,8 @@ static void ack_irq(struct spa *spa, enum xsl_response r)
> }
> }
>
> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops;
> +
> static void xsl_fault_handler_bh(struct work_struct *fault_work)
> {
> vm_fault_t flt = 0;
> @@ -376,6 +386,7 @@ static void free_spa(struct ocxl_link *link)
>
> static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_link)
> {
> + struct pci_controller *hose = pci_bus_to_host(dev->bus);
> struct ocxl_link *link;
> int rc;
>
> @@ -403,6 +414,22 @@ static int alloc_link(struct pci_dev *dev, int PE_mask, struct ocxl_link **out_l
> if (rc)
> goto err_xsl_irq;
>
> + /* Since OpenCAPI 5.0, Address Translation Mode requires TLB
> + * and SLB Invalidate operations to be initiated using MMIO
> + * registers
> + */
> + if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd",
> + 0, &link->mmio_atsd)) {
> + dev_info(&dev->dev, "No available ATSD found\n");
> + }
> + if (link->mmio_atsd) {
> + link->base = ioremap(link->mmio_atsd, 24);
> + if (!link->base)
> + dev_warn(&dev->dev, "ioremap failed - mmio_atsd: %#llx\n", link->mmio_atsd);
> + else
> + pnv_ocxl_map_lpar(dev, mfspr(SPRN_LPID), 0);
> + }
To stay coherent with how we've done things like interrupts, the
device tree parsing and ioremap() for the shootdown page should be
done in a platform-specific file (powernv)
Also, could we get a more descriptive name than link->base?
> +
> *out_link = link;
> return 0;
>
> @@ -464,12 +491,101 @@ void ocxl_link_release(struct pci_dev *dev, void *link_handle)
> {
> struct ocxl_link *link = (struct ocxl_link *) link_handle;
>
> + if (link->base) {
> + iounmap(link->base);
> + link->base = NULL;
> + }
What's allocated in alloc_link(), i.e. the shootdown page, should be
released in release_xsl() and not in ocxl_link_release() (it's too
early in case of several functions).
> mutex_lock(&links_list_lock);
> kref_put(&link->ref, release_xsl);
> mutex_unlock(&links_list_lock);
> }
> EXPORT_SYMBOL_GPL(ocxl_link_release);
>
> +static void tlb_invalidate(struct ocxl_link *link,
> + unsigned long pid,
> + unsigned long addr)
> +{
> + unsigned long timeout = jiffies + (HZ * OCXL_ATSD_TIMEOUT);
> + uint64_t val;
> + int pend;
> +
> + if (!link->base)
> + return;
> +
> + spin_lock(&link->atsd_lock);
Looks like atsd_lock is not initialized.
> + if (addr) {
> + /* load Abbreviated Virtual Address register with
> + * the necessary value
> + */
> + val = SETFIELD(XTS_ATSD_AVA_AVA, 0ull, addr >> (63-51));
> + out_be64(link->base + XTS_ATSD_AVA, val);
> + eieio();
That eieio() needs a comment.
> + trace_ocxl_mmu_notifier_mmio_atsd_ava(val, pid);
> + }
> +
> + /* Write access initiates a shoot down to initiate the
> + * TLB Invalidate command
> + */
> + val = XTS_ATSD_LNCH_R;
All that code should be in a platform-specific file.
> + if (addr) {
> + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b00);
> + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b00);
> + } else {
> + val = SETFIELD(XTS_ATSD_LNCH_RIC, val, 0b10);
> + val = SETFIELD(XTS_ATSD_LNCH_IS, val, 0b01);
> + val |= XTS_ATSD_LNCH_OCAPI_SINGLETON;
> + }
> + val |= XTS_ATSD_LNCH_PRS;
> + val = SETFIELD(XTS_ATSD_LNCH_AP, val, 0b101);
> + val = SETFIELD(XTS_ATSD_LNCH_PID, val, pid);
> + out_be64(link->base + XTS_ATSD_LNCH, val);
> + trace_ocxl_mmu_notifier_mmio_atsd_lnch(val, addr, pid);
> +
> + /* Poll the ATSD status register to determine when the
> + * TLB Invalidate has been completed.
> + */
> + val = in_be64(link->base + XTS_ATSD_STAT);
> + pend = val >> 63;
> + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
> +
> + while (pend) {
> + if (time_after_eq(jiffies, timeout)) {
> + pr_err("%s - Timeout while reading XTS MMIO ATSD status register (val=%#llx, pidr=0x%lx)\n",
> + __func__, val, pid);
> + spin_unlock(&link->atsd_lock);
> + return;
> + }
> + cpu_relax();
> + val = in_be64(link->base + XTS_ATSD_STAT);
> + pend = val >> 63;
> + }
> + spin_unlock(&link->atsd_lock);
> + trace_ocxl_mmu_notifier_mmio_atsd_stat(val, addr, pid);
> +}
> +
> +static void invalidate_range_end(struct mmu_notifier *mn,
> + const struct mmu_notifier_range *range)
> +{
> + struct pe_data *pe_data = container_of(mn, struct pe_data, mmu_notifier);
> + struct ocxl_link *link = pe_data->link;
> + struct mm_struct *mm = mn->mm;
> + unsigned long addr, pid, page_size = PAGE_SIZE;
> +
> + pid = mm->context.id;
> + trace_ocxl_mmu_notifier_range(range->start, range->end, pid);
> +
> + for (addr = range->start; addr < range->end; addr += page_size)
> + tlb_invalidate(link, pid, addr);
> +}
> +
> +static const struct mmu_notifier_ops ocxl_mmu_notifier_ops = {
> + /* invalidate_range_end() is called when all pages in the
> + * range have been unmapped and the pages have been freed by
> + * the VM
> + */
> + .invalidate_range_end = invalidate_range_end,
> +};
> +
See comments in mmu_notifier.h. Looks like we shouldn't be using
invalidate_range_end(), it's too late
Did we test abrupt death of the host process? Do we need to also use
.release()?
> static u64 calculate_cfg_state(bool kernel)
> {
> u64 state;
> @@ -517,7 +633,7 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
> goto unlock;
> }
>
> - pe_data = kmalloc(sizeof(*pe_data), GFP_KERNEL);
> + pe_data = kzalloc(sizeof(*pe_data), GFP_KERNEL);
> if (!pe_data) {
> rc = -ENOMEM;
> goto unlock;
> @@ -526,9 +642,13 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
> pe_data->mm = mm;
> pe_data->xsl_err_cb = xsl_err_cb;
> pe_data->xsl_err_data = xsl_err_data;
> + pe_data->link = link;
> + pe_data->mmu_notifier.ops = &ocxl_mmu_notifier_ops;
>
> memset(pe, 0, sizeof(struct ocxl_process_element));
> pe->config_state = cpu_to_be64(calculate_cfg_state(pidr == 0));
> + pe->pasid = cpu_to_be32(pasid << (31 - 19));
> + pe->bdf = cpu_to_be32(1 << (31 - 15));
why do we hard-code bdf 1?
> pe->lpid = cpu_to_be32(mfspr(SPRN_LPID));
> pe->pid = cpu_to_be32(pidr);
> pe->tid = cpu_to_be32(tidr);
> @@ -540,8 +660,17 @@ int ocxl_link_add_pe(void *link_handle, int pasid, u32 pidr, u32 tidr,
> * by the nest MMU. If we have a kernel context, TLBIs are
> * already global.
> */
> - if (mm)
> + if (mm) {
> mm_context_add_copro(mm);
> + if (link->base) {
> + /* Use MMIO registers for the TLB and SLB
> + * Invalidate operations.
> + */
> + trace_init_mmu_notifier(pasid, mm->context.id);
> + mmu_notifier_register(&pe_data->mmu_notifier, mm);
> + }
> + }
> +
> /*
> * Barrier is to make sure PE is visible in the SPA before it
> * is used by the device. It also helps with the global TLBI
> @@ -672,6 +801,11 @@ int ocxl_link_remove_pe(void *link_handle, int pasid)
> WARN(1, "Couldn't find pe data when removing PE\n");
> } else {
> if (pe_data->mm) {
> + if (link->base) {
> + trace_release_mmu_notifier(pasid, pe_data->mm->context.id);
> + mmu_notifier_unregister(&pe_data->mmu_notifier, pe_data->mm);
> + tlb_invalidate(link, pe_data->mm->context.id, 0ull);
> + }
> mm_context_remove_copro(pe_data->mm);
> mmdrop(pe_data->mm);
> }
> diff --git a/drivers/misc/ocxl/ocxl_internal.h b/drivers/misc/ocxl/ocxl_internal.h
> index 0bad0a123af6..35d8be3cd270 100644
> --- a/drivers/misc/ocxl/ocxl_internal.h
> +++ b/drivers/misc/ocxl/ocxl_internal.h
> @@ -8,6 +8,48 @@
> #include <linux/list.h>
> #include <misc/ocxl.h>
>
> +/* Find left shift from first set bit in mask */
> +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1)
> +
> +/* Set field fname of oval to fval
> + * NOTE: oval isn't modified, the combined result is returned
> + */
> +#define SETFIELD(m, v, val) \
> + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m)))
> +
> +#define OCXL_ATSD_TIMEOUT 1
> +
> +/* 5.9.3.3 TLB Management Instructions - PowerISA tags workbook */
> +#define XTS_ATSD_LNCH 0x00
> +#define XTS_ATSD_LNCH_R PPC_BIT(0) /* Radix Invalidate */
> +#define XTS_ATSD_LNCH_RIC PPC_BITMASK(1,2) /* Radix Invalidation Control
> + * 0b00 Just invalidate TLB.
> + * 0b01 Invalidate just Page Walk Cache.
> + * 0b10 Invalidate TLB, Page Walk Cache, and any
> + * caching of Partition and Process Table Entries.
> + */
> +#define XTS_ATSD_LNCH_LP PPC_BITMASK(3, 10) /* Number and Page Size of translations to be invalidated (HPT only ?) */
> +#define XTS_ATSD_LNCH_IS PPC_BITMASK(11, 12) /* Invalidation Criteria
> + * 0b00 Invalidate just the target VA.
> + * 0b01 Invalidate matching PID.
> + */
> +#define XTS_ATSD_LNCH_PRS PPC_BIT(13) /* 0b1: Process Scope, 0b0: Partition Scope */
> +#define XTS_ATSD_LNCH_B PPC_BIT(14) /* Invalidation Flag */
> +#define XTS_ATSD_LNCH_AP PPC_BITMASK(15, 17) /* Actual Page Size to be invalidated
> + * 000 4KB
> + * 101 64KB
> + * 001 2MB
> + * 010 1GB
> + */
> +#define XTS_ATSD_LNCH_L PPC_BIT(18) /* Defines the large page select (L=0b0 for 4KB pages, L=0b1 for large pages) */
> +#define XTS_ATSD_LNCH_PID PPC_BITMASK(19, 38) /* Process ID */
> +#define XTS_ATSD_LNCH_F PPC_BIT(39) /* NoFlush – Assumed to be 0b0 */
> +#define XTS_ATSD_LNCH_OCAPI_SLBI PPC_BIT(40)
> +#define XTS_ATSD_LNCH_OCAPI_SINGLETON PPC_BIT(41)
> +#define XTS_ATSD_AVA 0x08
> +#define XTS_ATSD_AVA_AVA PPC_BITMASK(0, 51) /* au lieu de 35*/
> +#define XTS_ATSD_STAT 0x10
> +
> #define MAX_IRQ_PER_LINK 2000
> #define MAX_IRQ_PER_CONTEXT MAX_IRQ_PER_LINK
>
> @@ -84,7 +126,9 @@ struct ocxl_context {
>
> struct ocxl_process_element {
> __be64 config_state;
> - __be32 reserved1[11];
> + __be32 pasid;
> + __be32 bdf;
> + __be32 reserved1[9];
> __be32 lpid;
> __be32 tid;
> __be32 pid;
> diff --git a/drivers/misc/ocxl/trace.h b/drivers/misc/ocxl/trace.h
> index 17e21cb2addd..6171069d071a 100644
> --- a/drivers/misc/ocxl/trace.h
> +++ b/drivers/misc/ocxl/trace.h
I haven't looked at this yet. It should probably be in a separate patch.
Fred
> @@ -8,6 +8,131 @@
>
> #include <linux/tracepoint.h>
>
> +
> +TRACE_EVENT(ocxl_mmu_notifier_range,
> + TP_PROTO(unsigned long start, unsigned long end, unsigned long pidr),
> + TP_ARGS(start, end, pidr),
> +
> + TP_STRUCT__entry(
> + __field(unsigned long, start)
> + __field(unsigned long, end)
> + __field(unsigned long, pidr)
> + ),
> +
> + TP_fast_assign(
> + __entry->start = start;
> + __entry->end = end;
> + __entry->pidr = pidr;
> + ),
> +
> + TP_printk("start=0x%lx end=0x%lx pidr=0x%lx",
> + __entry->start,
> + __entry->end,
> + __entry->pidr
> + )
> +);
> +
> +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_ava,
> + TP_PROTO(u64 val, unsigned long pidr),
> + TP_ARGS(val, pidr),
> +
> + TP_STRUCT__entry(
> + __field(u64, val)
> + __field(unsigned long, pidr)
> + ),
> +
> + TP_fast_assign(
> + __entry->val = val;
> + __entry->pidr = pidr;
> + ),
> +
> + TP_printk("ATSD AVA: 0x%llx pidr=0x%lx",
> + __entry->val, __entry->pidr
> + )
> +);
> +
> +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_lnch,
> + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
> + TP_ARGS(val, addr, pidr),
> +
> + TP_STRUCT__entry(
> + __field(u64, val)
> + __field(unsigned long, addr)
> + __field(unsigned long, pidr)
> + ),
> +
> + TP_fast_assign(
> + __entry->val = val;
> + __entry->addr = addr;
> + __entry->pidr = pidr;
> + ),
> +
> + TP_printk("ATSD LNCH: 0x%llx addr=0x%lx pidr=0x%lx",
> + __entry->val, __entry->addr, __entry->pidr
> + )
> +);
> +
> +TRACE_EVENT(ocxl_mmu_notifier_mmio_atsd_stat,
> + TP_PROTO(u64 val, unsigned long addr, unsigned long pidr),
> + TP_ARGS(val, addr, pidr),
> +
> + TP_STRUCT__entry(
> + __field(u64, val)
> + __field(unsigned long, addr)
> + __field(unsigned long, pidr)
> + ),
> +
> + TP_fast_assign(
> + __entry->val = val;
> + __entry->addr = addr;
> + __entry->pidr = pidr;
> + ),
> +
> + TP_printk("ATSD STAT: 0x%llx addr=0x%lx pidr=0x%lx",
> + __entry->val, __entry->addr, __entry->pidr
> + )
> +);
> +
> +TRACE_EVENT(init_mmu_notifier,
> + TP_PROTO(int pasid, unsigned long pidr),
> + TP_ARGS(pasid, pidr),
> +
> + TP_STRUCT__entry(
> + __field(int, pasid)
> + __field(unsigned long, pidr)
> + ),
> +
> + TP_fast_assign(
> + __entry->pasid = pasid;
> + __entry->pidr = pidr;
> + ),
> +
> + TP_printk("pasid=%d, pidr=0x%lx",
> + __entry->pasid,
> + __entry->pidr
> + )
> +);
> +
> +TRACE_EVENT(release_mmu_notifier,
> + TP_PROTO(int pasid, unsigned long pidr),
> + TP_ARGS(pasid, pidr),
> +
> + TP_STRUCT__entry(
> + __field(int, pasid)
> + __field(unsigned long, pidr)
> + ),
> +
> + TP_fast_assign(
> + __entry->pasid = pasid;
> + __entry->pidr = pidr;
> + ),
> +
> + TP_printk("pasid=%d, pidr=0x%lx",
> + __entry->pasid,
> + __entry->pidr
> + )
> +);
> +
> DECLARE_EVENT_CLASS(ocxl_context,
> TP_PROTO(pid_t pid, void *spa, int pasid, u32 pidr, u32 tidr),
> TP_ARGS(pid, spa, pasid, pidr, tidr),
>
More information about the Linuxppc-dev
mailing list