[PATCH kernel v3 2/3] powerpc/powernv/ioda2: Allocate TCE table levels on demand for default DMA window
alistair at popple.id.au
alistair at popple.id.au
Mon Jul 8 17:01:16 AEST 2019
It seems this is mostly just enabling already existing code used by KVM
for
on-demand TCE level allocation on baremetal as well. Given that I
suppose the
implementation of the on-demand allocation itself is already used and
therefore somewhat tested by KVM. I took a look at pnv_tce() which does
the
on-demand allocation and the changes here seem like they should work
with that
so:
Reviewed-by: Alistair Popple <alistair at popple.id.au>
On Thursday, 30 May 2019 5:03:54 PM AEST Alexey Kardashevskiy wrote:
> We allocate only the first level of multilevel TCE tables for KVM
> already (alloc_userspace_copy==true), and the rest is allocated on
> demand.
> This is not enabled though for baremetal.
>
> This removes the KVM limitation (implicit, via the alloc_userspace_copy
> parameter) and always allocates just the first level. The on-demand
> allocation of missing levels is already implemented.
>
> As from now on DMA map might happen with disabled interrupts, this
> allocates TCEs with GFP_ATOMIC.
>
> To save time when creating a new clean table, this skips non-allocated
> indirect TCE entries in pnv_tce_free just like we already do in
> the VFIO IOMMU TCE driver.
>
> This changes the default level number from 1 to 2 to reduce the amount
> of memory required for the default 32bit DMA window at the boot time.
> The default window size is up to 2GB which requires 4MB of TCEs which
> is
> unlikely to be used entirely or at all as most devices these days are
> 64bit capable so by switching to 2 levels by default we save 4032KB of
> RAM per a device.
>
> While at this, add __GFP_NOWARN to alloc_pages_node() as the userspace
> can trigger this path via VFIO, see the failure and try creating a
> table
> again with different parameters which might succeed.
>
> Signed-off-by: Alexey Kardashevskiy <aik at ozlabs.ru>
> ---
> Changes:
> v2:
> * added __GFP_NOWARN to alloc_pages_node
> ---
> arch/powerpc/platforms/powernv/pci.h | 2 +-
> arch/powerpc/platforms/powernv/pci-ioda-tce.c | 20 +++++++++----------
> 2 files changed, 11 insertions(+), 11 deletions(-)
>
> diff --git a/arch/powerpc/platforms/powernv/pci.h
> b/arch/powerpc/platforms/powernv/pci.h index 1a51e7bfc541..a55dabc8d057
> 100644
> --- a/arch/powerpc/platforms/powernv/pci.h
> +++ b/arch/powerpc/platforms/powernv/pci.h
> @@ -225,7 +225,7 @@ extern struct iommu_table_group
> *pnv_npu_compound_attach( struct pnv_ioda_pe *pe);
>
> /* pci-ioda-tce.c */
> -#define POWERNV_IOMMU_DEFAULT_LEVELS 1
> +#define POWERNV_IOMMU_DEFAULT_LEVELS 2
> #define POWERNV_IOMMU_MAX_LEVELS 5
>
> extern int pnv_tce_build(struct iommu_table *tbl, long index, long
> npages,
> diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> b/arch/powerpc/platforms/powernv/pci-ioda-tce.c index
> e28f03e1eb5e..c75ec37bf0cd 100644
> --- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> +++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
> @@ -36,7 +36,8 @@ static __be64 *pnv_alloc_tce_level(int nid, unsigned
> int
> shift) struct page *tce_mem = NULL;
> __be64 *addr;
>
> - tce_mem = alloc_pages_node(nid, GFP_KERNEL, shift - PAGE_SHIFT);
> + tce_mem = alloc_pages_node(nid, GFP_ATOMIC | __GFP_NOWARN,
> + shift - PAGE_SHIFT);
> if (!tce_mem) {
> pr_err("Failed to allocate a TCE memory, level shift=%d\n",
> shift);
> @@ -161,6 +162,9 @@ void pnv_tce_free(struct iommu_table *tbl, long
> index,
> long npages)
>
> if (ptce)
> *ptce = cpu_to_be64(0);
> + else
> + /* Skip the rest of the level */
> + i |= tbl->it_level_size - 1;
> }
> }
>
> @@ -260,7 +264,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64
> bus_offset, unsigned int table_shift = max_t(unsigned int,
> entries_shift +
> 3, PAGE_SHIFT);
> const unsigned long tce_table_size = 1UL << table_shift;
> - unsigned int tmplevels = levels;
>
> if (!levels || (levels > POWERNV_IOMMU_MAX_LEVELS))
> return -EINVAL;
> @@ -268,9 +271,6 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64
> bus_offset, if (!is_power_of_2(window_size))
> return -EINVAL;
>
> - if (alloc_userspace_copy && (window_size > (1ULL << 32)))
> - tmplevels = 1;
> -
> /* Adjust direct table size from window_size and levels */
> entries_shift = (entries_shift + levels - 1) / levels;
> level_shift = entries_shift + 3;
> @@ -281,7 +281,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64
> bus_offset,
>
> /* Allocate TCE table */
> addr = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
> - tmplevels, tce_table_size, &offset, &total_allocated);
> + 1, tce_table_size, &offset, &total_allocated);
>
> /* addr==NULL means that the first level allocation failed */
> if (!addr)
> @@ -292,18 +292,18 @@ long pnv_pci_ioda2_table_alloc_pages(int nid,
> __u64
> bus_offset, * we did not allocate as much as we wanted,
> * release partially allocated table.
> */
> - if (tmplevels == levels && offset < tce_table_size)
> + if (levels == 1 && offset < tce_table_size)
> goto free_tces_exit;
>
> /* Allocate userspace view of the TCE table */
> if (alloc_userspace_copy) {
> offset = 0;
> uas = pnv_pci_ioda2_table_do_alloc_pages(nid, level_shift,
> - tmplevels, tce_table_size, &offset,
> + 1, tce_table_size, &offset,
> &total_allocated_uas);
> if (!uas)
> goto free_tces_exit;
> - if (tmplevels == levels && (offset < tce_table_size ||
> + if (levels == 1 && (offset < tce_table_size ||
> total_allocated_uas != total_allocated))
> goto free_uas_exit;
> }
> @@ -318,7 +318,7 @@ long pnv_pci_ioda2_table_alloc_pages(int nid, __u64
> bus_offset,
>
> pr_debug("Created TCE table: ws=%08llx ts=%lx @%08llx base=%lx uas=%p
> levels=%d/%d\n", window_size, tce_table_size, bus_offset, tbl->it_base,
> - tbl->it_userspace, tmplevels, levels);
> + tbl->it_userspace, 1, levels);
>
> return 0;
More information about the Linuxppc-dev
mailing list