[PATCH] powerpc/pseries/iommu: memory notifier incorrectly adds TCEs for pmemory
Donet Tom
donettom at linux.ibm.com
Wed Feb 5 23:43:39 AEDT 2025
On 1/31/25 00:08, Gaurav Batra wrote:
> iommu_mem_notifier() is invoked when RAM is dynamically added/removed. This
> notifier call is responsible to add/remove TCEs from the Dynamic DMA Window
> (DDW) when TCEs are pre-mapped. TCEs are pre-mapped only for RAM and not
> for persistent memory (pmemory). For DMA buffers in pmemory, TCEs are
> dynamically mapped when the device driver instructs to do so.
>
> The issue is 'daxctl' command is capable of adding pmemory as "System RAM"
> after LPAR boot. The command to do so is -
>
> daxctl reconfigure-device --mode=system-ram dax0.0 --force
>
> This will dynamically add pmemory range to LPAR RAM eventually invoking
> iommu_mem_notifier(). The address range of pmemory is way beyond the Max
> RAM that the LPAR can have. Which means, this range is beyond the DDW
> created for the device, at device initialization time.
>
> As a result when TCEs are pre-mapped for the pmemory range, by
> iommu_mem_notifier(), PHYP HCALL returns H_PARAMETER. This failed the
> command, daxctl, to add pmemory as RAM.
>
> The solution is to not pre-map TCEs for pmemory.
>
> Signed-off-by: Gaurav Batra <gbatra at linux.ibm.com>
> ---
> arch/powerpc/include/asm/mmzone.h | 1 +
> arch/powerpc/mm/numa.c | 2 +-
> arch/powerpc/platforms/pseries/iommu.c | 29 ++++++++++++++------------
> 3 files changed, 18 insertions(+), 14 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
> index d99863cd6cde..049152f8d597 100644
> --- a/arch/powerpc/include/asm/mmzone.h
> +++ b/arch/powerpc/include/asm/mmzone.h
> @@ -29,6 +29,7 @@ extern cpumask_var_t node_to_cpumask_map[];
> #ifdef CONFIG_MEMORY_HOTPLUG
> extern unsigned long max_pfn;
> u64 memory_hotplug_max(void);
> +u64 hot_add_drconf_memory_max(void);
> #else
> #define memory_hotplug_max() memblock_end_of_DRAM()
> #endif
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 3c1da08304d0..603a0f652ba6 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1336,7 +1336,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
> return nid;
> }
>
> -static u64 hot_add_drconf_memory_max(void)
> +u64 hot_add_drconf_memory_max(void)
> {
> struct device_node *memory = NULL;
> struct device_node *dn = NULL;
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 29f1a0cc59cd..abd9529a8f41 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1284,17 +1284,13 @@ static LIST_HEAD(failed_ddw_pdn_list);
>
> static phys_addr_t ddw_memory_hotplug_max(void)
> {
> - resource_size_t max_addr = memory_hotplug_max();
> - struct device_node *memory;
> + resource_size_t max_addr;
>
> - for_each_node_by_type(memory, "memory") {
> - struct resource res;
> -
> - if (of_address_to_resource(memory, 0, &res))
> - continue;
> -
> - max_addr = max_t(resource_size_t, max_addr, res.end + 1);
> - }
> +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
> + max_addr = hot_add_drconf_memory_max();
> +#else
> + max_addr = memblock_end_of_DRAM();
> +#endif
>
> return max_addr;
> }
> @@ -1600,7 +1596,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>
> if (direct_mapping) {
> /* DDW maps the whole partition, so enable direct DMA mapping */
> - ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> + ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT,
> win64->value, tce_setrange_multi_pSeriesLP_walk);
> if (ret) {
> dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
> @@ -2346,11 +2342,17 @@ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
> struct memory_notify *arg = data;
> int ret = 0;
>
> + /* This notifier can get called when onlining persistent memory as well.
> + * TCEs are not pre-mapped for persistent memory. Persistent memory will
> + * always be above ddw_memory_hotplug_max()
> + */
> +
> switch (action) {
> case MEM_GOING_ONLINE:
> spin_lock(&dma_win_list_lock);
> list_for_each_entry(window, &dma_win_list, list) {
> - if (window->direct) {
> + if (window->direct && (arg->start_pfn << PAGE_SHIFT) <
> + ddw_memory_hotplug_max()) {
Hi Gaurav,
Since the pmem_start will be greater than ddw_memory_hotplug_max(), and
we have not created DDW beyond ddw_memory_hotplug_max(), we are not
adding TCE for this range, right?
I have tested this patch on my system, and daxctl reconfigure-device is
able to reconfigure PMEM to system RAM.
~# daxctl reconfigure-device --mode=system-ram dax1.0 --force
[
{
"chardev":"dax1.0",
"size":5362417664,
"target_node":4,
"align":65536,
"mode":"system-ram",
"online_memblocks":4,
"total_memblocks":4,
"movable":true
}
]
reconfigured 1 device
~#
~# lsmem
RANGE SIZE STATE REMOVABLE BLOCK
0x0000000000000000-0x000000697fffffff 422G online yes 0-421
0x0000040380000000-0x000004047fffffff 4G online yes 4110-4113
Memory block size: 1G
Total online memory: 426G
Total offline memory: 0B
root at ltcden14-lp2:~#
Thanks
Donet
> ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
> arg->nr_pages, window->prop);
> }
> @@ -2362,7 +2364,8 @@ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
> case MEM_OFFLINE:
> spin_lock(&dma_win_list_lock);
> list_for_each_entry(window, &dma_win_list, list) {
> - if (window->direct) {
> + if (window->direct && (arg->start_pfn << PAGE_SHIFT) <
> + ddw_memory_hotplug_max()) {
> ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
> arg->nr_pages, window->prop);
> }
>
> base-commit: 95ec54a420b8f445e04a7ca0ea8deb72c51fe1d3
More information about the Linuxppc-dev
mailing list