[PATCH] powerpc/pseries/iommu: memory notifier incorrectly adds TCEs for pmemory

Donet Tom donettom at linux.ibm.com
Wed Feb 5 23:43:39 AEDT 2025


On 1/31/25 00:08, Gaurav Batra wrote:
> iommu_mem_notifier() is invoked when RAM is dynamically added/removed. This
> notifier call is responsible to add/remove TCEs from the Dynamic DMA Window
> (DDW) when TCEs are pre-mapped. TCEs are pre-mapped only for RAM and not
> for persistent memory (pmemory). For DMA buffers in pmemory, TCEs are
> dynamically mapped when the device driver instructs to do so.
>
> The issue is 'daxctl' command is capable of adding pmemory as "System RAM"
> after LPAR boot. The command to do so is -
>
> daxctl reconfigure-device --mode=system-ram dax0.0 --force
>
> This will dynamically add pmemory range to LPAR RAM eventually invoking
> iommu_mem_notifier(). The address range of pmemory is way beyond the Max
> RAM that the LPAR can have. Which means, this range is beyond the DDW
> created for the device, at device initialization time.
>
> As a result when TCEs are pre-mapped for the pmemory range, by
> iommu_mem_notifier(), PHYP HCALL returns H_PARAMETER. This failed the
> command, daxctl, to add pmemory as RAM.
>
> The solution is to not pre-map TCEs for pmemory.
>
> Signed-off-by: Gaurav Batra <gbatra at linux.ibm.com>
> ---
>   arch/powerpc/include/asm/mmzone.h      |  1 +
>   arch/powerpc/mm/numa.c                 |  2 +-
>   arch/powerpc/platforms/pseries/iommu.c | 29 ++++++++++++++------------
>   3 files changed, 18 insertions(+), 14 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/mmzone.h b/arch/powerpc/include/asm/mmzone.h
> index d99863cd6cde..049152f8d597 100644
> --- a/arch/powerpc/include/asm/mmzone.h
> +++ b/arch/powerpc/include/asm/mmzone.h
> @@ -29,6 +29,7 @@ extern cpumask_var_t node_to_cpumask_map[];
>   #ifdef CONFIG_MEMORY_HOTPLUG
>   extern unsigned long max_pfn;
>   u64 memory_hotplug_max(void);
> +u64 hot_add_drconf_memory_max(void);
>   #else
>   #define memory_hotplug_max() memblock_end_of_DRAM()
>   #endif
> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
> index 3c1da08304d0..603a0f652ba6 100644
> --- a/arch/powerpc/mm/numa.c
> +++ b/arch/powerpc/mm/numa.c
> @@ -1336,7 +1336,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
>   	return nid;
>   }
>   
> -static u64 hot_add_drconf_memory_max(void)
> +u64 hot_add_drconf_memory_max(void)
>   {
>   	struct device_node *memory = NULL;
>   	struct device_node *dn = NULL;
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 29f1a0cc59cd..abd9529a8f41 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1284,17 +1284,13 @@ static LIST_HEAD(failed_ddw_pdn_list);
>   
>   static phys_addr_t ddw_memory_hotplug_max(void)
>   {
> -	resource_size_t max_addr = memory_hotplug_max();
> -	struct device_node *memory;
> +	resource_size_t max_addr;
>   
> -	for_each_node_by_type(memory, "memory") {
> -		struct resource res;
> -
> -		if (of_address_to_resource(memory, 0, &res))
> -			continue;
> -
> -		max_addr = max_t(resource_size_t, max_addr, res.end + 1);
> -	}
> +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
> +	max_addr = hot_add_drconf_memory_max();
> +#else
> +	max_addr = memblock_end_of_DRAM();
> +#endif
>   
>   	return max_addr;
>   }
> @@ -1600,7 +1596,7 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn)
>   
>   	if (direct_mapping) {
>   		/* DDW maps the whole partition, so enable direct DMA mapping */
> -		ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT,
> +		ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> PAGE_SHIFT,
>   					    win64->value, tce_setrange_multi_pSeriesLP_walk);
>   		if (ret) {
>   			dev_info(&dev->dev, "failed to map DMA window for %pOF: %d\n",
> @@ -2346,11 +2342,17 @@ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
>   	struct memory_notify *arg = data;
>   	int ret = 0;
>   
> +	/* This notifier can get called when onlining persistent memory as well.
> +	 * TCEs are not pre-mapped for persistent memory. Persistent memory will
> +	 * always be above ddw_memory_hotplug_max()
> +	 */
> +
>   	switch (action) {
>   	case MEM_GOING_ONLINE:
>   		spin_lock(&dma_win_list_lock);
>   		list_for_each_entry(window, &dma_win_list, list) {
> -			if (window->direct) {
> +			if (window->direct && (arg->start_pfn << PAGE_SHIFT) <
> +				ddw_memory_hotplug_max()) {
Hi Gaurav,

Since the pmem_start will be greater than ddw_memory_hotplug_max(), and 
we have not created DDW beyond ddw_memory_hotplug_max(), we are not 
adding TCE for this range, right?

I have tested this patch on my system, and daxctl reconfigure-device is 
able to reconfigure PMEM to system RAM.

~# daxctl reconfigure-device --mode=system-ram dax1.0 --force
[
   {
     "chardev":"dax1.0",
     "size":5362417664,
     "target_node":4,
     "align":65536,
     "mode":"system-ram",
     "online_memblocks":4,
     "total_memblocks":4,
     "movable":true
   }
]
reconfigured 1 device
~#
~# lsmem
RANGE                                  SIZE  STATE REMOVABLE BLOCK
0x0000000000000000-0x000000697fffffff  422G online       yes 0-421
0x0000040380000000-0x000004047fffffff    4G online       yes 4110-4113

Memory block size:         1G
Total online memory:     426G
Total offline memory:      0B
root at ltcden14-lp2:~#

Thanks
Donet
>   				ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
>   						arg->nr_pages, window->prop);
>   			}
> @@ -2362,7 +2364,8 @@ static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action,
>   	case MEM_OFFLINE:
>   		spin_lock(&dma_win_list_lock);
>   		list_for_each_entry(window, &dma_win_list, list) {
> -			if (window->direct) {
> +			if (window->direct && (arg->start_pfn << PAGE_SHIFT) <
> +				ddw_memory_hotplug_max()) {
>   				ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
>   						arg->nr_pages, window->prop);
>   			}
>
> base-commit: 95ec54a420b8f445e04a7ca0ea8deb72c51fe1d3


More information about the Linuxppc-dev mailing list