[PATCH] powerpc/pseries/iommu: memory notifier incorrectly adds TCEs for pmemory

Gaurav Batra gbatra at linux.ibm.com
Thu Feb 6 00:58:02 AEDT 2025


On 2/5/25 6:43 AM, Donet Tom wrote:
>
> On 1/31/25 00:08, Gaurav Batra wrote:
>> iommu_mem_notifier() is invoked when RAM is dynamically 
>> added/removed. This
>> notifier call is responsible to add/remove TCEs from the Dynamic DMA 
>> Window
>> (DDW) when TCEs are pre-mapped. TCEs are pre-mapped only for RAM and not
>> for persistent memory (pmemory). For DMA buffers in pmemory, TCEs are
>> dynamically mapped when the device driver instructs to do so.
>>
>> The issue is 'daxctl' command is capable of adding pmemory as "System 
>> RAM"
>> after LPAR boot. The command to do so is -
>>
>> daxctl reconfigure-device --mode=system-ram dax0.0 --force
>>
>> This will dynamically add pmemory range to LPAR RAM eventually invoking
>> iommu_mem_notifier(). The address range of pmemory is way beyond the Max
>> RAM that the LPAR can have. Which means, this range is beyond the DDW
>> created for the device, at device initialization time.
>>
>> As a result when TCEs are pre-mapped for the pmemory range, by
>> iommu_mem_notifier(), PHYP HCALL returns H_PARAMETER. This failed the
>> command, daxctl, to add pmemory as RAM.
>>
>> The solution is to not pre-map TCEs for pmemory.
>>
>> Signed-off-by: Gaurav Batra <gbatra at linux.ibm.com>
>> ---
>>   arch/powerpc/include/asm/mmzone.h      |  1 +
>>   arch/powerpc/mm/numa.c                 |  2 +-
>>   arch/powerpc/platforms/pseries/iommu.c | 29 ++++++++++++++------------
>>   3 files changed, 18 insertions(+), 14 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/mmzone.h 
>> b/arch/powerpc/include/asm/mmzone.h
>> index d99863cd6cde..049152f8d597 100644
>> --- a/arch/powerpc/include/asm/mmzone.h
>> +++ b/arch/powerpc/include/asm/mmzone.h
>> @@ -29,6 +29,7 @@ extern cpumask_var_t node_to_cpumask_map[];
>>   #ifdef CONFIG_MEMORY_HOTPLUG
>>   extern unsigned long max_pfn;
>>   u64 memory_hotplug_max(void);
>> +u64 hot_add_drconf_memory_max(void);
>>   #else
>>   #define memory_hotplug_max() memblock_end_of_DRAM()
>>   #endif
>> diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
>> index 3c1da08304d0..603a0f652ba6 100644
>> --- a/arch/powerpc/mm/numa.c
>> +++ b/arch/powerpc/mm/numa.c
>> @@ -1336,7 +1336,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr)
>>       return nid;
>>   }
>>   -static u64 hot_add_drconf_memory_max(void)
>> +u64 hot_add_drconf_memory_max(void)
>>   {
>>       struct device_node *memory = NULL;
>>       struct device_node *dn = NULL;
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c 
>> b/arch/powerpc/platforms/pseries/iommu.c
>> index 29f1a0cc59cd..abd9529a8f41 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -1284,17 +1284,13 @@ static LIST_HEAD(failed_ddw_pdn_list);
>>     static phys_addr_t ddw_memory_hotplug_max(void)
>>   {
>> -    resource_size_t max_addr = memory_hotplug_max();
>> -    struct device_node *memory;
>> +    resource_size_t max_addr;
>>   -    for_each_node_by_type(memory, "memory") {
>> -        struct resource res;
>> -
>> -        if (of_address_to_resource(memory, 0, &res))
>> -            continue;
>> -
>> -        max_addr = max_t(resource_size_t, max_addr, res.end + 1);
>> -    }
>> +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
>> +    max_addr = hot_add_drconf_memory_max();
>> +#else
>> +    max_addr = memblock_end_of_DRAM();
>> +#endif
>>         return max_addr;
>>   }
>> @@ -1600,7 +1596,7 @@ static bool enable_ddw(struct pci_dev *dev, 
>> struct device_node *pdn)
>>         if (direct_mapping) {
>>           /* DDW maps the whole partition, so enable direct DMA 
>> mapping */
>> -        ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> 
>> PAGE_SHIFT,
>> +        ret = walk_system_ram_range(0, ddw_memory_hotplug_max() >> 
>> PAGE_SHIFT,
>>                           win64->value, 
>> tce_setrange_multi_pSeriesLP_walk);
>>           if (ret) {
>>               dev_info(&dev->dev, "failed to map DMA window for %pOF: 
>> %d\n",
>> @@ -2346,11 +2342,17 @@ static int iommu_mem_notifier(struct 
>> notifier_block *nb, unsigned long action,
>>       struct memory_notify *arg = data;
>>       int ret = 0;
>>   +    /* This notifier can get called when onlining persistent 
>> memory as well.
>> +     * TCEs are not pre-mapped for persistent memory. Persistent 
>> memory will
>> +     * always be above ddw_memory_hotplug_max()
>> +     */
>> +
>>       switch (action) {
>>       case MEM_GOING_ONLINE:
>>           spin_lock(&dma_win_list_lock);
>>           list_for_each_entry(window, &dma_win_list, list) {
>> -            if (window->direct) {
>> +            if (window->direct && (arg->start_pfn << PAGE_SHIFT) <
>> +                ddw_memory_hotplug_max()) {
> Hi Gaurav,
>
> Since the pmem_start will be greater than ddw_memory_hotplug_max(), 
> and we have not created DDW beyond ddw_memory_hotplug_max(), we are 
> not adding TCE for this range, right?
>
That is correct


> I have tested this patch on my system, and daxctl reconfigure-device 
> is able to reconfigure PMEM to system RAM.
>
> ~# daxctl reconfigure-device --mode=system-ram dax1.0 --force
> [
>   {
>     "chardev":"dax1.0",
>     "size":5362417664,
>     "target_node":4,
>     "align":65536,
>     "mode":"system-ram",
>     "online_memblocks":4,
>     "total_memblocks":4,
>     "movable":true
>   }
> ]
> reconfigured 1 device
> ~#
> ~# lsmem
> RANGE                                  SIZE  STATE REMOVABLE BLOCK
> 0x0000000000000000-0x000000697fffffff  422G online       yes 0-421
> 0x0000040380000000-0x000004047fffffff    4G online       yes 4110-4113
>
> Memory block size:         1G
> Total online memory:     426G
> Total offline memory:      0B
> root at ltcden14-lp2:~#
>
> Thanks
> Donet
>>                   ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn,
>>                           arg->nr_pages, window->prop);
>>               }
>> @@ -2362,7 +2364,8 @@ static int iommu_mem_notifier(struct 
>> notifier_block *nb, unsigned long action,
>>       case MEM_OFFLINE:
>>           spin_lock(&dma_win_list_lock);
>>           list_for_each_entry(window, &dma_win_list, list) {
>> -            if (window->direct) {
>> +            if (window->direct && (arg->start_pfn << PAGE_SHIFT) <
>> +                ddw_memory_hotplug_max()) {
>>                   ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn,
>>                           arg->nr_pages, window->prop);
>>               }
>>
>> base-commit: 95ec54a420b8f445e04a7ca0ea8deb72c51fe1d3


More information about the Linuxppc-dev mailing list