[PATCH] pseries/iommu: Tweak ddw behavior in presence of pmem

Alexey Kardashevskiy aik at ozlabs.ru
Wed Mar 11 17:47:08 AEDT 2020



On 11/03/2020 17:09, Aneesh Kumar K.V wrote:
> Vaibhav Jain <vaibhav at linux.ibm.com> writes:
> 
>> Recently we discovered an issue on pseries guests that prevents pci
>> devices from accessing pmem memory via DMA. Performing such an
>> operation will cause PHB to freeze the corresponding partition
>> endpoint and in some scenarios will shutdown the disk that hosts the
>> rootfs.
>>
>> A fix for this is in works until then this patch proposes to disable
>> DDW if pmem nodes are present in the device-tree. This would force all
>> DMA from I/O adapters through default 2-GB window and prevent direct
>> access of pmem memory which is located beyond 4-TB guest physical
>> address.
>>
>> Since this change can have performance penalty for cases where its
>> known that i/o adapters wont be performing DMA to pmem, the patch
>> adds new args to the 'disable_ddw' kernel commanline flag with
>> following possible values:
>>
>> 'default' : Enable DDW only if no Pmem nodes present in device-tree
>> 'Yes' : Disable DDW always
>> 'No'  : Force enable DDW even if Pmem nodes present in device-tree
> 
> This is wrong, if we find pmem attached to LPAR we should not enable ddw
> at all. Enabling ddw results in us using direct dma ops and that cause
> crashes with vpmem address as you explained earlier.


I suggest using existing device::dma_mask/coherent_dma_mask and/or
device::bus_dma_limit - we could actually set these up in enable_ddw()
(where we know the window size) and then the rest of the code would use
it naturally and switch between "direct" or IOMMU DMA.

The immediate problem with this is that the generic
dma_set_mask()/dma_set_coherent_mask() do not ask the platform about the
maximum mask supported, it is either what the device requested (often it
is simply 64bit) or we fall back to 32bit, we could probably fix that.
Thanks,



> 
> 
>>
>> Signed-off-by: Vaibhav Jain <vaibhav at linux.ibm.com>
>> ---
>>  .../admin-guide/kernel-parameters.txt         | 10 ++-
>>  arch/powerpc/platforms/pseries/iommu.c        | 67 +++++++++++++++++--
>>  2 files changed, 70 insertions(+), 7 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index c07815d230bc..58e09f7a2cb9 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -901,9 +901,13 @@
>>  			The feature only exists starting from
>>  			Arch Perfmon v4 (Skylake and newer).
>>  
>> -	disable_ddw	[PPC/PSERIES]
>> -			Disable Dynamic DMA Window support. Use this if
>> -			to workaround buggy firmware.
>> +	disable_ddw=	[PPC/PSERIES]
>> +			Controls weather Dynamic DMA Window support is enabled.
>> +			Use this if to workaround buggy firmware. Following
>> +			values are supported:
>> +                on      Disable ddw always
>> +                off     Enable ddw always
>> +		default Enable ddw if no Persistent memory present (default)
>>  
>>  	disable_ipv6=	[IPV6]
>>  			See Documentation/networking/ipv6.txt.
>> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
>> index 2e0a8eab5588..97498cc25c9f 100644
>> --- a/arch/powerpc/platforms/pseries/iommu.c
>> +++ b/arch/powerpc/platforms/pseries/iommu.c
>> @@ -12,6 +12,7 @@
>>  
>>  #include <linux/init.h>
>>  #include <linux/types.h>
>> +#include <linux/string.h>
>>  #include <linux/slab.h>
>>  #include <linux/mm.h>
>>  #include <linux/memblock.h>
>> @@ -755,12 +756,39 @@ static void pci_dma_dev_setup_pSeries(struct pci_dev *dev)
>>  		       pci_name(dev));
>>  }
>>  
>> +/*
>> + * Following values for the variable are handled
>> + * '-1': Force enable ddw even if Persistent memory is present
>> + * '0' : Enable ddw if no Persistent memory present (default)
>> + * '1' : Disable ddw always
>> + */
>>  static int __read_mostly disable_ddw;
>>  
>> -static int __init disable_ddw_setup(char *str)
>> +static int __init disable_ddw_setup(char *param)
>>  {
>> -	disable_ddw = 1;
>> -	printk(KERN_INFO "ppc iommu: disabling ddw.\n");
>> +	bool val;
>> +	int res;
>> +
>> +	/* Maintain old behaviour that disables DDW when flag given */
>> +	if (!param) {
>> +		disable_ddw = 1;
>> +		return 0;
>> +	}
>> +
>> +	res = strtobool(param, &val);
>> +
>> +	if (!res) {
>> +		if (val) {
>> +			disable_ddw = 1;
>> +			pr_info("ppc iommu: disabling ddw.\n");
>> +		} else if (!val) {
>> +			/* Force enable of DDW even if pmem is available */
>> +			disable_ddw = -1;
>> +			pr_info("ppc iommu: will force enable ddw.\n");
>> +		}
>> +	} else if (strcmp(param, "default") == 0) {
>> +		disable_ddw = 0;
>> +	}
>>  
>>  	return 0;
>>  }
>> @@ -1313,6 +1341,37 @@ static struct notifier_block iommu_reconfig_nb = {
>>  	.notifier_call = iommu_reconfig_notifier,
>>  };
>>  
>> +/* Check if DDW can be supported for this lpar */
>> +int ddw_supported(void)
>> +{
>> +	struct device_node *dn;
>> +
>> +	if (disable_ddw == -1) /* force enable ddw */
>> +		goto out;
>> +
>> +	if (disable_ddw == 1)
>> +		return 0;
>> +
>> +	/*
>> +	 * Due to DMA window limitations currently DDW is not supported
>> +	 * for persistent memory. This is due 1 TiB size of direct mapped
>> +	 * DMA window size limitation enforce by phyp. Since pmem memory
>> +	 * will be mapped at phy address > 4TiB, we cannot accmodate pmem
>> +	 * in the DDW window and DMA's to/from the pmem memory will result in
>> +	 * PHBs getting frozen triggering EEH. Hence for the the time being
>> +	 * disable DDW in presence of a 'ibm,pmemory' node.
>> +	 */
> 
> 
> That comment is not complete. It is not a phyp limitations it is a linux
> kernel implementation detail where we have one offset for direct map
> dma.
> 
>> +	dn = of_find_compatible_node(NULL, NULL, "ibm,pmemory");
>> +	if (dn) {
>> +		pr_info("IOMMU: Disabling DDW as pmem memory available\n");
>> +		of_node_put(dn);
>> +		return 0;
>> +	}
>> + out:
>> +	pr_info("IOMMU: Enabling DDW support\n");
>> +	return 1;
>> +}
>> +
>>  /* These are called very early. */
>>  void iommu_init_early_pSeries(void)
>>  {
>> @@ -1322,7 +1381,7 @@ void iommu_init_early_pSeries(void)
>>  	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>>  		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
>>  		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
>> -		if (!disable_ddw)
>> +		if (ddw_supported())
>>  			pseries_pci_controller_ops.iommu_bypass_supported =
>>  				iommu_bypass_supported_pSeriesLP;
>>  	} else {
>> -- 
>> 2.24.1
> 
> I guess a much simpler patch is below.
> 
> diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c
> index 2e0a8eab5588..99f72162dd85 100644
> --- a/arch/powerpc/platforms/pseries/iommu.c
> +++ b/arch/powerpc/platforms/pseries/iommu.c
> @@ -1313,6 +1313,31 @@ static struct notifier_block iommu_reconfig_nb = {
>  	.notifier_call = iommu_reconfig_notifier,
>  };
>  
> +static bool pmem_attached(void)
> +{
> +	struct device_node *dn;
> +	/*
> +	 * Depending on different partitionable endpoints, dynamic dma window
> +	 * implementation in Linux restrict the max addressable memory that
> +	 * can be DMA mapped using dynamic dma window feature. This limit
> +	 * depends on the I/O map page size and max TCE entries enabled
> +	 * by Phyp. With 64K page size with some PE this limit is 1TB.
> +	 * When persistent memory is bound above this address, we need to
> +	 * disable ddw feature for this PE.
> +	 *
> +	 * Since kernel won't know the pmem bind-address early during boot,
> +	 * force disable dynamic dma window feature for all PEs if pmem is
> +	 * attached to the LPAR.
> +	 */
> +	dn = of_find_compatible_node(NULL, NULL, "ibm,pmemory");
> +	if (dn) {
> +		pr_info("IOMMU: Disabling DDW as pmem memory available\n");
> +		of_node_put(dn);
> +		return true;
> +	}
> +	return false;
> +}
> +
>  /* These are called very early. */
>  void iommu_init_early_pSeries(void)
>  {
> @@ -1322,7 +1347,7 @@ void iommu_init_early_pSeries(void)
>  	if (firmware_has_feature(FW_FEATURE_LPAR)) {
>  		pseries_pci_controller_ops.dma_bus_setup = pci_dma_bus_setup_pSeriesLP;
>  		pseries_pci_controller_ops.dma_dev_setup = pci_dma_dev_setup_pSeriesLP;
> -		if (!disable_ddw)
> +		if (!(disable_ddw || pmem_attached()))
>  			pseries_pci_controller_ops.iommu_bypass_supported =
>  				iommu_bypass_supported_pSeriesLP;
>  	} else {
> 

-- 
Alexey


More information about the Linuxppc-dev mailing list