[PATCH 2/2] powerpc/powernv: ocxl move TL definition

Tue Oct 15 02:32:59 AEDT 2019

On 14/10/2019 12:21, Frederic Barrat wrote:
> 
> 
> Le 09/10/2019 à 17:11, christophe lombard a écrit :
>> Specifies the templates in the Transaction Layer that the OpenCAPI 
>> device/host
>> support when transmitting/receiving DL/DLX frames to or from the OpenCAPI
>> device/host.
>> Update, rename and create new few platform-specific calls which can be 
>> used by
>> drivers.
>>
>> No functional change.
>>
>> Signed-off-by: Christophe Lombard <clombard at linux.vnet.ibm.com>
>> ---
>>   arch/powerpc/include/asm/pnv-ocxl.h   |   5 +-
>>   arch/powerpc/platforms/powernv/ocxl.c | 103 ++++++++++++++++++++++++--
>>   drivers/misc/ocxl/config.c            |  89 +---------------------
>>   3 files changed, 99 insertions(+), 98 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/pnv-ocxl.h 
>> b/arch/powerpc/include/asm/pnv-ocxl.h
>> index 8e516e339e6c..b8c68878b4ba 100644
>> --- a/arch/powerpc/include/asm/pnv-ocxl.h
>> +++ b/arch/powerpc/include/asm/pnv-ocxl.h
>> @@ -13,10 +13,7 @@ extern int pnv_ocxl_get_actag(struct pci_dev *dev, 
>> u16 *base, u16 *enabled,
>>               u16 *supported);
>>   extern int pnv_ocxl_get_pasid_count(struct pci_dev *dev, int *count);
>>
>> -extern int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
>> -            char *rate_buf, int rate_buf_size);
>> -extern int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
>> -            uint64_t rate_buf_phys, int rate_buf_size);
>> +extern int pnv_ocxl_set_TL(struct pci_dev *dev, int tl_dvsec);
>>
>>   extern int pnv_ocxl_platform_setup(struct pci_dev *dev,
>>                      int PE_mask, int *hwirq,
>> diff --git a/arch/powerpc/platforms/powernv/ocxl.c 
>> b/arch/powerpc/platforms/powernv/ocxl.c
>> index 4d26cba12b63..351324cffc2b 100644
>> --- a/arch/powerpc/platforms/powernv/ocxl.c
>> +++ b/arch/powerpc/platforms/powernv/ocxl.c
>> @@ -369,8 +369,8 @@ static void set_templ_rate(unsigned int templ, 
>> unsigned int rate, char *buf)
>>       buf[idx] |= rate << shift;
>>   }
>>
>> -int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long *cap,
>> -            char *rate_buf, int rate_buf_size)
>> +static int get_tl_cap(struct pci_dev *dev, long *cap,
>> +              char *rate_buf, int rate_buf_size)
>>   {
>>       if (rate_buf_size != PNV_OCXL_TL_RATE_BUF_SIZE)
>>           return -EINVAL;
>> @@ -390,10 +390,9 @@ int pnv_ocxl_get_tl_cap(struct pci_dev *dev, long 
>> *cap,
>>       *cap = PNV_OCXL_TL_P9_RECV_CAP;
>>       return 0;
>>   }
>> -EXPORT_SYMBOL_GPL(pnv_ocxl_get_tl_cap);
>>
>> -int pnv_ocxl_set_tl_conf(struct pci_dev *dev, long cap,
>> -            uint64_t rate_buf_phys, int rate_buf_size)
>> +static int set_tl_conf(struct pci_dev *dev, long cap,
>> +               uint64_t rate_buf_phys, int rate_buf_size)
>>   {
>>       struct pci_controller *hose = pci_bus_to_host(dev->bus);
>>       struct pnv_phb *phb = hose->private_data;
>> @@ -410,7 +409,99 @@ int pnv_ocxl_set_tl_conf(struct pci_dev *dev, 
>> long cap,
>>       }
>>       return 0;
>>   }
>> -EXPORT_SYMBOL_GPL(pnv_ocxl_set_tl_conf);
>> +
>> +int pnv_ocxl_set_TL(struct pci_dev *dev, int tl_dvsec)
>> +{
>> +    u32 val;
>> +    __be32 *be32ptr;
>> +    u8 timers;
>> +    int i, rc;
>> +    long recv_cap;
>> +    char *recv_rate;
>> +
>> +    recv_rate = kzalloc(PNV_OCXL_TL_RATE_BUF_SIZE, GFP_KERNEL);
>> +    if (!recv_rate)
>> +        return -ENOMEM;
>> +    /*
>> +     * The spec defines 64 templates for messages in the
>> +     * Transaction Layer (TL).
>> +     *
>> +     * The host and device each support a subset, so we need to
>> +     * configure the transmitters on each side to send only
>> +     * templates the receiver understands, at a rate the receiver
>> +     * can process.  Per the spec, template 0 must be supported by
>> +     * everybody. That's the template which has been used by the
>> +     * host and device so far.
>> +     *
>> +     * The sending rate limit must be set before the template is
>> +     * enabled.
>> +     */
>> +
>> +    /*
>> +     * Device -> host
>> +     */
>> +    rc = get_tl_cap(dev, &recv_cap, recv_rate,
>> +            PNV_OCXL_TL_RATE_BUF_SIZE);
>> +    if (rc)
>> +        goto out;
>> +
>> +    for (i = 0; i < PNV_OCXL_TL_RATE_BUF_SIZE; i += 4) {
>> +        be32ptr = (__be32 *) &recv_rate[i];
>> +        pci_write_config_dword(dev,
>> +                tl_dvsec + OCXL_DVSEC_TL_SEND_RATE + i,
>> +                be32_to_cpu(*be32ptr));
>> +    }
>> +    val = recv_cap >> 32;
>> +    pci_write_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_SEND_CAP, val);
>> +    val = recv_cap & GENMASK(31, 0);
>> +    pci_write_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_SEND_CAP + 
>> 4, val);
>> +
>> +    /*
>> +     * Host -> device
>> +     */
>> +    for (i = 0; i < PNV_OCXL_TL_RATE_BUF_SIZE; i += 4) {
>> +        pci_read_config_dword(dev,
>> +                tl_dvsec + OCXL_DVSEC_TL_RECV_RATE + i,
>> +                &val);
>> +        be32ptr = (__be32 *) &recv_rate[i];
>> +        *be32ptr = cpu_to_be32(val);
>> +    }
>> +    pci_read_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_RECV_CAP, &val);
>> +    recv_cap = (long) val << 32;
>> +    pci_read_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_RECV_CAP + 4, 
>> &val);
>> +    recv_cap |= val;
>> +
>> +    rc = set_tl_conf(dev, recv_cap, __pa(recv_rate),
>> +             PNV_OCXL_TL_RATE_BUF_SIZE);
>> +    if (rc)
>> +        goto out;
>> +
>> +    /*
>> +     * Opencapi commands needing to be retried are classified per
>> +     * the TL in 2 groups: short and long commands.
>> +     *
>> +     * The short back off timer it not used for now. It will be
>> +     * for opencapi 4.0.
>> +     *
>> +     * The long back off timer is typically used when an AFU hits
>> +     * a page fault but the NPU is already processing one. So the
>> +     * AFU needs to wait before it can resubmit. Having a value
>> +     * too low doesn't break anything, but can generate extra
>> +     * traffic on the link.
>> +     * We set it to 1.6 us for now. It's shorter than, but in the
>> +     * same order of magnitude as the time spent to process a page
>> +     * fault.
>> +     */
>> +    timers = 0x2 << 4; /* long timer = 1.6 us */
>> +    pci_write_config_byte(dev, tl_dvsec + OCXL_DVSEC_TL_BACKOFF_TIMERS,
>> +            timers);
>> +
> 
> 
> How does it work in the virtualized case? We would also need to do those 
> config space reads and writes. I'm guessing it's all handled in the host 
> behind a hcall, as we don't really want to have the guest mess with the 
> link configuration?
> 

A specific option (H_CONFIG_ADAPTER_SET_TL) through the hcall
H_OCXL_CONFIG_ADAPTER allows the guest to call pnv_ocxl_set_TL(), like
the ocxl driver running on the host.
All new pnv_* api have been created to configurate and handle the capi
device for the ocxl driver (running on the host) and for the guest,
through a new vfio driver. This new vfio driver will be in charge,
according the hcall options, to call the right api.

>    Fred
> 
> 
>> +    rc = 0;
>> +out:
>> +    kfree(recv_rate);
>> +    return rc;
>> +}
>> +EXPORT_SYMBOL_GPL(pnv_ocxl_set_TL);
>>
>>   static int get_xsl_irq(struct pci_dev *dev, int *hwirq)
>>   {
>> diff --git a/drivers/misc/ocxl/config.c b/drivers/misc/ocxl/config.c
>> index c8e19bfb5ef9..7ca0f6744125 100644
>> --- a/drivers/misc/ocxl/config.c
>> +++ b/drivers/misc/ocxl/config.c
>> @@ -709,100 +709,13 @@ EXPORT_SYMBOL_GPL(ocxl_config_set_afu_state);
>>
>>   int ocxl_config_set_TL(struct pci_dev *dev, int tl_dvsec)
>>   {
>> -    u32 val;
>> -    __be32 *be32ptr;
>> -    u8 timers;
>> -    int i, rc;
>> -    long recv_cap;
>> -    char *recv_rate;
>> -
>>       /*
>>        * Skip on function != 0, as the TL can only be defined on 0
>>        */
>>       if (PCI_FUNC(dev->devfn) != 0)
>>           return 0;
>>
>> -    recv_rate = kzalloc(PNV_OCXL_TL_RATE_BUF_SIZE, GFP_KERNEL);
>> -    if (!recv_rate)
>> -        return -ENOMEM;
>> -    /*
>> -     * The spec defines 64 templates for messages in the
>> -     * Transaction Layer (TL).
>> -     *
>> -     * The host and device each support a subset, so we need to
>> -     * configure the transmitters on each side to send only
>> -     * templates the receiver understands, at a rate the receiver
>> -     * can process.  Per the spec, template 0 must be supported by
>> -     * everybody. That's the template which has been used by the
>> -     * host and device so far.
>> -     *
>> -     * The sending rate limit must be set before the template is
>> -     * enabled.
>> -     */
>> -
>> -    /*
>> -     * Device -> host
>> -     */
>> -    rc = pnv_ocxl_get_tl_cap(dev, &recv_cap, recv_rate,
>> -                PNV_OCXL_TL_RATE_BUF_SIZE);
>> -    if (rc)
>> -        goto out;
>> -
>> -    for (i = 0; i < PNV_OCXL_TL_RATE_BUF_SIZE; i += 4) {
>> -        be32ptr = (__be32 *) &recv_rate[i];
>> -        pci_write_config_dword(dev,
>> -                tl_dvsec + OCXL_DVSEC_TL_SEND_RATE + i,
>> -                be32_to_cpu(*be32ptr));
>> -    }
>> -    val = recv_cap >> 32;
>> -    pci_write_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_SEND_CAP, val);
>> -    val = recv_cap & GENMASK(31, 0);
>> -    pci_write_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_SEND_CAP + 
>> 4, val);
>> -
>> -    /*
>> -     * Host -> device
>> -     */
>> -    for (i = 0; i < PNV_OCXL_TL_RATE_BUF_SIZE; i += 4) {
>> -        pci_read_config_dword(dev,
>> -                tl_dvsec + OCXL_DVSEC_TL_RECV_RATE + i,
>> -                &val);
>> -        be32ptr = (__be32 *) &recv_rate[i];
>> -        *be32ptr = cpu_to_be32(val);
>> -    }
>> -    pci_read_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_RECV_CAP, &val);
>> -    recv_cap = (long) val << 32;
>> -    pci_read_config_dword(dev, tl_dvsec + OCXL_DVSEC_TL_RECV_CAP + 4, 
>> &val);
>> -    recv_cap |= val;
>> -
>> -    rc = pnv_ocxl_set_tl_conf(dev, recv_cap, __pa(recv_rate),
>> -                PNV_OCXL_TL_RATE_BUF_SIZE);
>> -    if (rc)
>> -        goto out;
>> -
>> -    /*
>> -     * Opencapi commands needing to be retried are classified per
>> -     * the TL in 2 groups: short and long commands.
>> -     *
>> -     * The short back off timer it not used for now. It will be
>> -     * for opencapi 4.0.
>> -     *
>> -     * The long back off timer is typically used when an AFU hits
>> -     * a page fault but the NPU is already processing one. So the
>> -     * AFU needs to wait before it can resubmit. Having a value
>> -     * too low doesn't break anything, but can generate extra
>> -     * traffic on the link.
>> -     * We set it to 1.6 us for now. It's shorter than, but in the
>> -     * same order of magnitude as the time spent to process a page
>> -     * fault.
>> -     */
>> -    timers = 0x2 << 4; /* long timer = 1.6 us */
>> -    pci_write_config_byte(dev, tl_dvsec + OCXL_DVSEC_TL_BACKOFF_TIMERS,
>> -            timers);
>> -
>> -    rc = 0;
>> -out:
>> -    kfree(recv_rate);
>> -    return rc;
>> +    return pnv_ocxl_set_TL(dev, tl_dvsec);
>>   }
>>   EXPORT_SYMBOL_GPL(ocxl_config_set_TL);
>>