[Skiboot] [PATCH v2] npu2: Use phys-map to get MMIO BARs

Fri Jun 16 16:35:42 AEST 2017

Hi Andrew,

Couple of minor comments below. Didn't find any typos though, guess I can leave
that to others...

On Thu, 15 Jun 2017 04:23:04 PM Andrew Donnellan wrote:
> Commit bdea201a4c4b ("hw/npu2.c: Use phys-map to get GPU memory BARs")
> added use of phys-map for setting GPU memory BARs.
> 
> Move the MMIO BARs over to using phys-map as well.
> 
> Cc: Alistair Popple <alistair at popple.id.au>
> Signed-off-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>
> ---
> 
> On top of http://patchwork.ozlabs.org/patch/776138/
> 
> ---
>  hw/npu2.c      | 90 ++++++++++++++++++++++++++--------------------------------
>  include/npu2.h |  4 +++
>  2 files changed, 44 insertions(+), 50 deletions(-)
> 
> diff --git a/hw/npu2.c b/hw/npu2.c
> index 8786cc1e..6019c644 100644
> --- a/hw/npu2.c
> +++ b/hw/npu2.c
> @@ -1101,55 +1101,45 @@ static const struct phb_ops npu_ops = {
>  
>  static void assign_mmio_bars(uint64_t gcid, uint32_t scom, uint64_t reg[2], uint64_t mm_win[2])
>  {
> -	uint64_t mem_start;
>  	uint32_t i;
>  	struct npu2_bar *bar;
>  	struct npu2_bar npu2_bars[] = {
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0,   NPU2_PHY_BAR), .size = 0x1000000,
> +		/*
> +		 * NPU_REGS must be first in this list, at least on DD1.
> +		 * On DD2, stack 0 will be used for NPU_REGS, stack 1/2 for NPU_PHY.
> +		 */
> +		{ .type = NPU_REGS, .index = 0,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_PHY_BAR),
>  		  .flags = NPU2_BAR_FLAG_ENABLED },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0,   NPU2_PHY_BAR), .size =  0x200000,
> +		{ .type = NPU_PHY, .index = 0,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_PHY_BAR),
>  		  .flags = NPU2_BAR_FLAG_ENABLED },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0,   NPU2_PHY_BAR), .size =  0x200000,
> +		{ .type = NPU_PHY, .index = 1,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_PHY_BAR),
>  		  .flags = NPU2_BAR_FLAG_ENABLED },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0,  NPU2_NTL0_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0,  NPU2_NTL1_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0,  NPU2_NTL0_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0,  NPU2_NTL1_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0,  NPU2_NTL0_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0,  NPU2_NTL1_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR), .size =   0x20000 },
> -		{ .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR), .size =   0x20000 },
> +		{ .type = NPU_NTL, .index = 0,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL0_BAR) },
> +		{ .type = NPU_NTL, .index = 1,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_NTL1_BAR) },
> +		{ .type = NPU_NTL, .index = 2,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL0_BAR) },
> +		{ .type = NPU_NTL, .index = 3,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_NTL1_BAR) },
> +		{ .type = NPU_NTL, .index = 4,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL0_BAR) },
> +		{ .type = NPU_NTL, .index = 5,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_NTL1_BAR) },
> +		{ .type = NPU_GENID, .index = 0,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0, 0, NPU2_GENID_BAR) },
> +		{ .type = NPU_GENID, .index = 1,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_1, 0, NPU2_GENID_BAR) },
> +		{ .type = NPU_GENID, .index = 2,
> +		  .reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_2, 0, NPU2_GENID_BAR) },
>  	};
>  
> -	mem_start = 0x6030200000000;
> -	mem_start |= gcid << PPC_BITLSHIFT(21);
> -
> -	/*
> -	 * We're going to assign the BARs in reversed order according
> -	 * to their sizes, just like the order we have in npu_bars[].
> -	 * In that way, all BARs will be aligned perfectly without
> -	 * wasting resources. Also, the Linux kernel won't change
> -	 * anything though it attempts to reassign the BARs that
> -	 * it can see, which are NTL and GENID BARs.
> -	 *
> -	 * GLOBAL MMIO (16MB)
> -	 *        PHY0 (2MB)
> -	 *        PHB1 (2MB)
> -	 *        NTL0 (128KB)
> -	 *        NTL1 (128KB)
> -	 *        NTL2 (128KB)
> -	 *        NTL3 (128KB)
> -	 *        NTL4 (128KB)
> -	 *        NTL5 (128KB)
> -	 *      GENID0 (128KB)
> -	 *      GENID1 (128KB)
> -	 *      GENID2 (128KB)
> -	 */
>  	for (i = 0; i < ARRAY_SIZE(npu2_bars); i++) {
>  		bar = &npu2_bars[i];
> -		bar->base = mem_start;
> -		mem_start += bar->size;
> +		phys_map_get(gcid, bar->type, bar->index, &bar->base, &bar->size);
>  		npu2_write_bar(NULL, bar, gcid, scom);
>  	}
>  
> @@ -1442,6 +1432,7 @@ static void npu2_populate_devices(struct npu2 *p,
>  	struct npu2_dev *dev;
>  	struct dt_node *npu2_dn, *link;
>  	uint32_t npu_phandle, index = 0;
> +	int stack;
>  
>  	/*
>  	 * Get the npu node which has the links which we expand here
> @@ -1474,23 +1465,22 @@ static void npu2_populate_devices(struct npu2 *p,
>  		dev->pl_xscom_base = dt_prop_get_u64(link, "ibm,npu-phy");
>  		dev->lane_mask = dt_prop_get_u32(link, "ibm,npu-lane-mask");
>  
> -		/* Populate BARs. BAR0/1 is the NTL bar. We initialise
> -		 * it from the HW. */
> +		/* Populate BARs. BAR0/1 is the NTL bar. */
> +		stack = NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev);
>  		npu2_bar = &dev->bars[0].npu2_bar;
> -		if (NPU2DEV_BRICK(dev) == 0)
> -			/* Leave the block as 0 - the read/write bar
> -			 * functions fill it in */
> -			npu2_bar->reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev), 0, NPU2_NTL0_BAR);
> -		else
> -			npu2_bar->reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev), 0, NPU2_NTL1_BAR);
> +		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2DEV_BRICK(dev) == 0 ?
> +						NPU2_NTL0_BAR : NPU2_NTL1_BAR);
> +		phys_map_get(p->chip_id, NPU_NTL,
> +			     NPU2DEV_BRICK(dev) + (2 * NPU2DEV_STACK(dev)),

It would be clearer to use dev->index here. Given you add fields to struct
npu2_bar to store type and index we should probably just initialise them as well
and reference them in the phys_map_get() call.

> +			     &npu2_bar->base, &npu2_bar->size);
>  
> -		npu2_read_bar(p, npu2_bar);
>  		dev->bars[0].flags = PCI_CFG_BAR_TYPE_MEM | PCI_CFG_BAR_MEM64;
>  
>  		/* BAR2/3 is the GENID bar. */
>  		npu2_bar = &dev->bars[1].npu2_bar;
> -		npu2_bar->reg = NPU2_REG_OFFSET(NPU2_STACK_STCK_0 + NPU2DEV_STACK(dev), 0, NPU2_GENID_BAR);
> -		npu2_read_bar(p, npu2_bar);
> +		npu2_bar->reg = NPU2_REG_OFFSET(stack, 0, NPU2_GENID_BAR);
> +	        phys_map_get(p->chip_id, NPU_GENID, NPU2DEV_STACK(dev),

NPU2_STACK(dev) is correct here but again you should initialise it in
npu2_bar->index and npu2_bar->type.

Thanks,

Alistair

> +			     &npu2_bar->base, &npu2_bar->size);
>
>  		/* The GENID is a single physical BAR that we split
>  		 * for each emulated device */
> diff --git a/include/npu2.h b/include/npu2.h
> index 6476c729..88ebe9e1 100644
> --- a/include/npu2.h
> +++ b/include/npu2.h
> @@ -17,6 +17,8 @@
>  #ifndef __NPU2_H
>  #define __NPU2_H
>  
> +#include <phys-map.h>
> +
>  /* Debugging options */
>  #define NPU2DBG(p, fmt, a...)	prlog(PR_DEBUG, "NPU%d: " fmt, \
>  				      (p)->phb.opal_id, ##a)
> @@ -45,6 +47,8 @@
>   * emulated PCIe BARs. The is a subtle difference between the two as
>   * not all BARs are exposed outside of skiboot. */
>  struct npu2_bar {
> +	enum phys_map_type	type;
> +	int			index;
>  #define NPU2_BAR_FLAG_ENABLED	0x0010
>  
>  /* Generation ID's are a single space in the hardware but we split
>