[Skiboot] [PATCH v2 09/15] platforms/astbmc/witherspoon: Rework NPU presence detection
Alexey Kardashevskiy
aik at ozlabs.ru
Mon Jan 21 16:30:42 AEDT 2019
On 11/01/2019 12:09, Andrew Donnellan wrote:
> Rework NPU presence detection in preparation for supporting both NVLink and
> OpenCAPI devices operating simultaneously on the same NPU.
>
> If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is connected
> to GPU#1,
What is "NVLink GPU"? A peer NVLink of GPU#0 connected to GPU#1?
What is the supported hardware configuration? In other words, is there a
spec describing it such as a nice drawing from
Witherspoon_Design_Workbook_v1.7_19June2018.pdf, page 40 with the wiring
diagram?
The patch could be much simpler:
diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c
index fe13899..b592a67 100644
--- a/platforms/astbmc/witherspoon.c
+++ b/platforms/astbmc/witherspoon.c
@@ -233,6 +233,7 @@ static void witherspoon_npu2_device_detect(struct
npu2 *npu)
int rc;
bool gpu0_present, gpu1_present;
+ enum npu2_dev_type gpu0_type = NPU2_DEV_TYPE_UNKNOWN;
if (witherspoon_type != WITHERSPOON_TYPE_REDBUD) {
prlog(PR_DEBUG, "PLAT: Setting all NPU links to NVLink,
OpenCAPI only supported on Redbud\n");
@@ -305,12 +306,14 @@ static void witherspoon_npu2_device_detect(struct
npu2 *npu)
set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
/* We current don't support using the second link */
set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
+ gpu0_type = npu2_dev_type_opencapi;
} else {
prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is NVLink\n",
chip->id);
set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
+ gpu0_type = NPU2_DEV_TYPE_NVLINK;
}
}
@@ -324,7 +327,10 @@ static void witherspoon_npu2_device_detect(struct
npu2 *npu)
} else {
prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is NVLink\n",
chip->id);
- set_link_details(npu, 3, 3, NPU2_DEV_TYPE_NVLINK);
+ if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI)
+ prlog(PR_WARNING, "PLAT: Chip %d GPU#1
will operate at reduced performance due to presence of OpenCAPI device.
For optimal performance, swap device locations\n", chip->id);
+ else
+ set_link_details(npu, 3, 3,
NPU2_DEV_TYPE_NVLINK);
set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
}
Otherwise it is harder to spot the exact change the patch is doing which
is skipping one call to set_link_details().
> the GPU will only receive 2 links rather than the usual 3.
>
> The reason for this is that without the OpenCAPI card, the GPU would be
> use links 3-5, connected to NPU bricks 3-5, which needs both stacks 1 and 2
> to be in NVLink mode.
>
> However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1, we
> need to use NPU bricks 2-3, which means stack 1 must be set in OpenCAPI
> mode. As such, the GPU will be restricted to using links 4 and 5.
>
> Signed-off-by: Andrew Donnellan <andrew.donnellan at au1.ibm.com>
> Reviewed-by: Frederic Barrat <fbarrat at linux.ibm.com>
> ---
> platforms/astbmc/witherspoon.c | 62 ++++++++++++++++++++++++++---------
> 1 file changed, 47 insertions(+), 15 deletions(-)
>
> diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c
> index fe138991696f..c41f0c5b1971 100644
> --- a/platforms/astbmc/witherspoon.c
> +++ b/platforms/astbmc/witherspoon.c
> @@ -233,6 +233,8 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu)
> int rc;
>
> bool gpu0_present, gpu1_present;
> + enum npu2_dev_type gpu0_type = NPU2_DEV_TYPE_UNKNOWN;
> + enum npu2_dev_type gpu1_type = NPU2_DEV_TYPE_UNKNOWN;
>
> if (witherspoon_type != WITHERSPOON_TYPE_REDBUD) {
> prlog(PR_DEBUG, "PLAT: Setting all NPU links to NVLink, OpenCAPI only supported on Redbud\n");
> @@ -298,19 +300,11 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu)
> if (state & (1 << 0)) {
> prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is OpenCAPI\n",
> chip->id);
> - /*
> - * On witherspoon, bricks 2 and 3 are connected to
> - * the lanes matching links 0 and 1 in OpenCAPI mode.
> - */
> - set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
> - /* We current don't support using the second link */
> - set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
> + gpu0_type = NPU2_DEV_TYPE_OPENCAPI;
> } else {
> prlog(PR_DEBUG, "PLAT: Chip %d GPU#0 is NVLink\n",
> chip->id);
> - set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
> - set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
> - set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
> + gpu0_type = NPU2_DEV_TYPE_NVLINK;
> }
> }
>
> @@ -318,16 +312,54 @@ static void witherspoon_npu2_device_detect(struct npu2 *npu)
> if (state & (1 << 1)) {
> prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is OpenCAPI\n",
> chip->id);
> - set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI);
> - /* We current don't support using the second link */
> - set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN);
> + gpu1_type = NPU2_DEV_TYPE_OPENCAPI;
> } else {
> prlog(PR_DEBUG, "PLAT: Chip %d GPU#1 is NVLink\n",
> chip->id);
> + gpu1_type = NPU2_DEV_TYPE_NVLINK;
> + }
> + }
> +
> + if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) {
> + set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);
> + /* We currently don't support using the second link */
> + set_link_details(npu, 0, 2, NPU2_DEV_TYPE_UNKNOWN);
> + }
> +
> + if (gpu0_type == NPU2_DEV_TYPE_NVLINK) {
> + set_link_details(npu, 0, 0, NPU2_DEV_TYPE_NVLINK);
> + set_link_details(npu, 1, 1, NPU2_DEV_TYPE_NVLINK);
> + set_link_details(npu, 2, 2, NPU2_DEV_TYPE_NVLINK);
> + }
> +
> + if (gpu1_type == NPU2_DEV_TYPE_OPENCAPI) {
> + set_link_details(npu, 4, 4, NPU2_DEV_TYPE_OPENCAPI);
> + /* We currently don't support using the second link */
> + set_link_details(npu, 5, 5, NPU2_DEV_TYPE_UNKNOWN);
> + }
> +
> + /*
> + * If an OpenCAPI card is connected to GPU#0, and an NVLink GPU is
> + * connected to GPU#1, the GPU will only receive 2 links rather than the
> + * usual 3.
> + *
> + * The reason for this is that without the OpenCAPI card, the GPU would
> + * be use links 3-5, connected to NPU bricks 3-5, which needs both
> + * stacks 1 and 2 to be in NVLink mode.
> + *
> + * However, with an OpenCAPI card in the GPU#0 slot that uses links 0-1,
> + * we need to use NPU bricks 2-3, which means stack 1 must be set in
> + * OpenCAPI mode. As such, the GPU will be restricted to using links 4
> + * and 5.
Out of curiosity - what exactly does set the OpenCAPI mode for a stack?
set_link_details(npu, 1, 3, NPU2_DEV_TYPE_OPENCAPI);?
> + */
> + if (gpu1_type == NPU2_DEV_TYPE_NVLINK) {
> + if (gpu0_type == NPU2_DEV_TYPE_OPENCAPI) {
> + prlog(PR_WARNING, "PLAT: Chip %d GPU#1 will operate at reduced performance due to presence of OpenCAPI device. For optimal performance, swap device locations\n", chip->id);
> + } else {
> set_link_details(npu, 3, 3, NPU2_DEV_TYPE_NVLINK);
> - set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
> - set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
> }
> + set_link_details(npu, 4, 4, NPU2_DEV_TYPE_NVLINK);
> + set_link_details(npu, 5, 5, NPU2_DEV_TYPE_NVLINK);
btw after this rework you could add this, right?
diff --git a/platforms/astbmc/witherspoon.c b/platforms/astbmc/witherspoon.c
index fe13899..7ce525c 100644
--- a/platforms/astbmc/witherspoon.c
+++ b/platforms/astbmc/witherspoon.c
@@ -219,6 +219,7 @@ static void set_link_details(struct npu2 *npu,
uint32_t link_index,
link_index);
return;
}
+ assert(dev->brick_index != NPU2_DEV_TYPE_UNKNOWN);
dev->brick_index = brick_index;
dev->type = type;
}
> }
>
> return;
>
--
Alexey
More information about the Skiboot
mailing list