[PATCH 3/4] tools/perf: Fix perf numa bench to fix usage of affinity for machines with #CPUs > 1K

Wed Apr 6 17:15:28 AEST 2022

> On 05-Apr-2022, at 11:22 PM, Ian Rogers <irogers at google.com> wrote:
> 
> On Fri, Apr 1, 2022 at 11:59 AM Athira Rajeev
> <atrajeev at linux.vnet.ibm.com> wrote:
>> 
>> perf bench numa testcase fails on systems with CPU's
>> more than 1K.
>> 
>> Testcase: perf bench numa mem -p 1 -t 3 -P 512 -s 100 -zZ0qcm --thp  1
>> Snippet of code:
>> <<>>
>> perf: bench/numa.c:302: bind_to_node: Assertion `!(ret)' failed.
>> Aborted (core dumped)
>> <<>>
>> 
>> bind_to_node function uses "sched_getaffinity" to save the original
>> cpumask and this call is returning EINVAL ((invalid argument).
>> This happens because the default mask size in glibc is 1024.
>> To overcome this 1024 CPUs mask size limitation of cpu_set_t,
>> change the mask size using the CPU_*_S macros ie, use CPU_ALLOC to
>> allocate cpumask, CPU_ALLOC_SIZE for size. Apart from fixing this
>> for "orig_mask", apply same logic to "mask" as well which is used to
>> setaffinity so that mask size is large enough to represent number
>> of possible CPU's in the system.
>> 
>> sched_getaffinity is used in one more place in perf numa bench. It
>> is in "bind_to_cpu" function. Apply the same logic there also. Though
>> currently no failure is reported from there, it is ideal to change
>> getaffinity to work with such system configurations having CPU's more
>> than default mask size supported by glibc.
>> 
>> Also fix "sched_setaffinity" to use mask size which is large enough
>> to represent number of possible CPU's in the system.
>> 
>> Fixed all places where "bind_cpumask" which is part of "struct
>> thread_data" is used such that bind_cpumask works in all configuration.
>> 
>> Reported-by: Disha Goel <disgoel at linux.vnet.ibm.com>
>> Signed-off-by: Athira Rajeev <atrajeev at linux.vnet.ibm.com>
>> ---
>> tools/perf/bench/numa.c | 109 +++++++++++++++++++++++++++++-----------
>> 1 file changed, 81 insertions(+), 28 deletions(-)
>> 
>> diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
>> index f2640179ada9..333896907e45 100644
>> --- a/tools/perf/bench/numa.c
>> +++ b/tools/perf/bench/numa.c
>> @@ -54,7 +54,7 @@
>> 
>> struct thread_data {
>>        int                     curr_cpu;
>> -       cpu_set_t               bind_cpumask;
>> +       cpu_set_t               *bind_cpumask;
>>        int                     bind_node;
>>        u8                      *process_data;
>>        int                     process_nr;
>> @@ -266,46 +266,75 @@ static bool node_has_cpus(int node)
>>        return ret;
>> }
>> 
>> -static cpu_set_t bind_to_cpu(int target_cpu)
>> +static cpu_set_t *bind_to_cpu(int target_cpu)
>> {
>> -       cpu_set_t orig_mask, mask;
>> +       int nrcpus = numa_num_possible_cpus();
>> +       cpu_set_t *orig_mask, *mask;
>> +       size_t size;
>>        int ret;
>> 
>> -       ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
>> -       BUG_ON(ret);
>> +       orig_mask = CPU_ALLOC(nrcpus);
>> +       BUG_ON(!orig_mask);
>> +       size = CPU_ALLOC_SIZE(nrcpus);
>> +       CPU_ZERO_S(size, orig_mask);
>> +
>> +       ret = sched_getaffinity(0, size, orig_mask);
>> +       if (ret) {
>> +               CPU_FREE(orig_mask);
>> +               BUG_ON(ret);
>> +       }
>> 
>> -       CPU_ZERO(&mask);
>> +       mask = CPU_ALLOC(nrcpus);
>> +       BUG_ON(!mask);
>> +       CPU_ZERO_S(size, mask);
>> 
>>        if (target_cpu == -1) {
>>                int cpu;
>> 
>>                for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
>> -                       CPU_SET(cpu, &mask);
>> +                       CPU_SET_S(cpu, size, mask);
>>        } else {
>>                BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
>> -               CPU_SET(target_cpu, &mask);
>> +               CPU_SET_S(target_cpu, size, mask);
>>        }
>> 
>> -       ret = sched_setaffinity(0, sizeof(mask), &mask);
>> -       BUG_ON(ret);
>> +       ret = sched_setaffinity(0, size, mask);
>> +       if (ret) {
>> +               CPU_FREE(mask);
>> +               BUG_ON(ret);
>> +       }
>> +
>> +       CPU_FREE(mask);
> 
> This all looks good, a nit here it could it be a little shorter as:
> ret = sched_setaffinity(0, size, mask);
> CPU_FREE(mask);
> BUG_ON(ret);
> 
> Thanks,
> Ian
> 

Thanks for the review Ian.
Sure, I will address this change in V2

Thanks
Athira
>> 
>>        return orig_mask;
>> }
>> 
>> -static cpu_set_t bind_to_node(int target_node)
>> +static cpu_set_t *bind_to_node(int target_node)
>> {
>> -       cpu_set_t orig_mask, mask;
>> +       int nrcpus = numa_num_possible_cpus();
>> +       cpu_set_t *orig_mask, *mask;
>> +       size_t size;
>>        int cpu;
>>        int ret;
>> 
>> -       ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
>> -       BUG_ON(ret);
>> +       orig_mask = CPU_ALLOC(nrcpus);
>> +       BUG_ON(!orig_mask);
>> +       size = CPU_ALLOC_SIZE(nrcpus);
>> +       CPU_ZERO_S(size, orig_mask);
>> +
>> +       ret = sched_getaffinity(0, size, orig_mask);
>> +       if (ret) {
>> +               CPU_FREE(orig_mask);
>> +               BUG_ON(ret);
>> +       }
>> 
>> -       CPU_ZERO(&mask);
>> +       mask = CPU_ALLOC(nrcpus);
>> +       BUG_ON(!mask);
>> +       CPU_ZERO_S(size, mask);
>> 
>>        if (target_node == NUMA_NO_NODE) {
>>                for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
>> -                       CPU_SET(cpu, &mask);
>> +                       CPU_SET_S(cpu, size, mask);
>>        } else {
>>                struct bitmask *cpumask = numa_allocate_cpumask();
>> 
>> @@ -313,24 +342,33 @@ static cpu_set_t bind_to_node(int target_node)
>>                if (!numa_node_to_cpus(target_node, cpumask)) {
>>                        for (cpu = 0; cpu < (int)cpumask->size; cpu++) {
>>                                if (numa_bitmask_isbitset(cpumask, cpu))
>> -                                       CPU_SET(cpu, &mask);
>> +                                       CPU_SET_S(cpu, size, mask);
>>                        }
>>                }
>>                numa_free_cpumask(cpumask);
>>        }
>> 
>> -       ret = sched_setaffinity(0, sizeof(mask), &mask);
>> -       BUG_ON(ret);
>> +       ret = sched_setaffinity(0, size, mask);
>> +       if (ret) {
>> +               CPU_FREE(mask);
>> +               BUG_ON(ret);
>> +       }
>> +
>> +       CPU_FREE(mask);
>> 
>>        return orig_mask;
>> }
>> 
>> -static void bind_to_cpumask(cpu_set_t mask)
>> +static void bind_to_cpumask(cpu_set_t *mask)
>> {
>>        int ret;
>> +       size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus());
>> 
>> -       ret = sched_setaffinity(0, sizeof(mask), &mask);
>> -       BUG_ON(ret);
>> +       ret = sched_setaffinity(0, size, mask);
>> +       if (ret) {
>> +               CPU_FREE(mask);
>> +               BUG_ON(ret);
>> +       }
>> }
>> 
>> static void mempol_restore(void)
>> @@ -376,7 +414,7 @@ do {                                                        \
>> static u8 *alloc_data(ssize_t bytes0, int map_flags,
>>                      int init_zero, int init_cpu0, int thp, int init_random)
>> {
>> -       cpu_set_t orig_mask;
>> +       cpu_set_t *orig_mask;
>>        ssize_t bytes;
>>        u8 *buf;
>>        int ret;
>> @@ -434,6 +472,7 @@ static u8 *alloc_data(ssize_t bytes0, int map_flags,
>>        /* Restore affinity: */
>>        if (init_cpu0) {
>>                bind_to_cpumask(orig_mask);
>> +               CPU_FREE(orig_mask);
>>                mempol_restore();
>>        }
>> 
>> @@ -589,6 +628,7 @@ static int parse_setup_cpu_list(void)
>>                BUG_ON(bind_cpu_0 > bind_cpu_1);
>> 
>>                for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
>> +                       size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus);
>>                        int i;
>> 
>>                        for (i = 0; i < mul; i++) {
>> @@ -608,10 +648,12 @@ static int parse_setup_cpu_list(void)
>>                                        tprintf("%2d", bind_cpu);
>>                                }
>> 
>> -                               CPU_ZERO(&td->bind_cpumask);
>> +                               td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
>> +                               BUG_ON(!td->bind_cpumask);
>> +                               CPU_ZERO_S(size, td->bind_cpumask);
>>                                for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
>>                                        BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
>> -                                       CPU_SET(cpu, &td->bind_cpumask);
>> +                                       CPU_SET_S(cpu, size, td->bind_cpumask);
>>                                }
>>                                t++;
>>                        }
>> @@ -1241,7 +1283,7 @@ static void *worker_thread(void *__tdata)
>>                 * by migrating to CPU#0:
>>                 */
>>                if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
>> -                       cpu_set_t orig_mask;
>> +                       cpu_set_t *orig_mask;
>>                        int target_cpu;
>>                        int this_cpu;
>> 
>> @@ -1265,6 +1307,7 @@ static void *worker_thread(void *__tdata)
>>                                printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
>> 
>>                        bind_to_cpumask(orig_mask);
>> +                       CPU_FREE(orig_mask);
>>                }
>> 
>>                if (details >= 3) {
>> @@ -1398,21 +1441,31 @@ static void init_thread_data(void)
>> 
>>        for (t = 0; t < g->p.nr_tasks; t++) {
>>                struct thread_data *td = g->threads + t;
>> +               size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus);
>>                int cpu;
>> 
>>                /* Allow all nodes by default: */
>>                td->bind_node = NUMA_NO_NODE;
>> 
>>                /* Allow all CPUs by default: */
>> -               CPU_ZERO(&td->bind_cpumask);
>> +               td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
>> +               BUG_ON(!td->bind_cpumask);
>> +               CPU_ZERO_S(cpuset_size, td->bind_cpumask);
>>                for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
>> -                       CPU_SET(cpu, &td->bind_cpumask);
>> +                       CPU_SET_S(cpu, cpuset_size, td->bind_cpumask);
>>        }
>> }
>> 
>> static void deinit_thread_data(void)
>> {
>>        ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
>> +       int t;
>> +
>> +       /* Free the bind_cpumask allocated for thread_data */
>> +       for (t = 0; t < g->p.nr_tasks; t++) {
>> +               struct thread_data *td = g->threads + t;
>> +               CPU_FREE(td->bind_cpumask);
>> +       }
>> 
>>        free_data(g->threads, size);
>> }
>> --
>> 2.35.1