[PATCH] ppc64_cpu: utilize cpu/present info to cope with dynamic sysfs
Pingfan Liu
kernelfans at gmail.com
Thu Aug 2 23:39:52 AEST 2018
At present, ppc64_cpu takes the assumption of statically contiguous cpu
ids, i.e from 0 to threads_in_system. This does not face problem, since
the kernel code ensures the continuity. But due to kexec-tools needs the
CPU_ADD/_REMOVE udev event message, instead of CPU_ONLINE/_OFFLINE, the
kernel will resort to register_cpu/unregister_cpu API to acheive this.
Now, unplugging a core will make a hole in cpu_present_mask, which breaks
the continuity. To address this, this patch utilizes the cpu/present to
build a bitmap, and iterate over bitmap to cope with discontinuity.
By this way, ppc64_cpu can work with old/new kernel.
Notes about the kexec-tools issue: (tested with Fedora28)
Some user space tools such as kexec-tools resorts to the event add/remove
to automatically rebuild dtb. If the dtb is not rebuilt correctly, we
may hang on 2nd kernel due to lack the info of boot-cpu-hwid in dtb.
The steps to trigger the bug: (suppose 8 threads/core)
drmgr -c cpu -r -q 1
systemctl restart kdump.service
drmgr -c cpu -a -q 1
taskset -c 11 sh -c "echo c > /proc/sysrq-trigger"
Then, failure info:
[ 205.299528] SysRq : Trigger a crash
[ 205.299551] Unable to handle kernel paging request for data at address 0x00000000
[ 205.299558] Faulting instruction address: 0xc0000000006001a0
[ 205.299564] Oops: Kernel access of bad area, sig: 11 [#1]
[ 205.299569] SMP NR_CPUS=2048 NUMA pSeries
[-- cut --]
[ 205.301829] Sending IPI to other CPUs
[ 205.302846] IPI complete
I'm in purgatory
-- > hang up here
Cc: Tyrel Datwyler <tyreld at linux.vnet.ibm.com>
Cc: Benjamin Herrenschmidt <benh at kernel.crashing.org>
Cc: Michael Ellerman <mpe at ellerman.id.au>
Cc: linuxppc-dev at lists.ozlabs.org
Signed-off-by: Pingfan Liu <kernelfans at gmail.com>
---
src/ppc64_cpu.c | 205 ++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 176 insertions(+), 29 deletions(-)
diff --git a/src/ppc64_cpu.c b/src/ppc64_cpu.c
index 34654b4..cd5997d 100644
--- a/src/ppc64_cpu.c
+++ b/src/ppc64_cpu.c
@@ -23,6 +23,7 @@
#include <unistd.h>
#include <string.h>
#include <dirent.h>
+#include <malloc.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
@@ -49,7 +50,8 @@
#define PPC64_CPU_VERSION "1.2"
-#define SYSFS_CPUDIR "/sys/devices/system/cpu/cpu%d"
+#define SYSFS_CPUDIR "/sys/devices/system/cpu"
+#define SYSFS_PERCPUDIR "/sys/devices/system/cpu/cpu%d"
#define SYSFS_SUBCORES "/sys/devices/system/cpu/subcores_per_core"
#define DSCR_DEFAULT_PATH "/sys/devices/system/cpu/dscr_default"
#define INTSERV_PATH "/proc/device-tree/cpus/%s/ibm,ppc-interrupt-server#s"
@@ -75,17 +77,161 @@ struct cpu_freq {
static int threads_per_cpu = 0;
static int cpus_in_system = 0;
-static int threads_in_system = 0;
static int do_info(void);
+/* 64 bits system */
+#define BITS_PER_LONG 64
+#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG))
+#define BIT_WORD(nr) ((nr) / BITS_PER_LONG)
+
+static unsigned long *cpu_present_mask;
+static unsigned int max_cpu_id = (unsigned int)-1;
+
+/* @n: the position prior to the place to search */
+static unsigned int cpumask_next(int nr, unsigned long *addr)
+{
+ unsigned int bit_num, i, j;
+ unsigned long *p;
+
+ p = addr + BIT_WORD(nr);
+ for (i = nr+1; i < max_cpu_id; ) {
+ for (j = i % BITS_PER_LONG; j < BITS_PER_LONG; j++) {
+ if ((*p >> j) & 0x1) {
+ bit_num = BIT_WORD(i)*BITS_PER_LONG + j;
+ return bit_num;
+ }
+ }
+ p++;
+ i = ((i >> 6) + 1) << 6;
+ }
+ return -1;
+}
+
+#define for_each_cpu(cpu, mask) \
+ for ((cpu) = -1; \
+ (cpu) = cpumask_next((cpu), (mask)), \
+ (cpu) < max_cpu_id;)
+
+static inline int test_bit(int nr, const unsigned long *addr)
+{
+ return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
+}
+
+static inline void set_bit(int nr, const unsigned long *addr)
+{
+ unsigned long mask = BIT_MASK(nr);
+ unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+
+ *p |= mask;
+}
+
+static void set_bitmap(int start, int end, const unsigned long *addr)
+{
+ int i;
+
+ for ( i = start; i <= end; i++)
+ set_bit(i, addr);
+}
+
+/* @n: the place prior to search */
+static unsigned int cpumask_next_hthread(int nr, const unsigned long *mask)
+{
+ int i, start;
+
+ start = (nr/threads_per_cpu +1)*threads_per_cpu;
+ for (i = start; i < max_cpu_id; i += threads_per_cpu) {
+ if (test_bit(i, mask))
+ return i;
+ }
+ return -1;
+}
+
+/* @bitmap: allocated internally
+ * max_idx: the max cpu logical id
+ * return the num of bits in bitmap
+ */
+static int parse_cpu_mask(char *buf, int bz, unsigned long **bitmap,
+ unsigned int *max_idx)
+{
+ int a, b, i, bm_sz;
+ bool range = false;
+ char *s, *p;
+#define TMP_BUF_SIZE 32
+ char tbuf[TMP_BUF_SIZE];
+
+ a = b = i = 0;
+ /* get the max id in order to alloc bitmap */
+
+ for (s = p = buf + bz; s >= buf; s--) {
+ if (*s == '-' ||*s == ',') {
+ break;
+ }
+ }
+ memset(tbuf, '\0', TMP_BUF_SIZE);
+ memcpy(tbuf, s+1, p-s-1);
+ sscanf(tbuf, "%d", &b);
+ if (max_idx)
+ *max_idx = b;
+ /* in worst case waste 7 bytes */
+ bm_sz = (b + BITS_PER_LONG-1)/8;
+ *bitmap = memalign(sizeof(unsigned long), bm_sz);
+ memset(*bitmap, 0, bm_sz);
+
+ /* set the bitmap */
+
+ range = false;
+ for (s = p = buf; p - buf < bz; p++) {
+ if (*p == '-')
+ range = true;
+ if (*p == ',' || *p == '\n') {
+ memset(tbuf, '\0', TMP_BUF_SIZE);
+ memcpy(tbuf, s, p-s);
+ if (range) {
+ sscanf(tbuf, "%d-%d", &a, &b);
+ set_bitmap(a, b, *bitmap);
+ i += (b -a) +1;
+ } else {
+ sscanf(tbuf, "%d", &a);
+ set_bitmap(a, a, *bitmap);
+ i++;
+ }
+ range = false;
+ if (*p == ',' )
+ s = p + 1;
+ else
+ break;
+ }
+ }
+ return i;
+}
+
+static int get_cpu_present_mask(void)
+{
+ char path[SYSFS_PATH_MAX];
+ char buf[256] = {0};
+ int fd, sz, ret = 0;
+
+ sprintf(path, SYSFS_CPUDIR"/%s", "present");
+ fd = open(path, O_RDONLY);
+ sz = read(fd, buf, 256);
+ close(fd);
+ if (sz > 0)
+ parse_cpu_mask(buf, sz, &cpu_present_mask, &max_cpu_id);
+ else {
+ ret = -1;
+ printf("can not parse %s\n", path);
+ }
+ return ret;
+}
+
static int test_sysattr(char *attribute, int perms)
{
char path[SYSFS_PATH_MAX];
int i;
- for (i = 0; i < threads_in_system; i++) {
- sprintf(path, SYSFS_CPUDIR"/%s", i, attribute);
+ for_each_cpu(i, cpu_present_mask) {
+ sprintf(path, SYSFS_PERCPUDIR"/%s", i, attribute);
if (access(path, F_OK))
continue;
@@ -160,7 +306,7 @@ static int cpu_online(int thread)
char path[SYSFS_PATH_MAX];
int rc, online;
- sprintf(path, SYSFS_CPUDIR"/online", thread);
+ sprintf(path, SYSFS_PERCPUDIR"/online", thread);
rc = get_attribute(path, "%d", &online);
/* This attribute does not exist in kernels without hotplug enabled */
@@ -180,13 +326,13 @@ static int get_system_attribute(char *attribute, const char *fmt, int *value,
int i, rc;
int system_attribute = -1;
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
int cpu_attribute;
if (!cpu_online(i))
continue;
- sprintf(path, SYSFS_CPUDIR"/%s", i, attribute);
+ sprintf(path, SYSFS_PERCPUDIR"/%s", i, attribute);
rc = get_attribute(path, fmt, &cpu_attribute);
if (rc)
return rc;
@@ -208,8 +354,8 @@ static int set_system_attribute(char *attribute, const char *fmt, int state)
char path[SYSFS_PATH_MAX];
int i, rc;
- for (i = 0; i < threads_in_system; i++) {
- sprintf(path, SYSFS_CPUDIR"/%s", i, attribute);
+ for_each_cpu(i, cpu_present_mask) {
+ sprintf(path, SYSFS_PERCPUDIR"/%s", i, attribute);
rc = set_attribute(path, fmt, state);
/* When a CPU is offline some sysfs files are removed from the CPU
* directory, for example smt_snooze_delay and dscr. The absence of the
@@ -360,14 +506,13 @@ static int get_cpu_info(void)
}
closedir(d);
- threads_in_system = cpus_in_system * threads_per_cpu;
subcores = num_subcores();
if (is_subcore_capable() && subcores > 0) {
threads_per_cpu /= subcores;
cpus_in_system *= subcores;
}
- return 0;
+ return get_cpu_present_mask();
}
static int is_smt_capable(void)
@@ -376,8 +521,8 @@ static int is_smt_capable(void)
char path[SYSFS_PATH_MAX];
int i;
- for (i = 0; i < threads_in_system; i++) {
- sprintf(path, SYSFS_CPUDIR"/smt_snooze_delay", i);
+ for_each_cpu(i, cpu_present_mask) {
+ sprintf(path, SYSFS_PERCPUDIR"/smt_snooze_delay", i);
if (stat(path, &sb))
continue;
return 1;
@@ -431,7 +576,7 @@ static int set_one_smt_state(int thread, int online_threads)
int i, rc = 0;
for (i = 0; i < threads_per_cpu; i++) {
- snprintf(path, SYSFS_PATH_MAX, SYSFS_CPUDIR"/%s", thread + i,
+ snprintf(path, SYSFS_PATH_MAX, SYSFS_PERCPUDIR"/%s", thread + i,
"online");
if (i < online_threads)
rc = online_thread(path);
@@ -452,7 +597,8 @@ static int set_one_smt_state(int thread, int online_threads)
static int set_smt_state(int smt_state)
{
- int i, j, rc;
+ unsigned int i;
+ int j, rc;
int ssd, update_ssd = 1;
int inconsistent = 0;
int error = 0;
@@ -465,8 +611,9 @@ static int set_smt_state(int smt_state)
rc = get_smt_snooze_delay(&ssd, &inconsistent);
if (rc)
update_ssd = 0;
+ if (smt_state )
- for (i = 0; i < threads_in_system; i += threads_per_cpu) {
+ for (i = 0; i < max_cpu_id; ) {
/* Online means any thread on this core running, so check all
* threads in the core, not just the first. */
for (j = 0; j < threads_per_cpu; j++) {
@@ -481,6 +628,7 @@ static int set_smt_state(int smt_state)
error = 1;
break;
}
+ i = cpumask_next_hthread(i, cpu_present_mask);
}
if (update_ssd)
@@ -501,9 +649,8 @@ static int is_dscr_capable(void)
if (dscr_default_exists())
return 1;
-
- for (i = 0; i < threads_in_system; i++) {
- sprintf(path, SYSFS_CPUDIR"/dscr", i);
+ for_each_cpu(i, cpu_present_mask) {
+ sprintf(path, SYSFS_PERCPUDIR"/dscr", i);
if (stat(path, &sb))
continue;
return 1;
@@ -863,7 +1010,7 @@ static int setup_counters(struct cpu_freq *cpu_freqs)
/* Record how long the event ran for */
attr.read_format |= PERF_FORMAT_TOTAL_TIME_RUNNING;
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
if (!cpu_online(i)) {
cpu_freqs[i].offline = 1;
continue;
@@ -890,7 +1037,7 @@ static void start_counters(struct cpu_freq *cpu_freqs)
{
int i;
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
if (cpu_freqs[i].offline)
continue;
@@ -902,7 +1049,7 @@ static void stop_counters(struct cpu_freq *cpu_freqs)
{
int i;
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
if (cpu_freqs[i].offline)
continue;
@@ -920,7 +1067,7 @@ static void read_counters(struct cpu_freq *cpu_freqs)
int i;
struct read_format vals;
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
size_t res;
if (cpu_freqs[i].offline)
@@ -945,7 +1092,7 @@ static void check_threads(struct cpu_freq *cpu_freqs)
{
int i;
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
if (cpu_freqs[i].offline)
continue;
@@ -1051,7 +1198,7 @@ static void report_system_power_mode(void)
static void setrlimit_open_files(void)
{
struct rlimit old_rlim, new_rlim;
- int new = threads_in_system + 8;
+ int new = max_cpu_id + 8;
getrlimit(RLIMIT_NOFILE, &old_rlim);
@@ -1077,7 +1224,7 @@ static int do_cpu_frequency(int sleep_time)
setrlimit_open_files();
- cpu_freqs = calloc(threads_in_system, sizeof(*cpu_freqs));
+ cpu_freqs = calloc(max_cpu_id, sizeof(*cpu_freqs));
if (!cpu_freqs)
return -ENOMEM;
@@ -1088,7 +1235,7 @@ static int do_cpu_frequency(int sleep_time)
}
/* Start a soak thread on each CPU */
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
if (cpu_freqs[i].offline)
continue;
@@ -1111,7 +1258,7 @@ static int do_cpu_frequency(int sleep_time)
check_threads(cpu_freqs);
read_counters(cpu_freqs);
- for (i = 0; i < threads_in_system; i++) {
+ for_each_cpu(i, cpu_present_mask) {
double frequency;
if (cpu_freqs[i].offline)
@@ -1163,7 +1310,7 @@ static int set_all_threads_off(int cpu, int smt_state)
int rc = 0;
for (i = cpu + smt_state - 1; i >= cpu; i--) {
- snprintf(path, SYSFS_PATH_MAX, SYSFS_CPUDIR"/%s", i, "online");
+ snprintf(path, SYSFS_PATH_MAX, SYSFS_PERCPUDIR"/%s", i, "online");
rc = offline_thread(path);
if (rc == -1)
printf("Unable to take cpu%d offline", i);
--
2.7.4
More information about the Linuxppc-dev
mailing list