[Skiboot] [PATCH] Rate limit OPAL_MSG_OCC to only one outstanding message to host
Stewart Smith
stewart at linux.vnet.ibm.com
Fri Sep 25 10:59:08 AEST 2015
In the event of a lot of OCC events (or many CPU cores), we could
send many OCC messages to the host, which if it wasn't calling
opal_get_msg really often, would cause skiboot to malloc() additional
messages until we ran out of skiboot heap and things didn't end up
being much fun.
When running certain hardware exercisers, they seem to steal all time
from Linux being able to call opal_get_msg, causing these to queue up
and get "opalmsg: No available node in the free list, allocating" warnings
followed by tonnes of backtraces of failing memory allocations.
---
hw/occ.c | 19 +++++++++++++++++--
1 file changed, 17 insertions(+), 2 deletions(-)
diff --git a/hw/occ.c b/hw/occ.c
index 68b7032..79140cc 100644
--- a/hw/occ.c
+++ b/hw/occ.c
@@ -312,6 +312,14 @@ static bool cpu_pstates_prepare_core(struct proc_chip *chip, struct cpu_thread *
return true;
}
+static bool occ_opal_msg_outstanding = false;
+static void occ_msg_consumed(void *data __unused)
+{
+ lock(&occ_lock);
+ occ_opal_msg_outstanding = false;
+ unlock(&occ_lock);
+}
+
static void occ_throttle_poll(void *data __unused)
{
struct proc_chip *chip;
@@ -345,6 +353,8 @@ static void occ_throttle_poll(void *data __unused)
occ_reset = false;
}
} else {
+ if (occ_opal_msg_outstanding)
+ goto done;
for_each_chip(chip) {
occ_data = chip_occ_data(chip);
if ((occ_data->valid == 1) &&
@@ -353,13 +363,18 @@ static void occ_throttle_poll(void *data __unused)
occ_msg.type = OCC_THROTTLE;
occ_msg.chip = chip->id;
occ_msg.throttle_status = occ_data->throttle;
- rc = _opal_queue_msg(OPAL_MSG_OCC, NULL, NULL,
+ rc = _opal_queue_msg(OPAL_MSG_OCC, NULL,
+ occ_msg_consumed,
3, (uint64_t *)&occ_msg);
- if (!rc)
+ if (!rc) {
chip->throttle = occ_data->throttle;
+ occ_opal_msg_outstanding = true;
+ break;
+ }
}
}
}
+done:
unlock(&occ_lock);
}
--
2.1.4
More information about the Skiboot
mailing list