In a large VPC environment we want to log memory error occurrences
and log them with guest name and type - there are few use cases
- if VM crashes on AR mce inform the user about the reason and
resolve the case
- if VM hangs notify the user to reboot and resume processing
- if VM continues to run let the user know, he/she maybe able to
correlate to vm internal outage
- Rawhammer attacks - isolate/determine the attacker possible
migrating it off the hypervisor
- In general track memory errors on a hyperviosr over time to determine
trends
Monitoring our fleet we come across quite a few of these and been
able to take action where before there were no clues to the causes.
When memory error occurs we get a log entry in qemu log:
Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
addr of type BUS_MCEERR_AR injected
with enterprise logging environment we can to take further actions.
Signed-off-by: Mario Smarduch <address@hidden>
---
target/i386/kvm.c | 27 ++++++++++++++++++++++-----
util/qemu-error.c | 24 ++++++++++++++++++++++++
2 files changed, 46 insertions(+), 5 deletions(-)
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 92069099ab..79ebccc684 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr
paddr, int code)
(MCM_ADDR_PHYS << 6) | 0xc, flags);
}
-static void hardware_memory_error(void)
+static void hardware_memory_error(void *addr)
{
- fprintf(stderr, "Hardware memory error!\n");
+ error_report("QEMU got Hardware memory error at addr %p", addr);
exit(1);
}
@@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int
code, void *addr)
kvm_physical_memory_addr_from_host(c->kvm_state, addr,
&paddr)) {
kvm_hwpoison_page_add(ram_addr);
kvm_mce_inject(cpu, paddr, code);
+ /*
+ * Use different logging severity based on error type.
+ * If mcelog is running qemu va addr will help debug via
mcelog.
+ */
+ if (code == BUS_MCEERR_AR) {
+ error_report("Guest MCE Memory Error at qemu addr %p and "
+ "guest %lx addr of type %s injected", addr, paddr,
+ "BUS_MCEERR_AR");
+ } else {
+ warn_report("Guest MCE Memory Error at qemu addr %p and "
+ "guest %lx addr of type %s injected", addr,
+ paddr, "BUS_MCEERR_AO");
+ }
+
return;
}
- fprintf(stderr, "Hardware memory error for memory used by "
- "QEMU itself instead of guest system!\n");
+ if (code == BUS_MCEERR_AO) {
+ warn_report("Hardware memory error at addr %p of type %s "
+ "for memory used by QEMU itself instead of guest system!",
+ addr, "BUS_MCEERR_AO");
+ }
}
if (code == BUS_MCEERR_AR) {
- hardware_memory_error();
+ hardware_memory_error(addr);
}
/* Hope we are lucky for AO MCE */
diff --git a/util/qemu-error.c b/util/qemu-error.c
index f373f3b3b0..2ebafd4405 100644
--- a/util/qemu-error.c
+++ b/util/qemu-error.c
@@ -11,6 +11,8 @@
*/
#include "qemu/osdep.h"
+#include "qemu/option.h"
+#include "qemu/config-file.h"
#include "monitor/monitor.h"
#include "qemu/error-report.h"
@@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...)
return ret;
}
+static const char *error_get_guestname(void)
+{
+ QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL);
+ return qemu_opt_get(opts, "guest");
+}
+
+/*
+ * Print guest name associated with error, to aid debugging errors from
+ * multiple guests in centralized logging environment.
+ */
+static void error_print_guestname(void)
+{
+ const char *name;
+ name = error_get_guestname();
+ if (name != NULL && !cur_mon) {
+ error_printf("Guest [%s] ", name);
+ }
+}
+
int error_printf_unless_qmp(const char *fmt, ...)
{
va_list ap;
int ret;
+ error_print_guestname();
va_start(ap, fmt);
ret = error_vprintf_unless_qmp(fmt, ap);
va_end(ap);
@@ -274,6 +296,7 @@ void error_report(const char *fmt, ...)
{
va_list ap;
+ error_print_guestname();
va_start(ap, fmt);
vreport(REPORT_TYPE_ERROR, fmt, ap);
va_end(ap);
@@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...)
{
va_list ap;
+ error_print_guestname();
va_start(ap, fmt);
vreport(REPORT_TYPE_WARNING, fmt, ap);
va_end(ap);
--
2.17.1