In a large VPC environment we want to log memory error occurrences and log them with guest name and type - there are few use cases
- if VM crashes on AR mce inform the user about the reason and resolve the case - if VM hangs notify the user to reboot and resume processing - if VM continues to run let the user know, he/she maybe able to correlate to vm internal outage - Rawhammer attacks - isolate/determine the attacker possible migrating it off the hypervisor - In general track memory errors on a hyperviosr over time to determine trends Monitoring our fleet we come across quite a few of these and been able to take action where before there were no clues to the causes. When memory error occurs we get a log entry in qemu log: Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64: Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000 addr of type BUS_MCEERR_AR injected with enterprise logging environment we can to take further actions. Signed-off-by: Mario Smarduch <msmard...@digitalocean.com> --- target/i386/kvm.c | 27 ++++++++++++++++++++++----- util/qemu-error.c | 24 ++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/target/i386/kvm.c b/target/i386/kvm.c index 92069099ab..79ebccc684 100644 --- a/target/i386/kvm.c +++ b/target/i386/kvm.c @@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr paddr, int code) (MCM_ADDR_PHYS << 6) | 0xc, flags); } -static void hardware_memory_error(void) +static void hardware_memory_error(void *addr) { - fprintf(stderr, "Hardware memory error!\n"); + error_report("QEMU got Hardware memory error at addr %p", addr); exit(1); } @@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr) kvm_physical_memory_addr_from_host(c->kvm_state, addr, &paddr)) { kvm_hwpoison_page_add(ram_addr); kvm_mce_inject(cpu, paddr, code); + /* + * Use different logging severity based on error type. + * If mcelog is running qemu va addr will help debug via mcelog. + */ + if (code == BUS_MCEERR_AR) { + error_report("Guest MCE Memory Error at qemu addr %p and " + "guest %lx addr of type %s injected", addr, paddr, + "BUS_MCEERR_AR"); + } else { + warn_report("Guest MCE Memory Error at qemu addr %p and " + "guest %lx addr of type %s injected", addr, + paddr, "BUS_MCEERR_AO"); + } + return; } - fprintf(stderr, "Hardware memory error for memory used by " - "QEMU itself instead of guest system!\n"); + if (code == BUS_MCEERR_AO) { + warn_report("Hardware memory error at addr %p of type %s " + "for memory used by QEMU itself instead of guest system!", + addr, "BUS_MCEERR_AO"); + } } if (code == BUS_MCEERR_AR) { - hardware_memory_error(); + hardware_memory_error(addr); } /* Hope we are lucky for AO MCE */ diff --git a/util/qemu-error.c b/util/qemu-error.c index f373f3b3b0..2ebafd4405 100644 --- a/util/qemu-error.c +++ b/util/qemu-error.c @@ -11,6 +11,8 @@ */ #include "qemu/osdep.h" +#include "qemu/option.h" +#include "qemu/config-file.h" #include "monitor/monitor.h" #include "qemu/error-report.h" @@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...) return ret; } +static const char *error_get_guestname(void) +{ + QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL); + return qemu_opt_get(opts, "guest"); +} + +/* + * Print guest name associated with error, to aid debugging errors from + * multiple guests in centralized logging environment. + */ +static void error_print_guestname(void) +{ + const char *name; + name = error_get_guestname(); + if (name != NULL && !cur_mon) { + error_printf("Guest [%s] ", name); + } +} + int error_printf_unless_qmp(const char *fmt, ...) { va_list ap; int ret; + error_print_guestname(); va_start(ap, fmt); ret = error_vprintf_unless_qmp(fmt, ap); va_end(ap); @@ -274,6 +296,7 @@ void error_report(const char *fmt, ...) { va_list ap; + error_print_guestname(); va_start(ap, fmt); vreport(REPORT_TYPE_ERROR, fmt, ap); va_end(ap); @@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...) { va_list ap; + error_print_guestname(); va_start(ap, fmt); vreport(REPORT_TYPE_WARNING, fmt, ap); va_end(ap); -- 2.17.1