qemu-trivial
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] target/i386: log guest name and memory error type AO, AR for


From: Mario Smarduch
Subject: Re: [PATCH] target/i386: log guest name and memory error type AO, AR for MCEs
Date: Mon, 7 Oct 2019 10:52:21 -0700
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Thunderbird/52.2.1


On 10/07/2019 03:27 AM, Philippe Mathieu-Daudé wrote:
> Hi Mario,
> 
> On 10/5/19 1:53 AM, Mario Smarduch wrote:
>> In a large VPC environment we want to log memory error occurrences
>> and log them with guest name and type - there are few use cases
>>
>>
>> - if VM crashes on AR mce inform the user about the reason and
>>    resolve the case
>> - if VM hangs notify the user to reboot and resume processing
>> - if VM continues to run let the user know, he/she maybe able to
>>    correlate to vm internal outage
>> - Rawhammer attacks - isolate/determine the attacker possible
>>    migrating it off the hypervisor
>> - In general track memory errors on a hyperviosr over time to determine
>>    trends
>>
>> Monitoring our fleet we come across quite a few of these and been
>> able to take action where before there were no clues to the causes.
>>
>> When memory error occurs we get a log entry in qemu log:
>>
>> Guest [Droplet-12345678] 2019-08-02T05:00:11.940270Z qemu-system-x86_64:
>> Guest MCE Memory Error at qemu addr 0x7f3c7622f000 and guest 78e42f000
>> addr of type BUS_MCEERR_AR injected
>>
>> with enterprise logging environment we can to take further actions.
>>
>> Signed-off-by: Mario Smarduch <address@hidden>
>> ---
>>   target/i386/kvm.c | 27 ++++++++++++++++++++++-----
>>   util/qemu-error.c | 24 ++++++++++++++++++++++++
>>   2 files changed, 46 insertions(+), 5 deletions(-)
>>
>> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
>> index 92069099ab..79ebccc684 100644
>> --- a/target/i386/kvm.c
>> +++ b/target/i386/kvm.c
>> @@ -555,9 +555,9 @@ static void kvm_mce_inject(X86CPU *cpu, hwaddr
>> paddr, int code)
>>                          (MCM_ADDR_PHYS << 6) | 0xc, flags);
>>   }
>>
>> -static void hardware_memory_error(void)
>> +static void hardware_memory_error(void *addr)
> 
> Maybe rename addr -> host_addr.
yep makes it more clear.

> 
>>   {
>> -    fprintf(stderr, "Hardware memory error!\n");
>> +    error_report("QEMU got Hardware memory error at addr %p", addr);
>>       exit(1);
>>   }
>>
>> @@ -581,15 +581,32 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int
>> code, void *addr)
>>               kvm_physical_memory_addr_from_host(c->kvm_state, addr,
>> &paddr)) {
>>               kvm_hwpoison_page_add(ram_addr);
>>               kvm_mce_inject(cpu, paddr, code);
>> +            /*
>> +             * Use different logging severity based on error type.
>> +             * If mcelog is running qemu va addr will help debug via
>> mcelog.
>> +             */
>> +            if (code == BUS_MCEERR_AR) {
>> +                error_report("Guest MCE Memory Error at qemu addr %p
>> and "
>> +                    "guest %lx addr of type %s injected", addr, paddr,
> 
> "qemu addr" is not clear IMO, 'addr' is in the host (and is virtual...
> how does this help you?).

Our mcelog entries are logged globally as well as qemu memory errors, if
log entry(s) from mcelog go missing we can use the VA from qemu to
more or less figure out its the same memory if we have some prior
relationship (based on timestamps of entries). For some cases every
bit helps.

> 
> For the guest paddr you should use "0x%"HWADDR_PRIx format.

Yes missed that should be similar to x86_cpu_dump_state(), a bare
pointer looks bad in qemu.

> 
>> +                     "BUS_MCEERR_AR");
>> +            } else {
>> +                 warn_report("Guest MCE Memory Error at qemu addr %p
>> and "
>> +                     "guest %lx addr of type %s injected", addr,
>> +                     paddr, "BUS_MCEERR_AO");
>> +            }
>> +
>>               return;
>>           }
>>
>> -        fprintf(stderr, "Hardware memory error for memory used by "
>> -                "QEMU itself instead of guest system!\n");
>> +        if (code == BUS_MCEERR_AO) {
>> +            warn_report("Hardware memory error at addr %p of type %s "
>> +                "for memory used by QEMU itself instead of guest
>> system!",
>> +                addr, "BUS_MCEERR_AO");
>> +        }
>>       }
>>
>>       if (code == BUS_MCEERR_AR) {
>> -        hardware_memory_error();
>> +        hardware_memory_error(addr);
>>       }
>>
>>       /* Hope we are lucky for AO MCE */
>> diff --git a/util/qemu-error.c b/util/qemu-error.c
>> index f373f3b3b0..2ebafd4405 100644
>> --- a/util/qemu-error.c
>> +++ b/util/qemu-error.c
>> @@ -11,6 +11,8 @@
>>    */
>>
>>   #include "qemu/osdep.h"
>> +#include "qemu/option.h"
>> +#include "qemu/config-file.h"
>>   #include "monitor/monitor.h"
>>   #include "qemu/error-report.h"
>>
>> @@ -35,11 +37,31 @@ int error_printf(const char *fmt, ...)
>>       return ret;
>>   }
>>
>> +static const char *error_get_guestname(void)
>> +{
>> +    QemuOpts *opts = qemu_opts_find(qemu_find_opts("name"), NULL);
>> +    return qemu_opt_get(opts, "guest");
>> +}
>> +
>> +/*
>> + * Print guest name associated with error, to aid debugging errors from
>> + * multiple guests in centralized logging environment.
>> + */
>> +static void error_print_guestname(void)
>> +{
>> +    const char *name;
>> +    name = error_get_guestname();
>> +    if (name != NULL && !cur_mon) {
>> +        error_printf("Guest [%s] ", name);
>> +    }
>> +}
>> +
>>   int error_printf_unless_qmp(const char *fmt, ...)
>>   {
>>       va_list ap;
>>       int ret;
>>
>> +    error_print_guestname();
>>       va_start(ap, fmt);
>>       ret = error_vprintf_unless_qmp(fmt, ap);
>>       va_end(ap);
>> @@ -274,6 +296,7 @@ void error_report(const char *fmt, ...)
>>   {
>>       va_list ap;
>>
>> +    error_print_guestname();
>>       va_start(ap, fmt);
>>       vreport(REPORT_TYPE_ERROR, fmt, ap);
>>       va_end(ap);
>> @@ -289,6 +312,7 @@ void warn_report(const char *fmt, ...)
>>   {
>>       va_list ap;
>>
>> +    error_print_guestname();
>>       va_start(ap, fmt);
>>       vreport(REPORT_TYPE_WARNING, fmt, ap);
>>       va_end(ap);
>> -- 
>> 2.17.1
>>



reply via email to

[Prev in Thread] Current Thread [Next in Thread]