qemu-ppc
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v17 6/7] migration: Include migration support for machine che


From: Ganesh
Subject: Re: [PATCH v17 6/7] migration: Include migration support for machine check handling
Date: Fri, 6 Dec 2019 11:15:58 +0530
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.3.1


On 12/6/19 6:52 AM, David Gibson wrote:
On Thu, Dec 05, 2019 at 10:39:29AM +0530, Ganesh wrote:
On 11/19/19 8:15 AM, David Gibson wrote:
On Thu, Oct 24, 2019 at 01:13:06PM +0530, Ganesh Goudar wrote:
From: Aravinda Prasad <address@hidden>

This patch includes migration support for machine check
handling. Especially this patch blocks VM migration
requests until the machine check error handling is
complete as these errors are specific to the source
hardware and is irrelevant on the target hardware.

[Do not set FWNMI cap in post_load, now its done in .apply hook]
Signed-off-by: Ganesh Goudar <address@hidden>
Signed-off-by: Aravinda Prasad <address@hidden>
---
   hw/ppc/spapr.c         | 41 +++++++++++++++++++++++++++++++++++++++++
   hw/ppc/spapr_events.c  | 16 +++++++++++++++-
   hw/ppc/spapr_rtas.c    |  2 ++
   include/hw/ppc/spapr.h |  2 ++
   4 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 346ec5ba6c..e0d0f95ec0 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -46,6 +46,7 @@
   #include "migration/qemu-file-types.h"
   #include "migration/global_state.h"
   #include "migration/register.h"
+#include "migration/blocker.h"
   #include "mmu-hash64.h"
   #include "mmu-book3s-v3.h"
   #include "cpu-models.h"
@@ -1751,6 +1752,8 @@ static void spapr_machine_reset(MachineState *machine)
       /* Signal all vCPUs waiting on this condition */
       qemu_cond_broadcast(&spapr->mc_delivery_cond);
+
+    migrate_del_blocker(spapr->fwnmi_migration_blocker);
   }
   static void spapr_create_nvram(SpaprMachineState *spapr)
@@ -2041,6 +2044,43 @@ static const VMStateDescription vmstate_spapr_dtb = {
       },
   };
+static bool spapr_fwnmi_needed(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    return spapr->guest_machine_check_addr != -1;
+}
+
+static int spapr_fwnmi_pre_save(void *opaque)
+{
+    SpaprMachineState *spapr = (SpaprMachineState *)opaque;
+
+    /*
+     * With -only-migratable QEMU option, we cannot block migration.
+     * Hence check if machine check handling is in progress and print
+     * a warning message.
+     */
IIUC the logic below this could also occur in the case where the fwnmi
event occurs after a migration has started, but before it completes,
not just with -only-migratable.  Is that correct?
Yes
Ok, please update the comment accordingly.
sure

+    if (spapr->mc_status != -1) {
+        warn_report("A machine check is being handled during migration. The"
+                "handler may run and log hardware error on the destination");
+    }
+
+    return 0;
+}
+
+static const VMStateDescription vmstate_spapr_machine_check = {
+    .name = "spapr_machine_check",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = spapr_fwnmi_needed,
+    .pre_save = spapr_fwnmi_pre_save,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT64(guest_machine_check_addr, SpaprMachineState),
+        VMSTATE_INT32(mc_status, SpaprMachineState),
+        VMSTATE_END_OF_LIST()
+    },
+};
+
   static const VMStateDescription vmstate_spapr = {
       .name = "spapr",
       .version_id = 3,
@@ -2075,6 +2115,7 @@ static const VMStateDescription vmstate_spapr = {
           &vmstate_spapr_cap_large_decr,
           &vmstate_spapr_cap_ccf_assist,
           &vmstate_spapr_cap_fwnmi,
+        &vmstate_spapr_machine_check,
           NULL
       }
   };
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index db44e09154..30d9371c88 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -43,6 +43,7 @@
   #include "qemu/main-loop.h"
   #include "hw/ppc/spapr_ovec.h"
   #include <libfdt.h>
+#include "migration/blocker.h"
   #define RTAS_LOG_VERSION_MASK                   0xff000000
   #define   RTAS_LOG_VERSION_6                    0x06000000
@@ -842,6 +843,8 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
   {
       SpaprMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
       CPUState *cs = CPU(cpu);
+    int ret;
+    Error *local_err = NULL;
       if (spapr->guest_machine_check_addr == -1) {
           /*
@@ -871,8 +874,19 @@ void spapr_mce_req_event(PowerPCCPU *cpu, bool recovered)
               return;
           }
       }
-    spapr->mc_status = cpu->vcpu_id;
+    ret = migrate_add_blocker(spapr->fwnmi_migration_blocker, &local_err);
+    if (ret == -EBUSY) {
+        /*
+         * We don't want to abort so we let the migration to continue.
+         * In a rare case, the machine check handler will run on the target.
+         * Though this is not preferable, it is better than aborting
+         * the migration or killing the VM.
+         */
+        warn_report_err(local_err);
I suspect the error message in local_err won't be particularly
meaningful on its own.  Perhaps you need to add a prefix to clarify
that the problem is you've received a fwnmi after migration has
commenced?
ok
+    }
+
+    spapr->mc_status = cpu->vcpu_id;
       spapr_mce_dispatch_elog(cpu, recovered);
   }
diff --git a/hw/ppc/spapr_rtas.c b/hw/ppc/spapr_rtas.c
index 0328b1f341..c78d96ee7e 100644
--- a/hw/ppc/spapr_rtas.c
+++ b/hw/ppc/spapr_rtas.c
@@ -50,6 +50,7 @@
   #include "hw/ppc/fdt.h"
   #include "target/ppc/mmu-hash64.h"
   #include "target/ppc/mmu-book3s-v3.h"
+#include "migration/blocker.h"
   static void rtas_display_character(PowerPCCPU *cpu, SpaprMachineState *spapr,
                                      uint32_t token, uint32_t nargs,
@@ -446,6 +447,7 @@ static void rtas_ibm_nmi_interlock(PowerPCCPU *cpu,
        */
       spapr->mc_status = -1;
       qemu_cond_signal(&spapr->mc_delivery_cond);
+    migrate_del_blocker(spapr->fwnmi_migration_blocker);
Oh... damn.  I suggested using a static fwnmi_migration_blocker
instance, but I just realized there's a problem with it.

If we do receive multiple fwnmi events on different cpus at roughly
the same time, this will break: I think we'll try to add the same
migration blocker instance multiple times, which won't be good.  Even
if that doesn't do anything *too* bad, then we'll unblock the
migration on the first interlock, rather than waiting for all pending
fwnmi events to complete.
Ok, not sure how to handle this, ill look into it.
Well, you can do it by reverting to the old approach of dynamically
creating the Error object before adding it as blocker (and freeing it
when the blocker is removed).  It's a bit ugly, but it will suffice.
Sure, Thanks

       rtas_st(rets, 0, RTAS_OUT_SUCCESS);
   }
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 86f0fc8fdd..290abd6328 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -215,6 +215,8 @@ struct SpaprMachineState {
       unsigned gpu_numa_id;
       SpaprTpmProxy *tpm_proxy;
+
+    Error *fwnmi_migration_blocker;
   };
   #define H_SUCCESS         0




reply via email to

[Prev in Thread] Current Thread [Next in Thread]