[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PATCH COLO-Frame v6 15/31] COLO failover: Implement COLO p
From: |
zhanghailiang |
Subject: |
[Qemu-devel] [PATCH COLO-Frame v6 15/31] COLO failover: Implement COLO primary/secondary vm failover work |
Date: |
Thu, 18 Jun 2015 16:58:39 +0800 |
If there are some errors happen, we will give users(administrators) time to
get involved in failover verdict, which they can decide
which side should take over the work by using 'colo_lost_heartbeat' command.
Note: The default verdict is primary VM takes over work while secondary VM exit.
So if users choose secondary VM to take over work, please make sure that
Primary VM is dead, or there will be 'split-brain' problem.
Signed-off-by: zhanghailiang <address@hidden>
Signed-off-by: Li Zhijian <address@hidden>
Signed-off-by: Lai Jiangshan <address@hidden>
---
include/migration/migration-colo.h | 4 +
include/migration/migration-failover.h | 2 +
migration/colo-failover.c | 12 ++-
migration/colo.c | 132 ++++++++++++++++++++++++++++++++-
trace-events | 1 +
5 files changed, 147 insertions(+), 4 deletions(-)
diff --git a/include/migration/migration-colo.h
b/include/migration/migration-colo.h
index f9c09f3..ea10374 100644
--- a/include/migration/migration-colo.h
+++ b/include/migration/migration-colo.h
@@ -43,4 +43,8 @@ int get_colo_mode(void);
int create_and_init_ram_cache(void);
void colo_flush_ram_cache(void);
void release_ram_cache(void);
+
+/* failover */
+void colo_do_failover(MigrationState *s);
+
#endif
diff --git a/include/migration/migration-failover.h
b/include/migration/migration-failover.h
index a8767fc..5e59b1d 100644
--- a/include/migration/migration-failover.h
+++ b/include/migration/migration-failover.h
@@ -16,5 +16,7 @@
#include "qemu-common.h"
void failover_request_set(void);
+void failover_request_clear(void);
+bool failover_request_is_set(void);
#endif
diff --git a/migration/colo-failover.c b/migration/colo-failover.c
index 0f2f81f..d8f740c 100644
--- a/migration/colo-failover.c
+++ b/migration/colo-failover.c
@@ -23,7 +23,7 @@ static void colo_failover_bh(void *opaque)
{
qemu_bh_delete(failover_bh);
failover_bh = NULL;
- /*TODO: Do failover work */
+ colo_do_failover(NULL);
}
void failover_request_set(void)
@@ -33,6 +33,16 @@ void failover_request_set(void)
qemu_bh_schedule(failover_bh);
}
+void failover_request_clear(void)
+{
+ failover_request = false;
+}
+
+bool failover_request_is_set(void)
+{
+ return failover_request;
+}
+
void qmp_colo_lost_heartbeat(Error **errp)
{
if (get_colo_mode() == COLO_MODE_UNKNOWN) {
diff --git a/migration/colo.c b/migration/colo.c
index cc3d321..3ecaec8 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -73,6 +73,67 @@ bool loadvm_in_colo_state(void)
return colo != NULL;
}
+static bool colo_runstate_is_stopped(void)
+{
+ return runstate_check(RUN_STATE_COLO) || !runstate_is_running();
+}
+
+/*
+ * there are two way to entry this function
+ * 1. From colo checkpoint incoming thread, in this case
+ * we should protect it by iothread lock
+ * 2. From user command, because hmp/qmp command
+ * was happened in main loop, iothread lock will cause a
+ * dead lock.
+ */
+static void secondary_vm_do_failover(void)
+{
+ colo = NULL;
+
+ if (!autostart) {
+ error_report("\"-S\" qemu option will be ignored in secondary side");
+ /* recover runstate to normal migration finish state */
+ autostart = true;
+ }
+
+ /* For Secondary VM, jump to incoming co */
+ if (migration_incoming_co) {
+ qemu_coroutine_enter(migration_incoming_co, NULL);
+ }
+}
+
+static void primary_vm_do_failover(void)
+{
+ MigrationState *s = migrate_get_current();
+
+ if (!colo_runstate_is_stopped()) {
+ vm_stop_force_state(RUN_STATE_COLO);
+ }
+
+ if (s->state != MIGRATION_STATUS_FAILED) {
+ migrate_set_state(s, MIGRATION_STATUS_COLO,
MIGRATION_STATUS_COMPLETED);
+ }
+
+ vm_start();
+}
+
+static bool failover_completed;
+void colo_do_failover(MigrationState *s)
+{
+ /* Make sure vm stopped while failover */
+ if (!colo_runstate_is_stopped()) {
+ vm_stop_force_state(RUN_STATE_COLO);
+ }
+
+ trace_colo_do_failover();
+ if (get_colo_mode() == COLO_MODE_SECONDARY) {
+ secondary_vm_do_failover();
+ } else {
+ primary_vm_do_failover();
+ }
+ failover_completed = true;
+}
+
/* colo checkpoint control helper */
static int colo_ctl_put(QEMUFile *f, uint64_t request)
{
@@ -144,11 +205,23 @@ static int colo_do_checkpoint_transaction(MigrationState
*s, QEMUFile *control)
goto out;
}
+ if (failover_request_is_set()) {
+ ret = -1;
+ goto out;
+ }
/* suspend and save vm state to colo buffer */
qemu_mutex_lock_iothread();
vm_stop_force_state(RUN_STATE_COLO);
qemu_mutex_unlock_iothread();
trace_colo_vm_state_change("run", "stop");
+ /*
+ * failover request bh could be called after
+ * vm_stop_force_state so we check failover_request_is_set() again.
+ */
+ if (failover_request_is_set()) {
+ ret = -1;
+ goto out;
+ }
/* Disable block migration */
s->params.blk = 0;
@@ -209,7 +282,7 @@ static void *colo_thread(void *opaque)
{
MigrationState *s = opaque;
QEMUFile *colo_control = NULL;
- int ret;
+ int i, ret;
colo_control = qemu_fopen_socket(qemu_get_fd(s->file), "rb");
if (!colo_control) {
@@ -239,6 +312,11 @@ static void *colo_thread(void *opaque)
trace_colo_vm_state_change("stop", "run");
while (s->state == MIGRATION_STATUS_COLO) {
+ if (failover_request_is_set()) {
+ error_report("failover request");
+ goto out;
+ }
+
/* start a colo checkpoint */
if (colo_do_checkpoint_transaction(s, colo_control)) {
goto out;
@@ -246,7 +324,26 @@ static void *colo_thread(void *opaque)
}
out:
- migrate_set_state(s, MIGRATION_STATUS_COLO, MIGRATION_STATUS_COMPLETED);
+ error_report("colo: some error happens in colo_thread");
+ /* Give users time (2s) to get involved in this verdict */
+ for (i = 0; i < 10; i++) {
+ if (failover_request_is_set()) {
+ error_report("Primary VM will take over work");
+ break;
+ }
+ usleep(200*1000);
+ }
+ qemu_mutex_lock_iothread();
+ if (!failover_request_is_set()) {
+ error_report("Primary VM will take over work in default");
+ failover_request_set();
+ }
+ qemu_mutex_unlock_iothread();
+
+ while (!failover_completed) {
+ ;
+ }
+ failover_request_clear();
qsb_free(colo_buffer);
colo_buffer = NULL;
@@ -317,7 +414,7 @@ void *colo_process_incoming_checkpoints(void *opaque)
QEMUFile *f = colo_in->file;
int fd = qemu_get_fd(f);
QEMUFile *ctl = NULL, *fb = NULL;
- int ret;
+ int i, ret;
uint64_t total_size;
colo = qemu_coroutine_self();
@@ -362,6 +459,10 @@ void *colo_process_incoming_checkpoints(void *opaque)
continue;
}
}
+ if (failover_request_is_set()) {
+ error_report("failover request");
+ goto out;
+ }
/* suspend guest */
qemu_mutex_lock_iothread();
@@ -431,6 +532,31 @@ void *colo_process_incoming_checkpoints(void *opaque)
}
out:
+ error_report("Detect some error or get a failover request");
+ /* Give users time (2s) to get involved in this verdict */
+ for (i = 0; i < 10; i++) {
+ if (failover_request_is_set()) {
+ error_report("Secondary VM will take over work");
+ break;
+ }
+ usleep(200*1000);
+ }
+ /* check flag again*/
+ if (!failover_request_is_set()) {
+ /*
+ * We assume that Primary VM is still alive according to heartbeat,
+ * just kill Secondary VM
+ */
+ error_report("SVM is going to exit in default!");
+ exit(1);
+ } else {
+ /* if we went here, means Primary VM may dead, we are doing failover */
+ while (!failover_completed) {
+ ;
+ }
+ failover_request_clear();
+ }
+
colo = NULL;
if (fb) {
diff --git a/trace-events b/trace-events
index af05a12..e262c8a 100644
--- a/trace-events
+++ b/trace-events
@@ -1472,6 +1472,7 @@ colo_info_load(const char *msg) "%s"
# migration/colo.c
colo_vm_state_change(const char *old, const char *new) "Change '%s' => '%s'"
colo_receive_message(const char *msg) "Receive '%s'"
+colo_do_failover(void) ""
# kvm-all.c
kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
--
1.7.12.4
- [Qemu-devel] [PATCH COLO-Frame v6 04/31] migration: Integrate COLO checkpoint process into migration, (continued)
- [Qemu-devel] [PATCH COLO-Frame v6 04/31] migration: Integrate COLO checkpoint process into migration, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 10/31] COLO RAM: Load PVM's dirty page into SVM's RAM cache temporarily, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 06/31] COLO: Implement colo checkpoint protocol, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 05/31] migration: Integrate COLO checkpoint process into loadvm, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 09/31] COLO: Save VM state to slave when do checkpoint, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 12/31] arch_init: Start to trace dirty pages of SVM, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 13/31] COLO RAM: Flush cached RAM into SVM's memory, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 07/31] COLO: Add a new RunState RUN_STATE_COLO, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 08/31] QEMUSizedBuffer: Introduce two help functions for qsb, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 11/31] COLO VMstate: Load VM state into qsb before restore it, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 15/31] COLO failover: Implement COLO primary/secondary vm failover work,
zhanghailiang <=
- [Qemu-devel] [PATCH COLO-Frame v6 02/31] migration: Introduce capability 'colo' to migration, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 14/31] COLO failover: Introduce a new command to trigger a failover, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 17/31] COLO failover: Don't do failover during loading VM's state, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 21/31] COLO NIC: Implement colo nic device interface configure(), zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 24/31] COLO: Handle nfnetlink message from proxy module, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 19/31] COLO NIC: Init/remove colo nic devices when add/cleanup tap devices, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 26/31] COLO: Improve checkpoint efficiency by do additional periodic checkpoint, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 25/31] COLO: Do checkpoint according to the result of packets comparation, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 20/31] tap: Make launch_script() public, zhanghailiang, 2015/06/18
- [Qemu-devel] [PATCH COLO-Frame v6 16/31] qmp event: Add event notification for COLO error, zhanghailiang, 2015/06/18