[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[RFC PATCH 8/9] virtiofsd: Ensure crash consistency after reconnection
From: |
Jiachen Zhang |
Subject: |
[RFC PATCH 8/9] virtiofsd: Ensure crash consistency after reconnection |
Date: |
Wed, 16 Dec 2020 00:21:18 +0800 |
Now the lo_maps are linked lists based on continuous arrays which are
shared between QEMU and virtiofsd. Elem adding, removing, or lo_map
growing all involve multi-step updating of the shared memory region.
So we need to maintain the fault-atomicity (or crash-consistency) issue
of these operations.
We could choose to use a redo / undo logging to handle this, but it is
rather complex and might sequentialize the concurrent virtiofsd request
handling. Therefore we employ a more relaxed scheme, specifically:
- Ensure code execution order of the codelines involving consistency.
We adjust some execution order to make sure that the lo_maps can be
normally used after reconnections.
- Relax some error handling. For example, lo_link, lo_symlink, lo_mkdir
and lo_mknod should reply success when errno of the underlying FS
syscalls (such as linkat and mknodat) is EEXIST; lo_rmdir, lo_unlink
and lo_rename should succeed (rather than EIO) when lookup_name returns
ENOENT error.
Signed-off-by: Jiachen Zhang <zhangjiachen.jaycee@bytedance.com>
Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
---
tools/virtiofsd/fuse_lowlevel.c | 3 +
tools/virtiofsd/fuse_virtio.c | 28 +++++++
tools/virtiofsd/passthrough_ll.c | 123 +++++++++++++++++++++++--------
3 files changed, 123 insertions(+), 31 deletions(-)
diff --git a/tools/virtiofsd/fuse_lowlevel.c b/tools/virtiofsd/fuse_lowlevel.c
index f3358561e2..ddd8667e16 100644
--- a/tools/virtiofsd/fuse_lowlevel.c
+++ b/tools/virtiofsd/fuse_lowlevel.c
@@ -2521,6 +2521,9 @@ void fuse_session_process_buf_int(struct fuse_session *se,
if (in->opcode == FUSE_WRITE && se->op.write_buf) {
do_write_buf(req, in->nodeid, &iter, bufv);
} else {
+ if (in->opcode == FUSE_FORGET || in->opcode == FUSE_BATCH_FORGET) {
+ virtio_send_msg(req->se, req->ch, NULL, 0);
+ }
fuse_ll_ops[in->opcode].func(req, in->nodeid, &iter);
}
diff --git a/tools/virtiofsd/fuse_virtio.c b/tools/virtiofsd/fuse_virtio.c
index 03b03ddacf..8e8e9f48f3 100644
--- a/tools/virtiofsd/fuse_virtio.c
+++ b/tools/virtiofsd/fuse_virtio.c
@@ -203,6 +203,30 @@ static void copy_iov(struct iovec *src_iov, int src_count,
}
}
+static int virtio_no_msg(struct fuse_session *se, struct fuse_chan *ch)
+{
+ FVRequest *req = container_of(ch, FVRequest, ch);
+ struct fv_QueueInfo *qi = ch->qi;
+ VuDev *dev = &se->virtio_dev->dev;
+ VuVirtq *q = vu_get_queue(dev, qi->qidx);
+ VuVirtqElement *elem = &req->elem;
+ int ret = 0;
+
+ fuse_log(FUSE_LOG_DEBUG, "%s: elem %d no reply sent\n", __func__,
+ elem->index);
+
+ pthread_rwlock_rdlock(&qi->virtio_dev->vu_dispatch_rwlock);
+ pthread_mutex_lock(&qi->vq_lock);
+ vu_queue_push(dev, q, elem, 0);
+ vu_queue_notify(dev, q);
+ pthread_mutex_unlock(&qi->vq_lock);
+ pthread_rwlock_unlock(&qi->virtio_dev->vu_dispatch_rwlock);
+
+ req->reply_sent = true;
+
+ return ret;
+}
+
/*
* Called back by ll whenever it wants to send a reply/message back
* The 1st element of the iov starts with the fuse_out_header
@@ -218,6 +242,10 @@ int virtio_send_msg(struct fuse_session *se, struct
fuse_chan *ch,
VuVirtqElement *elem = &req->elem;
int ret = 0;
+ if (iov == NULL && count == 0) {
+ return virtio_no_msg(se, ch);
+ }
+
assert(count >= 1);
assert(iov[0].iov_len >= sizeof(struct fuse_out_header));
diff --git a/tools/virtiofsd/passthrough_ll.c b/tools/virtiofsd/passthrough_ll.c
index 8fee635396..b5750ef179 100644
--- a/tools/virtiofsd/passthrough_ll.c
+++ b/tools/virtiofsd/passthrough_ll.c
@@ -462,17 +462,22 @@ static int lo_map_grow(map_t map_type, size_t new_nelems)
}
lo.maps[map_type]->elems[new_nelems - 1].freelist = -1;
- lo.maps[map_type]->freelist = lo.maps[map_type]->nelems;
- lo.maps[map_type]->nelems = new_nelems;
-
if (lo.reconnect && lo.connected) {
/* sync the lo_map growing with QEMU */
uint64_t map_size = sizeof(struct lo_map) +
sizeof(struct lo_map_elem) * new_nelems;
vu_slave_send_shm(virtio_get_dev(se), lo.maps[map_type]->memfd,
map_size, map_type);
+ /*
+ * A compiler barrier to ensure map_size is sent to QEMU
+ * before we update lo.maps[map_type]->nelems.
+ */
+ barrier();
}
+ lo.maps[map_type]->freelist = lo.maps[map_type]->nelems;
+ lo.maps[map_type]->nelems = new_nelems;
+
return 1;
}
@@ -554,11 +559,15 @@ static void lo_map_remove(struct lo_map *map, size_t key)
return;
}
+ /*
+ * Crash consistency: we first set refcount to -1 (not in use),
+ * to invalidate the slot immediately, that may only cause a waste
+ * of space when crashing, instead of inconsistency.
+ */
g_atomic_int_set(&elem->refcount, -1);
elem->freelist = map->freelist;
map->freelist = key;
-
}
/* Assumes lo->mutex is held */
@@ -571,10 +580,6 @@ static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
return -1;
}
- g_atomic_int_set(&elem->refcount, 1);
-
- elem->fd = fd;
-
ssize_t fh = ((unsigned long)elem -
(unsigned long)lo_data(req)->maps[FD_MAP]->elems) /
sizeof(struct lo_map_elem);
@@ -585,6 +590,9 @@ static ssize_t lo_add_fd_mapping(fuse_req_t req, int fd)
vu_slave_send_fd_add(virtio_get_dev(se), fd, key);
}
+ g_atomic_int_set(&elem->refcount, 1);
+ elem->fd = fd;
+
return ((unsigned long)elem -
(unsigned long)lo_data(req)->maps[FD_MAP]->elems) /
sizeof(struct lo_map_elem);
@@ -1039,27 +1047,27 @@ static int lo_do_lookup(fuse_req_t req, fuse_ino_t
parent, const char *name,
struct lo_map_elem *elem;
pthread_mutex_lock(&lo->mutex);
elem = lo_add_inode_mapping(req, inode);
-
- /*
- * One for the caller and one for nlookup (released in
- * unref_inode_lolocked())
- */
- g_atomic_int_set(&elem->refcount, 2);
-
free(inode);
inode = &elem->inode;
inode->fuse_ino = ((unsigned long)elem -
(unsigned long)lo_data(req)->maps[INO_MAP]->elems) /
sizeof(struct lo_map_elem);
- g_hash_table_insert(lo->inodes, &inode->key, inode);
- pthread_mutex_unlock(&lo->mutex);
if (lo->reconnect) {
/* Send the newly opened fd to QEMU */
int key = inode->fuse_ino << MAP_TYPE_SHIFT | INO_MAP;
vu_slave_send_fd_add(virtio_get_dev(se), newfd, key);
}
+
+ /*
+ * One for the caller and one for nlookup (released in
+ * unref_inode_lolocked())
+ */
+ g_atomic_int_set(&elem->refcount, 2);
+
+ g_hash_table_insert(lo->inodes, &inode->key, inode);
+ pthread_mutex_unlock(&lo->mutex);
}
e->ino = inode->fuse_ino;
lo_inode_put(lo, inode);
@@ -1202,7 +1210,9 @@ static void lo_mknod_symlink(fuse_req_t req, fuse_ino_t
parent,
lo_restore_cred(&old);
if (res == -1) {
- goto out;
+ if (!(errno == EEXIST && lo->reconnect)) {
+ goto out;
+ }
}
saverr = lo_do_lookup(req, parent, name, &e);
@@ -1272,7 +1282,9 @@ static void lo_link(fuse_req_t req, fuse_ino_t ino,
fuse_ino_t parent,
AT_SYMLINK_FOLLOW);
if (res == -1) {
- goto out_err;
+ if (!(errno == EEXIST && lo->reconnect)) {
+ goto out_err;
+ }
}
res = fstatat(inode->fd, "", &e.attr, AT_EMPTY_PATH | AT_SYMLINK_NOFOLLOW);
@@ -1337,13 +1349,22 @@ static void lo_rmdir(fuse_req_t req, fuse_ino_t parent,
const char *name)
inode = lookup_name(req, parent, name);
if (!inode) {
- fuse_reply_err(req, EIO);
- return;
+ if (lo->reconnect && errno == ENOENT) {
+ /*
+ * Because we crashed in this function, so do not perform unlink
and
+ * reply 0 to complete this request.
+ */
+ fuse_reply_err(req, 0);
+ } else {
+ fuse_reply_err(req, EIO);
+ }
+ goto out;
}
res = unlinkat(lo_fd(req, parent), name, AT_REMOVEDIR);
fuse_reply_err(req, res == -1 ? errno : 0);
+out:
unref_inode_lolocked(lo, inode, 1);
lo_inode_put(lo, inode);
}
@@ -1371,11 +1392,19 @@ static void lo_rename(fuse_req_t req, fuse_ino_t
parent, const char *name,
goto out;
}
- oldinode = lookup_name(req, parent, name);
newinode = lookup_name(req, newparent, newname);
+ oldinode = lookup_name(req, parent, name);
if (!oldinode) {
- fuse_reply_err(req, EIO);
+ if (lo->reconnect && errno == ENOENT) {
+ /*
+ * Because we crashed in this function, so do not perform unlink
and
+ * reply 0 to complete this request.
+ */
+ fuse_reply_err(req, 0);
+ } else {
+ fuse_reply_err(req, EIO);
+ }
goto out;
}
@@ -1419,13 +1448,22 @@ static void lo_unlink(fuse_req_t req, fuse_ino_t
parent, const char *name)
inode = lookup_name(req, parent, name);
if (!inode) {
- fuse_reply_err(req, EIO);
- return;
+ if (lo->reconnect && errno == ENOENT) {
+ /*
+ * Because we crashed in this function, so do not perform unlink
and
+ * reply 0 to complete this request.
+ */
+ fuse_reply_err(req, 0);
+ } else {
+ fuse_reply_err(req, EIO);
+ }
+ goto out;
}
res = unlinkat(lo_fd(req, parent), name, 0);
fuse_reply_err(req, res == -1 ? errno : 0);
+out:
unref_inode_lolocked(lo, inode, 1);
lo_inode_put(lo, inode);
}
@@ -1599,8 +1637,6 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
if (lo->cache == CACHE_ALWAYS) {
fi->cache_readdir = 1;
}
- g_atomic_int_set(&elem->refcount, 1); /* paired with lo_releasedir() */
- pthread_mutex_unlock(&lo->mutex);
if (lo->reconnect) {
/* Send the newly opened fd to QEMU */
@@ -1608,7 +1644,10 @@ static void lo_opendir(fuse_req_t req, fuse_ino_t ino,
vu_slave_send_fd_add(virtio_get_dev(se), fd, key);
}
+ g_atomic_int_set(&elem->refcount, 1); /* paired with lo_releasedir() */
+ pthread_mutex_unlock(&lo->mutex);
fuse_reply_open(req, fi);
+
return;
out_errno:
@@ -1768,9 +1807,15 @@ static void lo_releasedir(fuse_req_t req, fuse_ino_t ino,
d = &elem->dirp;
pthread_mutex_unlock(&lo->mutex);
- lo_dirp_put(d, fi->fh); /* paired with lo_opendir() */
-
+ /*
+ * Reply the request before releasing the lo_map slot. Because the
+ * fi->fh slot may be re-allocated to a valid elem right after the slot
+ * is released, reply after releaseing may cause double-removing after
+ * reconnection, and the valid elem may be mistakenly removed.
+ */
fuse_reply_err(req, 0);
+
+ lo_dirp_put(d, fi->fh); /* paired with lo_opendir() */
}
static void update_open_flags(int writeback, int allow_direct_io,
@@ -2084,11 +2129,14 @@ static void lo_release(fuse_req_t req, fuse_ino_t ino,
pthread_mutex_lock(&lo->mutex);
elem = lo_map_get(lo->maps[FD_MAP], fi->fh);
if (elem) {
+ /*
+ * Reply the request before releasing the lo_map slot. Because similar
+ * with the lo_releasedir case, we want to avoid false double-removing.
+ */
+ fuse_reply_err(req, 0);
lo_map_remove(lo->maps[FD_MAP], fi->fh);
}
pthread_mutex_unlock(&lo->mutex);
-
- fuse_reply_err(req, 0);
}
static void lo_flush(fuse_req_t req, fuse_ino_t ino, struct fuse_file_info *fi)
@@ -3159,6 +3207,19 @@ static bool vu_set_fs_map(VuDev *dev, VhostUserMsg *vmsg)
tmp_memfd, 0);
lo.maps[map_type]->memfd = tmp_memfd;
+ /*
+ * Crash consistency: Check if virtiofsd is crashed in an inconsistent
+ * status when growing, and fix it when needed.
+ */
+ cur_map_size = sizeof(struct lo_map) +
+ sizeof(struct lo_map_elem) * lo.maps[map_type]->nelems;
+ if (cur_map_size != vmsg->payload.shm.size) {
+ size_t new_nelems = (vmsg->payload.shm.size - cur_map_size) /
+ sizeof(struct lo_map_elem);
+ lo_map_grow(map_type, lo.maps[map_type]->nelems + new_nelems);
+
+ }
+
return false;
}
--
2.20.1
- [RFC PATCH 0/9] Support for Virtio-fs daemon crash reconnection, Jiachen Zhang, 2020/12/15
- [RFC PATCH 1/9] vhost-user-fs: Add support for reconnection of vhost-user-fs backend, Jiachen Zhang, 2020/12/15
- [RFC PATCH 2/9] vhost: Add vhost-user message types for sending shared memory and file fds, Jiachen Zhang, 2020/12/15
- [RFC PATCH 3/9] vhost-user-fs: Support virtiofsd crash reconnection, Jiachen Zhang, 2020/12/15
- [RFC PATCH 4/9] libvhost-user: Add vhost-user message types for sending shared memory and file fds, Jiachen Zhang, 2020/12/15
- [RFC PATCH 5/9] virtiofsd: Convert the struct lo_map array to a more flatten layout, Jiachen Zhang, 2020/12/15
- [RFC PATCH 6/9] virtiofsd: Add two new options for crash reconnection, Jiachen Zhang, 2020/12/15
- [RFC PATCH 7/9] virtiofsd: Persist/restore lo_map and opened fds to/from QEMU, Jiachen Zhang, 2020/12/15
- [RFC PATCH 8/9] virtiofsd: Ensure crash consistency after reconnection,
Jiachen Zhang <=
- [RFC PATCH 9/9] virtiofsd: (work around) Comment qsort in inflight I/O tracking, Jiachen Zhang, 2020/12/15
- Re: [RFC PATCH 0/9] Support for Virtio-fs daemon crash reconnection, no-reply, 2020/12/15
- Re: [RFC PATCH 0/9] Support for Virtio-fs daemon crash reconnection, Marc-André Lureau, 2020/12/16