[sheepdog] [PATCH] sheepdog: fix overlapping metadata update
Liu Yuan
namei.unix at gmail.com
Wed Jul 29 06:02:35 CEST 2015
From: Liu Yuan <liuyuan at cmss.chinamobile.com>
Current sheepdog driver use a range update_inode(min_idx, max_idx) for batching
the updates. But there is subtle problem by determining min_idx and max_idx:
for a single create request, min_idx == max_idx, so actually we just update one
one bit as expected.
Suppose we have 2 create request, create(10) and create(20), then min == 10,
max==20 even though we just need to update index 10 and index 20, update_inode(10,20)
will actually update range from 10 to 20. This would work if all the update_inode()
requests won't overlap. But unfortunately, this is not true for some corner case.
So the problem arise as following:
req 1: update_inode(10,20)
req 2: update_inode(15,22)
req 1 and req 2 might have different value between [15,20] and cause problems
and can be illustrated as following by adding a printf in sd_write_done:
@@ -1976,6 +1976,7 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
mn = s->min_dirty_data_idx;
mx = s->max_dirty_data_idx;
+ printf("min %u, max %u\n", mn, mx);
if (mn <= mx) {
/* we need to update the vdi object. */
offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
...
min 4294967295, max 0
min 9221, max 9222
min 9224, max 9728
min 9223, max 9223
min 9729, max 9730
min 9731, max 9732
min 9733, max 9734
min 9736, max 10240
min 9735, max 10241
...
Every line represents a update_inode(min, max) last 2 lines show 2 requests
actually overlap while I ran mkfs.ext4 on a sheepdog volume. The overlapped
requests might be reordered via network and cause inode[idx] back to 0 after
being set by last request. Then a wrong remove request will be executed by sheep
internally and accidentally remove a victim object, which is reported at:
https://bugs.launchpad.net/sheepdog-project/+bug/1456421
The fix is simple that we just update inode one by one for aio_req. Since
aio_req is never overlapped, we'll have inode update never overlapped.
Cc: Jeff Cody <jcody at redhat.com>
Cc: Kevin Wolf <kwolf at redhat.com>
Cc: Stefan Hajnoczi <stefanha at redhat.com>
Cc: Teruaki Ishizaki <ishizaki.teruaki at lab.ntt.co.jp>
Cc: Vasiliy Tolstov <v.tolstov at selfip.ru>
Cc: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
---
block/sheepdog.c | 77 ++++++++++++++++----------------------------------------
1 file changed, 22 insertions(+), 55 deletions(-)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index bd7cbed..d1eeb81 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -342,9 +342,6 @@ typedef struct BDRVSheepdogState {
SheepdogInode inode;
- uint32_t min_dirty_data_idx;
- uint32_t max_dirty_data_idx;
-
char name[SD_MAX_VDI_LEN];
bool is_snapshot;
uint32_t cache_flags;
@@ -782,6 +779,26 @@ static coroutine_fn void reconnect_to_sdog(void *opaque)
}
}
+static void update_inode(BDRVSheepdogState *s, AIOReq *aio_req)
+{
+ struct iovec iov;
+ uint32_t offset, data_len;
+ SheepdogAIOCB *acb = aio_req->aiocb;
+ int idx = data_oid_to_idx(aio_req->oid);
+
+ offset = SD_INODE_HEADER_SIZE + sizeof(uint32_t) * idx;
+ data_len = sizeof(uint32_t);
+
+ iov.iov_base = &s->inode;
+ iov.iov_len = sizeof(s->inode);
+ aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
+ data_len, offset, 0, false, 0, offset);
+ QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
+ add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
+
+ return;
+}
+
/*
* Receive responses of the I/O requests.
*
@@ -820,25 +837,15 @@ static void coroutine_fn aio_read_response(void *opaque)
switch (acb->aiocb_type) {
case AIOCB_WRITE_UDATA:
- /* this coroutine context is no longer suitable for co_recv
- * because we may send data to update vdi objects */
- s->co_recv = NULL;
if (!is_data_obj(aio_req->oid)) {
break;
}
idx = data_oid_to_idx(aio_req->oid);
if (aio_req->create) {
- /*
- * If the object is newly created one, we need to update
- * the vdi object (metadata object). min_dirty_data_idx
- * and max_dirty_data_idx are changed to include updated
- * index between them.
- */
if (rsp.result == SD_RES_SUCCESS) {
s->inode.data_vdi_id[idx] = s->inode.vdi_id;
- s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx);
- s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx);
+ update_inode(s, aio_req);
}
/*
* Some requests may be blocked because simultaneous
@@ -1518,8 +1525,6 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags,
}
memcpy(&s->inode, buf, sizeof(s->inode));
- s->min_dirty_data_idx = UINT32_MAX;
- s->max_dirty_data_idx = 0;
bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE;
pstrcpy(s->name, sizeof(s->name), vdi);
@@ -1962,44 +1967,6 @@ static int sd_truncate(BlockDriverState *bs, int64_t offset)
return ret;
}
-/*
- * This function is called after writing data objects. If we need to
- * update metadata, this sends a write request to the vdi object.
- * Otherwise, this switches back to sd_co_readv/writev.
- */
-static void coroutine_fn sd_write_done(SheepdogAIOCB *acb)
-{
- BDRVSheepdogState *s = acb->common.bs->opaque;
- struct iovec iov;
- AIOReq *aio_req;
- uint32_t offset, data_len, mn, mx;
-
- mn = s->min_dirty_data_idx;
- mx = s->max_dirty_data_idx;
- if (mn <= mx) {
- /* we need to update the vdi object. */
- offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) +
- mn * sizeof(s->inode.data_vdi_id[0]);
- data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]);
-
- s->min_dirty_data_idx = UINT32_MAX;
- s->max_dirty_data_idx = 0;
-
- iov.iov_base = &s->inode;
- iov.iov_len = sizeof(s->inode);
- aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id),
- data_len, offset, 0, false, 0, offset);
- QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings);
- add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA);
-
- acb->aio_done_func = sd_finish_aiocb;
- acb->aiocb_type = AIOCB_WRITE_UDATA;
- return;
- }
-
- sd_finish_aiocb(acb);
-}
-
/* Delete current working VDI on the snapshot chain */
static bool sd_delete(BDRVSheepdogState *s)
{
@@ -2231,7 +2198,7 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num,
}
acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors);
- acb->aio_done_func = sd_write_done;
+ acb->aio_done_func = sd_finish_aiocb;
acb->aiocb_type = AIOCB_WRITE_UDATA;
ret = sd_co_rw_vector(acb);
--
1.9.1
More information about the sheepdog
mailing list