[sheepdog] [PATCH] sheep: make create operations atomic
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Wed Sep 12 12:14:04 CEST 2012
Currently, CREATE_AND_WRITE operations are processed even if recovery
works of their target objects are on going (commit b3442e66). This
causes a problem in the following situation (and it acutually happens
with test 008):
1. There are three nodes, A, B, and C.
2. CREATE_AND_WRITE operations are sent to them, and at the same
time, a node D is added to cluster.
3. The node D recovers a object from node A, but the object is just
being created. The node D reads a zero filled object; it is
fallocated, but data is not written yet.
4. The CREATE_AND_WRITE operation is retried to node D, and it is
successfully finished.
5. The recovery process on node D overwrite the object with the zero
filled data.
Instead of adding another complex check in request_in_recovery, this
patch makes all CREATE_AND_WRITE operations atomic to keep code
simplicity. With this patch, store->write is splited into
store->write and store->create_and_write, and store->atomic_put is
removed.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
sheep/farm/farm.c | 4 ++--
sheep/ops.c | 26 +++++++++++++++++---------
sheep/plain_store.c | 24 +++++++++++-------------
sheep/recovery.c | 2 +-
sheep/request.c | 6 +++++-
sheep/sheep_priv.h | 9 +++++----
6 files changed, 41 insertions(+), 30 deletions(-)
diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index fb68c3a..86d9fb3 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -263,7 +263,7 @@ static int restore_objects_from_snap(uint32_t epoch)
}
io.length = h.size;
io.buf = buffer;
- ret = default_atomic_put(oid, &io);
+ ret = default_create_and_write(oid, &io);
if (ret != SD_RES_SUCCESS) {
eprintf("oid %"PRIx64" not restored\n", oid);
goto out;
@@ -319,10 +319,10 @@ struct store_driver farm = {
.name = "farm",
.init = farm_init,
.exist = default_exist,
+ .create_and_write = default_create_and_write,
.write = default_write,
.read = default_read,
.link = default_link,
- .atomic_put = default_atomic_put,
.end_recover = default_end_recover,
.snapshot = farm_snapshot,
.cleanup = default_cleanup,
diff --git a/sheep/ops.c b/sheep/ops.c
index 4542335..02f43b1 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -776,8 +776,18 @@ out:
return ret;
}
+static int do_create_and_write_obj(struct siocb *iocb, struct sd_req *hdr,
+ uint32_t epoch, void *data)
+{
+ iocb->buf = data;
+ iocb->length = hdr->data_length;
+ iocb->offset = hdr->obj.offset;
+
+ return sd_store->create_and_write(hdr->obj.oid, iocb);
+}
+
static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
- void *data, int create)
+ void *data)
{
uint64_t oid = hdr->obj.oid;
int ret = SD_RES_SUCCESS;
@@ -787,9 +797,7 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
iocb->length = hdr->data_length;
iocb->offset = hdr->obj.offset;
- if (is_vdi_obj(oid) && create)
- ret = sd_store->atomic_put(oid, iocb);
- else if (is_vdi_obj(oid) && sys->use_journal) {
+ if (is_vdi_obj(oid) && sys->use_journal) {
struct strbuf buf = STRBUF_INIT;
strbuf_addf(&buf, "%s%016" PRIx64, obj_path, oid);
@@ -799,11 +807,11 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
strbuf_release(&buf);
return SD_RES_EIO;
}
- ret = sd_store->write(oid, iocb, create);
+ ret = sd_store->write(oid, iocb);
jrnl_end(jd);
strbuf_release(&buf);
} else
- ret = sd_store->write(oid, iocb, create);
+ ret = sd_store->write(oid, iocb);
return ret;
}
@@ -817,7 +825,7 @@ int peer_write_obj(struct request *req)
memset(&iocb, 0, sizeof(iocb));
iocb.epoch = epoch;
iocb.flags = hdr->flags;
- return do_write_obj(&iocb, hdr, epoch, req->data, 0);
+ return do_write_obj(&iocb, hdr, epoch, req->data);
}
int peer_create_and_write_obj(struct request *req)
@@ -856,9 +864,9 @@ int peer_create_and_write_obj(struct request *req)
cow_hdr.data_length = SD_DATA_OBJ_SIZE;
cow_hdr.obj.offset = 0;
- ret = do_write_obj(&iocb, &cow_hdr, epoch, buf, 1);
+ ret = do_create_and_write_obj(&iocb, &cow_hdr, epoch, buf);
} else
- ret = do_write_obj(&iocb, hdr, epoch, req->data, 1);
+ ret = do_create_and_write_obj(&iocb, hdr, epoch, req->data);
if (SD_RES_SUCCESS == ret)
objlist_cache_insert(oid);
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 91ba527..0c543ef 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -123,9 +123,9 @@ static int err_to_sderr(uint64_t oid, int err)
return SD_RES_NO_OBJ;
}
-int default_write(uint64_t oid, struct siocb *iocb, int create)
+int default_write(uint64_t oid, struct siocb *iocb)
{
- int flags = get_open_flags(oid, create), fd, ret = SD_RES_SUCCESS;
+ int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS;
char path[PATH_MAX];
ssize_t size;
@@ -141,11 +141,6 @@ int default_write(uint64_t oid, struct siocb *iocb, int create)
if (fd < 0)
return err_to_sderr(oid, errno);
- if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
- ret = prealloc(fd, get_objsize(oid));
- if (ret != SD_RES_SUCCESS)
- goto out;
- }
size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
if (size != iocb->length) {
eprintf("failed to write object %"PRIx64", path=%s, offset=%"
@@ -243,10 +238,7 @@ static int default_read_from_path(uint64_t oid, char *path,
return err_to_sderr(oid, errno);
size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
- if (size == 0) {
- /* the requested object is being created */
- ret = SD_RES_NO_OBJ;
- } else if (size != iocb->length) {
+ if (size != iocb->length) {
eprintf("failed to read object %"PRIx64", path=%s, offset=%"
PRId64", size=%"PRId32", result=%zd, %m\n", oid, path,
iocb->offset, iocb->length, size);
@@ -278,7 +270,7 @@ int default_read(uint64_t oid, struct siocb *iocb)
return ret;
}
-int default_atomic_put(uint64_t oid, struct siocb *iocb)
+int default_create_and_write(uint64_t oid, struct siocb *iocb)
{
char path[PATH_MAX], tmp_path[PATH_MAX];
int flags = get_open_flags(oid, true);
@@ -294,6 +286,12 @@ int default_atomic_put(uint64_t oid, struct siocb *iocb)
return SD_RES_EIO;
}
+ if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
+ ret = prealloc(fd, get_objsize(oid));
+ if (ret != SD_RES_SUCCESS)
+ goto out;
+ }
+
ret = xwrite(fd, iocb->buf, len);
if (ret != len) {
eprintf("failed to write object. %m\n");
@@ -470,10 +468,10 @@ struct store_driver plain_store = {
.name = "plain",
.init = default_init,
.exist = default_exist,
+ .create_and_write = default_create_and_write,
.write = default_write,
.read = default_read,
.link = default_link,
- .atomic_put = default_atomic_put,
.end_recover = default_end_recover,
.cleanup = default_cleanup,
.format = default_format,
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 1beb772..7d95cea 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -124,7 +124,7 @@ static int recover_object_from_replica(uint64_t oid,
iocb.epoch = epoch;
iocb.length = rlen;
iocb.buf = buf;
- ret = sd_store->atomic_put(oid, &iocb);
+ ret = sd_store->create_and_write(oid, &iocb);
if (ret != SD_RES_SUCCESS) {
ret = -1;
goto out;
diff --git a/sheep/request.c b/sheep/request.c
index ab9a9ef..31c2648 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -142,7 +142,11 @@ static int check_request_epoch(struct request *req)
static bool request_in_recovery(struct request *req)
{
- /* For CREATE request, we simply service it */
+ /*
+ * For CREATE request, we simply service it. CREATE operations are
+ * atomic, so it cannot happen for recover process to overwrite the
+ * created objects with the older data.
+ */
if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_PEER ||
req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ)
return false;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index bfc5b9d..341b530 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -148,13 +148,14 @@ struct store_driver {
const char *name;
int (*init)(char *path);
int (*exist)(uint64_t oid);
- int (*write)(uint64_t oid, struct siocb *, int create);
+ /* create_and_write must be an atomic operation*/
+ int (*create_and_write)(uint64_t oid, struct siocb *);
+ int (*write)(uint64_t oid, struct siocb *);
int (*read)(uint64_t oid, struct siocb *);
int (*format)(void);
int (*remove_object)(uint64_t oid);
/* Operations in recovery */
int (*link)(uint64_t oid, struct siocb *, uint32_t tgt_epoch);
- int (*atomic_put)(uint64_t oid, struct siocb *);
int (*begin_recover)(struct siocb *);
int (*end_recover)(uint32_t epoch, struct vnode_info *old_vnode_info);
int (*purge_obj)(void);
@@ -168,10 +169,10 @@ struct store_driver {
int default_init(char *p);
int default_exist(uint64_t oid);
-int default_write(uint64_t oid, struct siocb *iocb, int create);
+int default_create_and_write(uint64_t oid, struct siocb *iocb);
+int default_write(uint64_t oid, struct siocb *iocb);
int default_read(uint64_t oid, struct siocb *iocb);
int default_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch);
-int default_atomic_put(uint64_t oid, struct siocb *iocb);
int default_end_recover(uint32_t old_epoch, struct vnode_info *old_vnode_info);
int default_cleanup(void);
int default_format(void);
--
1.7.2.5
More information about the sheepdog
mailing list