[sheepdog] [PATCH] sheep: make create operations atomic

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Wed Sep 12 12:14:04 CEST 2012


Currently, CREATE_AND_WRITE operations are processed even if recovery
works of their target objects are on going (commit b3442e66).  This
causes a problem in the following situation (and it acutually happens
with test 008):

  1. There are three nodes, A, B, and C.
  2. CREATE_AND_WRITE operations are sent to them, and at the same
     time, a node D is added to cluster.
  3. The node D recovers a object from node A, but the object is just
     being created.  The node D reads a zero filled object; it is
     fallocated, but data is not written yet.
  4. The CREATE_AND_WRITE operation is retried to node D, and it is
     successfully finished.
  5. The recovery process on node D overwrite the object with the zero
     filled data.

Instead of adding another complex check in request_in_recovery, this
patch makes all CREATE_AND_WRITE operations atomic to keep code
simplicity.  With this patch, store->write is splited into
store->write and store->create_and_write, and store->atomic_put is
removed.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/farm/farm.c   |    4 ++--
 sheep/ops.c         |   26 +++++++++++++++++---------
 sheep/plain_store.c |   24 +++++++++++-------------
 sheep/recovery.c    |    2 +-
 sheep/request.c     |    6 +++++-
 sheep/sheep_priv.h  |    9 +++++----
 6 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index fb68c3a..86d9fb3 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -263,7 +263,7 @@ static int restore_objects_from_snap(uint32_t epoch)
 		}
 		io.length = h.size;
 		io.buf = buffer;
-		ret = default_atomic_put(oid, &io);
+		ret = default_create_and_write(oid, &io);
 		if (ret != SD_RES_SUCCESS) {
 			eprintf("oid %"PRIx64" not restored\n", oid);
 			goto out;
@@ -319,10 +319,10 @@ struct store_driver farm = {
 	.name = "farm",
 	.init = farm_init,
 	.exist = default_exist,
+	.create_and_write = default_create_and_write,
 	.write = default_write,
 	.read = default_read,
 	.link = default_link,
-	.atomic_put = default_atomic_put,
 	.end_recover = default_end_recover,
 	.snapshot = farm_snapshot,
 	.cleanup = default_cleanup,
diff --git a/sheep/ops.c b/sheep/ops.c
index 4542335..02f43b1 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -776,8 +776,18 @@ out:
 	return ret;
 }
 
+static int do_create_and_write_obj(struct siocb *iocb, struct sd_req *hdr,
+				   uint32_t epoch, void *data)
+{
+	iocb->buf = data;
+	iocb->length = hdr->data_length;
+	iocb->offset = hdr->obj.offset;
+
+	return sd_store->create_and_write(hdr->obj.oid, iocb);
+}
+
 static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
-		void *data, int create)
+			void *data)
 {
 	uint64_t oid = hdr->obj.oid;
 	int ret = SD_RES_SUCCESS;
@@ -787,9 +797,7 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
 	iocb->length = hdr->data_length;
 	iocb->offset = hdr->obj.offset;
 
-	if (is_vdi_obj(oid) && create)
-		ret = sd_store->atomic_put(oid, iocb);
-	else if (is_vdi_obj(oid) && sys->use_journal) {
+	if (is_vdi_obj(oid) && sys->use_journal) {
 		struct strbuf buf = STRBUF_INIT;
 
 		strbuf_addf(&buf, "%s%016" PRIx64, obj_path, oid);
@@ -799,11 +807,11 @@ static int do_write_obj(struct siocb *iocb, struct sd_req *hdr, uint32_t epoch,
 			strbuf_release(&buf);
 			return SD_RES_EIO;
 		}
-		ret = sd_store->write(oid, iocb, create);
+		ret = sd_store->write(oid, iocb);
 		jrnl_end(jd);
 		strbuf_release(&buf);
 	} else
-		ret = sd_store->write(oid, iocb, create);
+		ret = sd_store->write(oid, iocb);
 
 	return ret;
 }
@@ -817,7 +825,7 @@ int peer_write_obj(struct request *req)
 	memset(&iocb, 0, sizeof(iocb));
 	iocb.epoch = epoch;
 	iocb.flags = hdr->flags;
-	return do_write_obj(&iocb, hdr, epoch, req->data, 0);
+	return do_write_obj(&iocb, hdr, epoch, req->data);
 }
 
 int peer_create_and_write_obj(struct request *req)
@@ -856,9 +864,9 @@ int peer_create_and_write_obj(struct request *req)
 		cow_hdr.data_length = SD_DATA_OBJ_SIZE;
 		cow_hdr.obj.offset = 0;
 
-		ret = do_write_obj(&iocb, &cow_hdr, epoch, buf, 1);
+		ret = do_create_and_write_obj(&iocb, &cow_hdr, epoch, buf);
 	} else
-		ret = do_write_obj(&iocb, hdr, epoch, req->data, 1);
+		ret = do_create_and_write_obj(&iocb, hdr, epoch, req->data);
 
 	if (SD_RES_SUCCESS == ret)
 		objlist_cache_insert(oid);
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 91ba527..0c543ef 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -123,9 +123,9 @@ static int err_to_sderr(uint64_t oid, int err)
 	return SD_RES_NO_OBJ;
 }
 
-int default_write(uint64_t oid, struct siocb *iocb, int create)
+int default_write(uint64_t oid, struct siocb *iocb)
 {
-	int flags = get_open_flags(oid, create), fd, ret = SD_RES_SUCCESS;
+	int flags = get_open_flags(oid, false), fd, ret = SD_RES_SUCCESS;
 	char path[PATH_MAX];
 	ssize_t size;
 
@@ -141,11 +141,6 @@ int default_write(uint64_t oid, struct siocb *iocb, int create)
 	if (fd < 0)
 		return err_to_sderr(oid, errno);
 
-	if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
-		ret = prealloc(fd, get_objsize(oid));
-		if (ret != SD_RES_SUCCESS)
-			goto out;
-	}
 	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
 	if (size != iocb->length) {
 		eprintf("failed to write object %"PRIx64", path=%s, offset=%"
@@ -243,10 +238,7 @@ static int default_read_from_path(uint64_t oid, char *path,
 		return err_to_sderr(oid, errno);
 
 	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
-	if (size == 0) {
-		/* the requested object is being created */
-		ret = SD_RES_NO_OBJ;
-	} else if (size != iocb->length) {
+	if (size != iocb->length) {
 		eprintf("failed to read object %"PRIx64", path=%s, offset=%"
 			PRId64", size=%"PRId32", result=%zd, %m\n", oid, path,
 			iocb->offset, iocb->length, size);
@@ -278,7 +270,7 @@ int default_read(uint64_t oid, struct siocb *iocb)
 	return ret;
 }
 
-int default_atomic_put(uint64_t oid, struct siocb *iocb)
+int default_create_and_write(uint64_t oid, struct siocb *iocb)
 {
 	char path[PATH_MAX], tmp_path[PATH_MAX];
 	int flags = get_open_flags(oid, true);
@@ -294,6 +286,12 @@ int default_atomic_put(uint64_t oid, struct siocb *iocb)
 		return SD_RES_EIO;
 	}
 
+	if (iocb->offset != 0 || iocb->length != get_objsize(oid)) {
+		ret = prealloc(fd, get_objsize(oid));
+		if (ret != SD_RES_SUCCESS)
+			goto out;
+	}
+
 	ret = xwrite(fd, iocb->buf, len);
 	if (ret != len) {
 		eprintf("failed to write object. %m\n");
@@ -470,10 +468,10 @@ struct store_driver plain_store = {
 	.name = "plain",
 	.init = default_init,
 	.exist = default_exist,
+	.create_and_write = default_create_and_write,
 	.write = default_write,
 	.read = default_read,
 	.link = default_link,
-	.atomic_put = default_atomic_put,
 	.end_recover = default_end_recover,
 	.cleanup = default_cleanup,
 	.format = default_format,
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 1beb772..7d95cea 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -124,7 +124,7 @@ static int recover_object_from_replica(uint64_t oid,
 		iocb.epoch = epoch;
 		iocb.length = rlen;
 		iocb.buf = buf;
-		ret = sd_store->atomic_put(oid, &iocb);
+		ret = sd_store->create_and_write(oid, &iocb);
 		if (ret != SD_RES_SUCCESS) {
 			ret = -1;
 			goto out;
diff --git a/sheep/request.c b/sheep/request.c
index ab9a9ef..31c2648 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -142,7 +142,11 @@ static int check_request_epoch(struct request *req)
 static bool request_in_recovery(struct request *req)
 {
 
-	/* For CREATE request, we simply service it */
+	/*
+	 * For CREATE request, we simply service it.  CREATE operations are
+	 * atomic, so it cannot happen for recover process to overwrite the
+	 * created objects with the older data.
+	 */
 	if (req->rq.opcode == SD_OP_CREATE_AND_WRITE_PEER ||
 	    req->rq.opcode == SD_OP_CREATE_AND_WRITE_OBJ)
 		return false;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index bfc5b9d..341b530 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -148,13 +148,14 @@ struct store_driver {
 	const char *name;
 	int (*init)(char *path);
 	int (*exist)(uint64_t oid);
-	int (*write)(uint64_t oid, struct siocb *, int create);
+	/* create_and_write must be an atomic operation*/
+	int (*create_and_write)(uint64_t oid, struct siocb *);
+	int (*write)(uint64_t oid, struct siocb *);
 	int (*read)(uint64_t oid, struct siocb *);
 	int (*format)(void);
 	int (*remove_object)(uint64_t oid);
 	/* Operations in recovery */
 	int (*link)(uint64_t oid, struct siocb *, uint32_t tgt_epoch);
-	int (*atomic_put)(uint64_t oid, struct siocb *);
 	int (*begin_recover)(struct siocb *);
 	int (*end_recover)(uint32_t epoch, struct vnode_info *old_vnode_info);
 	int (*purge_obj)(void);
@@ -168,10 +169,10 @@ struct store_driver {
 
 int default_init(char *p);
 int default_exist(uint64_t oid);
-int default_write(uint64_t oid, struct siocb *iocb, int create);
+int default_create_and_write(uint64_t oid, struct siocb *iocb);
+int default_write(uint64_t oid, struct siocb *iocb);
 int default_read(uint64_t oid, struct siocb *iocb);
 int default_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch);
-int default_atomic_put(uint64_t oid, struct siocb *iocb);
 int default_end_recover(uint32_t old_epoch, struct vnode_info *old_vnode_info);
 int default_cleanup(void);
 int default_format(void);
-- 
1.7.2.5




More information about the sheepdog mailing list