[Sheepdog] [PATCH 2/2] make vdi setattr atomic

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Fri Oct 14 07:59:01 CEST 2011


At Fri, 14 Oct 2011 02:42:38 +0900,
MORITA Kazutaka wrote:
> 
> 'collie vdi setattr' runs the following two:
> 
>  - allocates a vdi attr object id
>  - writes an attribute data to the object
> 
> So a race can happen between these two operations.
> 
> With this patch, 'collie vdi setattr' sends the attribute data when
> allocating the object id to write the attribute atomically.
> 
> Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> ---
>  collie/vdi.c    |   47 ++++++++++++++++++++---------------------------
>  include/sheep.h |   11 +++++++----
>  sheep/group.c   |    3 ++-
>  sheep/sdnet.c   |    9 +++++----
>  sheep/store.c   |    5 +++--
>  sheep/vdi.c     |   43 ++++++++++++++++++++++++++++++-------------
>  6 files changed, 67 insertions(+), 51 deletions(-)

Oops, sorry, I forgot to add sheep/sheep_priv.h.  Fixed version is
below.

==
From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Subject: [PATCH] make vdi setattr atomic

'collie vdi setattr' runs the following two:

 - allocates a vdi attr object id
 - writes an attribute data to the object

So a race can happen between these two operations.

With this patch, 'collie vdi setattr' sends the attribute data when
allocating the object id to write the attribute atomically.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 collie/vdi.c       |   47 ++++++++++++++++++++---------------------------
 include/sheep.h    |   11 +++++++----
 sheep/group.c      |    3 ++-
 sheep/sdnet.c      |    9 +++++----
 sheep/sheep_priv.h |    7 ++++---
 sheep/store.c      |    5 +++--
 sheep/vdi.c        |   43 ++++++++++++++++++++++++++++++-------------
 7 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/collie/vdi.c b/collie/vdi.c
index 245a6e2..6ac73cf 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -784,8 +784,9 @@ static int vdi_object(int argc, char **argv)
 }
 
 static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
-			     char *key, uint32_t *vid, uint64_t *oid,
-			     unsigned int *nr_copies, int creat, int excl)
+			     char *key, void *value, unsigned int value_len,
+			     uint32_t *vid, uint64_t *oid, unsigned int *nr_copies,
+			     int creat, int excl, int delete)
 {
 	struct sd_vdi_req hdr;
 	struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr;
@@ -793,7 +794,7 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
 	unsigned int wlen, rlen;
 	struct sheepdog_vdi_attr *vattr;
 
-	vattr = zalloc(SD_ATTR_HEADER_SIZE);
+	vattr = zalloc(SD_ATTR_HEADER_SIZE + value_len);
 	if (!vattr)
 		return SD_RES_NO_MEM;
 
@@ -801,6 +802,8 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
 	strncpy(vattr->tag, vdi_cmd_data.snapshot_tag, SD_MAX_VDI_TAG_LEN);
 	vattr->snap_id = vdi_cmd_data.snapshot_id;
 	strncpy(vattr->key, key, SD_MAX_VDI_ATTR_KEY_LEN);
+	if (value && value_len)
+		memcpy(vattr->value, value, value_len);
 
 	fd = connect_to(sdhost, sdport);
 	if (fd < 0) {
@@ -810,7 +813,7 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
 
 	memset(&hdr, 0, sizeof(hdr));
 	hdr.opcode = SD_OP_GET_VDI_ATTR;
-	wlen = SD_ATTR_HEADER_SIZE;
+	wlen = SD_ATTR_HEADER_SIZE + value_len;
 	rlen = 0;
 	hdr.proto_ver = SD_PROTO_VER;
 	hdr.data_length = wlen;
@@ -820,6 +823,8 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
 		hdr.flags |= SD_FLAG_CMD_CREAT;
 	if (excl)
 		hdr.flags |= SD_FLAG_CMD_EXCL;
+	if (delete)
+		hdr.flags |= SD_FLAG_CMD_DEL;
 
 	ret = exec_req(fd, (struct sd_req *)&hdr, vattr, &wlen, &rlen);
 	if (ret) {
@@ -844,8 +849,8 @@ out:
 
 static int vdi_setattr(int argc, char **argv)
 {
-	int ret;
-	uint64_t oid, attr_oid = 0;
+	int ret, value_len = 0;
+	uint64_t attr_oid = 0;
 	uint32_t vid = 0, nr_copies = 0;
 	char *vdiname = argv[optind++], *key, *value;
 	uint64_t offset;
@@ -878,10 +883,14 @@ reread:
 		}
 	}
 
+	if (value)
+		value_len = strlen(value);
+
 	ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag,
-				vdi_cmd_data.snapshot_id, key, &vid, &attr_oid,
+				vdi_cmd_data.snapshot_id, key, value,
+				value_len, &vid, &attr_oid,
 				&nr_copies, !vdi_cmd_data.delete,
-				vdi_cmd_data.exclusive);
+				vdi_cmd_data.exclusive, vdi_cmd_data.delete);
 	if (ret) {
 		if (ret == SD_RES_VDI_EXIST) {
 			fprintf(stderr, "the attribute already exists, %s\n", key);
@@ -893,27 +902,11 @@ reread:
 			fprintf(stderr, "vdi not found\n");
 			return EXIT_MISSING;
 		} else
-			fprintf(stderr, "failed to find attr oid, %s\n",
+			fprintf(stderr, "failed to set attr, %s\n",
 				sd_strerror(ret));
 		return EXIT_FAILURE;
 	}
 
-	oid = attr_oid;
-
-	if (vdi_cmd_data.delete)
-		ret = sd_write_object(oid, 0, (char *)"", 1,
-				      offsetof(struct sheepdog_inode, name), 0,
-				      nr_copies, 0);
-	else
-		ret = sd_write_object(oid, 0, value, strlen(value),
-				      SD_ATTR_HEADER_SIZE, SD_FLAG_CMD_TRUNCATE,
-				      nr_copies, 0);
-
-	if (ret != SD_RES_SUCCESS) {
-		fprintf(stderr, "failed to set attribute\n");
-		return EXIT_FAILURE;
-	}
-
 	return EXIT_SUCCESS;
 }
 
@@ -931,8 +924,8 @@ static int vdi_getattr(int argc, char **argv)
 	}
 
 	ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag,
-				vdi_cmd_data.snapshot_id, key, &vid, &attr_oid,
-				&nr_copies, 0, 0);
+				vdi_cmd_data.snapshot_id, key, NULL, 0,
+				&vid, &attr_oid, &nr_copies, 0, 0, 0);
 	if (ret == SD_RES_NO_OBJ) {
 		fprintf(stderr, "no such attribute, %s\n", key);
 		return EXIT_MISSING;
diff --git a/include/sheep.h b/include/sheep.h
index 31516d9..3c2d799 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -37,10 +37,13 @@
 #define SD_OP_KILL_NODE      0x88
 #define SD_OP_GET_VDI_ATTR   0x89
 
-#define SD_FLAG_CMD_DIRECT   0x10
-#define SD_FLAG_CMD_RECOVERY 0x20
-#define SD_FLAG_CMD_CREAT    0x40
-#define SD_FLAG_CMD_EXCL     0x80
+#define SD_FLAG_CMD_DIRECT   0x0010
+#define SD_FLAG_CMD_RECOVERY 0x0020
+
+/* flags for vdi attribute operations */
+#define SD_FLAG_CMD_CREAT    0x0100
+#define SD_FLAG_CMD_EXCL     0x0200
+#define SD_FLAG_CMD_DEL      0x0400
 
 #define SD_RES_OLD_NODE_VER  0x41 /* Remote node has an old epoch */
 #define SD_RES_NEW_NODE_VER  0x42 /* Remote node has a new epoch */
diff --git a/sheep/group.c b/sheep/group.c
index de3333e..8e9266f 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -898,7 +898,8 @@ static void vdi_op(struct vdi_op_message *msg)
 		ret = get_vdi_attr(hdr->epoch, data, hdr->data_length, vid,
 				   &attrid, nr_copies, ctime,
 				   hdr->flags & SD_FLAG_CMD_CREAT,
-				   hdr->flags & SD_FLAG_CMD_EXCL);
+				   hdr->flags & SD_FLAG_CMD_EXCL,
+				   hdr->flags & SD_FLAG_CMD_DEL);
 		break;
 	case SD_OP_RELEASE_VDI:
 		break;
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index db2e691..eb375cb 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -629,7 +629,7 @@ int create_listen_port(int port, void *data)
 int write_object(struct sheepdog_vnode_list_entry *e,
 		 int vnodes, int zones, uint32_t node_version,
 		 uint64_t oid, char *data, unsigned int datalen,
-		 uint64_t offset, int nr, int create)
+		 uint64_t offset, uint16_t flags, int nr, int create)
 {
 	struct sd_obj_req hdr;
 	int i, n, fd, ret, success = 0;
@@ -644,8 +644,8 @@ int write_object(struct sheepdog_vnode_list_entry *e,
 		n = obj_to_sheep(e, vnodes, oid, i);
 
 		if (is_myself(e[n].addr, e[n].port)) {
-			ret = write_object_local(oid, data, datalen, offset, nr,
-						 node_version, create);
+			ret = write_object_local(oid, data, datalen, offset,
+						 flags, nr, node_version, create);
 
 			if (ret != 0)
 				eprintf("fail %"PRIx64" %"PRIx32"\n", oid, ret);
@@ -673,7 +673,8 @@ int write_object(struct sheepdog_vnode_list_entry *e,
 		hdr.oid = oid;
 		hdr.copies = nr;
 
-		hdr.flags = SD_FLAG_CMD_WRITE | SD_FLAG_CMD_DIRECT;
+		hdr.flags = flags;
+		hdr.flags |= SD_FLAG_CMD_WRITE | SD_FLAG_CMD_DIRECT;
 		hdr.data_length = wlen;
 		hdr.offset = offset;
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 6ea03d5..45c5903 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -181,7 +181,7 @@ int read_vdis(char *data, int len, unsigned int *rsp_len);
 
 int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 		 uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime,
-		 int creat, int excl);
+		 int write, int excl, int delete);
 
 int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries);
 void setup_ordered_sd_vnode_list(struct request *req);
@@ -199,7 +199,8 @@ int leave_cluster(void);
 void start_cpg_event_work(void);
 void store_queue_request(struct work *work, int idx);
 int write_object_local(uint64_t oid, char *data, unsigned int datalen,
-		       uint64_t offset, int copies, uint32_t epoch, int create);
+		       uint64_t offset, uint16_t flags, int copies,
+		       uint32_t epoch, int create);
 int read_object_local(uint64_t oid, char *data, unsigned int datalen,
 		      uint64_t offset, int copies, uint32_t epoch);
 
@@ -230,7 +231,7 @@ int is_recoverying_oid(uint64_t oid);
 int write_object(struct sheepdog_vnode_list_entry *e,
 		 int vnodes, int zones, uint32_t node_version,
 		 uint64_t oid, char *data, unsigned int datalen,
-		 uint64_t offset, int nr, int create);
+		 uint64_t offset, uint16_t flags, int nr, int create);
 int read_object(struct sheepdog_vnode_list_entry *e,
 		int vnodes, int zones, uint32_t node_version,
 		uint64_t oid, char *data, unsigned int datalen,
diff --git a/sheep/store.c b/sheep/store.c
index 12e54d9..57654ff 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -490,7 +490,8 @@ int update_epoch_store(uint32_t epoch)
 }
 
 int write_object_local(uint64_t oid, char *data, unsigned int datalen,
-		       uint64_t offset, int copies, uint32_t epoch, int create)
+		       uint64_t offset, uint16_t flags, int copies,
+		       uint32_t epoch, int create)
 {
 	int ret;
 	struct request *req;
@@ -507,7 +508,7 @@ int write_object_local(uint64_t oid, char *data, unsigned int datalen,
 	else
 		hdr->opcode = SD_OP_WRITE_OBJ;
 	hdr->copies = copies;
-	hdr->flags = SD_FLAG_CMD_WRITE;
+	hdr->flags = flags | SD_FLAG_CMD_WRITE;
 	hdr->offset = offset;
 	hdr->data_length = datalen;
 	req->data = data;
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 87a63c9..b192dcb 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -124,7 +124,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	if (is_snapshot && cur_vid != base_vid) {
 		ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 				   vid_to_vdi_oid(cur_vid), (char *)cur,
-				   SD_INODE_HEADER_SIZE, 0, copies, 0);
+				   SD_INODE_HEADER_SIZE, 0, 0, copies, 0);
 		if (ret != 0) {
 			vprintf(SDOG_ERR "failed\n");
 			ret = SD_RES_BASE_VDI_READ;
@@ -135,7 +135,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	if (base_vid) {
 		ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 				   vid_to_vdi_oid(base_vid), (char *)base,
-				   SD_INODE_HEADER_SIZE, 0, copies, 0);
+				   SD_INODE_HEADER_SIZE, 0, 0, copies, 0);
 		if (ret != 0) {
 			vprintf(SDOG_ERR "failed\n");
 			ret = SD_RES_BASE_VDI_WRITE;
@@ -145,7 +145,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 
 	ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 			   vid_to_vdi_oid(new_vid), (char *)new, sizeof(*new),
-			   0, copies, 1);
+			   0, 0, copies, 1);
 	if (ret != 0)
 		ret = SD_RES_VDI_WRITE;
 out:
@@ -390,7 +390,7 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 
 	ret = write_object(entries, nr_vnodes, nr_zones, epoch,
 			   vid_to_vdi_oid(*vid), (char *)inode,
-			   SD_INODE_HEADER_SIZE, 0, nr_reqs, 0);
+			   SD_INODE_HEADER_SIZE, 0, 0, nr_reqs, 0);
 	if (ret != 0) {
 		ret = SD_RES_EIO;
 		goto out;
@@ -654,13 +654,14 @@ err:
 
 int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 		 uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime,
-		 int creat, int excl)
+		 int write, int excl, int delete)
 {
 	struct sheepdog_vnode_list_entry *entries;
 	struct sheepdog_vdi_attr tmp_attr;
 	uint64_t oid;
 	uint32_t end;
 	int ret, nr_zones, nr_vnodes;
+	int value_len;
 
 	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
 	if (!entries) {
@@ -669,16 +670,13 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 		goto out;
 	}
 
-	if (data_len != SD_ATTR_HEADER_SIZE) {
-		ret = SD_RES_INVALID_PARMS;
-		goto out;
-	}
+	value_len = data_len - SD_ATTR_HEADER_SIZE;
 
 	vattr->ctime = ctime;
 
 	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
 
-	*attrid = fnv_64a_buf(vattr, data_len, FNV1A_64_INIT);
+	*attrid = fnv_64a_buf(vattr, SD_ATTR_HEADER_SIZE, FNV1A_64_INIT);
 	*attrid &= (UINT64_C(1) << VDI_SPACE_SHIFT) - 1;
 
 	end = *attrid - 1;
@@ -687,9 +685,9 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 		ret = read_object(entries, nr_vnodes, nr_zones, epoch, oid, (char *)&tmp_attr,
 				  sizeof(tmp_attr), 0, copies);
 
-		if (ret == -SD_RES_NO_OBJ && creat) {
+		if (ret == -SD_RES_NO_OBJ && write) {
 			ret = write_object(entries, nr_vnodes, nr_zones, epoch, oid,
-					   (char *)vattr, data_len, 0, copies, 1);
+					   (char *)vattr, data_len, 0, 0, copies, 1);
 			if (ret)
 				ret = SD_RES_EIO;
 			else
@@ -703,7 +701,26 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 		if (memcmp(&tmp_attr, vattr, sizeof(tmp_attr)) == 0) {
 			if (excl)
 				ret = SD_RES_VDI_EXIST;
-			else
+			else if (delete) {
+				ret = write_object(entries, nr_vnodes, nr_zones,
+						   epoch, oid, (char *)"", 1,
+						   offsetof(struct sheepdog_vdi_attr, name),
+						   0, copies, 0);
+				if (ret)
+					ret = SD_RES_EIO;
+				else
+					ret = SD_RES_SUCCESS;
+			} else if (write) {
+				ret = write_object(entries, nr_vnodes, nr_zones,
+						   epoch, oid, vattr->value,
+						   value_len, SD_ATTR_HEADER_SIZE,
+						   SD_FLAG_CMD_TRUNCATE, copies, 0);
+
+				if (ret)
+					ret = SD_RES_EIO;
+				else
+					ret = SD_RES_SUCCESS;
+			} else
 				ret = SD_RES_SUCCESS;
 			goto out;
 		}
-- 
1.7.2.5




More information about the sheepdog mailing list