[Sheepdog] [PATCH 2/2] make vdi setattr atomic
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Fri Oct 14 07:59:01 CEST 2011
At Fri, 14 Oct 2011 02:42:38 +0900,
MORITA Kazutaka wrote:
>
> 'collie vdi setattr' runs the following two:
>
> - allocates a vdi attr object id
> - writes an attribute data to the object
>
> So a race can happen between these two operations.
>
> With this patch, 'collie vdi setattr' sends the attribute data when
> allocating the object id to write the attribute atomically.
>
> Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
> ---
> collie/vdi.c | 47 ++++++++++++++++++++---------------------------
> include/sheep.h | 11 +++++++----
> sheep/group.c | 3 ++-
> sheep/sdnet.c | 9 +++++----
> sheep/store.c | 5 +++--
> sheep/vdi.c | 43 ++++++++++++++++++++++++++++++-------------
> 6 files changed, 67 insertions(+), 51 deletions(-)
Oops, sorry, I forgot to add sheep/sheep_priv.h. Fixed version is
below.
==
From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Subject: [PATCH] make vdi setattr atomic
'collie vdi setattr' runs the following two:
- allocates a vdi attr object id
- writes an attribute data to the object
So a race can happen between these two operations.
With this patch, 'collie vdi setattr' sends the attribute data when
allocating the object id to write the attribute atomically.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
collie/vdi.c | 47 ++++++++++++++++++++---------------------------
include/sheep.h | 11 +++++++----
sheep/group.c | 3 ++-
sheep/sdnet.c | 9 +++++----
sheep/sheep_priv.h | 7 ++++---
sheep/store.c | 5 +++--
sheep/vdi.c | 43 ++++++++++++++++++++++++++++++-------------
7 files changed, 71 insertions(+), 54 deletions(-)
diff --git a/collie/vdi.c b/collie/vdi.c
index 245a6e2..6ac73cf 100644
--- a/collie/vdi.c
+++ b/collie/vdi.c
@@ -784,8 +784,9 @@ static int vdi_object(int argc, char **argv)
}
static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
- char *key, uint32_t *vid, uint64_t *oid,
- unsigned int *nr_copies, int creat, int excl)
+ char *key, void *value, unsigned int value_len,
+ uint32_t *vid, uint64_t *oid, unsigned int *nr_copies,
+ int creat, int excl, int delete)
{
struct sd_vdi_req hdr;
struct sd_vdi_rsp *rsp = (struct sd_vdi_rsp *)&hdr;
@@ -793,7 +794,7 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
unsigned int wlen, rlen;
struct sheepdog_vdi_attr *vattr;
- vattr = zalloc(SD_ATTR_HEADER_SIZE);
+ vattr = zalloc(SD_ATTR_HEADER_SIZE + value_len);
if (!vattr)
return SD_RES_NO_MEM;
@@ -801,6 +802,8 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
strncpy(vattr->tag, vdi_cmd_data.snapshot_tag, SD_MAX_VDI_TAG_LEN);
vattr->snap_id = vdi_cmd_data.snapshot_id;
strncpy(vattr->key, key, SD_MAX_VDI_ATTR_KEY_LEN);
+ if (value && value_len)
+ memcpy(vattr->value, value, value_len);
fd = connect_to(sdhost, sdport);
if (fd < 0) {
@@ -810,7 +813,7 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
memset(&hdr, 0, sizeof(hdr));
hdr.opcode = SD_OP_GET_VDI_ATTR;
- wlen = SD_ATTR_HEADER_SIZE;
+ wlen = SD_ATTR_HEADER_SIZE + value_len;
rlen = 0;
hdr.proto_ver = SD_PROTO_VER;
hdr.data_length = wlen;
@@ -820,6 +823,8 @@ static int find_vdi_attr_oid(char *vdiname, char *tag, uint32_t snapid,
hdr.flags |= SD_FLAG_CMD_CREAT;
if (excl)
hdr.flags |= SD_FLAG_CMD_EXCL;
+ if (delete)
+ hdr.flags |= SD_FLAG_CMD_DEL;
ret = exec_req(fd, (struct sd_req *)&hdr, vattr, &wlen, &rlen);
if (ret) {
@@ -844,8 +849,8 @@ out:
static int vdi_setattr(int argc, char **argv)
{
- int ret;
- uint64_t oid, attr_oid = 0;
+ int ret, value_len = 0;
+ uint64_t attr_oid = 0;
uint32_t vid = 0, nr_copies = 0;
char *vdiname = argv[optind++], *key, *value;
uint64_t offset;
@@ -878,10 +883,14 @@ reread:
}
}
+ if (value)
+ value_len = strlen(value);
+
ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag,
- vdi_cmd_data.snapshot_id, key, &vid, &attr_oid,
+ vdi_cmd_data.snapshot_id, key, value,
+ value_len, &vid, &attr_oid,
&nr_copies, !vdi_cmd_data.delete,
- vdi_cmd_data.exclusive);
+ vdi_cmd_data.exclusive, vdi_cmd_data.delete);
if (ret) {
if (ret == SD_RES_VDI_EXIST) {
fprintf(stderr, "the attribute already exists, %s\n", key);
@@ -893,27 +902,11 @@ reread:
fprintf(stderr, "vdi not found\n");
return EXIT_MISSING;
} else
- fprintf(stderr, "failed to find attr oid, %s\n",
+ fprintf(stderr, "failed to set attr, %s\n",
sd_strerror(ret));
return EXIT_FAILURE;
}
- oid = attr_oid;
-
- if (vdi_cmd_data.delete)
- ret = sd_write_object(oid, 0, (char *)"", 1,
- offsetof(struct sheepdog_inode, name), 0,
- nr_copies, 0);
- else
- ret = sd_write_object(oid, 0, value, strlen(value),
- SD_ATTR_HEADER_SIZE, SD_FLAG_CMD_TRUNCATE,
- nr_copies, 0);
-
- if (ret != SD_RES_SUCCESS) {
- fprintf(stderr, "failed to set attribute\n");
- return EXIT_FAILURE;
- }
-
return EXIT_SUCCESS;
}
@@ -931,8 +924,8 @@ static int vdi_getattr(int argc, char **argv)
}
ret = find_vdi_attr_oid(vdiname, vdi_cmd_data.snapshot_tag,
- vdi_cmd_data.snapshot_id, key, &vid, &attr_oid,
- &nr_copies, 0, 0);
+ vdi_cmd_data.snapshot_id, key, NULL, 0,
+ &vid, &attr_oid, &nr_copies, 0, 0, 0);
if (ret == SD_RES_NO_OBJ) {
fprintf(stderr, "no such attribute, %s\n", key);
return EXIT_MISSING;
diff --git a/include/sheep.h b/include/sheep.h
index 31516d9..3c2d799 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -37,10 +37,13 @@
#define SD_OP_KILL_NODE 0x88
#define SD_OP_GET_VDI_ATTR 0x89
-#define SD_FLAG_CMD_DIRECT 0x10
-#define SD_FLAG_CMD_RECOVERY 0x20
-#define SD_FLAG_CMD_CREAT 0x40
-#define SD_FLAG_CMD_EXCL 0x80
+#define SD_FLAG_CMD_DIRECT 0x0010
+#define SD_FLAG_CMD_RECOVERY 0x0020
+
+/* flags for vdi attribute operations */
+#define SD_FLAG_CMD_CREAT 0x0100
+#define SD_FLAG_CMD_EXCL 0x0200
+#define SD_FLAG_CMD_DEL 0x0400
#define SD_RES_OLD_NODE_VER 0x41 /* Remote node has an old epoch */
#define SD_RES_NEW_NODE_VER 0x42 /* Remote node has a new epoch */
diff --git a/sheep/group.c b/sheep/group.c
index de3333e..8e9266f 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -898,7 +898,8 @@ static void vdi_op(struct vdi_op_message *msg)
ret = get_vdi_attr(hdr->epoch, data, hdr->data_length, vid,
&attrid, nr_copies, ctime,
hdr->flags & SD_FLAG_CMD_CREAT,
- hdr->flags & SD_FLAG_CMD_EXCL);
+ hdr->flags & SD_FLAG_CMD_EXCL,
+ hdr->flags & SD_FLAG_CMD_DEL);
break;
case SD_OP_RELEASE_VDI:
break;
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index db2e691..eb375cb 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -629,7 +629,7 @@ int create_listen_port(int port, void *data)
int write_object(struct sheepdog_vnode_list_entry *e,
int vnodes, int zones, uint32_t node_version,
uint64_t oid, char *data, unsigned int datalen,
- uint64_t offset, int nr, int create)
+ uint64_t offset, uint16_t flags, int nr, int create)
{
struct sd_obj_req hdr;
int i, n, fd, ret, success = 0;
@@ -644,8 +644,8 @@ int write_object(struct sheepdog_vnode_list_entry *e,
n = obj_to_sheep(e, vnodes, oid, i);
if (is_myself(e[n].addr, e[n].port)) {
- ret = write_object_local(oid, data, datalen, offset, nr,
- node_version, create);
+ ret = write_object_local(oid, data, datalen, offset,
+ flags, nr, node_version, create);
if (ret != 0)
eprintf("fail %"PRIx64" %"PRIx32"\n", oid, ret);
@@ -673,7 +673,8 @@ int write_object(struct sheepdog_vnode_list_entry *e,
hdr.oid = oid;
hdr.copies = nr;
- hdr.flags = SD_FLAG_CMD_WRITE | SD_FLAG_CMD_DIRECT;
+ hdr.flags = flags;
+ hdr.flags |= SD_FLAG_CMD_WRITE | SD_FLAG_CMD_DIRECT;
hdr.data_length = wlen;
hdr.offset = offset;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 6ea03d5..45c5903 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -181,7 +181,7 @@ int read_vdis(char *data, int len, unsigned int *rsp_len);
int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime,
- int creat, int excl);
+ int write, int excl, int delete);
int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries);
void setup_ordered_sd_vnode_list(struct request *req);
@@ -199,7 +199,8 @@ int leave_cluster(void);
void start_cpg_event_work(void);
void store_queue_request(struct work *work, int idx);
int write_object_local(uint64_t oid, char *data, unsigned int datalen,
- uint64_t offset, int copies, uint32_t epoch, int create);
+ uint64_t offset, uint16_t flags, int copies,
+ uint32_t epoch, int create);
int read_object_local(uint64_t oid, char *data, unsigned int datalen,
uint64_t offset, int copies, uint32_t epoch);
@@ -230,7 +231,7 @@ int is_recoverying_oid(uint64_t oid);
int write_object(struct sheepdog_vnode_list_entry *e,
int vnodes, int zones, uint32_t node_version,
uint64_t oid, char *data, unsigned int datalen,
- uint64_t offset, int nr, int create);
+ uint64_t offset, uint16_t flags, int nr, int create);
int read_object(struct sheepdog_vnode_list_entry *e,
int vnodes, int zones, uint32_t node_version,
uint64_t oid, char *data, unsigned int datalen,
diff --git a/sheep/store.c b/sheep/store.c
index 12e54d9..57654ff 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -490,7 +490,8 @@ int update_epoch_store(uint32_t epoch)
}
int write_object_local(uint64_t oid, char *data, unsigned int datalen,
- uint64_t offset, int copies, uint32_t epoch, int create)
+ uint64_t offset, uint16_t flags, int copies,
+ uint32_t epoch, int create)
{
int ret;
struct request *req;
@@ -507,7 +508,7 @@ int write_object_local(uint64_t oid, char *data, unsigned int datalen,
else
hdr->opcode = SD_OP_WRITE_OBJ;
hdr->copies = copies;
- hdr->flags = SD_FLAG_CMD_WRITE;
+ hdr->flags = flags | SD_FLAG_CMD_WRITE;
hdr->offset = offset;
hdr->data_length = datalen;
req->data = data;
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 87a63c9..b192dcb 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -124,7 +124,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
if (is_snapshot && cur_vid != base_vid) {
ret = write_object(entries, nr_vnodes, nr_zones, epoch,
vid_to_vdi_oid(cur_vid), (char *)cur,
- SD_INODE_HEADER_SIZE, 0, copies, 0);
+ SD_INODE_HEADER_SIZE, 0, 0, copies, 0);
if (ret != 0) {
vprintf(SDOG_ERR "failed\n");
ret = SD_RES_BASE_VDI_READ;
@@ -135,7 +135,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
if (base_vid) {
ret = write_object(entries, nr_vnodes, nr_zones, epoch,
vid_to_vdi_oid(base_vid), (char *)base,
- SD_INODE_HEADER_SIZE, 0, copies, 0);
+ SD_INODE_HEADER_SIZE, 0, 0, copies, 0);
if (ret != 0) {
vprintf(SDOG_ERR "failed\n");
ret = SD_RES_BASE_VDI_WRITE;
@@ -145,7 +145,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
ret = write_object(entries, nr_vnodes, nr_zones, epoch,
vid_to_vdi_oid(new_vid), (char *)new, sizeof(*new),
- 0, copies, 1);
+ 0, 0, copies, 1);
if (ret != 0)
ret = SD_RES_VDI_WRITE;
out:
@@ -390,7 +390,7 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
ret = write_object(entries, nr_vnodes, nr_zones, epoch,
vid_to_vdi_oid(*vid), (char *)inode,
- SD_INODE_HEADER_SIZE, 0, nr_reqs, 0);
+ SD_INODE_HEADER_SIZE, 0, 0, nr_reqs, 0);
if (ret != 0) {
ret = SD_RES_EIO;
goto out;
@@ -654,13 +654,14 @@ err:
int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime,
- int creat, int excl)
+ int write, int excl, int delete)
{
struct sheepdog_vnode_list_entry *entries;
struct sheepdog_vdi_attr tmp_attr;
uint64_t oid;
uint32_t end;
int ret, nr_zones, nr_vnodes;
+ int value_len;
entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
if (!entries) {
@@ -669,16 +670,13 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
goto out;
}
- if (data_len != SD_ATTR_HEADER_SIZE) {
- ret = SD_RES_INVALID_PARMS;
- goto out;
- }
+ value_len = data_len - SD_ATTR_HEADER_SIZE;
vattr->ctime = ctime;
get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
- *attrid = fnv_64a_buf(vattr, data_len, FNV1A_64_INIT);
+ *attrid = fnv_64a_buf(vattr, SD_ATTR_HEADER_SIZE, FNV1A_64_INIT);
*attrid &= (UINT64_C(1) << VDI_SPACE_SHIFT) - 1;
end = *attrid - 1;
@@ -687,9 +685,9 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
ret = read_object(entries, nr_vnodes, nr_zones, epoch, oid, (char *)&tmp_attr,
sizeof(tmp_attr), 0, copies);
- if (ret == -SD_RES_NO_OBJ && creat) {
+ if (ret == -SD_RES_NO_OBJ && write) {
ret = write_object(entries, nr_vnodes, nr_zones, epoch, oid,
- (char *)vattr, data_len, 0, copies, 1);
+ (char *)vattr, data_len, 0, 0, copies, 1);
if (ret)
ret = SD_RES_EIO;
else
@@ -703,7 +701,26 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
if (memcmp(&tmp_attr, vattr, sizeof(tmp_attr)) == 0) {
if (excl)
ret = SD_RES_VDI_EXIST;
- else
+ else if (delete) {
+ ret = write_object(entries, nr_vnodes, nr_zones,
+ epoch, oid, (char *)"", 1,
+ offsetof(struct sheepdog_vdi_attr, name),
+ 0, copies, 0);
+ if (ret)
+ ret = SD_RES_EIO;
+ else
+ ret = SD_RES_SUCCESS;
+ } else if (write) {
+ ret = write_object(entries, nr_vnodes, nr_zones,
+ epoch, oid, vattr->value,
+ value_len, SD_ATTR_HEADER_SIZE,
+ SD_FLAG_CMD_TRUNCATE, copies, 0);
+
+ if (ret)
+ ret = SD_RES_EIO;
+ else
+ ret = SD_RES_SUCCESS;
+ } else
ret = SD_RES_SUCCESS;
goto out;
}
--
1.7.2.5
More information about the sheepdog
mailing list