[Sheepdog] [PATCH] sheep: fix data consistency when reading objects for the first time

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Sat Dec 25 17:55:25 CET 2010


If total node failure happens, data consistency of replicated objects
could be broken.  This patch overwrites replicated objects with the
same data and recovers the data consistency when qemu reads the
objects for the first time.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/sheepdog_proto.h |    5 ++++
 sheep/group.c            |   16 +++++++++++++
 sheep/sdnet.c            |   22 +++++++++++++++++-
 sheep/sheep_priv.h       |    9 +++++++
 sheep/store.c            |   54 ++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 105 insertions(+), 1 deletions(-)

diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 1e321a8..0602ff5 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -203,6 +203,11 @@ static inline int is_data_obj(uint64_t oid)
 	return !(VDI_BIT & oid);
 }
 
+static inline uint64_t data_oid_to_idx(uint64_t oid)
+{
+	return oid & (MAX_DATA_OBJS - 1);
+}
+
 static inline uint64_t vid_to_vdi_oid(uint32_t vid)
 {
 	return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT);
diff --git a/sheep/group.c b/sheep/group.c
index 096a328..ed50390 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -1500,6 +1500,21 @@ do_retry:
 					list_add_tail(&req->r_wlist, &failed_req_list);
 					continue;
 				}
+			} else if (req->rq.opcode == SD_OP_READ_OBJ) {
+				struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
+				uint32_t vdi_id = oid_to_vid(hdr->oid);
+				struct data_object_bmap *bmap;
+
+				req->check_consistency = 1;
+				if (is_data_obj(hdr->oid)) {
+					list_for_each_entry(bmap, &sys->consistent_obj_list, list) {
+						if (bmap->vdi_id == vdi_id) {
+							if (test_bit(data_oid_to_idx(hdr->oid), bmap->dobjs))
+								req->check_consistency = 0;
+							break;
+						}
+					}
+				}
 			}
 		}
 		queue_work(&req->work);
@@ -1722,6 +1737,7 @@ join_retry:
 
 	INIT_LIST_HEAD(&sys->outstanding_req_list);
 	INIT_LIST_HEAD(&sys->req_wait_for_obj_list);
+	INIT_LIST_HEAD(&sys->consistent_obj_list);
 
 	INIT_LIST_HEAD(&sys->cpg_event_siblings);
 	cpg_context_set(cpg_handle, sys);
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 6d7e7a3..2a6b706 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -135,8 +135,28 @@ static void __done(struct work *work, int idx)
 
 			list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings);
 			again = 1;
+		} else if (req->rp.result == SD_RES_SUCCESS && req->check_consistency) {
+			struct sd_obj_req *obj_hdr = (struct sd_obj_req *)&req->rq;
+			uint32_t vdi_id = oid_to_vid(obj_hdr->oid);
+			struct data_object_bmap *bmap;
+
+			list_for_each_entry(bmap, &sys->consistent_obj_list, list) {
+				if (bmap->vdi_id == vdi_id) {
+					set_bit(data_oid_to_idx(obj_hdr->oid), bmap->dobjs);
+					goto done;
+				}
+			}
+			bmap = zalloc(sizeof(*bmap));
+			if (bmap == NULL) {
+				eprintf("out of memory\n");
+				goto done;
+			}
+			dprintf("allocate a new object map\n");
+			bmap->vdi_id = vdi_id;
+			list_add(&bmap->list, &sys->consistent_obj_list);
+			set_bit(data_oid_to_idx(obj_hdr->oid), bmap->dobjs);
 		}
-
+done:
 		resume_pending_requests();
 		resume_recovery_work();
 	}
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a8af306..62924f2 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -85,11 +85,19 @@ struct request {
 
 	struct sheepdog_node_list_entry entry[SD_MAX_NODES];
 	int nr_nodes;
+	int check_consistency;
 
 	req_end_t done;
 	struct work work;
 };
 
+struct data_object_bmap {
+	uint32_t vdi_id;
+	DECLARE_BITMAP(dobjs, MAX_DATA_OBJS);
+
+	struct list_head list;
+};
+
 struct cluster_info {
 	cpg_handle_t handle;
 	/* set after finishing the JOIN procedure */
@@ -115,6 +123,7 @@ struct cluster_info {
 
 	struct list_head outstanding_req_list;
 	struct list_head req_wait_for_obj_list;
+	struct list_head consistent_obj_list;
 
 	uint32_t nr_sobjs;
 
diff --git a/sheep/store.c b/sheep/store.c
index 8789478..8c688c1 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -703,6 +703,53 @@ out:
 	return ret;
 }
 
+static int fix_object_consistency(struct request *req, int idx)
+{
+	int ret;
+	unsigned int data_length;
+	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
+	struct sd_obj_req req_bak = *((struct sd_obj_req *)&req->rq);
+	struct sd_obj_rsp rsp_bak = *((struct sd_obj_rsp *)&req->rp);
+	void *data = req->data, *buf;
+
+	if (is_data_obj(hdr->oid))
+		data_length = SD_DATA_OBJ_SIZE;
+	else
+		data_length = sizeof(struct sheepdog_inode);
+
+	buf = zalloc(data_length);
+	if (buf == NULL) {
+		eprintf("out of memory\n");
+		goto out;
+	}
+
+	req->data = buf;
+	hdr->offset = 0;
+	hdr->data_length = data_length;
+	hdr->opcode = SD_OP_READ_OBJ;
+	hdr->flags = 0;
+	ret = forward_read_obj_req(req, idx);
+	if (ret < 0) {
+		eprintf("failed to read object, %d\n", ret);
+		goto out;
+	}
+
+	hdr->opcode = SD_OP_WRITE_OBJ;
+	hdr->flags = SD_FLAG_CMD_WRITE;
+	ret = forward_write_obj_req(req, idx);
+	if (ret < 0) {
+		eprintf("failed to write object, %d\n", ret);
+		goto out;
+	}
+out:
+	free(buf);
+	req->data = data;
+	*((struct sd_obj_req *)&req->rq) = req_bak;
+	*((struct sd_obj_rsp *)&req->rp) = rsp_bak;
+
+	return ret;
+}
+
 void store_queue_request(struct work *work, int idx)
 {
 	struct request *req = container_of(work, struct request, work);
@@ -730,6 +777,13 @@ void store_queue_request(struct work *work, int idx)
 	}
 
 	if (!(hdr->flags & SD_FLAG_CMD_DIRECT)) {
+		/* fix object consistency when we read the object for the first time */
+		if (req->check_consistency) {
+			ret = fix_object_consistency(req, idx);
+			if (ret < 0)
+				goto out;
+		}
+
 		if (hdr->flags & SD_FLAG_CMD_WRITE)
 			ret = forward_write_obj_req(req, idx);
 		else
-- 
1.5.6.5




More information about the sheepdog mailing list