[sheepdog] [PATCH 2/2] sheep: sync checkpoint in a fine grained manner

Mon Jun 8 15:34:45 CEST 2015

Current way of syncing checkpoint is can issue too large
SD_OP_VDI_STATE_CHECKPOINT_CTL request when a cluster has bunch of
VDIs.

For avoiding the problem, this patch lets the request fine grain. One
request corresponds to single VDI. With this method, the broated
request can be avoided.

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 include/sheepdog_proto.h |   2 +
 sheep/group.c            | 127 ++++++++++++++++++++++++++++-------------------
 sheep/ops.c              |  10 ++--
 sheep/sheep_priv.h       |   3 +-
 sheep/vdi.c              |  24 ++++-----
 5 files changed, 97 insertions(+), 69 deletions(-)

diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 4b9dacc..af9b24f 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -198,6 +198,8 @@ struct sd_req {
 		struct {
 			uint32_t        get; /* 0 means free, 1 means get */
 			uint32_t        tgt_epoch;
+
+			uint32_t        vid; /* only used for get */
 		} vdi_state_checkpoint;
 		struct {
 			/* 1 means validate, 0 means invalidate */
diff --git a/sheep/group.c b/sheep/group.c
index e342453..53f1d36 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -576,7 +576,7 @@ static void do_get_vdis(struct work *work)
 
 static void collect_cinfo(void);
 
-static void get_vdis_done(struct work *work)
+main_fn static void get_vdis_done(struct work *work)
 {
 	struct get_vdis_work *w =
 		container_of(work, struct get_vdis_work, work);
@@ -691,42 +691,32 @@ struct cinfo_collection_work {
 	int epoch;
 	struct vnode_info *members;
 
-	int nr_vdi_states;
 	struct vdi_state *result;
+	uint32_t next_vid;
 };
 
 static struct cinfo_collection_work *collect_work;
 
 static struct vdi_state *do_cinfo_collection_work(uint32_t epoch,
-						  struct sd_node *n,
-						  int *nr_vdi_states)
+						  uint32_t vid,
+						  struct sd_node *n)
 {
 	struct vdi_state *vs = NULL;
-	unsigned int rlen = 4096;
 	struct sd_req hdr;
-	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	int ret;
 
-	vs = xcalloc(rlen, sizeof(*vs));
+	vs = xzalloc(sizeof(*vs));
 
-retry:
 	sd_init_req(&hdr, SD_OP_VDI_STATE_CHECKPOINT_CTL);
 	hdr.vdi_state_checkpoint.get = 1;
 	hdr.vdi_state_checkpoint.tgt_epoch = epoch;
-	hdr.data_length = rlen;
+	hdr.vdi_state_checkpoint.vid = vid;
+	hdr.data_length = sizeof(*vs);
 
 	ret = sheep_exec_req(&n->nid, &hdr, (char *)vs);
 	if (ret == SD_RES_SUCCESS) {
 		sd_debug("succeed to obtain checkpoint of vdi states");
-		*nr_vdi_states = rsp->data_length / sizeof(*vs);
 		return vs;
-	} else if (ret == SD_RES_BUFFER_SMALL) {
-		sd_debug("buffer is small for obtaining checkpoint of vdi states,"
-			 " doubling it (%lu -> %lu)", rlen * sizeof(*vs),
-			 rlen * 2 * sizeof(*vs));
-		rlen *= 2;
-		vs = xrealloc(vs, sizeof(*vs) * rlen);
-		goto retry;
 	}
 
 	sd_err("failed to obtain checkpoint of vdi states from node %s",
@@ -736,14 +726,13 @@ retry:
 
 static void cinfo_collection_work(struct work *work)
 {
-	struct sd_req hdr;
 	struct vdi_state *vs = NULL;
 	struct cinfo_collection_work *w =
 		container_of(work, struct cinfo_collection_work, work);
 	struct sd_node *n;
-	int ret, nr_vdi_states = 0;
 
-	sd_debug("start collection of cinfo...");
+	sd_debug("start collection of cinfo, epoch: %d, vid: %"PRIx32,
+		 w->epoch, w->next_vid);
 
 	sd_assert(w == collect_work);
 
@@ -751,18 +740,31 @@ static void cinfo_collection_work(struct work *work)
 		if (node_is_local(n))
 			continue;
 
-		vs = do_cinfo_collection_work(w->epoch, n, &nr_vdi_states);
-		if (vs)
-			goto get_succeed;
+		vs = do_cinfo_collection_work(w->epoch, w->next_vid, n);
+		if (vs) {
+			w->result = vs;
+			return;
+		}
 	}
 
+	/*
+	 * TODO: need to sleep and retry
+	 *
+	 * There is a possibility that other nodes is still preparing
+	 * their checkpoints.
+	 */
 	panic("getting a checkpoint of vdi state at epoch %d failed", w->epoch);
+}
 
-get_succeed:
-	w->nr_vdi_states = nr_vdi_states;
-	w->result = vs;
+main_fn static void free_cinfo(struct cinfo_collection_work *w)
+/* TODO: this should be done in worker */
+{
+	struct sd_node *n;
+	struct sd_req hdr;
+	int ret;
 
-	sd_debug("collecting cinfo done, freeing from remote nodes");
+	sd_info("collecting cinfo done at epoch %d, freeing from remote nodes",
+		w->epoch);
 
 	rb_for_each_entry(n, &w->members->nroot, rb) {
 		if (node_is_local(n))
@@ -772,49 +774,60 @@ get_succeed:
 		hdr.vdi_state_checkpoint.get = 0;
 		hdr.vdi_state_checkpoint.tgt_epoch = w->epoch;
 
-		ret = sheep_exec_req(&n->nid, &hdr, (char *)vs);
+		ret = sheep_exec_req(&n->nid, &hdr, NULL);
 		if (ret != SD_RES_SUCCESS)
 			sd_err("error at freeing a checkpoint of vdi state"
 			       " at epoch %d", w->epoch);
 	}
 
-	sd_debug("collection done");
+	sd_info("freeing done");
 }
 
-static void cinfo_collection_done(struct work *work)
+main_fn static void cinfo_collection_done(struct work *work)
 {
 	struct cinfo_collection_work *w =
 		container_of(work, struct cinfo_collection_work, work);
+	uint32_t next_vid;
+	struct vdi_state *vs = w->result;
 
 	sd_assert(w == collect_work);
 
-	for (int i = 0; i < w->nr_vdi_states; i++) {
-		struct vdi_state *vs = &w->result[i];
 
-		sd_debug("VID: %"PRIx32, vs->vid);
-		sd_debug("nr_copies: %d", vs->nr_copies);
-		sd_debug("snapshot: %d", vs->snapshot);
-		sd_debug("copy_policy: %d", vs->copy_policy);
-		sd_debug("block_size_shift: %"PRIu8, vs->block_size_shift);
-		sd_debug("lock_state: %x", vs->lock_state);
-		sd_debug("owner: %s",
-			 addr_to_str(vs->lock_owner.addr, vs->lock_owner.port));
+	sd_debug("VID: %"PRIx32, vs->vid);
+	sd_debug("nr_copies: %d", vs->nr_copies);
+	sd_debug("snapshot: %d", vs->snapshot);
+	sd_debug("copy_policy: %d", vs->copy_policy);
+	sd_debug("block_size_shift: %"PRIu8, vs->block_size_shift);
+	sd_debug("lock_state: %x", vs->lock_state);
+	sd_debug("owner: %s",
+		 addr_to_str(vs->lock_owner.addr, vs->lock_owner.port));
 
-		apply_vdi_lock_state(vs);
-	}
+	apply_vdi_lock_state(vs);
+	free(vs);
 
-	put_vnode_info(w->members);
-	free(w->result);
-	free(w);
-	collect_work = NULL;
+	next_vid = find_next_bit(sys->vdi_inuse, SD_NR_VDIS, w->next_vid + 1);
+	if (next_vid == SD_NR_VDIS) {
+		sd_info("collecting checkpoint of epoch %d completed", w->epoch);
+
+		free_cinfo(w);
+
+		put_vnode_info(w->members);
+		free(w);
+		collect_work = NULL;
 
-	play_logged_vdi_ops();
+		play_logged_vdi_ops();
+
+		sd_debug("cluster info collection finished");
+		sys->node_status = SD_NODE_STATUS_OK;
+
+		return;
+	}
 
-	sd_debug("cluster info collection finished");
-	sys->node_status = SD_NODE_STATUS_OK;
+	w->next_vid = next_vid;
+	queue_work(sys->block_wqueue, &collect_work->work);
 }
 
-static void collect_cinfo(void)
+main_fn static void collect_cinfo(void)
 {
 	if (!collect_work)
 		return;
@@ -822,6 +835,20 @@ static void collect_cinfo(void)
 	sd_debug("start cluster info collection for epoch %d",
 		 collect_work->epoch);
 
+	collect_work->next_vid = find_next_bit(sys->vdi_inuse, SD_NR_VDIS, 1);
+	if (collect_work->next_vid == SD_NR_VDIS) {
+		sd_debug("no VDIs are created yet");
+
+		put_vnode_info(collect_work->members);
+		free(collect_work);
+		collect_work = NULL;
+		sys->node_status = SD_NODE_STATUS_OK;
+
+		return;
+	}
+
+	sd_debug("initial vid: %"PRIx32, collect_work->next_vid);
+
 	collect_work->work.fn = cinfo_collection_work;
 	collect_work->work.done = cinfo_collection_done;
 	queue_work(sys->block_wqueue, &collect_work->work);
diff --git a/sheep/ops.c b/sheep/ops.c
index 223994c..a750884 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1460,16 +1460,18 @@ static int local_vdi_state_checkpoint_ctl(const struct sd_req *req,
 {
 	bool get = !!req->vdi_state_checkpoint.get;
 	int epoch = req->vdi_state_checkpoint.tgt_epoch;
-	int ret, length = 0;
+	uint32_t vid = req->vdi_state_checkpoint.vid;
+	int ret;
 
 	sd_info("%s vdi state checkpoint at epoch %d",
 		get ? "getting" : "freeing", epoch);
 
 	if (get) {
-		ret = get_vdi_state_checkpoint(epoch, data, req->data_length,
-					       &length);
+		sd_debug("target VID: %"PRIx32, vid);
+
+		ret = get_vdi_state_checkpoint(epoch, vid, data);
 		if (ret == SD_RES_SUCCESS)
-			rsp->data_length = length;
+			rsp->data_length = sizeof(struct vdi_state);
 		else {
 			sd_info("failed to get vdi state checkpoint: %s",
 			       sd_strerror(ret));
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 8c173a9..95900ed 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -372,8 +372,7 @@ bool vdi_lock(uint32_t vid, const struct node_id *owner, int type);
 bool vdi_unlock(uint32_t vid, const struct node_id *owner, int type);
 void apply_vdi_lock_state(struct vdi_state *vs);
 void create_vdi_state_checkpoint(int epoch);
-int get_vdi_state_checkpoint(int epoch, void *data, int data_len_max,
-			     int *data_len_result);
+int get_vdi_state_checkpoint(int epoch, uint32_t vid, void *data);
 void free_vdi_state_checkpoint(int epoch);
 void log_vdi_op_lock(uint32_t vid, const struct node_id *owner, int type);
 void log_vdi_op_unlock(uint32_t vid, const struct node_id *owner, int type);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index aeb6e68..1689d23 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -2154,15 +2154,20 @@ main_fn void create_vdi_state_checkpoint(int epoch)
 	sd_debug("a number of vdi state: %d", checkpoint->nr_vs);
 }
 
-main_fn int get_vdi_state_checkpoint(int epoch, void *data, int data_len_max,
-				     int *data_len_result)
+main_fn int get_vdi_state_checkpoint(int epoch, uint32_t vid, void *data)
 {
 	struct vdi_state_checkpoint *checkpoint;
-	int len;
+	struct vdi_state *vs;
 
 	list_for_each_entry(checkpoint, &vdi_state_checkpoint_list, list) {
-		if (checkpoint->epoch == epoch)
-			goto found;
+		if (checkpoint->epoch == epoch) {
+			for (int i = 0; i < checkpoint->nr_vs; i++) {
+				if (checkpoint->vs[i].vid == vid) {
+					vs = &checkpoint->vs[i];
+					goto found;
+				}
+			}
+		}
 	}
 
 	sd_info("get request for not prepared vdi state checkpoint, epoch: %d",
@@ -2170,14 +2175,7 @@ main_fn int get_vdi_state_checkpoint(int epoch, void *data, int data_len_max,
 	return SD_RES_AGAIN;
 
 found:
-	len = sizeof(*checkpoint->vs) * checkpoint->nr_vs;
-	if (data_len_max < len) {
-		sd_info("maximum allowed length: %d, required length: %d",
-			data_len_max, len);
-		return SD_RES_BUFFER_SMALL;
-	}
-
-	memcpy(data, checkpoint->vs, len);
+	memcpy(data, vs, sizeof(*vs));
 	return SD_RES_SUCCESS;
 }
 
-- 
1.9.1