[sheepdog] [PATCH 2/2] sheep: remove stale objects after recovery fully completed

Sun Aug 26 09:54:42 CEST 2012

This notifies SD_OP_COMPLETE_RECOVERY to all nodes when object
recovery finishes.  Sheep removes stale objects when it receives
SD_OP_COMPLETE_RECOVERY from all nodes.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/internal_proto.h |    1 +
 sheep/ops.c              |   54 +++++++++++++++++++++++++++++++++++++++++++--
 sheep/plain_store.c      |    2 +-
 sheep/recovery.c         |   30 ++++++++++++++++++++++++-
 sheep/sheep_priv.h       |    4 +-
 5 files changed, 84 insertions(+), 7 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 3d70ba9..c1d116a 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -64,6 +64,7 @@
 #define SD_OP_DISABLE_RECOVER 0xA9
 #define SD_OP_INFO_RECOVER 0xAA
 #define SD_OP_GET_VDI_COPIES 0xAB
+#define SD_OP_COMPLETE_RECOVERY 0xAC
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/sheep/ops.c b/sheep/ops.c
index c6a4f3b..499c773 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -536,8 +536,6 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
 				void *data)
 {
 	int ret;
-	struct siocb iocb = { 0 };
-	iocb.epoch = sys->epoch;
 
 	if (node_in_recovery())
 		return SD_RES_NODE_IN_RECOVERY;
@@ -546,7 +544,7 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
 		return SD_RES_SUCCESS;
 
 	if (sd_store->cleanup)
-		ret = sd_store->cleanup(&iocb);
+		ret = sd_store->cleanup();
 	else
 		ret = SD_RES_NO_SUPPORT;
 
@@ -561,6 +559,49 @@ static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp,
 	return objlist_cache_cleanup(vid);
 }
 
+static int cluster_recovery_completion(const struct sd_req *req,
+				       struct sd_rsp *rsp,
+				       void *data)
+{
+	static struct sd_node recovereds[SD_MAX_NODES], *node;
+	static size_t nr_recovereds;
+	static int latest_epoch;
+	struct vnode_info *vnode_info;
+	int i;
+
+	node = (struct sd_node *)data;
+
+	if (latest_epoch < req->epoch) {
+		dprintf("new epoch %d\n", req->epoch);
+		latest_epoch = req->epoch;
+		nr_recovereds = 0;
+	}
+
+	recovereds[nr_recovereds++] = *(struct sd_node *)node;
+	qsort(recovereds, nr_recovereds, sizeof(*recovereds), node_id_cmp);
+
+	dprintf("%s is recovered at epoch %d\n", node_to_str(node), req->epoch);
+	for (i = 0; i < nr_recovereds; i++)
+		dprintf("[%x] %s\n", i, node_to_str(recovereds + i));
+
+	if (sys->epoch != latest_epoch)
+		return SD_RES_SUCCESS;
+
+	vnode_info = get_vnode_info();
+
+	if (vnode_info->nr_nodes == nr_recovereds &&
+	    memcmp(vnode_info->nodes, recovereds,
+		   sizeof(*recovereds) * nr_recovereds) == 0) {
+		dprintf("all nodes are recovered at epoch %d\n", req->epoch);
+		if (sd_store->cleanup)
+			sd_store->cleanup();
+	}
+
+	put_vnode_info(vnode_info);
+
+	return SD_RES_SUCCESS;
+}
+
 static int local_set_cache_size(const struct sd_req *req, struct sd_rsp *rsp,
 				  void *data)
 {
@@ -947,6 +988,13 @@ static struct sd_op_template sd_ops[] = {
 		.process_main = cluster_notify_vdi_del,
 	},
 
+	[SD_OP_COMPLETE_RECOVERY] = {
+		.name = "COMPLETE_RECOVERY",
+		.type = SD_OP_TYPE_CLUSTER,
+		.force = 1,
+		.process_main = cluster_recovery_completion,
+	},
+
 	/* local operations */
 	[SD_OP_GET_STORE_LIST] = {
 		.name = "GET_STORE_LIST",
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 8028f7d..8888521 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -138,7 +138,7 @@ out:
 	return ret;
 }
 
-int default_cleanup(struct siocb *iocb)
+int default_cleanup(void)
 {
 	rmdir_r(stale_dir);
 	if (mkdir(stale_dir, 0755) < 0) {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 72c90cd..dec7261 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -337,6 +337,31 @@ static inline void run_next_rw(struct recovery_work *rw)
 	dprintf("recovery work is superseded\n");
 }
 
+static void notify_recovery_completion_work(struct work *work)
+{
+	struct recovery_work *rw = container_of(work, struct recovery_work,
+						work);
+	struct sd_req hdr;
+	int ret;
+
+	sd_init_req(&hdr, SD_OP_COMPLETE_RECOVERY);
+	hdr.epoch = rw->epoch;
+	hdr.flags = SD_FLAG_CMD_WRITE;
+	hdr.data_length = sizeof(sys->this_node);
+
+	ret = exec_local_req(&hdr, &sys->this_node);
+	if (ret != SD_RES_SUCCESS)
+		eprintf("failed to notify recovery completion, %d\n",
+			rw->epoch);
+}
+
+static void notify_recovery_completion_main(struct work *work)
+{
+	struct recovery_work *rw = container_of(work, struct recovery_work,
+						work);
+	free_recovery_work(rw);
+}
+
 static inline void finish_recovery(struct recovery_work *rw)
 {
 	recovering_work = NULL;
@@ -345,7 +370,10 @@ static inline void finish_recovery(struct recovery_work *rw)
 	if (sd_store->end_recover)
 		sd_store->end_recover(sys->epoch - 1, rw->old_vinfo);
 
-	free_recovery_work(rw);
+	/* notify recovery completion to other nodes */
+	rw->work.fn = notify_recovery_completion_work;
+	rw->work.done = notify_recovery_completion_main;
+	queue_work(sys->recovery_wqueue, &rw->work);
 
 	dprintf("recovery complete: new epoch %"PRIu32"\n",
 		sys->recovered_epoch);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 224be51..1bc7e60 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -154,7 +154,7 @@ struct store_driver {
 	int (*purge_obj)(void);
 	/* Operations for snapshot */
 	int (*snapshot)(struct siocb *);
-	int (*cleanup)(struct siocb *);
+	int (*cleanup)(void);
 	int (*restore)(struct siocb *);
 	int (*get_snap_file)(struct siocb *);
 };
@@ -166,7 +166,7 @@ int default_read(uint64_t oid, struct siocb *iocb);
 int default_link(uint64_t oid, struct siocb *iocb, uint32_t tgt_epoch);
 int default_atomic_put(uint64_t oid, struct siocb *iocb);
 int default_end_recover(uint32_t old_epoch, struct vnode_info *old_vnode_info);
-int default_cleanup(struct siocb *iocb);
+int default_cleanup(void);
 int default_format(char *name);
 int default_remove_object(uint64_t oid);
 int default_purge_obj(void);
-- 
1.7.2.5