[sheepdog] [PATCH 5/5] sheep: wildcard recovery strategy

Sun Jul 12 07:54:32 CEST 2015

This patch adds a new type of recovery strategy, wildcard. Current
sheepdog doesn't have a method for migrating objects when it changes
hash function for object placement. The command added by the previous
patch is a solution for the problem, but it requires manual copy
spanning server.

As an alternative and more easy way to solve the problem, this patch
adds a new option -W to sheep command. When the option is passed,
sheep searches all objects from all nodes. Therefore, a cluster can be
upgraded even if object placement strategy is changed (e.g. from 0.7.x
to 0.9.y).

The feature will be useful for cases like switching normal mode to
diskvnodes in the future.

TODO: erasure coded VDIs are not supported yet.

Cc: Masahiro Tsuji <tuji at atworks.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 include/internal_proto.h  |  1 +
 sheep/group.c             | 18 +++++++++----
 sheep/ops.c               |  5 ++--
 sheep/recovery.c          | 68 ++++++++++++++++++++++++++++++++++++++++++++---
 sheep/sheep.c             |  4 +++
 sheep/sheep_priv.h        |  6 ++++-
 sheep/store/md.c          |  2 +-
 sheep/store/plain_store.c |  5 ++--
 8 files changed, 94 insertions(+), 15 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index fe159a6..ee94c18 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -118,6 +118,7 @@
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
+#define SD_FLAG_CMD_WILDCARD 0x0100
 
 /* flags for VDI attribute operations */
 #define SD_FLAG_CMD_CREAT    0x0100
diff --git a/sheep/group.c b/sheep/group.c
index f134f74..4018acb 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -32,6 +32,8 @@ static main_thread(struct vnode_info *) current_vnode_info;
 static main_thread(struct list_head *) pending_block_list;
 static main_thread(struct list_head *) pending_notify_list;
 
+bool wildcard_recovery;
+
 static int get_zones_nr_from(struct rb_root *nroot)
 {
 	int nr_zones = 0, j;
@@ -974,11 +976,17 @@ static void update_cluster_info(const struct cluster_info *cinfo,
 				      sys->cinfo.epoch);
 
 			start_recovery(main_thread_get(current_vnode_info),
-				       old_vnode_info, true);
-		} else if (!was_cluster_shutdowned()) {
+				       old_vnode_info, true, false);
+		} else if (!was_cluster_shutdowned() || wildcard_recovery) {
 			start_recovery(main_thread_get(current_vnode_info),
 				       main_thread_get(current_vnode_info),
-				       false);
+				       false, wildcard_recovery);
+
+			/*
+			 * wildcard recovery is invoked only at first time of
+			 * sheep process launch
+			 */
+			wildcard_recovery = false;
 		}
 		set_cluster_shutdown(false);
 	}
@@ -1295,7 +1303,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
 		if (ret != 0)
 			panic("cannot log current epoch %d", sys->cinfo.epoch);
 		start_recovery(main_thread_get(current_vnode_info),
-			       old_vnode_info, true);
+			       old_vnode_info, true, false);
 	}
 
 	put_vnode_info(old_vnode_info);
@@ -1339,7 +1347,7 @@ static void kick_node_recover(void)
 	ret = inc_and_log_epoch();
 	if (ret != 0)
 		panic("cannot log current epoch %d", sys->cinfo.epoch);
-	start_recovery(main_thread_get(current_vnode_info), old, true);
+	start_recovery(main_thread_get(current_vnode_info), old, true, false);
 	put_vnode_info(old);
 }
 
diff --git a/sheep/ops.c b/sheep/ops.c
index a750884..d1eb1fa 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -670,7 +670,7 @@ static int cluster_force_recover_main(const struct sd_req *req,
 
 	vnode_info = get_vnode_info();
 	old_vnode_info = alloc_vnode_info(&nroot);
-	start_recovery(vnode_info, old_vnode_info, true);
+	start_recovery(vnode_info, old_vnode_info, true, false);
 	put_vnode_info(vnode_info);
 	put_vnode_info(old_vnode_info);
 	return ret;
@@ -817,7 +817,7 @@ static int cluster_alter_vdi_copy(const struct sd_req *req, struct sd_rsp *rsp,
 	add_vdi_state(vid, nr_copies, false, 0, block_size_shift, 0);
 
 	vinfo = get_vnode_info();
-	start_recovery(vinfo, vinfo, false);
+	start_recovery(vinfo, vinfo, false, false);
 	put_vnode_info(vinfo);
 
 	return SD_RES_SUCCESS;
@@ -1056,6 +1056,7 @@ int peer_read_obj(struct request *req)
 	iocb.offset = hdr->obj.offset;
 	iocb.ec_index = hdr->obj.ec_index;
 	iocb.copy_policy = hdr->obj.copy_policy;
+	iocb.wildcard = !!(hdr->flags & SD_FLAG_CMD_WILDCARD);
 	ret = sd_store->read(hdr->obj.oid, &iocb);
 	if (ret != SD_RES_SUCCESS)
 		goto out;
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 56328dc..bb89a84 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -42,6 +42,8 @@ struct recovery_obj_work {
 	/* local replica in the stale directory */
 	uint32_t local_epoch;
 	uint8_t local_sha1[SHA1_DIGEST_SIZE];
+
+	bool wildcard;
 };
 
 /*
@@ -76,6 +78,8 @@ struct recovery_info {
 	uint32_t max_exec_count;
 	uint64_t queue_work_interval;
 	bool throttling;
+
+	bool wildcard;
 };
 
 struct recovery_timer {
@@ -252,7 +256,7 @@ done:
  */
 static int recover_object_from(struct recovery_obj_work *row,
 			       const struct sd_node *node,
-			       uint32_t tgt_epoch)
+			       uint32_t tgt_epoch, bool wildcard)
 {
 	uint64_t oid = row->oid;
 	uint32_t local_epoch = row->local_epoch;
@@ -299,6 +303,8 @@ static int recover_object_from(struct recovery_obj_work *row,
 	sd_init_req(&hdr, SD_OP_READ_PEER);
 	hdr.epoch = epoch;
 	hdr.flags = SD_FLAG_CMD_RECOVERY;
+	if (wildcard)
+		hdr.flags |= SD_FLAG_CMD_WILDCARD;
 	hdr.data_length = rlen;
 	hdr.obj.oid = oid;
 	hdr.obj.tgt_epoch = tgt_epoch;
@@ -349,7 +355,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
 		if (invalid_node(node, row->base.cur_vinfo))
 			continue;
 
-		ret = recover_object_from(row, node, tgt_epoch);
+		ret = recover_object_from(row, node, tgt_epoch, false);
 		switch (ret) {
 		case SD_RES_SUCCESS:
 			sd_debug("recovered oid %"PRIx64" from %d to epoch %d",
@@ -378,6 +384,50 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
 	return ret;
 }
 
+static int recover_object_wildcard(struct recovery_obj_work *row,
+				   struct vnode_info *old,
+				   uint32_t tgt_epoch)
+{
+	uint64_t oid = row->oid;
+	uint32_t epoch = row->base.epoch;
+	int ret = SD_RES_SUCCESS;
+	bool fully_replicated = true;
+	struct sd_node *n;
+
+	rb_for_each_entry(n, &old->nroot, rb) {
+		sd_info("doing wildcard recovery: at epoch %u, object %"PRIx64
+			", from %s", tgt_epoch, oid, node_to_str(n));
+
+		ret = recover_object_from(row, n, tgt_epoch, true);
+
+		switch (ret) {
+		case SD_RES_SUCCESS:
+			sd_debug("recovered oid %"PRIx64" from %d to epoch %d"
+				 " (wildcard)", oid, tgt_epoch, epoch);
+			return ret;
+		case SD_RES_OLD_NODE_VER:
+			/* move to the next epoch recovery */
+			return ret;
+		case SD_RES_NO_OBJ:
+			fully_replicated = false;
+			/* fall through */
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * sheep would return a stale object when
+	 *  - all the nodes hold the copies, and
+	 *  - all the nodes are gone
+	 * at the some epoch
+	 */
+	if (fully_replicated && ret != SD_RES_SUCCESS)
+		ret = SD_RES_STALE_OBJ;
+
+	return ret;
+}
+
 /*
  * Recover the object from its track in epoch history. That is,
  * the routine will try to recovery it from the nodes it has stayed,
@@ -397,7 +447,10 @@ again:
 	sd_debug("try recover object %"PRIx64" from epoch %"PRIu32, oid,
 		 tgt_epoch);
 
-	ret = recover_object_from_replica(row, old, tgt_epoch);
+	if (row->wildcard)
+		ret = recover_object_wildcard(row, old, tgt_epoch);
+	else
+		ret = recover_object_from_replica(row, old, tgt_epoch);
 
 	switch (ret) {
 	case SD_RES_SUCCESS:
@@ -609,6 +662,7 @@ static inline void direct_queue_recovery_work(uint64_t oid)
 	struct recovery_obj_work *row;
 	row = xzalloc(sizeof(*row));
 	row->oid = oid;
+	row->wildcard = rinfo->wildcard;
 
 	rw = &row->base;
 	rw->work.fn = recover_object_work;
@@ -1148,7 +1202,7 @@ out:
 }
 
 int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
-		   bool epoch_lifted)
+		   bool epoch_lifted, bool wildcard)
 {
 	struct recovery_info *rinfo;
 
@@ -1173,6 +1227,11 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
 	rinfo->cur_vinfo = grab_vnode_info(cur_vinfo);
 	rinfo->old_vinfo = grab_vnode_info(old_vinfo);
 
+	rinfo->wildcard = wildcard;
+	if (wildcard)
+		sd_info("starting wild card recovery, objects will be searched"
+			" from all nodes");
+
 	if (!node_is_gateway_only())
 		sd_store->update_epoch(rinfo->tgt_epoch);
 
@@ -1215,6 +1274,7 @@ static void queue_recovery_work(struct recovery_info *rinfo)
 	case RW_RECOVER_OBJ:
 		row = xzalloc(sizeof(*row));
 		row->oid = rinfo->oids[rinfo->next];
+		row->wildcard = rinfo->wildcard;
 
 		rw = &row->base;
 		rw->work.fn = recover_object_work;
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 6763a83..0f01d00 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -156,6 +156,7 @@ static struct sd_option sheep_options[] = {
 	{'v', "version", false, "show the version"},
 	{'V', "vnodes", true, "set number of vnodes", vnodes_help},
 	{'w', "cache", true, "enable object cache", cache_help},
+	{'W', "wildcard-recovery", false, "wildcard recovery for first time"},
 	{'y', "myaddr", true, "specify the address advertised to other sheep",
 	 myaddr_help},
 	{'z', "zone", true,
@@ -834,6 +835,9 @@ int main(int argc, char **argv)
 				exit(1);
 			}
 			break;
+		case 'W':
+			wildcard_recovery = true;
+			break;
 		default:
 			usage(1);
 			break;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 95900ed..181869e 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -213,6 +213,7 @@ struct siocb {
 	uint32_t offset;
 	uint8_t ec_index;
 	uint8_t copy_policy;
+	uint8_t wildcard;
 };
 
 /* This structure is used to pass parameters to vdi_* functions. */
@@ -454,7 +455,8 @@ int get_obj_list(const struct sd_req *, struct sd_rsp *, void *);
 int objlist_cache_cleanup(uint32_t vid);
 void objlist_cache_format(void);
 
-int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool);
+int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool,
+		   bool);
 bool oid_in_recovery(uint64_t oid);
 bool node_in_recovery(void);
 void get_recovery_state(struct recovery_state *state);
@@ -647,4 +649,6 @@ static inline int nfs_init(const char *options)
 }
 #endif
 
+extern bool wildcard_recovery;
+
 #endif
diff --git a/sheep/store/md.c b/sheep/store/md.c
index 96635b6..e3501ed 100644
--- a/sheep/store/md.c
+++ b/sheep/store/md.c
@@ -545,7 +545,7 @@ static inline void kick_recover(void)
 	if (is_cluster_diskmode(&sys->cinfo))
 		sys->cdrv->update_node(&sys->this_node);
 	else {
-		start_recovery(vinfo, vinfo, false);
+		start_recovery(vinfo, vinfo, false, false);
 		put_vnode_info(vinfo);
 	}
 }
diff --git a/sheep/store/plain_store.c b/sheep/store/plain_store.c
index 2cc9479..4daf1c9 100644
--- a/sheep/store/plain_store.c
+++ b/sheep/store/plain_store.c
@@ -294,8 +294,9 @@ int default_read(uint64_t oid, const struct siocb *iocb)
 	 * If the request is against the older epoch, try to read from
 	 * the stale directory
 	 */
-	if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
-	    iocb->epoch < sys_epoch()) {
+	if (ret == SD_RES_NO_OBJ &&
+	    (iocb->wildcard ||
+	     (0 < iocb->epoch && iocb->epoch < sys_epoch()))) {
 		get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
 		ret = default_read_from_path(oid, path, iocb);
 	}
-- 
1.9.1