[sheepdog] [PATCH v2] sheep: collie cluster recover needs to start recovery

Wed Jun 6 12:38:15 CEST 2012

Currenly we can easily get into a situation where we can't read objects
after losing a node in an offline cluster and then doing a manual recovery.

To fix this call start_recovery from cluster_manual_recover.  Also move
get_vnodes_from_epoch into group.c and rename it to fit with the rest of
the vnode_info functions now that is is used outside of recovery.c.

Signed-off-by: Christoph Hellwig <hch at lst.de>

diff --git a/sheep/group.c b/sheep/group.c
index c2679f2..a83590c 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -201,7 +201,8 @@ void oid_to_vnodes(struct vnode_info *vnode_info, uint64_t oid, int nr_copies,
 	}
 }
 
-struct vnode_info *alloc_vnode_info(struct sd_node *nodes, size_t nr_nodes)
+static struct vnode_info *alloc_vnode_info(struct sd_node *nodes,
+		size_t nr_nodes)
 {
 	struct vnode_info *vnode_info;
 
@@ -218,6 +219,23 @@ struct vnode_info *alloc_vnode_info(struct sd_node *nodes, size_t nr_nodes)
 	return vnode_info;
 }
 
+struct vnode_info *get_vnode_info_epoch(uint32_t epoch)
+{
+	struct sd_node nodes[SD_MAX_NODES];
+	int nr_nodes;
+
+	nr_nodes = epoch_log_read_nr(epoch, (void *)nodes, sizeof(nodes));
+	if (nr_nodes < 0) {
+		nr_nodes = epoch_log_read_remote(epoch, (void *)nodes,
+						 sizeof(nodes));
+		if (nr_nodes == 0)
+			return NULL;
+		nr_nodes /= sizeof(nodes[0]);
+	}
+
+	return alloc_vnode_info(nodes, nr_nodes);
+}
+
 int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 			       void *data)
 {
diff --git a/sheep/ops.c b/sheep/ops.c
index 27ca07d..89870a1 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -420,6 +420,7 @@ static int local_get_epoch(struct request *req)
 static int cluster_manual_recover(const struct sd_req *req, struct sd_rsp *rsp,
 				void *data)
 {
+	struct vnode_info *old_vnode_info, *vnode_info;
 	int ret = SD_RES_SUCCESS;
 	uint8_t c;
 	uint16_t f;
@@ -429,21 +430,25 @@ static int cluster_manual_recover(const struct sd_req *req, struct sd_rsp *rsp,
 	 * 2) some nodes are physically down (same epoch condition).
 	 * In both case, the nodes(s) stat is WAIT_FOR_JOIN.
 	 */
-	if (!sys_stat_wait_join()) {
-		ret = SD_RES_MANUAL_RECOVER;
-		goto out;
-	}
+	if (!sys_stat_wait_join())
+		return SD_RES_MANUAL_RECOVER;
 
 	ret = get_cluster_copies(&c);
 	if (ret)
-		goto out;
+		return ret;
 	ret = get_cluster_flags(&f);
 	if (ret)
-		goto out;
+		return ret;
 
 	sys->nr_copies = c;
 	sys->flags = f;
 
+	old_vnode_info = get_vnode_info_epoch(sys->epoch);
+	if (!old_vnode_info) {
+		eprintf("cannot get vnode info for epoch %d\n", sys->epoch);
+		return SD_RES_EIO;
+	}
+
 	sys->epoch++; /* some nodes are left, so we get a new epoch */
 	ret = log_current_epoch();
 	if (ret) {
@@ -456,7 +461,12 @@ static int cluster_manual_recover(const struct sd_req *req, struct sd_rsp *rsp,
 		sys_stat_set(SD_STATUS_OK);
 	else
 		sys_stat_set(SD_STATUS_HALT);
+
+	vnode_info = get_vnode_info();
+	start_recovery(vnode_info, old_vnode_info);
+	put_vnode_info(vnode_info);
 out:
+	put_vnode_info(old_vnode_info);
 	return ret;
 }
 
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 9b715ea..64309df 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -55,23 +55,6 @@ static int obj_cmp(const void *oid1, const void *oid2)
 	return 0;
 }
 
-static struct vnode_info *get_vnodes_from_epoch(uint32_t epoch)
-{
-	struct sd_node nodes[SD_MAX_NODES];
-	int nr_nodes;
-
-	nr_nodes = epoch_log_read_nr(epoch, (void *)nodes, sizeof(nodes));
-	if (nr_nodes < 0) {
-		nr_nodes = epoch_log_read_remote(epoch, (void *)nodes,
-						 sizeof(nodes));
-		if (nr_nodes == 0)
-			return NULL;
-		nr_nodes /= sizeof(nodes[0]);
-	}
-
-	return alloc_vnode_info(nodes, nr_nodes);
-}
-
 static int recover_object_from_replica(uint64_t oid,
 				       struct sd_vnode *entry,
 				       uint32_t epoch, uint32_t tgt_epoch)
@@ -225,7 +208,7 @@ again:
 			goto err;
 		}
 
-		new_old = get_vnodes_from_epoch(tgt_epoch);
+		new_old = get_vnode_info_epoch(tgt_epoch);
 		if (!new_old) {
 			ret = -1;
 			goto err;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 60432c7..45d3852 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -232,10 +232,10 @@ int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 		void *data);
 
 bool have_enough_zones(void);
-struct vnode_info *alloc_vnode_info(struct sd_node *nodes, size_t nr_nodes);
 struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
 struct vnode_info *get_vnode_info(void);
 void put_vnode_info(struct vnode_info *vnodes);
+struct vnode_info *get_vnode_info_epoch(uint32_t epoch);
 
 struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
 		int copy_idx);