[sheepdog] [PATCH] Revert "sheep: forbid revival of orphan objects"

Hitoshi Mitake mitake.hitoshi at gmail.com
Sat Feb 28 13:15:56 CET 2015


At Sat, 28 Feb 2015 18:42:56 +0800,
Liu Yuan wrote:
> 
> From: Liu Yuan <liuyuan at cmss.chinamobile.com>
> 
> The reverted patch actually couldn't pass our 061, which means that we will lose
> data in the following case:
> 
> 0 A cluster with 3 copies
> 1 3 nodes of the cluster failed at the same time
> 2 after sometime, the cluster is recovered. As we can expect, some poor objects
>   that happened to have all its copies dwell on 3 failed nodes, will not be
>   recovered. Instead, all the copies are in those failed nodes. We DON'T lose
>   them yet.
> 3 we start those 3 nodes to expect recover the previously unrecovered objects.
>   Unfortunately, the reverted patch will mark all 3 nodes as excluded and not
>   try to recover the objects at all. So we LOSE the objects permanentally after
>   a new epoch is reached, because a cleanup operation across the cluster will be
>   triggered and those good objects are cleaned up.
> 
> P.S., Merge the patch not passing our test framework is dangerouse!
> 
> Cc: Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
> Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
> ---
>  sheep/group.c      | 20 ++++++--------------
>  sheep/md.c         |  2 +-
>  sheep/ops.c        |  4 ++--
>  sheep/recovery.c   | 47 +----------------------------------------------
>  sheep/sheep_priv.h |  3 +--
>  5 files changed, 11 insertions(+), 65 deletions(-)

Applied, thanks.
Hitoshi

> 
> diff --git a/sheep/group.c b/sheep/group.c
> index 5ab2c0c..0fce7c0 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -868,7 +868,7 @@ static bool membership_changed(const struct cluster_info *cinfo,
>  static void update_cluster_info(const struct cluster_info *cinfo,
>  				const struct sd_node *joined,
>  				const struct rb_root *nroot,
> -				size_t nr_nodes, enum sd_status prev_status)
> +				size_t nr_nodes)
>  {
>  	struct vnode_info *old_vnode_info;
>  
> @@ -920,8 +920,6 @@ static void update_cluster_info(const struct cluster_info *cinfo,
>  
>  		if (membership_changed(cinfo, nroot, nr_nodes)) {
>  			int ret;
> -			struct sd_node *excluded = NULL;
> -
>  			if (old_vnode_info)
>  				put_vnode_info(old_vnode_info);
>  
> @@ -931,17 +929,12 @@ static void update_cluster_info(const struct cluster_info *cinfo,
>  				panic("cannot log current epoch %d",
>  				      sys->cinfo.epoch);
>  
> -			if (prev_status == SD_STATUS_OK ||
> -			    (prev_status == SD_STATUS_WAIT &&
> -			     !node_cmp(joined, &sys->this_node)))
> -				excluded = (struct sd_node *)joined;
> -
>  			start_recovery(main_thread_get(current_vnode_info),
> -				       old_vnode_info, true, excluded);
> +				       old_vnode_info, true);
>  		} else if (!was_cluster_shutdowned()) {
>  			start_recovery(main_thread_get(current_vnode_info),
>  				       main_thread_get(current_vnode_info),
> -				       false, NULL);
> +				       false);
>  		}
>  		set_cluster_shutdown(false);
>  	}
> @@ -1169,7 +1162,6 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
>  {
>  	const struct cluster_info *cinfo = opaque;
>  	struct sd_node *n;
> -	enum sd_status prev_status = sys->cinfo.status;
>  	uint16_t flags;
>  
>  	if (node_is_local(joined) && sys->gateway_only
> @@ -1196,7 +1188,7 @@ main_fn void sd_accept_handler(const struct sd_node *joined,
>  	if (sys->cinfo.status == SD_STATUS_SHUTDOWN)
>  		return;
>  
> -	update_cluster_info(cinfo, joined, nroot, nr_nodes, prev_status);
> +	update_cluster_info(cinfo, joined, nroot, nr_nodes);
>  
>  	if (node_is_local(joined)) {
>   		/* this output is used for testing */
> @@ -1259,7 +1251,7 @@ main_fn void sd_leave_handler(const struct sd_node *left,
>  		if (ret != 0)
>  			panic("cannot log current epoch %d", sys->cinfo.epoch);
>  		start_recovery(main_thread_get(current_vnode_info),
> -			       old_vnode_info, true, NULL);
> +			       old_vnode_info, true);
>  	}
>  
>  	put_vnode_info(old_vnode_info);
> @@ -1303,7 +1295,7 @@ static void kick_node_recover(void)
>  	ret = inc_and_log_epoch();
>  	if (ret != 0)
>  		panic("cannot log current epoch %d", sys->cinfo.epoch);
> -	start_recovery(main_thread_get(current_vnode_info), old, true, NULL);
> +	start_recovery(main_thread_get(current_vnode_info), old, true);
>  	put_vnode_info(old);
>  }
>  
> diff --git a/sheep/md.c b/sheep/md.c
> index 4f0cd49..c00d7a5 100644
> --- a/sheep/md.c
> +++ b/sheep/md.c
> @@ -531,7 +531,7 @@ static inline void kick_recover(void)
>  	if (is_cluster_diskmode(&sys->cinfo))
>  		sys->cdrv->update_node(&sys->this_node);
>  	else {
> -		start_recovery(vinfo, vinfo, false, NULL);
> +		start_recovery(vinfo, vinfo, false);
>  		put_vnode_info(vinfo);
>  	}
>  }
> diff --git a/sheep/ops.c b/sheep/ops.c
> index cba5573..8084011 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -667,7 +667,7 @@ static int cluster_force_recover_main(const struct sd_req *req,
>  
>  	vnode_info = get_vnode_info();
>  	old_vnode_info = alloc_vnode_info(&nroot);
> -	start_recovery(vnode_info, old_vnode_info, true, NULL);
> +	start_recovery(vnode_info, old_vnode_info, true);
>  	put_vnode_info(vnode_info);
>  	put_vnode_info(old_vnode_info);
>  	return ret;
> @@ -814,7 +814,7 @@ static int cluster_alter_vdi_copy(const struct sd_req *req, struct sd_rsp *rsp,
>  	add_vdi_state(vid, nr_copies, false, 0, block_size_shift, 0);
>  
>  	vinfo = get_vnode_info();
> -	start_recovery(vinfo, vinfo, false, NULL);
> +	start_recovery(vinfo, vinfo, false);
>  	put_vnode_info(vinfo);
>  
>  	return SD_RES_SUCCESS;
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index adbf5ba..8d94d4f 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -73,9 +73,6 @@ struct recovery_info {
>  	int max_epoch;
>  	struct vnode_info **vinfo_array;
>  	struct sd_mutex vinfo_lock;
> -
> -	struct sd_node *excluded;
> -
>  	uint32_t max_exec_count;
>  	uint64_t queue_work_interval;
>  	bool throttling;
> @@ -197,8 +194,6 @@ static void *read_erasure_object(uint64_t oid, uint8_t idx,
>  	uint8_t policy = get_vdi_copy_policy(oid_to_vid(oid));
>  	int edp = ec_policy_to_dp(policy, NULL, NULL);
>  	int ret;
> -	struct sd_node *excluded = row->base.rinfo->excluded;
> -
>  again:
>  	if (unlikely(old->nr_zones < edp)) {
>  		if (search_erasure_object(oid, idx, &old->nroot, rw,
> @@ -213,11 +208,6 @@ again:
>  		 oid, epoch, tgt_epoch, idx, node_to_str(node));
>  	if (invalid_node(node, rw->cur_vinfo))
>  		goto rollback;
> -	if (excluded && !node_cmp(excluded, node)) {
> -		sd_debug("skipping reading object from %s", node_to_str(node));
> -		goto rollback;
> -	}
> -
>  	sd_init_req(&hdr, SD_OP_READ_PEER);
>  	hdr.epoch = epoch;
>  	hdr.flags = SD_FLAG_CMD_RECOVERY;
> @@ -332,7 +322,6 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
>  	uint32_t epoch = row->base.epoch;
>  	int nr_copies, ret = SD_RES_SUCCESS, start = 0;
>  	bool fully_replicated = true;
> -	struct sd_node *excluded = row->base.rinfo->excluded;
>  
>  	nr_copies = get_obj_copy_number(oid, old->nr_zones);
>  
> @@ -358,12 +347,6 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
>  		if (invalid_node(node, row->base.cur_vinfo))
>  			continue;
>  
> -		if (excluded && !node_cmp(excluded, node)) {
> -			sd_debug("skipping reading from %s because it is"
> -				 " excluded", node_to_str(node));
> -			continue;
> -		}
> -
>  		ret = recover_object_from(row, node, tgt_epoch);
>  		switch (ret) {
>  		case SD_RES_SUCCESS:
> @@ -575,20 +558,12 @@ static void recover_object_work(struct work *work)
>  	uint64_t oid = row->oid;
>  	struct vnode_info *cur = rw->cur_vinfo;
>  	int ret, epoch;
> -	struct sd_node *excluded = rw->rinfo->excluded;
>  
>  	if (sd_store->exist(oid, local_ec_index(cur, oid))) {
>  		sd_debug("the object is already recovered");
>  		return;
>  	}
>  
> -	if (excluded && !node_cmp(excluded, &sys->this_node)) {
> -		sd_debug("skipping recovery from stale dir because"
> -			 " I'm newly joining node (%s)",
> -			 node_to_str(&sys->this_node));
> -		goto remote_recover;
> -	}
> -
>  	/* find object in the stale directory */
>  	if (!is_erasure_oid(oid))
>  		for (epoch = sys_epoch() - 1; epoch >= last_gathered_epoch;
> @@ -602,7 +577,6 @@ static void recover_object_work(struct work *work)
>  			}
>  		}
>  
> -remote_recover:
>  	ret = do_recover_object(row);
>  	if (ret != 0)
>  		sd_err("failed to recover object %"PRIx64, oid);
> @@ -1133,7 +1107,6 @@ static void prepare_object_list(struct work *work)
>  	int start = random() % nr_nodes, i, end = nr_nodes;
>  	uint64_t *oids;
>  	struct sd_node *nodes;
> -	struct sd_node *excluded = rw->rinfo->excluded;
>  
>  	if (node_is_gateway_only())
>  		return;
> @@ -1154,13 +1127,6 @@ again:
>  			goto out;
>  		}
>  
> -		if (excluded && !node_cmp(excluded, node)) {
> -			sd_info("skipping object list reading from %s because"
> -				"it is marked as excluded node",
> -				node_to_str(excluded));
> -			continue;
> -		}
> -
>  		oids = fetch_object_list(node, rw->epoch, &nr_oids);
>  		if (!oids)
>  			continue;
> @@ -1179,17 +1145,8 @@ out:
>  	free(nodes);
>  }
>  
> -/*
> - * notes on the parameter excluded of start_recovery():
> - * When a status of a cluster is SD_STATUS_OK and a new node is joining to the
> - * cluster, the recovery process should avoid reading objects from the newly
> - * joining node. Because the newly joining node can have orphan objects.
> - * If the cluster status is already SD_STATUS_OK, the cluster must have every
> - * replica, so the newly joining node shouldn't provide any objects for the
> - * recovery process.
> - */
>  int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
> -		   bool epoch_lifted, struct sd_node *excluded)
> +		   bool epoch_lifted)
>  {
>  	struct recovery_info *rinfo;
>  
> @@ -1214,8 +1171,6 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
>  	rinfo->cur_vinfo = grab_vnode_info(cur_vinfo);
>  	rinfo->old_vinfo = grab_vnode_info(old_vinfo);
>  
> -	rinfo->excluded = excluded;
> -
>  	if (!node_is_gateway_only())
>  		sd_store->update_epoch(rinfo->tgt_epoch);
>  
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 85af7a0..26afa89 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -428,8 +428,7 @@ int init_config_file(void);
>  int get_obj_list(const struct sd_req *, struct sd_rsp *, void *);
>  int objlist_cache_cleanup(uint32_t vid);
>  
> -int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool,
> -		   struct sd_node *);
> +int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *, bool);
>  bool oid_in_recovery(uint64_t oid);
>  bool node_in_recovery(void);
>  void get_recovery_state(struct recovery_state *state);
> -- 
> 1.9.1
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> https://lists.wpkg.org/mailman/listinfo/sheepdog



More information about the sheepdog mailing list