[Sheepdog] [PATCH 1/2] sheep: optimize sheep recovery logic

Thu Nov 24 18:04:56 CET 2011

At Thu, 24 Nov 2011 20:03:17 +0800,
Liu Yuan wrote:
> 
> From: Liu Yuan <tailai.ly at taobao.com>
> 
> We don't need to iterate from epoch 1 to hdr->tgt_epoch, since when the
> node is recovered from view(membership) change, the current epoch objects
> have all the object information that need for subsequent view change.
> 
> prev_rw_epoch is needed, because we need to handle below situation:
> 
> init: node A, B, C.
> 
> then D, E joined the cluster.
> 
>               t
> epoch 1    2     3
>       A    A     A
>       B    B     B
>       C    C     C
>            D     D
> 	         E
> 
> at the time t:
> Since now we have nodes recover in parallel, we might have A recovered fully,
> while B C doesn't.
> 
>    pre_rw_eopch  recovered_epoch
> A      1               3
> B      1               1
> C      1               1
> 
> Then B, C need to iterate from pre_rw_epoch to hdr->tgt_epoch, instead of from
> recovered_epoch to hdr->tgt_epoch, to get the needed object list information.

This is not correct.  Note that new nodes can be added before
finishing recovery on all nodes.

For example:

 1. There is only one node A at epoch 1.  Node A has one object 'obj'.

       pre_rw_oopch  recovered_epoch  epoch
    A      -               -            1

 2. Node B joins, and the store of 'obj' changes to node B.  Node A
    finishes recovery, but node B does not yet.

       pre_rw_epoch  recovered_epoch  epoch
    A      -               2            2
    B      -               -            2

 3. Node C joins, and node A finishes recovery at epoch 3 soon, but
    node B does not finish recovery at epoch 2 yet.

       pre_rw_epoch  recovered_epoch  epoch
    A      2               3            3
    B      -               -            3
    C      -               -            3

 4. Node B needs to read 'obj' from node A at epoch 1, so we cannot
    remove epoch 1 on node A though it is less than 'pre_rw_epoch'.

If 'pre_rw_epoch' is the minimum number of 'recovered_epoch' on all
nodes, we can safely remove all epochs before 'pre_rw_epoch'.  It is
because the epochs are not necessary for any other nodes.

Thanks,

Kazutaka

> 
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
>  sheep/group.c      |    2 ++
>  sheep/ops.c        |    1 +
>  sheep/sheep_priv.h |    1 +
>  sheep/store.c      |   17 +++++++++++------
>  4 files changed, 15 insertions(+), 6 deletions(-)
> 
> diff --git a/sheep/group.c b/sheep/group.c
> index 31d1f76..799e758 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -575,6 +575,8 @@ join_finished:
>  			set_cluster_copies(sys->nr_sobjs);
>  			set_cluster_flags(sys->flags);
>  			set_cluster_ctime(msg->ctime);
> +			sys->prev_rw_epoch = sys->epoch;
> +			sys->recovered_epoch = sys->epoch;
>  		}
>  	}
>  
> diff --git a/sheep/ops.c b/sheep/ops.c
> index fd836c1..7ff6052 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -147,6 +147,7 @@ static int cluster_make_fs(const struct sd_req *req, struct sd_rsp *rsp,
>  
>  	sys->epoch = 1;
>  	sys->recovered_epoch = 1;
> +	sys->prev_rw_epoch = 1;
>  
>  	ret = update_epoch_log(sys->epoch);
>  	if (ret)
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 618267e..dbc0328 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -146,6 +146,7 @@ struct cluster_info {
>  	unsigned int outstanding_data_size;
>  
>  	uint32_t recovered_epoch;
> +	uint32_t prev_rw_epoch;
>  
>  	int use_directio;
>  
> diff --git a/sheep/store.c b/sheep/store.c
> index d4c3f27..3f0b053 100644
> --- a/sheep/store.c
> +++ b/sheep/store.c
> @@ -106,7 +106,7 @@ int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *d
>  	DIR *dir;
>  	struct dirent *d;
>  	uint64_t oid;
> -	uint32_t epoch;
> +	uint32_t epoch, from, to;
>  	char path[1024];
>  	uint64_t *p = (uint64_t *)data;
>  	int nr = 0;
> @@ -126,7 +126,11 @@ int get_obj_list(const struct sd_list_req *hdr, struct sd_list_rsp *rsp, void *d
>  	}
>  
>  	objlist = (uint64_t *)buf;
> -	for (epoch = 1; epoch <= hdr->tgt_epoch; epoch++) {
> +	dprintf("%d, %d, %d, %d\n", sys->prev_rw_epoch, sys->recovered_epoch,
> +		hdr->tgt_epoch, sys->epoch);
> +	from = sys->prev_rw_epoch;
> +	to = sys->prev_rw_epoch > hdr->tgt_epoch ? sys->prev_rw_epoch : hdr->tgt_epoch;
> +	for (epoch = from; epoch <= to; epoch++) {
>  		snprintf(path, sizeof(path), "%s%08u/", obj_path, epoch);
>  
>  		dprintf("%"PRIu32", %s\n", sys->this_node.port, path);
> @@ -1527,6 +1531,7 @@ static void recover_done(struct work *work, int idx)
>  {
>  	struct recovery_work *rw = container_of(work, struct recovery_work, work);
>  	uint64_t oid;
> +	uint32_t rw_epoch = rw->epoch;
>  
>  	if (rw->state == RW_INIT)
>  		rw->state = RW_RUN;
> @@ -1558,11 +1563,7 @@ static void recover_done(struct work *work, int idx)
>  		queue_work(sys->recovery_wqueue, &rw->work);
>  		return;
>  	}
> -
> -	dprintf("recovery complete: new epoch %"PRIu32"\n", rw->epoch);
>  	recovering_work = NULL;
> -
> -	sys->recovered_epoch = rw->epoch;
>  	resume_pending_requests();
>  
>  	free(rw->oids);
> @@ -1574,6 +1575,10 @@ static void recover_done(struct work *work, int idx)
>  
>  		recovering_work = rw;
>  		queue_work(sys->recovery_wqueue, &rw->work);
> +	} else {
> +		sys->prev_rw_epoch = sys->recovered_epoch;
> +		sys->recovered_epoch = rw_epoch;
> +		dprintf("recovery complete: new epoch %"PRIu32"\n", rw_epoch);
>  	}
>  }
>  
> -- 
> 1.7.8.rc3
> 
> -- 
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog