[sheepdog] [PATCH v4 5/6] sheep: cache vnode_info when doing recovery

Tue May 20 11:33:36 CEST 2014

On Tue, May 20, 2014 at 04:40:33PM +0800, Robin Dong wrote:
> From: Robin Dong <sanbai at taobao.com>
> 
> When sheepdog doing recovery in same low-performance machines, the CPU is
> very high. After using perf tools to check the hot point of performance in
> sheep daemon, we find out that the "alloc_vnode_info()" function cost lots
> of CPU circyles because the rollback_vnode_info() rebuilds the vnode_info
> by calling alloc_vnode_info() too frequently.
> 
> The solution is to cache result of alloc_vnode_info() for specific 'epoch'
> and 'nr_nodes' in the recovery context.
> 
> Signed-off-by: Robin Dong <sanbai at taobao.com>
> ---
>  sheep/group.c      | 12 ++++++++++++
>  sheep/recovery.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------
>  sheep/sheep_priv.h |  4 ++++
>  3 files changed, 64 insertions(+), 6 deletions(-)
> 
> diff --git a/sheep/group.c b/sheep/group.c
> index 63e9ab9..360771d 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -175,6 +175,18 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
>  	return alloc_vnode_info(&nroot);
>  }
>  
> +int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
> +		    struct sd_node *nodes, int len)
> +{
> +	int nr_nodes;
> +
> +	nr_nodes = epoch_log_read(epoch, nodes, len);
> +	if (nr_nodes < 0)
> +		nr_nodes = epoch_log_read_remote(epoch, nodes, len,
> +						 NULL, cur_vinfo);
> +	return nr_nodes;
> +}
> +
>  int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
>  			void *data)
>  {
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index 27eb6c8..75e4b93 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -17,6 +17,7 @@ struct recovery_work {
>  	uint32_t epoch;
>  	uint32_t tgt_epoch;
>  
> +	struct recovery_info *rinfo;
>  	struct vnode_info *old_vinfo;
>  	struct vnode_info *cur_vinfo;
>  
> @@ -71,6 +72,10 @@ struct recovery_info {
>  
>  	struct vnode_info *old_vinfo;
>  	struct vnode_info *cur_vinfo;
> +
> +	int max_epoch;
> +	struct vnode_info **vinfo_array;
> +	struct sd_mutex vinfo_lock;
>  };
>  
>  static struct recovery_info *next_rinfo;
> @@ -95,23 +100,44 @@ static inline bool node_is_gateway_only(void)
>  	return sys->this_node.nr_vnodes == 0;
>  }
>  
> +static inline int vinfo_idx(uint32_t epoch, int nr_nodes)
> +{
> +	return epoch * SD_MAX_NODES + nr_nodes;
> +}
> +
>  static struct vnode_info *rollback_vnode_info(uint32_t *epoch,
> +					      struct recovery_info *rinfo,
>  					      struct vnode_info *cur)
>  {
> -	struct vnode_info *vinfo;
> +	struct sd_node nodes[SD_MAX_NODES];
> +	int nr_nodes, idx;
> +	struct rb_root nroot = RB_ROOT;
> +
>  rollback:
>  	*epoch -= 1;
>  	if (*epoch < last_gathered_epoch)
>  		return NULL;
>  
> -	vinfo = get_vnode_info_epoch(*epoch, cur);
> -	if (!vinfo) {
> +	nr_nodes = get_nodes_epoch(*epoch, cur, nodes, sizeof(nodes));
> +	if (!nr_nodes) {
>  		/* We rollback in case we don't get a valid epoch */
>  		sd_alert("cannot get epoch %d", *epoch);
>  		sd_alert("clients may see old data");
>  		goto rollback;
>  	}
> -	return vinfo;
> +	idx = vinfo_idx(*epoch, nr_nodes);
> +	/* double check */
> +	if (rinfo->vinfo_array[idx] == NULL) {
> +		sd_mutex_lock(&rinfo->vinfo_lock);
> +		if (rinfo->vinfo_array[idx] == NULL) {
> +			for (int i = 0; i < nr_nodes; i++)
> +				rb_insert(&nroot, &nodes[i], rb, node_cmp);
> +			rinfo->vinfo_array[idx] = alloc_vnode_info(&nroot);
> +		}
> +		sd_mutex_unlock(&rinfo->vinfo_lock);
> +	}
> +	refcount_inc(&(rinfo->vinfo_array[idx]->refcnt));
> +	return rinfo->vinfo_array[idx];
>  }
>  
>  /*
> @@ -202,7 +228,8 @@ again:
>  		break;
>  	default:
>  rollback:
> -		new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
> +		new_old = rollback_vnode_info(&tgt_epoch, rw->rinfo,
> +					      rw->cur_vinfo);
>  		if (!new_old) {
>  			sd_err("can not read %"PRIx64" idx %d", oid, idx);
>  			free(buf);
> @@ -385,7 +412,8 @@ again:
>  		/* fall through */
>  	default:
>  		/* No luck, roll back to an older configuration and try again */
> -		new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
> +		new_old = rollback_vnode_info(&tgt_epoch, rw->rinfo,
> +					      rw->cur_vinfo);
>  		if (!new_old) {
>  			sd_err("can not recover oid %"PRIx64, oid);
>  			ret = -1;
> @@ -669,10 +697,19 @@ static void free_recovery_obj_work(struct recovery_obj_work *row)
>  
>  static void free_recovery_info(struct recovery_info *rinfo)
>  {
> +	int idx;
> +
>  	put_vnode_info(rinfo->cur_vinfo);
>  	put_vnode_info(rinfo->old_vinfo);
>  	free(rinfo->oids);
>  	free(rinfo->prio_oids);
> +	for (int i = 0; i < rinfo->max_epoch; i++)
> +		for (int j = 0; j < SD_MAX_NODES; j++) {

For node as vnode unit, SD_MAX_NODES is very large (>6000), so this loop will
burn cpu, no?  I think this line will degrade the performance a bit.

Thanks
Yuan