[sheepdog] [PATCH v3] optimize epoch_log structure to reduce network and memory

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Thu Jul 31 07:03:09 CEST 2014


At Thu, 31 Jul 2014 11:58:52 +0800,
Liu Yuan wrote:
> 
> On Wed, Jul 30, 2014 at 02:26:20PM +0800, Ruoyu wrote:
> > Current epoch_log contains a long nodes array to sync nodes and
> > epoch in the cluster. It is simple, but there is a potential
> > performance issue because each epoch log occupies nearly 500
> > KBytes. If the cluster members change frequently, epoch is lifted
> > frequently, more and more unused data is transfered between
> > client and server. If we don't find a way, the performance will
> > go from bad to worse.
> > 
> > Although the max node number is 6144, we only use a few of them.
> > Therefore, the first solution is using a zero-length array,
> > client (dog) and server (sheep) will negotiate an appropriate
> > supported node number. This way will spend much less memory and
> > will run much faster than before.
> > 
> > v3:
> >  - epoch_log_read series functions are changed to propagate error
> >    to upper layer.
> > 
> > v2:
> >  - internal data structure is changed.
> 
> You need put change log under Signed-off-by line
> 
> > Signed-off-by: Ruoyu <liangry at ucweb.com>
> > --- 
> <-- Here we put change log for a single patch, otherwise put it in the
> introduction patch (git format-patch --cover-letter xxx).
> 
> >
> >  dog/cluster.c            | 40 +++++++++++++++++++++++----------
> >  dog/vdi.c                | 37 +++++++++++++++++++++++--------
> >  include/internal_proto.h |  2 +-
> >  include/sheepdog_proto.h |  1 +
> >  sheep/group.c            | 57 +++++++++++++++++++++++++-----------------------
> >  sheep/ops.c              | 55 +++++++++++++++++++++++++++-------------------
> >  sheep/sheep_priv.h       | 10 +++++----
> >  sheep/store.c            | 25 ++++++++++++---------
> >  8 files changed, 143 insertions(+), 84 deletions(-)
> > 
> > diff --git a/dog/cluster.c b/dog/cluster.c
> > index 188d4f4..508b65a 100644
> > --- a/dog/cluster.c
> > +++ b/dog/cluster.c
> > @@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
> >  	return EXIT_SUCCESS;
> >  }
> >  
> > -static void print_nodes(const struct epoch_log *logs, int epoch)
> > +static void print_nodes(const struct epoch_log *logs, uint16_t flags)
> >  {
> >  	int i, nr_disk;
> >  	const struct sd_node *entry;
> >  
> > -	for (i = 0; i < logs[epoch].nr_nodes; i++) {
> > -		entry = logs[epoch].nodes + i;
> > -		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
> > +	for (i = 0; i < logs->nr_nodes; i++) {
> > +		entry = logs->nodes + i;
> > +		if (flags & SD_CLUSTER_FLAG_DISKMODE) {
> >  			for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
> >  				if (entry->disks[nr_disk].disk_id == 0)
> >  					break;
> > @@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv)
> >  	int i, ret;
> >  	struct sd_req hdr;
> >  	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> > -	struct epoch_log *logs;
> > +	struct epoch_log *logs, *log;
> > +	char *next_log;
> >  	int nr_logs, log_length;
> >  	time_t ti, ct;
> >  	struct tm tm;
> >  	char time_str[128];
> > +	uint32_t support_nodes;
> >  
> > -	log_length = sd_epoch * sizeof(struct epoch_log);
> > +#define DEFAULT_SUPPORT_NODES 32
> 
> It is used by mutiple files, so define it in .h file. By the way, why not use
> sd_nodes_nr directly?
> 
> > +	support_nodes = DEFAULT_SUPPORT_NODES;
> > +	log_length = sd_epoch * (sizeof(struct epoch_log)
> > +			+ support_nodes * sizeof(struct sd_node));
> >  	logs = xmalloc(log_length);
> >  
> > +retry:
> >  	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
> >  	hdr.data_length = log_length;
> > +	hdr.cluster.support_nodes = support_nodes;
> >  
> >  	ret = dog_exec_req(&sd_nid, &hdr, logs);
> >  	if (ret < 0)
> >  		goto error;
> > +	if (rsp->result == SD_RES_BUFFER_SMALL) {
> > +		support_nodes *= 2;
> > +		log_length = sd_epoch * (sizeof(struct epoch_log)
> > +				+ support_nodes * sizeof(struct sd_node));
> > +		logs = xrealloc(logs, log_length);
> > +		goto retry;
> > +	}
> >  
> >  	/* show cluster status */
> >  	if (!raw_output)
> > @@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv)
> >  		printf("Epoch Time           Version\n");
> >  	}
> >  
> > -	nr_logs = rsp->data_length / sizeof(struct epoch_log);
> > +	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> > +			+ support_nodes * sizeof(struct sd_node));
> > +	next_log = (char *)logs;
> >  	for (i = 0; i < nr_logs; i++) {
> > -
> > -		ti = logs[i].time;
> > +		log = (struct epoch_log *)next_log;
> > +		ti = log->time;
> >  		if (raw_output) {
> >  			snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
> >  		} else {
> > @@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv)
> >  			strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
> >  		}
> >  
> > -		printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
> > +		printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
> >  		printf(" [");
> > -		print_nodes(logs, i);
> > +		print_nodes(log, logs->flags);
> >  		printf("]\n");
> > +		next_log = (char *)log->nodes
> > +				+ support_nodes * sizeof(struct sd_node);
> >  	}
> >  
> >  	free(logs);
> > diff --git a/dog/vdi.c b/dog/vdi.c
> > index 2e3f7b3..6f0b748 100644
> > --- a/dog/vdi.c
> > +++ b/dog/vdi.c
> > @@ -1096,47 +1096,64 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
> >  	struct sd_req hdr;
> >  	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> >  	const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
> > -	struct epoch_log *logs;
> > +	struct epoch_log *logs, *log;
> > +	char *next_log;
> >  	int nr_logs, log_length;
> > +	uint32_t support_nodes;
> >  
> > -	log_length = sd_epoch * sizeof(struct epoch_log);
> > +#define DEFAULT_SUPPORT_NODES 32
> > +	support_nodes = DEFAULT_SUPPORT_NODES;
> > +	log_length = sd_epoch * (sizeof(struct epoch_log)
> > +			+ support_nodes * sizeof(struct sd_node));
> >  	logs = xmalloc(log_length);
> >  
> > +retry:
> >  	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
> >  	hdr.data_length = log_length;
> > +	hdr.cluster.support_nodes = support_nodes;
> >  
> >  	ret = dog_exec_req(&sd_nid, &hdr, logs);
> >  	if (ret < 0)
> >  		goto error;
> >  
> > +	if (rsp->result == SD_RES_BUFFER_SMALL) {
> > +		support_nodes *= 2;
> > +		log_length = sd_epoch * (sizeof(struct epoch_log)
> > +				+ support_nodes * sizeof(struct sd_node));
> > +		logs = xrealloc(logs, log_length);
> > +		goto retry;
> > +	}
> >  	if (rsp->result != SD_RES_SUCCESS) {
> >  		printf("%s\n", sd_strerror(rsp->result));
> >  		goto error;
> >  	}
> >  
> > -	nr_logs = rsp->data_length / sizeof(struct epoch_log);
> > +	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> > +			+ support_nodes * sizeof(struct sd_node));
> > +	next_log = (char *)logs;
> >  	for (i = nr_logs - 1; i >= 0; i--) {
> >  		struct rb_root vroot = RB_ROOT;
> >  		struct rb_root nroot = RB_ROOT;
> >  
> > +		log = (struct epoch_log *)next_log;
> >  		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
> > -		       oid, logs[i].epoch, nr_copies);
> > +		       oid, log->epoch, nr_copies);
> >  		printf("---------------------------------------------------\n");
> >  
> >  		/*
> >  		 * When # of nodes is less than nr_copies, we only print
> >  		 * remaining nodes that holds all the remaining copies.
> >  		 */
> > -		if (logs[i].nr_nodes < nr_copies) {
> > -			for (j = 0; j < logs[i].nr_nodes; j++) {
> > -				const struct node_id *n = &logs[i].nodes[j].nid;
> > +		if (log->nr_nodes < nr_copies) {
> > +			for (j = 0; j < log->nr_nodes; j++) {
> > +				const struct node_id *n = &log->nodes[j].nid;
> >  
> >  				printf("%s\n", addr_to_str(n->addr, n->port));
> >  			}
> >  			continue;
> >  		}
> > -		for (int k = 0; k < logs[i].nr_nodes; k++)
> > -			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
> > +		for (int k = 0; k < log->nr_nodes; k++)
> > +			rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
> >  		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
> >  			disks_to_vnodes(&nroot, &vroot);
> >  		else
> > @@ -1148,6 +1165,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
> >  			printf("%s\n", addr_to_str(n->addr, n->port));
> >  		}
> >  		rb_destroy(&vroot, struct sd_vnode, rb);
> > +		next_log = (char *)log->nodes
> > +				+ support_nodes * sizeof(struct sd_node);
> >  	}
> >  
> >  	free(logs);
> > diff --git a/include/internal_proto.h b/include/internal_proto.h
> > index 37afb46..d61b5a5 100644
> > --- a/include/internal_proto.h
> > +++ b/include/internal_proto.h
> > @@ -221,7 +221,7 @@ struct epoch_log {
> >  	uint8_t  __pad[3];
> >  	uint16_t flags;
> >  	char drv_name[STORE_LEN];
> > -	struct sd_node nodes[SD_MAX_NODES];
> > +	struct sd_node nodes[0];
> >  };
> >  
> >  struct vdi_op_message {
> > diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> > index 7cfdccb..8b26a8b 100644
> > --- a/include/sheepdog_proto.h
> > +++ b/include/sheepdog_proto.h
> > @@ -165,6 +165,7 @@ struct sd_req {
> >  			uint8_t		copy_policy;
> >  			uint16_t	flags;
> >  			uint32_t	tag;
> > +			uint32_t	support_nodes;
> 
> support_nodes is not good. nodes_nr is simpler.

These sort of style problems can be fixed later easily. I prefer apply
first and fix later than increasing turn around of review. Frequent
commits can reduce possibility of rebasing and make development
speedy.

Thanks,
Hitoshi




More information about the sheepdog mailing list