[sheepdog] [PATCH v3] optimize epoch_log structure to reduce network and memory

Liu Yuan namei.unix at gmail.com
Thu Jul 31 05:58:52 CEST 2014


On Wed, Jul 30, 2014 at 02:26:20PM +0800, Ruoyu wrote:
> Current epoch_log contains a long nodes array to sync nodes and
> epoch in the cluster. It is simple, but there is a potential
> performance issue because each epoch log occupies nearly 500
> KBytes. If the cluster members change frequently, epoch is lifted
> frequently, more and more unused data is transfered between
> client and server. If we don't find a way, the performance will
> go from bad to worse.
> 
> Although the max node number is 6144, we only use a few of them.
> Therefore, the first solution is using a zero-length array,
> client (dog) and server (sheep) will negotiate an appropriate
> supported node number. This way will spend much less memory and
> will run much faster than before.
> 
> v3:
>  - epoch_log_read series functions are changed to propagate error
>    to upper layer.
> 
> v2:
>  - internal data structure is changed.

You need put change log under Signed-off-by line

> Signed-off-by: Ruoyu <liangry at ucweb.com>
> --- 
<-- Here we put change log for a single patch, otherwise put it in the
introduction patch (git format-patch --cover-letter xxx).

>
>  dog/cluster.c            | 40 +++++++++++++++++++++++----------
>  dog/vdi.c                | 37 +++++++++++++++++++++++--------
>  include/internal_proto.h |  2 +-
>  include/sheepdog_proto.h |  1 +
>  sheep/group.c            | 57 +++++++++++++++++++++++++-----------------------
>  sheep/ops.c              | 55 +++++++++++++++++++++++++++-------------------
>  sheep/sheep_priv.h       | 10 +++++----
>  sheep/store.c            | 25 ++++++++++++---------
>  8 files changed, 143 insertions(+), 84 deletions(-)
> 
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 188d4f4..508b65a 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
>  	return EXIT_SUCCESS;
>  }
>  
> -static void print_nodes(const struct epoch_log *logs, int epoch)
> +static void print_nodes(const struct epoch_log *logs, uint16_t flags)
>  {
>  	int i, nr_disk;
>  	const struct sd_node *entry;
>  
> -	for (i = 0; i < logs[epoch].nr_nodes; i++) {
> -		entry = logs[epoch].nodes + i;
> -		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
> +	for (i = 0; i < logs->nr_nodes; i++) {
> +		entry = logs->nodes + i;
> +		if (flags & SD_CLUSTER_FLAG_DISKMODE) {
>  			for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
>  				if (entry->disks[nr_disk].disk_id == 0)
>  					break;
> @@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv)
>  	int i, ret;
>  	struct sd_req hdr;
>  	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> -	struct epoch_log *logs;
> +	struct epoch_log *logs, *log;
> +	char *next_log;
>  	int nr_logs, log_length;
>  	time_t ti, ct;
>  	struct tm tm;
>  	char time_str[128];
> +	uint32_t support_nodes;
>  
> -	log_length = sd_epoch * sizeof(struct epoch_log);
> +#define DEFAULT_SUPPORT_NODES 32

It is used by mutiple files, so define it in .h file. By the way, why not use
sd_nodes_nr directly?

> +	support_nodes = DEFAULT_SUPPORT_NODES;
> +	log_length = sd_epoch * (sizeof(struct epoch_log)
> +			+ support_nodes * sizeof(struct sd_node));
>  	logs = xmalloc(log_length);
>  
> +retry:
>  	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
>  	hdr.data_length = log_length;
> +	hdr.cluster.support_nodes = support_nodes;
>  
>  	ret = dog_exec_req(&sd_nid, &hdr, logs);
>  	if (ret < 0)
>  		goto error;
> +	if (rsp->result == SD_RES_BUFFER_SMALL) {
> +		support_nodes *= 2;
> +		log_length = sd_epoch * (sizeof(struct epoch_log)
> +				+ support_nodes * sizeof(struct sd_node));
> +		logs = xrealloc(logs, log_length);
> +		goto retry;
> +	}
>  
>  	/* show cluster status */
>  	if (!raw_output)
> @@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv)
>  		printf("Epoch Time           Version\n");
>  	}
>  
> -	nr_logs = rsp->data_length / sizeof(struct epoch_log);
> +	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> +			+ support_nodes * sizeof(struct sd_node));
> +	next_log = (char *)logs;
>  	for (i = 0; i < nr_logs; i++) {
> -
> -		ti = logs[i].time;
> +		log = (struct epoch_log *)next_log;
> +		ti = log->time;
>  		if (raw_output) {
>  			snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
>  		} else {
> @@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv)
>  			strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
>  		}
>  
> -		printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
> +		printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
>  		printf(" [");
> -		print_nodes(logs, i);
> +		print_nodes(log, logs->flags);
>  		printf("]\n");
> +		next_log = (char *)log->nodes
> +				+ support_nodes * sizeof(struct sd_node);
>  	}
>  
>  	free(logs);
> diff --git a/dog/vdi.c b/dog/vdi.c
> index 2e3f7b3..6f0b748 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -1096,47 +1096,64 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
>  	struct sd_req hdr;
>  	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
>  	const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
> -	struct epoch_log *logs;
> +	struct epoch_log *logs, *log;
> +	char *next_log;
>  	int nr_logs, log_length;
> +	uint32_t support_nodes;
>  
> -	log_length = sd_epoch * sizeof(struct epoch_log);
> +#define DEFAULT_SUPPORT_NODES 32
> +	support_nodes = DEFAULT_SUPPORT_NODES;
> +	log_length = sd_epoch * (sizeof(struct epoch_log)
> +			+ support_nodes * sizeof(struct sd_node));
>  	logs = xmalloc(log_length);
>  
> +retry:
>  	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
>  	hdr.data_length = log_length;
> +	hdr.cluster.support_nodes = support_nodes;
>  
>  	ret = dog_exec_req(&sd_nid, &hdr, logs);
>  	if (ret < 0)
>  		goto error;
>  
> +	if (rsp->result == SD_RES_BUFFER_SMALL) {
> +		support_nodes *= 2;
> +		log_length = sd_epoch * (sizeof(struct epoch_log)
> +				+ support_nodes * sizeof(struct sd_node));
> +		logs = xrealloc(logs, log_length);
> +		goto retry;
> +	}
>  	if (rsp->result != SD_RES_SUCCESS) {
>  		printf("%s\n", sd_strerror(rsp->result));
>  		goto error;
>  	}
>  
> -	nr_logs = rsp->data_length / sizeof(struct epoch_log);
> +	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> +			+ support_nodes * sizeof(struct sd_node));
> +	next_log = (char *)logs;
>  	for (i = nr_logs - 1; i >= 0; i--) {
>  		struct rb_root vroot = RB_ROOT;
>  		struct rb_root nroot = RB_ROOT;
>  
> +		log = (struct epoch_log *)next_log;
>  		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
> -		       oid, logs[i].epoch, nr_copies);
> +		       oid, log->epoch, nr_copies);
>  		printf("---------------------------------------------------\n");
>  
>  		/*
>  		 * When # of nodes is less than nr_copies, we only print
>  		 * remaining nodes that holds all the remaining copies.
>  		 */
> -		if (logs[i].nr_nodes < nr_copies) {
> -			for (j = 0; j < logs[i].nr_nodes; j++) {
> -				const struct node_id *n = &logs[i].nodes[j].nid;
> +		if (log->nr_nodes < nr_copies) {
> +			for (j = 0; j < log->nr_nodes; j++) {
> +				const struct node_id *n = &log->nodes[j].nid;
>  
>  				printf("%s\n", addr_to_str(n->addr, n->port));
>  			}
>  			continue;
>  		}
> -		for (int k = 0; k < logs[i].nr_nodes; k++)
> -			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
> +		for (int k = 0; k < log->nr_nodes; k++)
> +			rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
>  		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
>  			disks_to_vnodes(&nroot, &vroot);
>  		else
> @@ -1148,6 +1165,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
>  			printf("%s\n", addr_to_str(n->addr, n->port));
>  		}
>  		rb_destroy(&vroot, struct sd_vnode, rb);
> +		next_log = (char *)log->nodes
> +				+ support_nodes * sizeof(struct sd_node);
>  	}
>  
>  	free(logs);
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index 37afb46..d61b5a5 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -221,7 +221,7 @@ struct epoch_log {
>  	uint8_t  __pad[3];
>  	uint16_t flags;
>  	char drv_name[STORE_LEN];
> -	struct sd_node nodes[SD_MAX_NODES];
> +	struct sd_node nodes[0];
>  };
>  
>  struct vdi_op_message {
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 7cfdccb..8b26a8b 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -165,6 +165,7 @@ struct sd_req {
>  			uint8_t		copy_policy;
>  			uint16_t	flags;
>  			uint32_t	tag;
> +			uint32_t	support_nodes;

support_nodes is not good. nodes_nr is simpler.



More information about the sheepdog mailing list