[sheepdog] [PATCH 2/2] optimize epoch_log structure to reduce network and memory overhead

Ruoyu liangry at ucweb.com
Mon Jul 7 14:42:53 CEST 2014


Current epoch_log contains a long nodes array to sync nodes and
epoch in the cluster. It is simple, but there is a potential
performance issue because each epoch log occupies nearly 500
KBytes. If the cluster members change frequently, epoch is lifted
frequently. If we don't find a way, the performance will go from
bad to worse.

Although the max node number is 6144, we only use a few of them.
Therefore, the first solution is using a zero-length array,
client (dog) and server (sheep) will negotiate an appropriate
supported node number. This way will spend much less memory and
will run much faster than before.

Signed-off-by: Ruoyu <liangry at ucweb.com>
---
 dog/alter.c              |  3 +++
 dog/cluster.c            | 40 +++++++++++++++++++++++++++++-----------
 dog/dog.c                |  1 +
 dog/vdi.c                | 37 ++++++++++++++++++++++++++++---------
 include/internal_proto.h |  2 +-
 include/sheepdog_proto.h |  3 +++
 sheep/group.c            |  8 +++++++-
 sheep/ops.c              | 47 +++++++++++++++++++++++++++++++----------------
 sheep/store.c            |  4 ++--
 9 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/dog/alter.c b/dog/alter.c
index 7af7f9f..9801cc2 100644
--- a/dog/alter.c
+++ b/dog/alter.c
@@ -63,8 +63,11 @@ static int alter_cluster_copy(int argc, char **argv)
 
 	log_length = sd_epoch * sizeof(struct epoch_log);
 	logs = xmalloc(log_length);
+
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.epoch_log.support_nodes = 0;
+
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto failure;
diff --git a/dog/cluster.c b/dog/cluster.c
index 69ec07c..4731767 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
 	return EXIT_SUCCESS;
 }
 
-static void print_nodes(const struct epoch_log *logs, int epoch)
+static void print_nodes(const struct epoch_log *logs, uint16_t flags)
 {
 	int i, nr_disk;
 	const struct sd_node *entry;
 
-	for (i = 0; i < logs[epoch].nr_nodes; i++) {
-		entry = logs[epoch].nodes + i;
-		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
+	for (i = 0; i < logs->nr_nodes; i++) {
+		entry = logs->nodes + i;
+		if (flags & SD_CLUSTER_FLAG_DISKMODE) {
 			for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
 				if (entry->disks[nr_disk].disk_id == 0)
 					break;
@@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv)
 	int i, ret;
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-	struct epoch_log *logs;
+	struct epoch_log *logs, *log;
+	char *next_log;
 	int nr_logs, log_length;
 	time_t ti, ct;
 	struct tm tm;
 	char time_str[128];
+	uint16_t support_nodes;
 
-	log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+	support_nodes = DEFAULT_SUPPORT_NODES;
+	log_length = sd_epoch * (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
 	logs = xmalloc(log_length);
 
+retry:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.epoch_log.support_nodes = support_nodes;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto error;
+	if (rsp->result == SD_RES_BUFFER_SMALL) {
+		support_nodes *= 2;
+		log_length = sd_epoch * (sizeof(struct epoch_log)
+				+ support_nodes * sizeof(struct sd_node));
+		logs = xrealloc(logs, log_length);
+		goto retry;
+	}
 
 	/* show cluster status */
 	if (!raw_output)
@@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv)
 		printf("Epoch Time           Version\n");
 	}
 
-	nr_logs = rsp->data_length / sizeof(struct epoch_log);
+	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
+	next_log = (char *)logs;
 	for (i = 0; i < nr_logs; i++) {
-
-		ti = logs[i].time;
+		log = (struct epoch_log *)next_log;
+		ti = log->time;
 		if (raw_output) {
 			snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
 		} else {
@@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv)
 			strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
 		}
 
-		printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
+		printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
 		printf(" [");
-		print_nodes(logs, i);
+		print_nodes(log, logs->flags);
 		printf("]\n");
+		next_log = (char *)log->nodes
+				+ support_nodes * sizeof(struct sd_node);
 	}
 
 	free(logs);
diff --git a/dog/dog.c b/dog/dog.c
index 46992ec..fda7906 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -119,6 +119,7 @@ int update_node_list(int max_nodes)
 
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.epoch_log.support_nodes = 0;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
diff --git a/dog/vdi.c b/dog/vdi.c
index 49a2139..5fd0b7b 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -964,47 +964,64 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
-	struct epoch_log *logs;
+	struct epoch_log *logs, *log;
+	char *next_log;
 	int nr_logs, log_length;
+	uint16_t support_nodes;
 
-	log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+	support_nodes = DEFAULT_SUPPORT_NODES;
+	log_length = sd_epoch * (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
 	logs = xmalloc(log_length);
 
+retry:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.epoch_log.support_nodes = support_nodes;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto error;
 
+	if (rsp->result == SD_RES_BUFFER_SMALL) {
+		support_nodes *= 2;
+		log_length = sd_epoch * (sizeof(struct epoch_log)
+				+ support_nodes * sizeof(struct sd_node));
+		logs = xrealloc(logs, log_length);
+		goto retry;
+	}
 	if (rsp->result != SD_RES_SUCCESS) {
 		printf("%s\n", sd_strerror(rsp->result));
 		goto error;
 	}
 
-	nr_logs = rsp->data_length / sizeof(struct epoch_log);
+	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
+	next_log = (char *)logs;
 	for (i = nr_logs - 1; i >= 0; i--) {
 		struct rb_root vroot = RB_ROOT;
 		struct rb_root nroot = RB_ROOT;
 
+		log = (struct epoch_log *)next_log;
 		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
-		       oid, logs[i].epoch, nr_copies);
+		       oid, log->epoch, nr_copies);
 		printf("---------------------------------------------------\n");
 
 		/*
 		 * When # of nodes is less than nr_copies, we only print
 		 * remaining nodes that holds all the remaining copies.
 		 */
-		if (logs[i].nr_nodes < nr_copies) {
-			for (j = 0; j < logs[i].nr_nodes; j++) {
-				const struct node_id *n = &logs[i].nodes[j].nid;
+		if (log->nr_nodes < nr_copies) {
+			for (j = 0; j < log->nr_nodes; j++) {
+				const struct node_id *n = &log->nodes[j].nid;
 
 				printf("%s\n", addr_to_str(n->addr, n->port));
 			}
 			continue;
 		}
-		for (int k = 0; k < logs[i].nr_nodes; k++)
-			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
+		for (int k = 0; k < log->nr_nodes; k++)
+			rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
 		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
 			disks_to_vnodes(&nroot, &vroot);
 		else
@@ -1016,6 +1033,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 			printf("%s\n", addr_to_str(n->addr, n->port));
 		}
 		rb_destroy(&vroot, struct sd_vnode, rb);
+		next_log = (char *)log->nodes
+				+ support_nodes * sizeof(struct sd_node);
 	}
 
 	free(logs);
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 7ec2872..ad4d822 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -211,7 +211,7 @@ struct epoch_log {
 	uint8_t  __pad[3];
 	uint16_t flags;
 	char drv_name[STORE_LEN];
-	struct sd_node nodes[SD_MAX_NODES];
+	struct sd_node nodes[0];
 };
 
 struct vdi_op_message {
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 76fad51..1355ecb 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -180,6 +180,9 @@ struct sd_req {
 			uint8_t		addr[16];
 			uint16_t	port;
 		} node_addr;
+		struct {
+			uint16_t	support_nodes;
+		} epoch_log;
 
 		uint32_t		__pad[8];
 	};
diff --git a/sheep/group.c b/sheep/group.c
index adfd798..27e3574 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -352,7 +352,7 @@ error:
 int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 			  time_t *timestamp, struct vnode_info *vinfo)
 {
-	char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
+	char *buf = xzalloc(len + sizeof(time_t));
 	const struct sd_node *node;
 	int ret;
 
@@ -369,6 +369,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 		hdr.obj.tgt_epoch = epoch;
 		hdr.epoch = sys_epoch();
 		ret = sheep_exec_req(&node->nid, &hdr, buf);
+		if (ret == SD_RES_BUFFER_SMALL) {
+			free(buf);
+			return -2;
+		}
 		if (ret != SD_RES_SUCCESS)
 			continue;
 
@@ -377,6 +381,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 		if (timestamp)
 			memcpy(timestamp, buf + nodes_len, sizeof(*timestamp));
 
+		free(buf);
 		return nodes_len / sizeof(struct sd_node);
 	}
 
@@ -384,6 +389,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 	 * If no node has targeted epoch log, return 0 here to at least
 	 * allow reading older epoch logs.
 	 */
+	free(buf);
 	return 0;
 }
 
diff --git a/sheep/ops.c b/sheep/ops.c
index fb26077..12957d2 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -438,15 +438,19 @@ static int local_stat_cluster(struct request *req)
 {
 	struct sd_rsp *rsp = &req->rp;
 	struct epoch_log *elog;
+	char *next_elog;
 	int i, max_elogs;
 	uint32_t epoch;
+	uint16_t support_nodes = req->rq.epoch_log.support_nodes;
 
 	if (req->vinfo == NULL) {
 		sd_debug("cluster is not started up");
 		goto out;
 	}
 
-	max_elogs = req->rq.data_length / sizeof(*elog);
+	max_elogs = req->rq.data_length / (sizeof(*elog)
+			+ support_nodes * sizeof(struct sd_node));
+	next_elog = (char *)req->data;
 	epoch = get_latest_epoch();
 	for (i = 0; i < max_elogs; i++) {
 		int nr_nodes;
@@ -454,7 +458,7 @@ static int local_stat_cluster(struct request *req)
 		if (epoch <= 0)
 			break;
 
-		elog = (struct epoch_log *)req->data + i;
+		elog = (struct epoch_log *)next_elog;
 		memset(elog, 0, sizeof(*elog));
 
 		/* some filed only need to store in first elog */
@@ -469,20 +473,29 @@ static int local_stat_cluster(struct request *req)
 		}
 
 		elog->epoch = epoch;
-		nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes,
-							 sizeof(elog->nodes),
-							 (time_t *)&elog->time);
-		if (nr_nodes == -1)
-			nr_nodes = epoch_log_read_remote(epoch, elog->nodes,
-							 sizeof(elog->nodes),
-							 (time_t *)&elog->time,
-							 req->vinfo);
-		assert(nr_nodes >= 0);
-		assert(nr_nodes <= SD_MAX_NODES);
-		elog->nr_nodes = nr_nodes;
-
-
-		rsp->data_length += sizeof(*elog);
+		if (support_nodes > 0) {
+			nr_nodes = epoch_log_read_with_timestamp(
+					epoch, elog->nodes,
+					support_nodes * sizeof(struct sd_node),
+					(time_t *)&elog->time);
+			if (nr_nodes == -1)
+				nr_nodes = epoch_log_read_remote(
+					epoch, elog->nodes,
+					support_nodes * sizeof(struct sd_node),
+					(time_t *)&elog->time,
+					req->vinfo);
+			if (nr_nodes == -2)
+				return SD_RES_BUFFER_SMALL;
+			assert(nr_nodes >= 0);
+			assert(nr_nodes <= SD_MAX_NODES);
+			elog->nr_nodes = nr_nodes;
+		} else
+			elog->nr_nodes = 0;
+
+		next_elog = (char *)elog->nodes
+				+ support_nodes * sizeof(struct sd_node);
+		rsp->data_length += sizeof(*elog)
+				+ support_nodes * sizeof(struct sd_node);
 		epoch--;
 	}
 out:
@@ -520,6 +533,8 @@ static int local_get_epoch(struct request *req)
 					&timestamp);
 	if (nr_nodes == -1)
 		return SD_RES_NO_TAG;
+	if (nr_nodes == -2)
+		return SD_RES_BUFFER_SMALL;
 
 	nodes_len = nr_nodes * sizeof(struct sd_node);
 	memcpy((void *)((char *)req->data + nodes_len), &timestamp,
diff --git a/sheep/store.c b/sheep/store.c
index eee88c7..70fddb8 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -63,8 +63,8 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
 	}
 
 	if (len < epoch_stat.st_size - sizeof(*timestamp)) {
-		sd_err("invalid epoch %"PRIu32" log", epoch);
-		goto err;
+		close(fd);
+		return -2;
 	}
 
 	ret = xread(fd, nodes, epoch_stat.st_size - sizeof(*timestamp));
-- 
1.8.3.2





More information about the sheepdog mailing list