[sheepdog] [PATCH v2] optimize epoch_log structure to reduce network and memory

Ruoyu liangry at ucweb.com
Tue Jul 29 13:16:46 CEST 2014


Current epoch_log contains a long nodes array to sync nodes and
epoch in the cluster. It is simple, but there is a potential
performance issue because each epoch log occupies nearly 500
KBytes. If the cluster members change frequently, epoch is lifted
frequently, more and more unused data is transfered between
client and server. If we don't find a way, the performance will
go from bad to worse.

Although the max node number is 6144, we only use a few of them.
Therefore, the first solution is using a zero-length array,
client (dog) and server (sheep) will negotiate an appropriate
supported node number. This way will spend much less memory and
will run much faster than before.

This is patch v2. Comparing it to v1 which is submit on Jul 7,
internal data structure is changed.

Signed-off-by: Ruoyu <liangry at ucweb.com>
---
 dog/cluster.c            | 40 +++++++++++++++++++++++++++++-----------
 dog/vdi.c                | 37 ++++++++++++++++++++++++++++---------
 include/internal_proto.h |  2 +-
 include/sheepdog_proto.h |  1 +
 sheep/group.c            |  8 +++++++-
 sheep/ops.c              | 47 +++++++++++++++++++++++++++++++----------------
 sheep/store.c            |  6 +++++-
 7 files changed, 102 insertions(+), 39 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 188d4f4..508b65a 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
 	return EXIT_SUCCESS;
 }
 
-static void print_nodes(const struct epoch_log *logs, int epoch)
+static void print_nodes(const struct epoch_log *logs, uint16_t flags)
 {
 	int i, nr_disk;
 	const struct sd_node *entry;
 
-	for (i = 0; i < logs[epoch].nr_nodes; i++) {
-		entry = logs[epoch].nodes + i;
-		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
+	for (i = 0; i < logs->nr_nodes; i++) {
+		entry = logs->nodes + i;
+		if (flags & SD_CLUSTER_FLAG_DISKMODE) {
 			for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
 				if (entry->disks[nr_disk].disk_id == 0)
 					break;
@@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv)
 	int i, ret;
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-	struct epoch_log *logs;
+	struct epoch_log *logs, *log;
+	char *next_log;
 	int nr_logs, log_length;
 	time_t ti, ct;
 	struct tm tm;
 	char time_str[128];
+	uint32_t support_nodes;
 
-	log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+	support_nodes = DEFAULT_SUPPORT_NODES;
+	log_length = sd_epoch * (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
 	logs = xmalloc(log_length);
 
+retry:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.cluster.support_nodes = support_nodes;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto error;
+	if (rsp->result == SD_RES_BUFFER_SMALL) {
+		support_nodes *= 2;
+		log_length = sd_epoch * (sizeof(struct epoch_log)
+				+ support_nodes * sizeof(struct sd_node));
+		logs = xrealloc(logs, log_length);
+		goto retry;
+	}
 
 	/* show cluster status */
 	if (!raw_output)
@@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv)
 		printf("Epoch Time           Version\n");
 	}
 
-	nr_logs = rsp->data_length / sizeof(struct epoch_log);
+	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
+	next_log = (char *)logs;
 	for (i = 0; i < nr_logs; i++) {
-
-		ti = logs[i].time;
+		log = (struct epoch_log *)next_log;
+		ti = log->time;
 		if (raw_output) {
 			snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
 		} else {
@@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv)
 			strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
 		}
 
-		printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
+		printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
 		printf(" [");
-		print_nodes(logs, i);
+		print_nodes(log, logs->flags);
 		printf("]\n");
+		next_log = (char *)log->nodes
+				+ support_nodes * sizeof(struct sd_node);
 	}
 
 	free(logs);
diff --git a/dog/vdi.c b/dog/vdi.c
index 2e3f7b3..6f0b748 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1096,47 +1096,64 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
-	struct epoch_log *logs;
+	struct epoch_log *logs, *log;
+	char *next_log;
 	int nr_logs, log_length;
+	uint32_t support_nodes;
 
-	log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+	support_nodes = DEFAULT_SUPPORT_NODES;
+	log_length = sd_epoch * (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
 	logs = xmalloc(log_length);
 
+retry:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.cluster.support_nodes = support_nodes;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto error;
 
+	if (rsp->result == SD_RES_BUFFER_SMALL) {
+		support_nodes *= 2;
+		log_length = sd_epoch * (sizeof(struct epoch_log)
+				+ support_nodes * sizeof(struct sd_node));
+		logs = xrealloc(logs, log_length);
+		goto retry;
+	}
 	if (rsp->result != SD_RES_SUCCESS) {
 		printf("%s\n", sd_strerror(rsp->result));
 		goto error;
 	}
 
-	nr_logs = rsp->data_length / sizeof(struct epoch_log);
+	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+			+ support_nodes * sizeof(struct sd_node));
+	next_log = (char *)logs;
 	for (i = nr_logs - 1; i >= 0; i--) {
 		struct rb_root vroot = RB_ROOT;
 		struct rb_root nroot = RB_ROOT;
 
+		log = (struct epoch_log *)next_log;
 		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
-		       oid, logs[i].epoch, nr_copies);
+		       oid, log->epoch, nr_copies);
 		printf("---------------------------------------------------\n");
 
 		/*
 		 * When # of nodes is less than nr_copies, we only print
 		 * remaining nodes that holds all the remaining copies.
 		 */
-		if (logs[i].nr_nodes < nr_copies) {
-			for (j = 0; j < logs[i].nr_nodes; j++) {
-				const struct node_id *n = &logs[i].nodes[j].nid;
+		if (log->nr_nodes < nr_copies) {
+			for (j = 0; j < log->nr_nodes; j++) {
+				const struct node_id *n = &log->nodes[j].nid;
 
 				printf("%s\n", addr_to_str(n->addr, n->port));
 			}
 			continue;
 		}
-		for (int k = 0; k < logs[i].nr_nodes; k++)
-			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
+		for (int k = 0; k < log->nr_nodes; k++)
+			rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
 		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
 			disks_to_vnodes(&nroot, &vroot);
 		else
@@ -1148,6 +1165,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 			printf("%s\n", addr_to_str(n->addr, n->port));
 		}
 		rb_destroy(&vroot, struct sd_vnode, rb);
+		next_log = (char *)log->nodes
+				+ support_nodes * sizeof(struct sd_node);
 	}
 
 	free(logs);
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 37afb46..d61b5a5 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -221,7 +221,7 @@ struct epoch_log {
 	uint8_t  __pad[3];
 	uint16_t flags;
 	char drv_name[STORE_LEN];
-	struct sd_node nodes[SD_MAX_NODES];
+	struct sd_node nodes[0];
 };
 
 struct vdi_op_message {
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 7cfdccb..8b26a8b 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -165,6 +165,7 @@ struct sd_req {
 			uint8_t		copy_policy;
 			uint16_t	flags;
 			uint32_t	tag;
+			uint32_t	support_nodes;
 		} cluster;
 		struct {
 			uint32_t	old_vid;
diff --git a/sheep/group.c b/sheep/group.c
index f53ad0f..cb10301 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -352,7 +352,7 @@ error:
 int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 			  time_t *timestamp, struct vnode_info *vinfo)
 {
-	char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
+	char *buf = xzalloc(len + sizeof(time_t));
 	const struct sd_node *node;
 	int ret;
 
@@ -369,6 +369,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 		hdr.obj.tgt_epoch = epoch;
 		hdr.epoch = sys_epoch();
 		ret = sheep_exec_req(&node->nid, &hdr, buf);
+		if (ret == SD_RES_BUFFER_SMALL) {
+			free(buf);
+			return -2;
+		}
 		if (ret != SD_RES_SUCCESS)
 			continue;
 
@@ -376,6 +380,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 		memcpy((void *)nodes, buf, nodes_len);
 		if (timestamp)
 			memcpy(timestamp, buf + nodes_len, sizeof(*timestamp));
+		free(buf);
 
 		nr_nodes = nodes_len / sizeof(struct sd_node);
 		/* epoch file is missing in local node, try to create one */
@@ -387,6 +392,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 	 * If no node has targeted epoch log, return 0 here to at least
 	 * allow reading older epoch logs.
 	 */
+	free(buf);
 	return 0;
 }
 
diff --git a/sheep/ops.c b/sheep/ops.c
index 3d20c7d..072e1f7 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -436,15 +436,19 @@ static int local_stat_cluster(struct request *req)
 {
 	struct sd_rsp *rsp = &req->rp;
 	struct epoch_log *elog;
+	char *next_elog;
 	int i, max_elogs;
 	uint32_t epoch;
+	uint32_t support_nodes = req->rq.cluster.support_nodes;
 
 	if (req->vinfo == NULL) {
 		sd_debug("cluster is not started up");
 		goto out;
 	}
 
-	max_elogs = req->rq.data_length / sizeof(*elog);
+	max_elogs = req->rq.data_length / (sizeof(*elog)
+			+ support_nodes * sizeof(struct sd_node));
+	next_elog = (char *)req->data;
 	epoch = get_latest_epoch();
 	for (i = 0; i < max_elogs; i++) {
 		int nr_nodes;
@@ -452,7 +456,7 @@ static int local_stat_cluster(struct request *req)
 		if (epoch <= 0)
 			break;
 
-		elog = (struct epoch_log *)req->data + i;
+		elog = (struct epoch_log *)next_elog;
 		memset(elog, 0, sizeof(*elog));
 
 		/* some filed only need to store in first elog */
@@ -467,20 +471,29 @@ static int local_stat_cluster(struct request *req)
 		}
 
 		elog->epoch = epoch;
-		nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes,
-							 sizeof(elog->nodes),
-							 (time_t *)&elog->time);
-		if (nr_nodes == -1)
-			nr_nodes = epoch_log_read_remote(epoch, elog->nodes,
-							 sizeof(elog->nodes),
-							 (time_t *)&elog->time,
-							 req->vinfo);
-		assert(nr_nodes >= 0);
-		assert(nr_nodes <= SD_MAX_NODES);
-		elog->nr_nodes = nr_nodes;
-
-
-		rsp->data_length += sizeof(*elog);
+		if (support_nodes > 0) {
+			nr_nodes = epoch_log_read_with_timestamp(
+					epoch, elog->nodes,
+					support_nodes * sizeof(struct sd_node),
+					(time_t *)&elog->time);
+			if (nr_nodes == -1)
+				nr_nodes = epoch_log_read_remote(
+					epoch, elog->nodes,
+					support_nodes * sizeof(struct sd_node),
+					(time_t *)&elog->time,
+					req->vinfo);
+			if (nr_nodes == -2)
+				return SD_RES_BUFFER_SMALL;
+			assert(nr_nodes >= 0);
+			assert(nr_nodes <= SD_MAX_NODES);
+			elog->nr_nodes = nr_nodes;
+		} else
+			elog->nr_nodes = 0;
+
+		next_elog = (char *)elog->nodes
+				+ support_nodes * sizeof(struct sd_node);
+		rsp->data_length += sizeof(*elog)
+				+ support_nodes * sizeof(struct sd_node);
 		epoch--;
 	}
 out:
@@ -518,6 +531,8 @@ static int local_get_epoch(struct request *req)
 					&timestamp);
 	if (nr_nodes == -1)
 		return SD_RES_NO_TAG;
+	if (nr_nodes == -2)
+		return SD_RES_BUFFER_SMALL;
 
 	nodes_len = nr_nodes * sizeof(struct sd_node);
 	memcpy((void *)((char *)req->data + nodes_len), &timestamp,
diff --git a/sheep/store.c b/sheep/store.c
index ea445fc..e45f0f0 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -72,10 +72,14 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
 	}
 
 	buf_len = epoch_stat.st_size - sizeof(*timestamp);
-	if (buf_len < 0 || len < buf_len) {
+	if (buf_len < 0) {
 		sd_err("invalid epoch %"PRIu32" log", epoch);
 		goto err;
 	}
+	if (len < buf_len) {
+		close(fd);
+		return -2;
+	}
 
 	ret = xread(fd, nodes, buf_len);
 	if (ret < 0) {
-- 
1.8.3.2





More information about the sheepdog mailing list