[sheepdog] [PATCH v4] optimize epoch_log structure to reduce network and memory

Ruoyu liangry at ucweb.com
Thu Jul 31 09:27:10 CEST 2014


Current epoch_log contains a long nodes array to sync nodes and
epoch in the cluster. It is simple, but there is a potential
performance issue because each epoch log occupies nearly 500
KBytes. If the cluster members change frequently, epoch is lifted
frequently, more and more unused data is transfered between
client and server. If we don't find a way, the performance will
go from bad to worse.

Although the max node number is 6144, we only use a few of them.
Therefore, the first solution is using a zero-length array,
client (dog) and server (sheep) will negotiate an appropriate
supported node number. This way will spend much less memory and
will run much faster than before.

Signed-off-by: Ruoyu <liangry at ucweb.com>

v4:
 - variable renamed.
 - use sd_nodes_nr as default node number.

v3:
 - epoch_log_read series functions are changed to propagate error
   to upper layer.

v2:
 - internal data structure is changed.
---
 dog/cluster.c            | 39 +++++++++++++++++++++++----------
 dog/vdi.c                | 36 ++++++++++++++++++++++--------
 include/internal_proto.h |  2 +-
 include/sheepdog_proto.h |  1 +
 sheep/group.c            | 57 +++++++++++++++++++++++++-----------------------
 sheep/ops.c              | 55 +++++++++++++++++++++++++++-------------------
 sheep/sheep_priv.h       | 10 +++++----
 sheep/store.c            | 25 ++++++++++++---------
 8 files changed, 141 insertions(+), 84 deletions(-)

diff --git a/dog/cluster.c b/dog/cluster.c
index 188d4f4..e36e308 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
 	return EXIT_SUCCESS;
 }
 
-static void print_nodes(const struct epoch_log *logs, int epoch)
+static void print_nodes(const struct epoch_log *logs, uint16_t flags)
 {
 	int i, nr_disk;
 	const struct sd_node *entry;
 
-	for (i = 0; i < logs[epoch].nr_nodes; i++) {
-		entry = logs[epoch].nodes + i;
-		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
+	for (i = 0; i < logs->nr_nodes; i++) {
+		entry = logs->nodes + i;
+		if (flags & SD_CLUSTER_FLAG_DISKMODE) {
 			for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
 				if (entry->disks[nr_disk].disk_id == 0)
 					break;
@@ -169,21 +169,34 @@ static int cluster_info(int argc, char **argv)
 	int i, ret;
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-	struct epoch_log *logs;
+	struct epoch_log *logs, *log;
+	char *next_log;
 	int nr_logs, log_length;
 	time_t ti, ct;
 	struct tm tm;
 	char time_str[128];
+	uint32_t nodes_nr;
 
-	log_length = sd_epoch * sizeof(struct epoch_log);
+	nodes_nr = sd_nodes_nr;
+	log_length = sd_epoch * (sizeof(struct epoch_log)
+			+ nodes_nr * sizeof(struct sd_node));
 	logs = xmalloc(log_length);
 
+retry:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.cluster.nodes_nr = nodes_nr;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto error;
+	if (rsp->result == SD_RES_BUFFER_SMALL) {
+		nodes_nr *= 2;
+		log_length = sd_epoch * (sizeof(struct epoch_log)
+				+ nodes_nr * sizeof(struct sd_node));
+		logs = xrealloc(logs, log_length);
+		goto retry;
+	}
 
 	/* show cluster status */
 	if (!raw_output)
@@ -230,10 +243,12 @@ static int cluster_info(int argc, char **argv)
 		printf("Epoch Time           Version\n");
 	}
 
-	nr_logs = rsp->data_length / sizeof(struct epoch_log);
+	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+			+ nodes_nr * sizeof(struct sd_node));
+	next_log = (char *)logs;
 	for (i = 0; i < nr_logs; i++) {
-
-		ti = logs[i].time;
+		log = (struct epoch_log *)next_log;
+		ti = log->time;
 		if (raw_output) {
 			snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
 		} else {
@@ -241,10 +256,12 @@ static int cluster_info(int argc, char **argv)
 			strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
 		}
 
-		printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
+		printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
 		printf(" [");
-		print_nodes(logs, i);
+		print_nodes(log, logs->flags);
 		printf("]\n");
+		next_log = (char *)log->nodes
+				+ nodes_nr * sizeof(struct sd_node);
 	}
 
 	free(logs);
diff --git a/dog/vdi.c b/dog/vdi.c
index 2e3f7b3..18f8799 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -1096,47 +1096,63 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 	struct sd_req hdr;
 	struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 	const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
-	struct epoch_log *logs;
+	struct epoch_log *logs, *log;
+	char *next_log;
 	int nr_logs, log_length;
+	uint32_t nodes_nr;
 
-	log_length = sd_epoch * sizeof(struct epoch_log);
+	nodes_nr = sd_nodes_nr;
+	log_length = sd_epoch * (sizeof(struct epoch_log)
+			+ nodes_nr * sizeof(struct sd_node));
 	logs = xmalloc(log_length);
 
+retry:
 	sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
 	hdr.data_length = log_length;
+	hdr.cluster.nodes_nr = nodes_nr;
 
 	ret = dog_exec_req(&sd_nid, &hdr, logs);
 	if (ret < 0)
 		goto error;
 
+	if (rsp->result == SD_RES_BUFFER_SMALL) {
+		nodes_nr *= 2;
+		log_length = sd_epoch * (sizeof(struct epoch_log)
+				+ nodes_nr * sizeof(struct sd_node));
+		logs = xrealloc(logs, log_length);
+		goto retry;
+	}
 	if (rsp->result != SD_RES_SUCCESS) {
 		printf("%s\n", sd_strerror(rsp->result));
 		goto error;
 	}
 
-	nr_logs = rsp->data_length / sizeof(struct epoch_log);
+	nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+			+ nodes_nr * sizeof(struct sd_node));
+	next_log = (char *)logs;
 	for (i = nr_logs - 1; i >= 0; i--) {
 		struct rb_root vroot = RB_ROOT;
 		struct rb_root nroot = RB_ROOT;
 
+		log = (struct epoch_log *)next_log;
 		printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
-		       oid, logs[i].epoch, nr_copies);
+		       oid, log->epoch, nr_copies);
 		printf("---------------------------------------------------\n");
 
 		/*
 		 * When # of nodes is less than nr_copies, we only print
 		 * remaining nodes that holds all the remaining copies.
 		 */
-		if (logs[i].nr_nodes < nr_copies) {
-			for (j = 0; j < logs[i].nr_nodes; j++) {
-				const struct node_id *n = &logs[i].nodes[j].nid;
+		if (log->nr_nodes < nr_copies) {
+			for (j = 0; j < log->nr_nodes; j++) {
+				const struct node_id *n = &log->nodes[j].nid;
 
 				printf("%s\n", addr_to_str(n->addr, n->port));
 			}
 			continue;
 		}
-		for (int k = 0; k < logs[i].nr_nodes; k++)
-			rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
+		for (int k = 0; k < log->nr_nodes; k++)
+			rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
 		if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
 			disks_to_vnodes(&nroot, &vroot);
 		else
@@ -1148,6 +1164,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
 			printf("%s\n", addr_to_str(n->addr, n->port));
 		}
 		rb_destroy(&vroot, struct sd_vnode, rb);
+		next_log = (char *)log->nodes
+				+ nodes_nr * sizeof(struct sd_node);
 	}
 
 	free(logs);
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 37afb46..d61b5a5 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -221,7 +221,7 @@ struct epoch_log {
 	uint8_t  __pad[3];
 	uint16_t flags;
 	char drv_name[STORE_LEN];
-	struct sd_node nodes[SD_MAX_NODES];
+	struct sd_node nodes[0];
 };
 
 struct vdi_op_message {
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 7cfdccb..349aa07 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -165,6 +165,7 @@ struct sd_req {
 			uint8_t		copy_policy;
 			uint16_t	flags;
 			uint32_t	tag;
+			uint32_t	nodes_nr;
 		} cluster;
 		struct {
 			uint32_t	old_vid;
diff --git a/sheep/group.c b/sheep/group.c
index f53ad0f..9597495 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -160,13 +160,13 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 {
 	struct sd_node nodes[SD_MAX_NODES];
 	struct rb_root nroot = RB_ROOT;
-	int nr_nodes;
+	int nr_nodes = 0, ret;
 
-	nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes));
-	if (nr_nodes < 0) {
-		nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes),
-						 NULL, cur_vinfo);
-		if (nr_nodes == 0)
+	ret = epoch_log_read(epoch, nodes, sizeof(nodes), &nr_nodes);
+	if (ret != SD_RES_SUCCESS) {
+		ret = epoch_log_read_remote(epoch, nodes, sizeof(nodes),
+						 &nr_nodes, NULL, cur_vinfo);
+		if (ret != SD_RES_SUCCESS)
 			return NULL;
 	}
 	for (int i = 0; i < nr_nodes; i++)
@@ -178,12 +178,12 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
 		    struct sd_node *nodes, int len)
 {
-	int nr_nodes;
+	int nr_nodes = 0, ret;
 
-	nr_nodes = epoch_log_read(epoch, nodes, len);
-	if (nr_nodes < 0)
-		nr_nodes = epoch_log_read_remote(epoch, nodes, len,
-						 NULL, cur_vinfo);
+	ret = epoch_log_read(epoch, nodes, len, &nr_nodes);
+	if (ret != SD_RES_SUCCESS)
+		epoch_log_read_remote(epoch, nodes, len, &nr_nodes,
+				NULL, cur_vinfo);
 	return nr_nodes;
 }
 
@@ -350,16 +350,17 @@ error:
 }
 
 int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
-			  time_t *timestamp, struct vnode_info *vinfo)
+					  int *nr_nodes, time_t *timestamp,
+					  struct vnode_info *vinfo)
 {
-	char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
+	char *buf = xzalloc(len + sizeof(time_t));
 	const struct sd_node *node;
 	int ret;
 
 	rb_for_each_entry(node, &vinfo->nroot, rb) {
 		struct sd_req hdr;
 		struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
-		int nodes_len, nr_nodes;
+		int nodes_len;
 
 		if (node_is_local(node))
 			continue;
@@ -369,6 +370,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 		hdr.obj.tgt_epoch = epoch;
 		hdr.epoch = sys_epoch();
 		ret = sheep_exec_req(&node->nid, &hdr, buf);
+		if (ret == SD_RES_BUFFER_SMALL) {
+			free(buf);
+			return ret;
+		}
 		if (ret != SD_RES_SUCCESS)
 			continue;
 
@@ -376,18 +381,16 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
 		memcpy((void *)nodes, buf, nodes_len);
 		if (timestamp)
 			memcpy(timestamp, buf + nodes_len, sizeof(*timestamp));
+		free(buf);
 
-		nr_nodes = nodes_len / sizeof(struct sd_node);
+		*nr_nodes = nodes_len / sizeof(struct sd_node);
 		/* epoch file is missing in local node, try to create one */
-		update_epoch_log(epoch, nodes, nr_nodes);
-		return nr_nodes;
+		update_epoch_log(epoch, nodes, *nr_nodes);
+		return SD_RES_SUCCESS;
 	}
 
-	/*
-	 * If no node has targeted epoch log, return 0 here to at least
-	 * allow reading older epoch logs.
-	 */
-	return 0;
+	free(buf);
+	return SD_RES_NO_TAG;
 }
 
 static bool cluster_ctime_check(const struct cluster_info *cinfo)
@@ -1054,7 +1057,7 @@ main_fn void sd_update_node_handler(struct sd_node *node)
 int create_cluster(int port, int64_t zone, int nr_vnodes,
 		   bool explicit_addr)
 {
-	int ret;
+	int nr_nodes = 0, ret;
 
 	if (!sys->cdrv) {
 		sys->cdrv = find_cdrv(DEFAULT_CLUSTER_DRIVER);
@@ -1089,11 +1092,11 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
 
 	sys->cinfo.epoch = get_latest_epoch();
 	if (sys->cinfo.epoch) {
-		sys->cinfo.nr_nodes = epoch_log_read(sys->cinfo.epoch,
-						     sys->cinfo.nodes,
-						     sizeof(sys->cinfo.nodes));
-		if (sys->cinfo.nr_nodes == -1)
+		ret = epoch_log_read(sys->cinfo.epoch, sys->cinfo.nodes,
+				sizeof(sys->cinfo.nodes), &nr_nodes);
+		if (ret != SD_RES_SUCCESS)
 			return -1;
+		sys->cinfo.nr_nodes = nr_nodes;
 	}
 	sys->cinfo.status = SD_STATUS_WAIT;
 
diff --git a/sheep/ops.c b/sheep/ops.c
index 3d20c7d..a2e94c5 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -436,23 +436,27 @@ static int local_stat_cluster(struct request *req)
 {
 	struct sd_rsp *rsp = &req->rp;
 	struct epoch_log *elog;
+	char *next_elog;
 	int i, max_elogs;
 	uint32_t epoch;
+	uint32_t nodes_nr = req->rq.cluster.nodes_nr;
 
 	if (req->vinfo == NULL) {
 		sd_debug("cluster is not started up");
 		goto out;
 	}
 
-	max_elogs = req->rq.data_length / sizeof(*elog);
+	max_elogs = req->rq.data_length / (sizeof(*elog)
+			+ nodes_nr * sizeof(struct sd_node));
+	next_elog = (char *)req->data;
 	epoch = get_latest_epoch();
 	for (i = 0; i < max_elogs; i++) {
-		int nr_nodes;
+		int nr_nodes = 0, ret;
 
 		if (epoch <= 0)
 			break;
 
-		elog = (struct epoch_log *)req->data + i;
+		elog = (struct epoch_log *)next_elog;
 		memset(elog, 0, sizeof(*elog));
 
 		/* some filed only need to store in first elog */
@@ -467,20 +471,27 @@ static int local_stat_cluster(struct request *req)
 		}
 
 		elog->epoch = epoch;
-		nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes,
-							 sizeof(elog->nodes),
-							 (time_t *)&elog->time);
-		if (nr_nodes == -1)
-			nr_nodes = epoch_log_read_remote(epoch, elog->nodes,
-							 sizeof(elog->nodes),
-							 (time_t *)&elog->time,
-							 req->vinfo);
-		assert(nr_nodes >= 0);
-		assert(nr_nodes <= SD_MAX_NODES);
-		elog->nr_nodes = nr_nodes;
-
-
-		rsp->data_length += sizeof(*elog);
+		if (nodes_nr > 0) {
+			ret = epoch_log_read_with_timestamp(
+					epoch, elog->nodes,
+					nodes_nr * sizeof(struct sd_node),
+					&nr_nodes, (time_t *)&elog->time);
+			if (ret == SD_RES_NO_TAG)
+				ret = epoch_log_read_remote(
+					epoch, elog->nodes,
+					nodes_nr * sizeof(struct sd_node),
+					&nr_nodes, (time_t *)&elog->time,
+					req->vinfo);
+			if (ret == SD_RES_BUFFER_SMALL)
+				return ret;
+			elog->nr_nodes = nr_nodes;
+		} else
+			elog->nr_nodes = 0;
+
+		next_elog = (char *)elog->nodes
+				+ nodes_nr * sizeof(struct sd_node);
+		rsp->data_length += sizeof(*elog)
+				+ nodes_nr * sizeof(struct sd_node);
 		epoch--;
 	}
 out:
@@ -507,17 +518,17 @@ static int local_get_obj_list(struct request *req)
 static int local_get_epoch(struct request *req)
 {
 	uint32_t epoch = req->rq.obj.tgt_epoch;
-	int nr_nodes, nodes_len;
+	int nr_nodes = 0, nodes_len, ret;
 	time_t timestamp;
 
 	sd_debug("%d", epoch);
 
-	nr_nodes =
+	ret =
 		epoch_log_read_with_timestamp(epoch, req->data,
 					req->rq.data_length - sizeof(timestamp),
-					&timestamp);
-	if (nr_nodes == -1)
-		return SD_RES_NO_TAG;
+					&nr_nodes, &timestamp);
+	if (ret != SD_RES_SUCCESS)
+		return ret;
 
 	nodes_len = nr_nodes * sizeof(struct sd_node);
 	memcpy((void *)((char *)req->data + nodes_len), &timestamp,
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 4402bb7..36bb8f9 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -394,11 +394,13 @@ int set_cluster_shutdown(bool);
 int store_file_write(void *buffer, size_t len);
 void *store_file_read(void);
 
-int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len);
+int epoch_log_read(uint32_t epoch, struct sd_node *nodes,
+				int len, int *nr_nodes);
 int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
-				int len, time_t *timestamp);
-int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
-			  time_t *timestamp, struct vnode_info *vinfo);
+				int len, int *nr_nodes, time_t *timestamp);
+int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes,
+				int len, int *nr_nodes, time_t *timestamp,
+				struct vnode_info *vinfo);
 uint32_t get_latest_epoch(void);
 void init_config_path(const char *base_path);
 int init_config_file(void);
diff --git a/sheep/store.c b/sheep/store.c
index ea445fc..80e0406 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -51,9 +51,9 @@ int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
 }
 
 static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
-			     time_t *timestamp)
+			     int *nr_nodes, time_t *timestamp)
 {
-	int fd, ret, nr_nodes, buf_len;
+	int fd, ret, buf_len;
 	char path[PATH_MAX];
 	struct stat epoch_stat;
 
@@ -72,10 +72,14 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
 	}
 
 	buf_len = epoch_stat.st_size - sizeof(*timestamp);
-	if (buf_len < 0 || len < buf_len) {
+	if (buf_len < 0) {
 		sd_err("invalid epoch %"PRIu32" log", epoch);
 		goto err;
 	}
+	if (len < buf_len) {
+		close(fd);
+		return SD_RES_BUFFER_SMALL;
+	}
 
 	ret = xread(fd, nodes, buf_len);
 	if (ret < 0) {
@@ -89,7 +93,7 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
 		goto err;
 	}
 
-	nr_nodes = ret / sizeof(struct sd_node);
+	*nr_nodes = ret / sizeof(struct sd_node);
 
 	if (timestamp) {
 		ret = xread(fd, timestamp, sizeof(*timestamp));
@@ -100,22 +104,23 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
 	}
 
 	close(fd);
-	return nr_nodes;
+	return SD_RES_SUCCESS;
 err:
 	if (fd >= 0)
 		close(fd);
-	return -1;
+	return SD_RES_NO_TAG;
 }
 
-int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len)
+int epoch_log_read(uint32_t epoch, struct sd_node *nodes,
+				int len, int *nr_nodes)
 {
-	return do_epoch_log_read(epoch, nodes, len, NULL);
+	return do_epoch_log_read(epoch, nodes, len, nr_nodes, NULL);
 }
 
 int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
-				int len, time_t *timestamp)
+				int len, int *nr_nodes, time_t *timestamp)
 {
-	return do_epoch_log_read(epoch, nodes, len, timestamp);
+	return do_epoch_log_read(epoch, nodes, len, nr_nodes, timestamp);
 }
 
 uint32_t get_latest_epoch(void)
-- 
1.8.3.2





More information about the sheepdog mailing list