[sheepdog] [PATCH 2/2] optimize epoch_log structure to reduce network and memory overhead
Ruoyu
liangry at ucweb.com
Mon Jul 7 14:42:53 CEST 2014
Current epoch_log contains a long nodes array to sync nodes and
epoch in the cluster. It is simple, but there is a potential
performance issue because each epoch log occupies nearly 500
KBytes. If the cluster members change frequently, epoch is lifted
frequently. If we don't find a way, the performance will go from
bad to worse.
Although the max node number is 6144, we only use a few of them.
Therefore, the first solution is using a zero-length array,
client (dog) and server (sheep) will negotiate an appropriate
supported node number. This way will spend much less memory and
will run much faster than before.
Signed-off-by: Ruoyu <liangry at ucweb.com>
---
dog/alter.c | 3 +++
dog/cluster.c | 40 +++++++++++++++++++++++++++++-----------
dog/dog.c | 1 +
dog/vdi.c | 37 ++++++++++++++++++++++++++++---------
include/internal_proto.h | 2 +-
include/sheepdog_proto.h | 3 +++
sheep/group.c | 8 +++++++-
sheep/ops.c | 47 +++++++++++++++++++++++++++++++----------------
sheep/store.c | 4 ++--
9 files changed, 105 insertions(+), 40 deletions(-)
diff --git a/dog/alter.c b/dog/alter.c
index 7af7f9f..9801cc2 100644
--- a/dog/alter.c
+++ b/dog/alter.c
@@ -63,8 +63,11 @@ static int alter_cluster_copy(int argc, char **argv)
log_length = sd_epoch * sizeof(struct epoch_log);
logs = xmalloc(log_length);
+
sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
hdr.data_length = log_length;
+ hdr.epoch_log.support_nodes = 0;
+
ret = dog_exec_req(&sd_nid, &hdr, logs);
if (ret < 0)
goto failure;
diff --git a/dog/cluster.c b/dog/cluster.c
index 69ec07c..4731767 100644
--- a/dog/cluster.c
+++ b/dog/cluster.c
@@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
return EXIT_SUCCESS;
}
-static void print_nodes(const struct epoch_log *logs, int epoch)
+static void print_nodes(const struct epoch_log *logs, uint16_t flags)
{
int i, nr_disk;
const struct sd_node *entry;
- for (i = 0; i < logs[epoch].nr_nodes; i++) {
- entry = logs[epoch].nodes + i;
- if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
+ for (i = 0; i < logs->nr_nodes; i++) {
+ entry = logs->nodes + i;
+ if (flags & SD_CLUSTER_FLAG_DISKMODE) {
for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
if (entry->disks[nr_disk].disk_id == 0)
break;
@@ -169,21 +169,35 @@ static int cluster_info(int argc, char **argv)
int i, ret;
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
- struct epoch_log *logs;
+ struct epoch_log *logs, *log;
+ char *next_log;
int nr_logs, log_length;
time_t ti, ct;
struct tm tm;
char time_str[128];
+ uint16_t support_nodes;
- log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+ support_nodes = DEFAULT_SUPPORT_NODES;
+ log_length = sd_epoch * (sizeof(struct epoch_log)
+ + support_nodes * sizeof(struct sd_node));
logs = xmalloc(log_length);
+retry:
sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
hdr.data_length = log_length;
+ hdr.epoch_log.support_nodes = support_nodes;
ret = dog_exec_req(&sd_nid, &hdr, logs);
if (ret < 0)
goto error;
+ if (rsp->result == SD_RES_BUFFER_SMALL) {
+ support_nodes *= 2;
+ log_length = sd_epoch * (sizeof(struct epoch_log)
+ + support_nodes * sizeof(struct sd_node));
+ logs = xrealloc(logs, log_length);
+ goto retry;
+ }
/* show cluster status */
if (!raw_output)
@@ -230,10 +244,12 @@ static int cluster_info(int argc, char **argv)
printf("Epoch Time Version\n");
}
- nr_logs = rsp->data_length / sizeof(struct epoch_log);
+ nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+ + support_nodes * sizeof(struct sd_node));
+ next_log = (char *)logs;
for (i = 0; i < nr_logs; i++) {
-
- ti = logs[i].time;
+ log = (struct epoch_log *)next_log;
+ ti = log->time;
if (raw_output) {
snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
} else {
@@ -241,10 +257,12 @@ static int cluster_info(int argc, char **argv)
strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
}
- printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
+ printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
printf(" [");
- print_nodes(logs, i);
+ print_nodes(log, logs->flags);
printf("]\n");
+ next_log = (char *)log->nodes
+ + support_nodes * sizeof(struct sd_node);
}
free(logs);
diff --git a/dog/dog.c b/dog/dog.c
index 46992ec..fda7906 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -119,6 +119,7 @@ int update_node_list(int max_nodes)
sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
hdr.data_length = log_length;
+ hdr.epoch_log.support_nodes = 0;
ret = dog_exec_req(&sd_nid, &hdr, logs);
if (ret < 0)
diff --git a/dog/vdi.c b/dog/vdi.c
index 49a2139..5fd0b7b 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -964,47 +964,64 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
- struct epoch_log *logs;
+ struct epoch_log *logs, *log;
+ char *next_log;
int nr_logs, log_length;
+ uint16_t support_nodes;
- log_length = sd_epoch * sizeof(struct epoch_log);
+#define DEFAULT_SUPPORT_NODES 32
+ support_nodes = DEFAULT_SUPPORT_NODES;
+ log_length = sd_epoch * (sizeof(struct epoch_log)
+ + support_nodes * sizeof(struct sd_node));
logs = xmalloc(log_length);
+retry:
sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
hdr.data_length = log_length;
+ hdr.epoch_log.support_nodes = support_nodes;
ret = dog_exec_req(&sd_nid, &hdr, logs);
if (ret < 0)
goto error;
+ if (rsp->result == SD_RES_BUFFER_SMALL) {
+ support_nodes *= 2;
+ log_length = sd_epoch * (sizeof(struct epoch_log)
+ + support_nodes * sizeof(struct sd_node));
+ logs = xrealloc(logs, log_length);
+ goto retry;
+ }
if (rsp->result != SD_RES_SUCCESS) {
printf("%s\n", sd_strerror(rsp->result));
goto error;
}
- nr_logs = rsp->data_length / sizeof(struct epoch_log);
+ nr_logs = rsp->data_length / (sizeof(struct epoch_log)
+ + support_nodes * sizeof(struct sd_node));
+ next_log = (char *)logs;
for (i = nr_logs - 1; i >= 0; i--) {
struct rb_root vroot = RB_ROOT;
struct rb_root nroot = RB_ROOT;
+ log = (struct epoch_log *)next_log;
printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
- oid, logs[i].epoch, nr_copies);
+ oid, log->epoch, nr_copies);
printf("---------------------------------------------------\n");
/*
* When # of nodes is less than nr_copies, we only print
* remaining nodes that holds all the remaining copies.
*/
- if (logs[i].nr_nodes < nr_copies) {
- for (j = 0; j < logs[i].nr_nodes; j++) {
- const struct node_id *n = &logs[i].nodes[j].nid;
+ if (log->nr_nodes < nr_copies) {
+ for (j = 0; j < log->nr_nodes; j++) {
+ const struct node_id *n = &log->nodes[j].nid;
printf("%s\n", addr_to_str(n->addr, n->port));
}
continue;
}
- for (int k = 0; k < logs[i].nr_nodes; k++)
- rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
+ for (int k = 0; k < log->nr_nodes; k++)
+ rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
disks_to_vnodes(&nroot, &vroot);
else
@@ -1016,6 +1033,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
printf("%s\n", addr_to_str(n->addr, n->port));
}
rb_destroy(&vroot, struct sd_vnode, rb);
+ next_log = (char *)log->nodes
+ + support_nodes * sizeof(struct sd_node);
}
free(logs);
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 7ec2872..ad4d822 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -211,7 +211,7 @@ struct epoch_log {
uint8_t __pad[3];
uint16_t flags;
char drv_name[STORE_LEN];
- struct sd_node nodes[SD_MAX_NODES];
+ struct sd_node nodes[0];
};
struct vdi_op_message {
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 76fad51..1355ecb 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -180,6 +180,9 @@ struct sd_req {
uint8_t addr[16];
uint16_t port;
} node_addr;
+ struct {
+ uint16_t support_nodes;
+ } epoch_log;
uint32_t __pad[8];
};
diff --git a/sheep/group.c b/sheep/group.c
index adfd798..27e3574 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -352,7 +352,7 @@ error:
int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
time_t *timestamp, struct vnode_info *vinfo)
{
- char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
+ char *buf = xzalloc(len + sizeof(time_t));
const struct sd_node *node;
int ret;
@@ -369,6 +369,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
hdr.obj.tgt_epoch = epoch;
hdr.epoch = sys_epoch();
ret = sheep_exec_req(&node->nid, &hdr, buf);
+ if (ret == SD_RES_BUFFER_SMALL) {
+ free(buf);
+ return -2;
+ }
if (ret != SD_RES_SUCCESS)
continue;
@@ -377,6 +381,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
if (timestamp)
memcpy(timestamp, buf + nodes_len, sizeof(*timestamp));
+ free(buf);
return nodes_len / sizeof(struct sd_node);
}
@@ -384,6 +389,7 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
* If no node has targeted epoch log, return 0 here to at least
* allow reading older epoch logs.
*/
+ free(buf);
return 0;
}
diff --git a/sheep/ops.c b/sheep/ops.c
index fb26077..12957d2 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -438,15 +438,19 @@ static int local_stat_cluster(struct request *req)
{
struct sd_rsp *rsp = &req->rp;
struct epoch_log *elog;
+ char *next_elog;
int i, max_elogs;
uint32_t epoch;
+ uint16_t support_nodes = req->rq.epoch_log.support_nodes;
if (req->vinfo == NULL) {
sd_debug("cluster is not started up");
goto out;
}
- max_elogs = req->rq.data_length / sizeof(*elog);
+ max_elogs = req->rq.data_length / (sizeof(*elog)
+ + support_nodes * sizeof(struct sd_node));
+ next_elog = (char *)req->data;
epoch = get_latest_epoch();
for (i = 0; i < max_elogs; i++) {
int nr_nodes;
@@ -454,7 +458,7 @@ static int local_stat_cluster(struct request *req)
if (epoch <= 0)
break;
- elog = (struct epoch_log *)req->data + i;
+ elog = (struct epoch_log *)next_elog;
memset(elog, 0, sizeof(*elog));
/* some filed only need to store in first elog */
@@ -469,20 +473,29 @@ static int local_stat_cluster(struct request *req)
}
elog->epoch = epoch;
- nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes,
- sizeof(elog->nodes),
- (time_t *)&elog->time);
- if (nr_nodes == -1)
- nr_nodes = epoch_log_read_remote(epoch, elog->nodes,
- sizeof(elog->nodes),
- (time_t *)&elog->time,
- req->vinfo);
- assert(nr_nodes >= 0);
- assert(nr_nodes <= SD_MAX_NODES);
- elog->nr_nodes = nr_nodes;
-
-
- rsp->data_length += sizeof(*elog);
+ if (support_nodes > 0) {
+ nr_nodes = epoch_log_read_with_timestamp(
+ epoch, elog->nodes,
+ support_nodes * sizeof(struct sd_node),
+ (time_t *)&elog->time);
+ if (nr_nodes == -1)
+ nr_nodes = epoch_log_read_remote(
+ epoch, elog->nodes,
+ support_nodes * sizeof(struct sd_node),
+ (time_t *)&elog->time,
+ req->vinfo);
+ if (nr_nodes == -2)
+ return SD_RES_BUFFER_SMALL;
+ assert(nr_nodes >= 0);
+ assert(nr_nodes <= SD_MAX_NODES);
+ elog->nr_nodes = nr_nodes;
+ } else
+ elog->nr_nodes = 0;
+
+ next_elog = (char *)elog->nodes
+ + support_nodes * sizeof(struct sd_node);
+ rsp->data_length += sizeof(*elog)
+ + support_nodes * sizeof(struct sd_node);
epoch--;
}
out:
@@ -520,6 +533,8 @@ static int local_get_epoch(struct request *req)
×tamp);
if (nr_nodes == -1)
return SD_RES_NO_TAG;
+ if (nr_nodes == -2)
+ return SD_RES_BUFFER_SMALL;
nodes_len = nr_nodes * sizeof(struct sd_node);
memcpy((void *)((char *)req->data + nodes_len), ×tamp,
diff --git a/sheep/store.c b/sheep/store.c
index eee88c7..70fddb8 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -63,8 +63,8 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
}
if (len < epoch_stat.st_size - sizeof(*timestamp)) {
- sd_err("invalid epoch %"PRIu32" log", epoch);
- goto err;
+ close(fd);
+ return -2;
}
ret = xread(fd, nodes, epoch_stat.st_size - sizeof(*timestamp));
--
1.8.3.2
More information about the sheepdog
mailing list