[sheepdog] [PATCH v4] optimize epoch_log structure to reduce network and memory
Hitoshi Mitake
mitake.hitoshi at gmail.com
Sun Aug 3 15:54:48 CEST 2014
At Thu, 31 Jul 2014 15:27:10 +0800,
Ruoyu wrote:
>
> Current epoch_log contains a long nodes array to sync nodes and
> epoch in the cluster. It is simple, but there is a potential
> performance issue because each epoch log occupies nearly 500
> KBytes. If the cluster members change frequently, epoch is lifted
> frequently, more and more unused data is transfered between
> client and server. If we don't find a way, the performance will
> go from bad to worse.
>
> Although the max node number is 6144, we only use a few of them.
> Therefore, the first solution is using a zero-length array,
> client (dog) and server (sheep) will negotiate an appropriate
> supported node number. This way will spend much less memory and
> will run much faster than before.
>
> Signed-off-by: Ruoyu <liangry at ucweb.com>
>
> v4:
> - variable renamed.
> - use sd_nodes_nr as default node number.
>
> v3:
> - epoch_log_read series functions are changed to propagate error
> to upper layer.
>
> v2:
> - internal data structure is changed.
> ---
> dog/cluster.c | 39 +++++++++++++++++++++++----------
> dog/vdi.c | 36 ++++++++++++++++++++++--------
> include/internal_proto.h | 2 +-
> include/sheepdog_proto.h | 1 +
> sheep/group.c | 57 +++++++++++++++++++++++++-----------------------
> sheep/ops.c | 55 +++++++++++++++++++++++++++-------------------
> sheep/sheep_priv.h | 10 +++++----
> sheep/store.c | 25 ++++++++++++---------
> 8 files changed, 141 insertions(+), 84 deletions(-)
Applied, thanks.
Hitoshi
>
> diff --git a/dog/cluster.c b/dog/cluster.c
> index 188d4f4..e36e308 100644
> --- a/dog/cluster.c
> +++ b/dog/cluster.c
> @@ -141,14 +141,14 @@ static int cluster_format(int argc, char **argv)
> return EXIT_SUCCESS;
> }
>
> -static void print_nodes(const struct epoch_log *logs, int epoch)
> +static void print_nodes(const struct epoch_log *logs, uint16_t flags)
> {
> int i, nr_disk;
> const struct sd_node *entry;
>
> - for (i = 0; i < logs[epoch].nr_nodes; i++) {
> - entry = logs[epoch].nodes + i;
> - if (logs->flags & SD_CLUSTER_FLAG_DISKMODE) {
> + for (i = 0; i < logs->nr_nodes; i++) {
> + entry = logs->nodes + i;
> + if (flags & SD_CLUSTER_FLAG_DISKMODE) {
> for (nr_disk = 0; nr_disk < DISK_MAX; nr_disk++) {
> if (entry->disks[nr_disk].disk_id == 0)
> break;
> @@ -169,21 +169,34 @@ static int cluster_info(int argc, char **argv)
> int i, ret;
> struct sd_req hdr;
> struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> - struct epoch_log *logs;
> + struct epoch_log *logs, *log;
> + char *next_log;
> int nr_logs, log_length;
> time_t ti, ct;
> struct tm tm;
> char time_str[128];
> + uint32_t nodes_nr;
>
> - log_length = sd_epoch * sizeof(struct epoch_log);
> + nodes_nr = sd_nodes_nr;
> + log_length = sd_epoch * (sizeof(struct epoch_log)
> + + nodes_nr * sizeof(struct sd_node));
> logs = xmalloc(log_length);
>
> +retry:
> sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
> hdr.data_length = log_length;
> + hdr.cluster.nodes_nr = nodes_nr;
>
> ret = dog_exec_req(&sd_nid, &hdr, logs);
> if (ret < 0)
> goto error;
> + if (rsp->result == SD_RES_BUFFER_SMALL) {
> + nodes_nr *= 2;
> + log_length = sd_epoch * (sizeof(struct epoch_log)
> + + nodes_nr * sizeof(struct sd_node));
> + logs = xrealloc(logs, log_length);
> + goto retry;
> + }
>
> /* show cluster status */
> if (!raw_output)
> @@ -230,10 +243,12 @@ static int cluster_info(int argc, char **argv)
> printf("Epoch Time Version\n");
> }
>
> - nr_logs = rsp->data_length / sizeof(struct epoch_log);
> + nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> + + nodes_nr * sizeof(struct sd_node));
> + next_log = (char *)logs;
> for (i = 0; i < nr_logs; i++) {
> -
> - ti = logs[i].time;
> + log = (struct epoch_log *)next_log;
> + ti = log->time;
> if (raw_output) {
> snprintf(time_str, sizeof(time_str), "%" PRIu64, (uint64_t) ti);
> } else {
> @@ -241,10 +256,12 @@ static int cluster_info(int argc, char **argv)
> strftime(time_str, sizeof(time_str), "%Y-%m-%d %H:%M:%S", &tm);
> }
>
> - printf(raw_output ? "%s %d" : "%s %6d", time_str, logs[i].epoch);
> + printf(raw_output ? "%s %d" : "%s %6d", time_str, log->epoch);
> printf(" [");
> - print_nodes(logs, i);
> + print_nodes(log, logs->flags);
> printf("]\n");
> + next_log = (char *)log->nodes
> + + nodes_nr * sizeof(struct sd_node);
> }
>
> free(logs);
> diff --git a/dog/vdi.c b/dog/vdi.c
> index 2e3f7b3..18f8799 100644
> --- a/dog/vdi.c
> +++ b/dog/vdi.c
> @@ -1096,47 +1096,63 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
> struct sd_req hdr;
> struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
> - struct epoch_log *logs;
> + struct epoch_log *logs, *log;
> + char *next_log;
> int nr_logs, log_length;
> + uint32_t nodes_nr;
>
> - log_length = sd_epoch * sizeof(struct epoch_log);
> + nodes_nr = sd_nodes_nr;
> + log_length = sd_epoch * (sizeof(struct epoch_log)
> + + nodes_nr * sizeof(struct sd_node));
> logs = xmalloc(log_length);
>
> +retry:
> sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
> hdr.data_length = log_length;
> + hdr.cluster.nodes_nr = nodes_nr;
>
> ret = dog_exec_req(&sd_nid, &hdr, logs);
> if (ret < 0)
> goto error;
>
> + if (rsp->result == SD_RES_BUFFER_SMALL) {
> + nodes_nr *= 2;
> + log_length = sd_epoch * (sizeof(struct epoch_log)
> + + nodes_nr * sizeof(struct sd_node));
> + logs = xrealloc(logs, log_length);
> + goto retry;
> + }
> if (rsp->result != SD_RES_SUCCESS) {
> printf("%s\n", sd_strerror(rsp->result));
> goto error;
> }
>
> - nr_logs = rsp->data_length / sizeof(struct epoch_log);
> + nr_logs = rsp->data_length / (sizeof(struct epoch_log)
> + + nodes_nr * sizeof(struct sd_node));
> + next_log = (char *)logs;
> for (i = nr_logs - 1; i >= 0; i--) {
> struct rb_root vroot = RB_ROOT;
> struct rb_root nroot = RB_ROOT;
>
> + log = (struct epoch_log *)next_log;
> printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
> - oid, logs[i].epoch, nr_copies);
> + oid, log->epoch, nr_copies);
> printf("---------------------------------------------------\n");
>
> /*
> * When # of nodes is less than nr_copies, we only print
> * remaining nodes that holds all the remaining copies.
> */
> - if (logs[i].nr_nodes < nr_copies) {
> - for (j = 0; j < logs[i].nr_nodes; j++) {
> - const struct node_id *n = &logs[i].nodes[j].nid;
> + if (log->nr_nodes < nr_copies) {
> + for (j = 0; j < log->nr_nodes; j++) {
> + const struct node_id *n = &log->nodes[j].nid;
>
> printf("%s\n", addr_to_str(n->addr, n->port));
> }
> continue;
> }
> - for (int k = 0; k < logs[i].nr_nodes; k++)
> - rb_insert(&nroot, &logs[i].nodes[k], rb, node_cmp);
> + for (int k = 0; k < log->nr_nodes; k++)
> + rb_insert(&nroot, &log->nodes[k], rb, node_cmp);
> if (logs->flags & SD_CLUSTER_FLAG_DISKMODE)
> disks_to_vnodes(&nroot, &vroot);
> else
> @@ -1148,6 +1164,8 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
> printf("%s\n", addr_to_str(n->addr, n->port));
> }
> rb_destroy(&vroot, struct sd_vnode, rb);
> + next_log = (char *)log->nodes
> + + nodes_nr * sizeof(struct sd_node);
> }
>
> free(logs);
> diff --git a/include/internal_proto.h b/include/internal_proto.h
> index 37afb46..d61b5a5 100644
> --- a/include/internal_proto.h
> +++ b/include/internal_proto.h
> @@ -221,7 +221,7 @@ struct epoch_log {
> uint8_t __pad[3];
> uint16_t flags;
> char drv_name[STORE_LEN];
> - struct sd_node nodes[SD_MAX_NODES];
> + struct sd_node nodes[0];
> };
>
> struct vdi_op_message {
> diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
> index 7cfdccb..349aa07 100644
> --- a/include/sheepdog_proto.h
> +++ b/include/sheepdog_proto.h
> @@ -165,6 +165,7 @@ struct sd_req {
> uint8_t copy_policy;
> uint16_t flags;
> uint32_t tag;
> + uint32_t nodes_nr;
> } cluster;
> struct {
> uint32_t old_vid;
> diff --git a/sheep/group.c b/sheep/group.c
> index f53ad0f..9597495 100644
> --- a/sheep/group.c
> +++ b/sheep/group.c
> @@ -160,13 +160,13 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
> {
> struct sd_node nodes[SD_MAX_NODES];
> struct rb_root nroot = RB_ROOT;
> - int nr_nodes;
> + int nr_nodes = 0, ret;
>
> - nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes));
> - if (nr_nodes < 0) {
> - nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes),
> - NULL, cur_vinfo);
> - if (nr_nodes == 0)
> + ret = epoch_log_read(epoch, nodes, sizeof(nodes), &nr_nodes);
> + if (ret != SD_RES_SUCCESS) {
> + ret = epoch_log_read_remote(epoch, nodes, sizeof(nodes),
> + &nr_nodes, NULL, cur_vinfo);
> + if (ret != SD_RES_SUCCESS)
> return NULL;
> }
> for (int i = 0; i < nr_nodes; i++)
> @@ -178,12 +178,12 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
> int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
> struct sd_node *nodes, int len)
> {
> - int nr_nodes;
> + int nr_nodes = 0, ret;
>
> - nr_nodes = epoch_log_read(epoch, nodes, len);
> - if (nr_nodes < 0)
> - nr_nodes = epoch_log_read_remote(epoch, nodes, len,
> - NULL, cur_vinfo);
> + ret = epoch_log_read(epoch, nodes, len, &nr_nodes);
> + if (ret != SD_RES_SUCCESS)
> + epoch_log_read_remote(epoch, nodes, len, &nr_nodes,
> + NULL, cur_vinfo);
> return nr_nodes;
> }
>
> @@ -350,16 +350,17 @@ error:
> }
>
> int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> - time_t *timestamp, struct vnode_info *vinfo)
> + int *nr_nodes, time_t *timestamp,
> + struct vnode_info *vinfo)
> {
> - char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
> + char *buf = xzalloc(len + sizeof(time_t));
> const struct sd_node *node;
> int ret;
>
> rb_for_each_entry(node, &vinfo->nroot, rb) {
> struct sd_req hdr;
> struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
> - int nodes_len, nr_nodes;
> + int nodes_len;
>
> if (node_is_local(node))
> continue;
> @@ -369,6 +370,10 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> hdr.obj.tgt_epoch = epoch;
> hdr.epoch = sys_epoch();
> ret = sheep_exec_req(&node->nid, &hdr, buf);
> + if (ret == SD_RES_BUFFER_SMALL) {
> + free(buf);
> + return ret;
> + }
> if (ret != SD_RES_SUCCESS)
> continue;
>
> @@ -376,18 +381,16 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> memcpy((void *)nodes, buf, nodes_len);
> if (timestamp)
> memcpy(timestamp, buf + nodes_len, sizeof(*timestamp));
> + free(buf);
>
> - nr_nodes = nodes_len / sizeof(struct sd_node);
> + *nr_nodes = nodes_len / sizeof(struct sd_node);
> /* epoch file is missing in local node, try to create one */
> - update_epoch_log(epoch, nodes, nr_nodes);
> - return nr_nodes;
> + update_epoch_log(epoch, nodes, *nr_nodes);
> + return SD_RES_SUCCESS;
> }
>
> - /*
> - * If no node has targeted epoch log, return 0 here to at least
> - * allow reading older epoch logs.
> - */
> - return 0;
> + free(buf);
> + return SD_RES_NO_TAG;
> }
>
> static bool cluster_ctime_check(const struct cluster_info *cinfo)
> @@ -1054,7 +1057,7 @@ main_fn void sd_update_node_handler(struct sd_node *node)
> int create_cluster(int port, int64_t zone, int nr_vnodes,
> bool explicit_addr)
> {
> - int ret;
> + int nr_nodes = 0, ret;
>
> if (!sys->cdrv) {
> sys->cdrv = find_cdrv(DEFAULT_CLUSTER_DRIVER);
> @@ -1089,11 +1092,11 @@ int create_cluster(int port, int64_t zone, int nr_vnodes,
>
> sys->cinfo.epoch = get_latest_epoch();
> if (sys->cinfo.epoch) {
> - sys->cinfo.nr_nodes = epoch_log_read(sys->cinfo.epoch,
> - sys->cinfo.nodes,
> - sizeof(sys->cinfo.nodes));
> - if (sys->cinfo.nr_nodes == -1)
> + ret = epoch_log_read(sys->cinfo.epoch, sys->cinfo.nodes,
> + sizeof(sys->cinfo.nodes), &nr_nodes);
> + if (ret != SD_RES_SUCCESS)
> return -1;
> + sys->cinfo.nr_nodes = nr_nodes;
> }
> sys->cinfo.status = SD_STATUS_WAIT;
>
> diff --git a/sheep/ops.c b/sheep/ops.c
> index 3d20c7d..a2e94c5 100644
> --- a/sheep/ops.c
> +++ b/sheep/ops.c
> @@ -436,23 +436,27 @@ static int local_stat_cluster(struct request *req)
> {
> struct sd_rsp *rsp = &req->rp;
> struct epoch_log *elog;
> + char *next_elog;
> int i, max_elogs;
> uint32_t epoch;
> + uint32_t nodes_nr = req->rq.cluster.nodes_nr;
>
> if (req->vinfo == NULL) {
> sd_debug("cluster is not started up");
> goto out;
> }
>
> - max_elogs = req->rq.data_length / sizeof(*elog);
> + max_elogs = req->rq.data_length / (sizeof(*elog)
> + + nodes_nr * sizeof(struct sd_node));
> + next_elog = (char *)req->data;
> epoch = get_latest_epoch();
> for (i = 0; i < max_elogs; i++) {
> - int nr_nodes;
> + int nr_nodes = 0, ret;
>
> if (epoch <= 0)
> break;
>
> - elog = (struct epoch_log *)req->data + i;
> + elog = (struct epoch_log *)next_elog;
> memset(elog, 0, sizeof(*elog));
>
> /* some filed only need to store in first elog */
> @@ -467,20 +471,27 @@ static int local_stat_cluster(struct request *req)
> }
>
> elog->epoch = epoch;
> - nr_nodes = epoch_log_read_with_timestamp(epoch, elog->nodes,
> - sizeof(elog->nodes),
> - (time_t *)&elog->time);
> - if (nr_nodes == -1)
> - nr_nodes = epoch_log_read_remote(epoch, elog->nodes,
> - sizeof(elog->nodes),
> - (time_t *)&elog->time,
> - req->vinfo);
> - assert(nr_nodes >= 0);
> - assert(nr_nodes <= SD_MAX_NODES);
> - elog->nr_nodes = nr_nodes;
> -
> -
> - rsp->data_length += sizeof(*elog);
> + if (nodes_nr > 0) {
> + ret = epoch_log_read_with_timestamp(
> + epoch, elog->nodes,
> + nodes_nr * sizeof(struct sd_node),
> + &nr_nodes, (time_t *)&elog->time);
> + if (ret == SD_RES_NO_TAG)
> + ret = epoch_log_read_remote(
> + epoch, elog->nodes,
> + nodes_nr * sizeof(struct sd_node),
> + &nr_nodes, (time_t *)&elog->time,
> + req->vinfo);
> + if (ret == SD_RES_BUFFER_SMALL)
> + return ret;
> + elog->nr_nodes = nr_nodes;
> + } else
> + elog->nr_nodes = 0;
> +
> + next_elog = (char *)elog->nodes
> + + nodes_nr * sizeof(struct sd_node);
> + rsp->data_length += sizeof(*elog)
> + + nodes_nr * sizeof(struct sd_node);
> epoch--;
> }
> out:
> @@ -507,17 +518,17 @@ static int local_get_obj_list(struct request *req)
> static int local_get_epoch(struct request *req)
> {
> uint32_t epoch = req->rq.obj.tgt_epoch;
> - int nr_nodes, nodes_len;
> + int nr_nodes = 0, nodes_len, ret;
> time_t timestamp;
>
> sd_debug("%d", epoch);
>
> - nr_nodes =
> + ret =
> epoch_log_read_with_timestamp(epoch, req->data,
> req->rq.data_length - sizeof(timestamp),
> - ×tamp);
> - if (nr_nodes == -1)
> - return SD_RES_NO_TAG;
> + &nr_nodes, ×tamp);
> + if (ret != SD_RES_SUCCESS)
> + return ret;
>
> nodes_len = nr_nodes * sizeof(struct sd_node);
> memcpy((void *)((char *)req->data + nodes_len), ×tamp,
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index 4402bb7..36bb8f9 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -394,11 +394,13 @@ int set_cluster_shutdown(bool);
> int store_file_write(void *buffer, size_t len);
> void *store_file_read(void);
>
> -int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len);
> +int epoch_log_read(uint32_t epoch, struct sd_node *nodes,
> + int len, int *nr_nodes);
> int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
> - int len, time_t *timestamp);
> -int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
> - time_t *timestamp, struct vnode_info *vinfo);
> + int len, int *nr_nodes, time_t *timestamp);
> +int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes,
> + int len, int *nr_nodes, time_t *timestamp,
> + struct vnode_info *vinfo);
> uint32_t get_latest_epoch(void);
> void init_config_path(const char *base_path);
> int init_config_file(void);
> diff --git a/sheep/store.c b/sheep/store.c
> index ea445fc..80e0406 100644
> --- a/sheep/store.c
> +++ b/sheep/store.c
> @@ -51,9 +51,9 @@ int update_epoch_log(uint32_t epoch, struct sd_node *nodes, size_t nr_nodes)
> }
>
> static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
> - time_t *timestamp)
> + int *nr_nodes, time_t *timestamp)
> {
> - int fd, ret, nr_nodes, buf_len;
> + int fd, ret, buf_len;
> char path[PATH_MAX];
> struct stat epoch_stat;
>
> @@ -72,10 +72,14 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
> }
>
> buf_len = epoch_stat.st_size - sizeof(*timestamp);
> - if (buf_len < 0 || len < buf_len) {
> + if (buf_len < 0) {
> sd_err("invalid epoch %"PRIu32" log", epoch);
> goto err;
> }
> + if (len < buf_len) {
> + close(fd);
> + return SD_RES_BUFFER_SMALL;
> + }
>
> ret = xread(fd, nodes, buf_len);
> if (ret < 0) {
> @@ -89,7 +93,7 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
> goto err;
> }
>
> - nr_nodes = ret / sizeof(struct sd_node);
> + *nr_nodes = ret / sizeof(struct sd_node);
>
> if (timestamp) {
> ret = xread(fd, timestamp, sizeof(*timestamp));
> @@ -100,22 +104,23 @@ static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
> }
>
> close(fd);
> - return nr_nodes;
> + return SD_RES_SUCCESS;
> err:
> if (fd >= 0)
> close(fd);
> - return -1;
> + return SD_RES_NO_TAG;
> }
>
> -int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len)
> +int epoch_log_read(uint32_t epoch, struct sd_node *nodes,
> + int len, int *nr_nodes)
> {
> - return do_epoch_log_read(epoch, nodes, len, NULL);
> + return do_epoch_log_read(epoch, nodes, len, nr_nodes, NULL);
> }
>
> int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
> - int len, time_t *timestamp)
> + int len, int *nr_nodes, time_t *timestamp)
> {
> - return do_epoch_log_read(epoch, nodes, len, timestamp);
> + return do_epoch_log_read(epoch, nodes, len, nr_nodes, timestamp);
> }
>
> uint32_t get_latest_epoch(void)
> --
> 1.8.3.2
>
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list