[sheepdog] [PATCH v2] sheep: make epoch_log_read() clean
Hitoshi Mitake
mitake.hitoshi at lab.ntt.co.jp
Tue Mar 19 03:36:40 CET 2013
Current epoch_log_read() and the way of treating epoch files are
confusing. Because they heavily depend on the condition:
sizeof(time_t) < sizeof(struct sd_node). The condition is true
currently, but this requirement is implicit and hard to understand.
In addition, epoch_log_read() doesn't handle a rise of EINTR when it
calls read(). This behavior might cause a subtle bug.
This patch eliminates the implicit requirement and adds the handling
of EINTR. For doing it, this patch implements a new function,
epoch_log_read_with_timestamp(). If callers require a timestamp at
tails of epoch files, they should call this one instead of
epoch_log_read().
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v2
- don't change the member "time" of struct epoch_log for keeping alignment
- avoid needless memcpy() for timestamp
include/internal_proto.h | 2 +-
sheep/group.c | 18 +++++++++++---
sheep/ops.c | 23 +++++++++++------
sheep/sheep_priv.h | 6 ++++-
sheep/store.c | 59 ++++++++++++++++++++++++++++++++++++---------
5 files changed, 82 insertions(+), 26 deletions(-)
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 3677405..6f1fdb3 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -191,7 +191,7 @@ struct sd_node {
struct epoch_log {
uint64_t ctime;
- uint64_t time;
+ uint64_t time; /* treated as time_t */
uint32_t epoch;
uint32_t nr_nodes;
uint32_t nr_copies;
diff --git a/sheep/group.c b/sheep/group.c
index 2f8814f..b991e5e 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -19,6 +19,7 @@
#include <pthread.h>
#include <urcu/uatomic.h>
#include <math.h>
+#include <time.h>
#include "sheepdog_proto.h"
#include "sheep_priv.h"
@@ -189,7 +190,8 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch)
nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes));
if (nr_nodes < 0) {
- nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes));
+ nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes),
+ NULL);
if (nr_nodes == 0)
return NULL;
}
@@ -453,16 +455,19 @@ static void clear_exceptional_node_lists(void)
list_del(&n->list);
}
-int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len)
+int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
+ time_t *timestamp)
{
int i, nr, ret;
struct vnode_info *vinfo = get_vnode_info();
+ char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
nr = vinfo->nr_nodes;
for (i = 0; i < nr; i++) {
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
const struct sd_node *node = vinfo->nodes + i;
+ int nodes_len;
if (node_is_local(node))
continue;
@@ -475,12 +480,17 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len)
hdr.data_length = len;
hdr.obj.tgt_epoch = epoch;
hdr.epoch = sys_epoch();
- ret = sheep_exec_req(&node->nid, &hdr, nodes);
+ ret = sheep_exec_req(&node->nid, &hdr, buf);
if (ret != SD_RES_SUCCESS)
continue;
+ nodes_len = rsp->data_length - sizeof(timestamp);
+ memcpy((void *)nodes, buf, nodes_len);
+ if (timestamp)
+ memcpy(timestamp, buf + nodes_len, sizeof(timestamp));
+
put_vnode_info(vinfo);
- return rsp->data_length / sizeof(*nodes);
+ return nodes_len / sizeof(struct sd_node);
}
/*
diff --git a/sheep/ops.c b/sheep/ops.c
index b9634ae..a230eb6 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -421,17 +421,17 @@ static int local_stat_cluster(struct request *req)
memset(log, 0, sizeof(*log));
log->epoch = epoch;
log->ctime = get_cluster_ctime();
- log->nr_nodes = epoch_log_read(epoch, log->nodes,
- sizeof(log->nodes));
+ log->nr_nodes = epoch_log_read_with_timestamp(epoch, log->nodes,
+ sizeof(log->nodes),
+ (time_t *)&log->time);
if (log->nr_nodes == -1)
log->nr_nodes = epoch_log_read_remote(epoch, log->nodes,
- sizeof(log->nodes));
+ sizeof(log->nodes),
+ (time_t *)&log->time);
log->nr_copies = sys->nr_copies;
rsp->data_length += sizeof(*log);
- /* FIXME: this hack would require sizeof(time_t) < sizeof(log->nodes[0]) */
- log->time = *(uint64_t *)(&log->nodes[log->nr_nodes]);
epoch--;
}
@@ -460,15 +460,22 @@ static int local_get_obj_list(struct request *req)
static int local_get_epoch(struct request *req)
{
uint32_t epoch = req->rq.obj.tgt_epoch;
- int nr_nodes;
+ int nr_nodes, nodes_len;
+ time_t timestamp;
sd_dprintf("%d", epoch);
- nr_nodes = epoch_log_read(epoch, req->data, req->rq.data_length);
+ nr_nodes =
+ epoch_log_read_with_timestamp(epoch, req->data,
+ req->rq.data_length - sizeof(timestamp),
+ ×tamp);
if (nr_nodes == -1)
return SD_RES_NO_TAG;
- req->rp.data_length = nr_nodes * sizeof(struct sd_node) + sizeof(time_t);
+ nodes_len = nr_nodes * sizeof(struct sd_node);
+ memcpy((void *)((char *)req->data + nodes_len), ×tamp,
+ sizeof(timestamp));
+ req->rp.data_length = nodes_len + sizeof(time_t);
return SD_RES_SUCCESS;
}
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 003f3c9..bdb829e 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -14,6 +14,7 @@
#include <inttypes.h>
#include <stdbool.h>
#include <urcu/uatomic.h>
+#include <time.h>
#include "sheepdog_proto.h"
#include "event.h"
@@ -289,7 +290,10 @@ int store_file_write(void *buffer, size_t len);
void *store_file_read(void);
int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len);
-int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len);
+int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
+ int len, time_t *timestamp);
+int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
+ time_t *timestamp);
uint32_t get_latest_epoch(void);
int init_config_path(const char *base_path);
int set_cluster_ctime(uint64_t ctime);
diff --git a/sheep/store.c b/sheep/store.c
index bd5e0ec..c327b9b 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -74,27 +74,62 @@ err_open:
return -1;
}
-int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len)
+static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
+ time_t *timestamp)
{
- int fd;
+ int fd, ret, nr_nodes;
char path[PATH_MAX];
+ struct stat epoch_stat;
snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch);
fd = open(path, O_RDONLY);
- if (fd < 0) {
- sd_eprintf("failed to open epoch %"PRIu32" log", epoch);
- return -1;
- }
+ if (fd < 0)
+ goto err;
- len = read(fd, nodes, len);
+ memset(&epoch_stat, 0, sizeof(epoch_stat));
+ ret = fstat(fd, &epoch_stat);
+ if (ret < 0)
+ goto err;
- close(fd);
+ if (len < epoch_stat.st_size - sizeof(time_t))
+ goto err;
- if (len < 0) {
- sd_eprintf("failed to read epoch %"PRIu32" log", epoch);
- return -1;
+ ret = xread(fd, nodes, epoch_stat.st_size - sizeof(timestamp));
+ if (ret < 0)
+ goto err;
+
+ assert(ret % sizeof(struct sd_node) == 0);
+ nr_nodes = ret / sizeof(struct sd_node);
+
+ if (timestamp) {
+ ret = xread(fd, timestamp, sizeof(timestamp));
+ if (ret != sizeof(timestamp))
+ goto err;
}
- return len / sizeof(*nodes);
+
+ ret = nr_nodes;
+ goto end;
+
+err:
+ sd_eprintf("failed to open epoch %"PRIu32" log", epoch);
+ ret = -1;
+
+end:
+ if (0 <= fd)
+ close(fd);
+
+ return ret;
+}
+
+int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len)
+{
+ return do_epoch_log_read(epoch, nodes, len, NULL);
+}
+
+int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
+ int len, time_t *timestamp)
+{
+ return do_epoch_log_read(epoch, nodes, len, timestamp);
}
uint32_t get_latest_epoch(void)
--
1.7.2.5
More information about the sheepdog
mailing list