[sheepdog] [PATCH v2] sheep: make epoch_log_read() clean

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Tue Mar 19 03:36:40 CET 2013


Current epoch_log_read() and the way of treating epoch files are
confusing. Because they heavily depend on the condition:
sizeof(time_t) < sizeof(struct sd_node). The condition is true
currently, but this requirement is implicit and hard to understand.

In addition, epoch_log_read() doesn't handle a rise of EINTR when it
calls read(). This behavior might cause a subtle bug.

This patch eliminates the implicit requirement and adds the handling
of EINTR. For doing it, this patch implements a new function,
epoch_log_read_with_timestamp(). If callers require a timestamp at
tails of epoch files, they should call this one instead of
epoch_log_read().

Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---

v2
 - don't change the member "time" of struct epoch_log for keeping alignment
 - avoid needless memcpy() for timestamp

 include/internal_proto.h |    2 +-
 sheep/group.c            |   18 +++++++++++---
 sheep/ops.c              |   23 +++++++++++------
 sheep/sheep_priv.h       |    6 ++++-
 sheep/store.c            |   59 ++++++++++++++++++++++++++++++++++++---------
 5 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 3677405..6f1fdb3 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -191,7 +191,7 @@ struct sd_node {
 
 struct epoch_log {
 	uint64_t ctime;
-	uint64_t time;
+	uint64_t time;		/* treated as time_t */
 	uint32_t epoch;
 	uint32_t nr_nodes;
 	uint32_t nr_copies;
diff --git a/sheep/group.c b/sheep/group.c
index 2f8814f..b991e5e 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -19,6 +19,7 @@
 #include <pthread.h>
 #include <urcu/uatomic.h>
 #include <math.h>
+#include <time.h>
 
 #include "sheepdog_proto.h"
 #include "sheep_priv.h"
@@ -189,7 +190,8 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch)
 
 	nr_nodes = epoch_log_read(epoch, nodes, sizeof(nodes));
 	if (nr_nodes < 0) {
-		nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes));
+		nr_nodes = epoch_log_read_remote(epoch, nodes, sizeof(nodes),
+						NULL);
 		if (nr_nodes == 0)
 			return NULL;
 	}
@@ -453,16 +455,19 @@ static void clear_exceptional_node_lists(void)
 		list_del(&n->list);
 }
 
-int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len)
+int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
+			time_t *timestamp)
 {
 	int i, nr, ret;
 	struct vnode_info *vinfo = get_vnode_info();
+	char buf[SD_MAX_NODES * sizeof(struct sd_node) + sizeof(time_t)];
 
 	nr = vinfo->nr_nodes;
 	for (i = 0; i < nr; i++) {
 		struct sd_req hdr;
 		struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
 		const struct sd_node *node = vinfo->nodes + i;
+		int nodes_len;
 
 		if (node_is_local(node))
 			continue;
@@ -475,12 +480,17 @@ int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len)
 		hdr.data_length = len;
 		hdr.obj.tgt_epoch = epoch;
 		hdr.epoch = sys_epoch();
-		ret = sheep_exec_req(&node->nid, &hdr, nodes);
+		ret = sheep_exec_req(&node->nid, &hdr, buf);
 		if (ret != SD_RES_SUCCESS)
 			continue;
 
+		nodes_len = rsp->data_length - sizeof(timestamp);
+		memcpy((void *)nodes, buf, nodes_len);
+		if (timestamp)
+			memcpy(timestamp, buf + nodes_len, sizeof(timestamp));
+
 		put_vnode_info(vinfo);
-		return rsp->data_length / sizeof(*nodes);
+		return nodes_len / sizeof(struct sd_node);
 	}
 
 	/*
diff --git a/sheep/ops.c b/sheep/ops.c
index b9634ae..a230eb6 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -421,17 +421,17 @@ static int local_stat_cluster(struct request *req)
 		memset(log, 0, sizeof(*log));
 		log->epoch = epoch;
 		log->ctime = get_cluster_ctime();
-		log->nr_nodes = epoch_log_read(epoch, log->nodes,
-					       sizeof(log->nodes));
+		log->nr_nodes = epoch_log_read_with_timestamp(epoch, log->nodes,
+							sizeof(log->nodes),
+							(time_t *)&log->time);
 		if (log->nr_nodes == -1)
 			log->nr_nodes = epoch_log_read_remote(epoch, log->nodes,
-							      sizeof(log->nodes));
+							sizeof(log->nodes),
+							(time_t *)&log->time);
 
 		log->nr_copies = sys->nr_copies;
 
 		rsp->data_length += sizeof(*log);
-		/* FIXME: this hack would require sizeof(time_t) < sizeof(log->nodes[0]) */
-		log->time = *(uint64_t *)(&log->nodes[log->nr_nodes]);
 		epoch--;
 	}
 
@@ -460,15 +460,22 @@ static int local_get_obj_list(struct request *req)
 static int local_get_epoch(struct request *req)
 {
 	uint32_t epoch = req->rq.obj.tgt_epoch;
-	int nr_nodes;
+	int nr_nodes, nodes_len;
+	time_t timestamp;
 
 	sd_dprintf("%d", epoch);
 
-	nr_nodes = epoch_log_read(epoch, req->data, req->rq.data_length);
+	nr_nodes =
+		epoch_log_read_with_timestamp(epoch, req->data,
+					req->rq.data_length - sizeof(timestamp),
+					&timestamp);
 	if (nr_nodes == -1)
 		return SD_RES_NO_TAG;
 
-	req->rp.data_length = nr_nodes * sizeof(struct sd_node) + sizeof(time_t);
+	nodes_len = nr_nodes * sizeof(struct sd_node);
+	memcpy((void *)((char *)req->data + nodes_len), &timestamp,
+		sizeof(timestamp));
+	req->rp.data_length = nodes_len + sizeof(time_t);
 	return SD_RES_SUCCESS;
 }
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 003f3c9..bdb829e 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -14,6 +14,7 @@
 #include <inttypes.h>
 #include <stdbool.h>
 #include <urcu/uatomic.h>
+#include <time.h>
 
 #include "sheepdog_proto.h"
 #include "event.h"
@@ -289,7 +290,10 @@ int store_file_write(void *buffer, size_t len);
 void *store_file_read(void);
 
 int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len);
-int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len);
+int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
+				int len, time_t *timestamp);
+int epoch_log_read_remote(uint32_t epoch, struct sd_node *nodes, int len,
+			time_t *timestamp);
 uint32_t get_latest_epoch(void);
 int init_config_path(const char *base_path);
 int set_cluster_ctime(uint64_t ctime);
diff --git a/sheep/store.c b/sheep/store.c
index bd5e0ec..c327b9b 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -74,27 +74,62 @@ err_open:
 	return -1;
 }
 
-int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len)
+static int do_epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len,
+			time_t *timestamp)
 {
-	int fd;
+	int fd, ret, nr_nodes;
 	char path[PATH_MAX];
+	struct stat epoch_stat;
 
 	snprintf(path, sizeof(path), "%s%08u", epoch_path, epoch);
 	fd = open(path, O_RDONLY);
-	if (fd < 0) {
-		sd_eprintf("failed to open epoch %"PRIu32" log", epoch);
-		return -1;
-	}
+	if (fd < 0)
+		goto err;
 
-	len = read(fd, nodes, len);
+	memset(&epoch_stat, 0, sizeof(epoch_stat));
+	ret = fstat(fd, &epoch_stat);
+	if (ret < 0)
+		goto err;
 
-	close(fd);
+	if (len < epoch_stat.st_size - sizeof(time_t))
+		goto err;
 
-	if (len < 0) {
-		sd_eprintf("failed to read epoch %"PRIu32" log", epoch);
-		return -1;
+	ret = xread(fd, nodes, epoch_stat.st_size - sizeof(timestamp));
+	if (ret < 0)
+		goto err;
+
+	assert(ret % sizeof(struct sd_node) == 0);
+	nr_nodes = ret / sizeof(struct sd_node);
+
+	if (timestamp) {
+		ret = xread(fd, timestamp, sizeof(timestamp));
+		if (ret != sizeof(timestamp))
+			goto err;
 	}
-	return len / sizeof(*nodes);
+
+	ret = nr_nodes;
+	goto end;
+
+err:
+	sd_eprintf("failed to open epoch %"PRIu32" log", epoch);
+	ret = -1;
+
+end:
+	if (0 <= fd)
+		close(fd);
+
+	return ret;
+}
+
+int epoch_log_read(uint32_t epoch, struct sd_node *nodes, int len)
+{
+	return do_epoch_log_read(epoch, nodes, len, NULL);
+}
+
+int epoch_log_read_with_timestamp(uint32_t epoch, struct sd_node *nodes,
+				int len, time_t *timestamp)
+{
+	return do_epoch_log_read(epoch, nodes, len, timestamp);
 }
 
 uint32_t get_latest_epoch(void)
-- 
1.7.2.5




More information about the sheepdog mailing list