[Sheepdog] [PATCH v2 3/3] farm: avoid unnecessary IO operation when recovering

Li Wenpeng levin108 at gmail.com
Tue Mar 13 03:10:25 CET 2012


From: levin li <xingke.lwp at taobao.com>

When the cluster is recovering, we only need to write the objects
which no longer belong to the node to the snapshot, instead of writing
all the objects, by which we decreased the IO operation.

When we try to read an object, we first read it from the local
object directory, if not found, then read it from the snapshot.

Signed-off-by: levin li <xingke.lwp at taobao.com>
---
 sheep/farm/farm.c  |  136 +++++++++++++++++++++++++++++-----------------------
 sheep/farm/farm.h  |    3 +-
 sheep/farm/snap.c  |    9 +---
 sheep/farm/trunk.c |   63 ++++++++++++++++++++++++
 sheep/store.c      |   14 +++--
 5 files changed, 151 insertions(+), 74 deletions(-)

diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index 6db7dab..8453de4 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -279,6 +279,40 @@ static int farm_get_objlist(struct siocb *iocb)
 	return SD_RES_SUCCESS;
 }
 
+
+static void *read_working_object(uint64_t oid, int length)
+{
+	void *buf = NULL;
+	char path[PATH_MAX];
+	int fd, ret;
+
+	snprintf(path, sizeof(path), "%s%016" PRIx64, obj_path, oid);
+
+	fd = open(path, O_RDONLY, def_fmode);
+	if (fd < 0) {
+		eprintf("failed to open %s: %m\n", path);
+		goto out;
+	}
+
+	buf = malloc(length);
+	if (!buf) {
+		eprintf("no memory to allocate buffer.\n");
+		goto out;
+	}
+
+	ret = xread(fd, buf, length);
+	if (length != ret) {
+		eprintf("object read error.\n");
+		free(buf);
+		buf = NULL;
+		goto out;
+	}
+	close(fd);
+
+out:
+	return buf;
+}
+
 static void *retrieve_object_from_snap(uint64_t oid, int epoch)
 {
 	struct sha1_file_hdr hdr;
@@ -299,11 +333,11 @@ static void *retrieve_object_from_snap(uint64_t oid, int epoch)
 		struct sha1_file_hdr h;
 		if (trunk_buf->oid != oid)
 			continue;
+
 		buffer = sha1_file_read(trunk_buf->sha1, &h);
-		if (!buffer)
-			goto out;
 		break;
 	}
+
 out:
 	dprintf("oid %"PRIx64", epoch %d, %s\n", oid, epoch, buffer ? "succeed" : "fail");
 	free(trunk_free);
@@ -312,8 +346,26 @@ out:
 
 static int farm_read(uint64_t oid, struct siocb *iocb)
 {
+	int i;
+
 	if (iocb->epoch < sys->epoch) {
-		void *buffer = retrieve_object_from_snap(oid, iocb->epoch);
+		void *buffer;
+		buffer = read_working_object(oid, iocb->length);
+		if (!buffer) {
+			/* Here if read the object from the targeted epoch failed,
+			 * we need to read from the later epoch, because at some epoch
+			 * we doesn't write the object to the snapshot, we assume
+			 * it in the current local object directory, but maybe
+			 * in the next epoch we removed it from the local directory.
+			 * in this case, we should try to retrieve object upwards, since.
+			 * when the object is to be removed, it will get written to the
+			 * snapshot at later epoch. */
+			for (i = iocb->epoch; i < sys->epoch; i++) {
+				buffer = retrieve_object_from_snap(oid, i);
+				if (buffer)
+					break;
+			}
+		}
 		if (!buffer)
 			return SD_RES_NO_OBJ;
 		memcpy(iocb->buf, buffer, iocb->length);
@@ -368,31 +420,41 @@ out:
 static int farm_link(uint64_t oid, struct siocb *iocb, int tgt_epoch)
 {
 	int ret = SD_RES_EIO;
-	void *buf;
+	void *buf = NULL;
 	struct siocb io = { 0 };
+	int i;
 
 	dprintf("try link %"PRIx64" from snapshot with epoch %d\n", oid, tgt_epoch);
-	buf = retrieve_object_from_snap(oid, tgt_epoch);
+
+	for (i = tgt_epoch; i < sys->epoch; i++) {
+		buf = retrieve_object_from_snap(oid, i);
+		if (buf)
+			break;
+	}
 	if (!buf)
-		goto fail;
+		goto out;
 
 	io.length = SD_DATA_OBJ_SIZE;
 	io.buf = buf;
 	ret = farm_atomic_put(oid, &io);
-fail:
+out:
 	free(buf);
 	return ret;
 }
 
-static int farm_begin_recover(struct siocb *iocb)
+static int farm_end_recover(struct siocb *iocb)
 {
 	unsigned char snap_sha1[SHA1_LEN];
+	unsigned char trunk_sha1[SHA1_LEN];
 	int epoch = iocb->epoch - 1;
 
 	if (epoch == 0)
 		return SD_RES_SUCCESS;
 	dprintf("epoch %d\n", epoch);
-	if (snap_file_write(epoch, snap_sha1, 0) < 0)
+	if (trunk_file_write_recovery(trunk_sha1) < 0)
+		return SD_RES_EIO;
+
+	if (snap_file_write(epoch, trunk_sha1, snap_sha1, 0) < 0)
 		return SD_RES_EIO;
 
 	if (snap_log_write(iocb->epoch - 1, snap_sha1, 0) < 0)
@@ -401,58 +463,10 @@ static int farm_begin_recover(struct siocb *iocb)
 	return SD_RES_SUCCESS;
 }
 
-static int oid_stale(uint64_t oid)
-{
-	int i, vidx;
-	struct sd_vnode *vnodes = sys->vnodes;
-
-	for (i = 0; i < sys->nr_sobjs; i++) {
-		vidx = obj_to_sheep(vnodes, sys->nr_vnodes, oid, i);
-		if (is_myself(vnodes[vidx].addr, vnodes[vidx].port))
-			return 0;
-	}
-	return 1;
-}
-
-static int farm_end_recover(struct siocb *iocb)
-{
-	DIR *dir;
-	struct dirent *d;
-	uint64_t oid;
-	int ret = SD_RES_EIO;
-
-	dprintf("%d\n", iocb->epoch);
-	dir = opendir(obj_path);
-	if (!dir)
-		goto out;
-
-	while ((d = readdir(dir))) {
-		if (!strncmp(d->d_name, ".", 1))
-			continue;
-		oid = strtoull(d->d_name, NULL, 16);
-		if (oid == 0 || oid == ULLONG_MAX)
-			continue;
-		if (oid_stale(oid)) {
-			char p[PATH_MAX];
-			snprintf(p, sizeof(p), "%s%s", obj_path, d->d_name);
-			if (unlink(p) < 0) {
-				eprintf("%s:%m\n", p);
-				goto out_close;
-			}
-			trunk_put_entry(oid);
-			dprintf("remove oid %s\n", d->d_name);
-		}
-	}
-	ret = SD_RES_SUCCESS;
-out_close:
-	closedir(dir);
-out:
-	return ret;
-}
-
 static int farm_snapshot(struct siocb *iocb)
 {
 	unsigned char snap_sha1[SHA1_LEN];
+	unsigned char trunk_sha1[SHA1_LEN];
 	void *buffer;
 	int log_nr, ret = SD_RES_EIO, epoch;
 
@@ -462,7 +476,10 @@ static int farm_snapshot(struct siocb *iocb)
 
 	epoch = log_nr + 1;
 	dprintf("user epoch %d\n", epoch);
-	if (snap_file_write(epoch, snap_sha1, 1) < 0)
+	if (trunk_file_write(trunk_sha1, 1) < 0)
+		goto out;
+
+	if (snap_file_write(epoch, trunk_sha1, snap_sha1, 1) < 0)
 		goto out;
 
 	if (snap_log_write(epoch, snap_sha1, 1) < 0)
@@ -618,7 +635,6 @@ struct store_driver farm = {
 	.get_objlist = farm_get_objlist,
 	.link = farm_link,
 	.atomic_put = farm_atomic_put,
-	.begin_recover = farm_begin_recover,
 	.end_recover = farm_end_recover,
 	.snapshot = farm_snapshot,
 	.restore = farm_restore,
diff --git a/sheep/farm/farm.h b/sheep/farm/farm.h
index a9a1bbf..e7978b8 100644
--- a/sheep/farm/farm.h
+++ b/sheep/farm/farm.h
@@ -62,6 +62,7 @@ extern int sha1_file_try_delete(const unsigned char *sha1);
 
 /* trunk.c */
 extern int trunk_init(void);
+extern int trunk_file_write_recovery(unsigned char *outsha1);
 extern int trunk_file_write(unsigned char *outsha1, int user);
 extern void *trunk_file_read(unsigned char *sha1, struct sha1_file_hdr *);
 extern int trunk_update_entry(uint64_t oid);
@@ -73,7 +74,7 @@ extern int trunk_get_working_objlist(uint64_t *list);
 /* snap.c */
 extern int snap_init(void);
 extern void *snap_file_read(unsigned char *sha1, struct sha1_file_hdr *outhdr);
-extern int snap_file_write(int epoch, unsigned char *outsha1, int user);
+extern int snap_file_write(int epoch, unsigned char *trunksha1, unsigned char *outsha1, int user);
 extern void *snap_log_read(int *, int user);
 extern int snap_log_write(int epoch, unsigned char *sha1, int user);
 
diff --git a/sheep/farm/snap.c b/sheep/farm/snap.c
index 6c4b829..65fcc0c 100644
--- a/sheep/farm/snap.c
+++ b/sheep/farm/snap.c
@@ -138,24 +138,19 @@ void *snap_file_read(unsigned char *sha1, struct sha1_file_hdr *outhdr)
 	return buffer;
 }
 
-int snap_file_write(int epoch, unsigned char *outsha1, int user)
+int snap_file_write(int epoch, unsigned char *trunksha1, unsigned char *outsha1, int user)
 {
 	int ret = 0;
 	struct strbuf buf = STRBUF_INIT;
 	struct sd_node nodes[SD_MAX_NODES];
 	int tgt_epoch = user ? sys->epoch : epoch;
 	uint64_t epoch_size = epoch_log_read(tgt_epoch, (char *)nodes, sizeof(nodes));
-	unsigned char trunk[SHA1_LEN];
 	struct sha1_file_hdr hdr = { .size = epoch_size + SHA1_LEN,
 				     .priv = tgt_epoch };
 	memcpy(hdr.tag, TAG_SNAP, TAG_LEN);
 
-	if (trunk_file_write(trunk, user) < 0) {
-		ret =  -1;
-		goto err;
-	}
 	strbuf_add(&buf, &hdr, sizeof(hdr));
-	strbuf_add(&buf, trunk, SHA1_LEN);
+	strbuf_add(&buf, trunksha1, SHA1_LEN);
 	strbuf_add(&buf, (char *)nodes, epoch_size);
 	if (sha1_file_write((void *)buf.buf, buf.len, outsha1) < 0) {
 		ret = -1;
diff --git a/sheep/farm/trunk.c b/sheep/farm/trunk.c
index 2949704..6725c47 100644
--- a/sheep/farm/trunk.c
+++ b/sheep/farm/trunk.c
@@ -26,6 +26,7 @@
 #include "list.h"
 #include "util.h"
 #include "sheepdog_proto.h"
+#include "sheep_priv.h"
 
 #define TRUNK_ENTRY_DIRTY	0x00000001
 
@@ -279,6 +280,68 @@ static struct omap_entry *omap_file_insert(struct strbuf *buf, struct omap_entry
 	return NULL;
 }
 
+static int oid_stale(uint64_t oid)
+{
+	int i, vidx;
+	struct sd_vnode *vnodes = sys->vnodes;
+
+	for (i = 0; i < sys->nr_sobjs; i++) {
+		vidx = obj_to_sheep(vnodes, sys->nr_vnodes, oid, i);
+		if (is_myself(vnodes[vidx].addr, vnodes[vidx].port))
+			return 0;
+	}
+	return 1;
+}
+
+int trunk_file_write_recovery(unsigned char *outsha1)
+{
+	struct trunk_entry_incore *entry, *t;
+	struct strbuf buf;
+	char p[PATH_MAX];
+	uint64_t data_size = sizeof(struct trunk_entry) * trunk_entry_active_nr;
+	struct sha1_file_hdr hdr = { .size = data_size,
+				     .priv = trunk_entry_active_nr };
+	int ret = -1;
+	uint64_t oid;
+
+	memcpy(hdr.tag, TAG_TRUNK, TAG_LEN);
+	strbuf_init(&buf, sizeof(hdr) + data_size);
+	strbuf_add(&buf, &hdr, sizeof(hdr));
+
+	list_for_each_entry_safe(entry, t, &trunk_active_list, active_list) {
+		oid = entry->raw.oid;
+		if (oid_stale(oid)) {
+			dprintf("stale oid %"PRIx64"\n", oid);
+			if (trunk_entry_no_sha1(entry) || trunk_entry_is_dirty(entry)) {
+				if (fill_entry_new_sha1(entry) < 0) {
+					eprintf("write sha1 object fail.\n");
+					goto out;
+				}
+			}
+
+			strbuf_add(&buf, &entry->raw, sizeof(struct trunk_entry));
+
+			snprintf(p, sizeof(p), "%s%016"PRIx64, obj_path, entry->raw.oid);
+			if (unlink(p) < 0) {
+				eprintf("%s:%m\n", p);
+				goto out;
+			}
+			dprintf("remove file %"PRIx64"\n", entry->raw.oid);
+			put_entry(entry);
+		}
+	}
+
+	if (sha1_file_write((void *)buf.buf, buf.len, outsha1) < 0) {
+		dprintf("sha1 file write fail.\n");
+		goto out;
+	}
+
+	ret = SD_RES_SUCCESS;
+out:
+	strbuf_release(&buf);
+	return ret;
+}
+
 int trunk_file_write(unsigned char *outsha1, int user)
 {
 	struct strbuf buf, omap_buf = STRBUF_INIT;
diff --git a/sheep/store.c b/sheep/store.c
index 0069eaf..f64e126 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -1293,8 +1293,16 @@ static int recover_object_from_replica(uint64_t oid,
 		return -1;
 	}
 
+	if (is_vdi_obj(oid))
+		rlen = sizeof(struct sheepdog_inode);
+	else if (is_vdi_attr_obj(oid))
+		rlen = SD_MAX_VDI_ATTR_VALUE_LEN;
+	else
+		rlen = SD_DATA_OBJ_SIZE;
+
 	if (is_myself(entry->addr, entry->port)) {
 		iocb.epoch = epoch;
+		iocb.length = rlen;
 		ret = sd_store->link(oid, &iocb, tgt_epoch);
 		if (ret == SD_RES_SUCCESS) {
 			ret = 0;
@@ -1313,12 +1321,6 @@ static int recover_object_from_replica(uint64_t oid,
 		ret = -1;
 		goto out;
 	}
-	if (is_vdi_obj(oid))
-		rlen = sizeof(struct sheepdog_inode);
-	else if (is_vdi_attr_obj(oid))
-		rlen = SD_MAX_VDI_ATTR_VALUE_LEN;
-	else
-		rlen = SD_DATA_OBJ_SIZE;
 
 	memset(&hdr, 0, sizeof(hdr));
 	hdr.opcode = SD_OP_READ_OBJ;
-- 
1.7.1




More information about the sheepdog mailing list