[Sheepdog] [PATCH 3/5] sheep: remove object list file

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Thu Sep 1 11:46:42 CEST 2011


Sheepdog creates a object list file (*.list) when the epoch is
changed.  This file is used for the response of SD_OP_GET_OBJ_LIST.
But when many nodes are added to the cluster at the same time, it
takes a long time to create a many list files, and SD_OP_GET_OBJ_LIST
can result in a timeout error.

This patch completely removes the object list file from Sheepdog.  In
the response of SD_OP_GET_OBJ_LIST, sheep simply calls readdir() on
the store directory, and lists all the object id stored on the local
node.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/store.c |  121 ++++++++++++++++----------------------------------------
 1 files changed, 35 insertions(+), 86 deletions(-)

diff --git a/sheep/store.c b/sheep/store.c
index 2a491f2..05e4ace 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -107,6 +107,10 @@ static int stat_sheep(uint64_t *store_size, uint64_t *store_free, uint32_t epoch
 	return SD_RES_SUCCESS;
 }
 
+static int merge_objlist(struct sheepdog_vnode_list_entry *entries, int nr_entries,
+			 uint64_t *list1, int nr_list1,
+			 uint64_t *list2, int nr_list2, int nr_objs);
+
 static int get_obj_list(struct request *req)
 {
 	DIR *dir;
@@ -114,12 +118,12 @@ static int get_obj_list(struct request *req)
 	struct sd_list_req *hdr = (struct sd_list_req *)&req->rq;
 	struct sd_list_rsp *rsp = (struct sd_list_rsp *)&req->rp;
 	uint64_t oid;
-	uint32_t epoch = hdr->tgt_epoch;
+	uint32_t epoch;
 	char path[1024];
 	uint64_t *p = (uint64_t *)req->data;
 	int nr = 0;
 	uint64_t *objlist = NULL;
-	int obj_nr = 0, fd, i;
+	int obj_nr, i;
 	int res = SD_RES_SUCCESS;
 	int buf_len;
 	char *buf;
@@ -133,65 +137,34 @@ static int get_obj_list(struct request *req)
 		goto out;
 	}
 
-	if (epoch == 1)
-		goto local;
-
-	snprintf(path, sizeof(path), "%s%08u/list", obj_path, epoch);
-
-	fd = open(path, O_RDONLY);
-	if (fd < 0) {
-		eprintf("failed to open %s, %s\n", path, strerror(errno));
-		res = SD_RES_EIO;
-		goto out;
-	}
-	obj_nr = read(fd, buf, buf_len);
-	dprintf("read objct list from %s, %"PRIu32"\n", path, obj_nr);
-
-	obj_nr /= sizeof(uint64_t);
 	objlist = (uint64_t *)buf;
-	for (i = 0; i < obj_nr; i++) {
-		p[nr++] = objlist[i];
+	obj_nr = 0;
+	for (epoch = 1; epoch <= hdr->tgt_epoch; epoch++) {
+		snprintf(path, sizeof(path), "%s%08u/", obj_path, epoch);
 
-		if (nr * sizeof(uint64_t) >= hdr->data_length)
-			break;
-	}
-	close(fd);
+		dprintf("%"PRIu32", %s\n", sys->this_node.port, path);
 
-local:
-	snprintf(path, sizeof(path), "%s%08u/", obj_path, hdr->tgt_epoch);
-
-	dprintf("%"PRIu32", %s\n", sys->this_node.port, path);
-
-	dir = opendir(path);
-	if (!dir) {
-		eprintf("%s\n", path);
-		res = SD_RES_EIO;
-		goto out;
-	}
-
-	while ((d = readdir(dir))) {
-		if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
-			continue;
-
-		oid = strtoull(d->d_name, NULL, 16);
-		if (oid == 0)
+		dir = opendir(path);
+		if (!dir) {
+			eprintf("%s\n", path);
 			continue;
+		}
 
-		for (i = 0; i < obj_nr; i++)
-			if (objlist[i] == oid)
-				break;
-		if (i < obj_nr)
-			continue;
+		while ((d = readdir(dir))) {
+			if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+				continue;
 
-		p[nr++] = oid;
+			oid = strtoull(d->d_name, NULL, 16);
+			if (oid == 0)
+				continue;
 
-		if (nr * sizeof(uint64_t) >= hdr->data_length)
-			break;
-	}
+			objlist[obj_nr++] = oid;
+		}
 
-	eprintf("nr = %"PRIu32"\n", nr);
+		closedir(dir);
 
-	closedir(dir);
+		nr = merge_objlist(NULL, 0, p, nr, objlist, obj_nr, 0);
+	}
 out:
 	free(buf);
 	rsp->data_length = nr * sizeof(uint64_t);
@@ -1677,17 +1650,20 @@ static int merge_objlist(struct sheepdog_vnode_list_entry *entries, int nr_entri
 			 uint64_t *list2, int nr_list2, int nr_objs)
 {
 	int i, j, idx;
+	int old_nr_list1 = nr_list1;
 
 	for (i = 0; i < nr_list2; i++) {
-		for (j = 0; j < nr_objs; j++) {
-			idx = obj_to_sheep(entries, nr_entries, list2[i], j);
-			if (is_myself(entries[idx].addr, entries[idx].port))
-				break;
+		if (entries) {
+			for (j = 0; j < nr_objs; j++) {
+				idx = obj_to_sheep(entries, nr_entries, list2[i], j);
+				if (is_myself(entries[idx].addr, entries[idx].port))
+					break;
+			}
+			if (j == nr_objs)
+				continue;
 		}
-		if (j == nr_objs)
-			continue;
 
-		if (bsearch(list2 + i, list1, nr_list1, sizeof(*list1), obj_cmp))
+		if (bsearch(list2 + i, list1, old_nr_list1, sizeof(*list1), obj_cmp))
 			continue;
 
 		list1[nr_list1++] = list2[i];
@@ -1761,9 +1737,6 @@ static void __start_recovery(struct work *work, int idx)
 	struct recovery_work *rw = container_of(work, struct recovery_work, work);
 	uint32_t epoch = rw->epoch;
 	int nr_objs;
-	int fd;
-	char path[PATH_MAX], tmp_path[PATH_MAX];
-	int ret;
 
 	dprintf("%u\n", epoch);
 
@@ -1801,30 +1774,6 @@ static void __start_recovery(struct work *work, int idx)
 		goto fail;
 	}
 
-	snprintf(path, sizeof(path), "%s%08u/list", obj_path, epoch);
-	snprintf(tmp_path, sizeof(tmp_path), "%s%08u/list.tmp", obj_path, epoch);
-
-	dprintf("write object list to %s\n", tmp_path);
-	fd = open(tmp_path, O_RDWR | O_CREAT | O_SYNC, def_fmode);
-	if (fd < 0) {
-		eprintf("failed to open %s, %s, %m\n", tmp_path, strerror(errno));
-		goto fail;
-	}
-	ret = write(fd, rw->oids, sizeof(*rw->oids) * rw->count);
-	if (ret != sizeof(uint64_t) * rw->count) {
-		eprintf("failed to write to %s, %m\n", tmp_path);
-		close(fd);
-		goto fail;
-	}
-	close(fd);
-
-	dprintf("rename %s to %s\n", tmp_path, path);
-	ret = rename(tmp_path, path);
-	if (ret < 0) {
-		eprintf("failed to rename %s to %s, %m\n", tmp_path, path);
-		goto fail;
-	}
-
 	return;
 fail:
 	rw->count = 0;
-- 
1.7.2.5




More information about the sheepdog mailing list