[Sheepdog] [PATCH 3/5] sheep: remove object list file
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Thu Sep 1 11:46:42 CEST 2011
Sheepdog creates a object list file (*.list) when the epoch is
changed. This file is used for the response of SD_OP_GET_OBJ_LIST.
But when many nodes are added to the cluster at the same time, it
takes a long time to create a many list files, and SD_OP_GET_OBJ_LIST
can result in a timeout error.
This patch completely removes the object list file from Sheepdog. In
the response of SD_OP_GET_OBJ_LIST, sheep simply calls readdir() on
the store directory, and lists all the object id stored on the local
node.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
sheep/store.c | 121 ++++++++++++++++----------------------------------------
1 files changed, 35 insertions(+), 86 deletions(-)
diff --git a/sheep/store.c b/sheep/store.c
index 2a491f2..05e4ace 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -107,6 +107,10 @@ static int stat_sheep(uint64_t *store_size, uint64_t *store_free, uint32_t epoch
return SD_RES_SUCCESS;
}
+static int merge_objlist(struct sheepdog_vnode_list_entry *entries, int nr_entries,
+ uint64_t *list1, int nr_list1,
+ uint64_t *list2, int nr_list2, int nr_objs);
+
static int get_obj_list(struct request *req)
{
DIR *dir;
@@ -114,12 +118,12 @@ static int get_obj_list(struct request *req)
struct sd_list_req *hdr = (struct sd_list_req *)&req->rq;
struct sd_list_rsp *rsp = (struct sd_list_rsp *)&req->rp;
uint64_t oid;
- uint32_t epoch = hdr->tgt_epoch;
+ uint32_t epoch;
char path[1024];
uint64_t *p = (uint64_t *)req->data;
int nr = 0;
uint64_t *objlist = NULL;
- int obj_nr = 0, fd, i;
+ int obj_nr, i;
int res = SD_RES_SUCCESS;
int buf_len;
char *buf;
@@ -133,65 +137,34 @@ static int get_obj_list(struct request *req)
goto out;
}
- if (epoch == 1)
- goto local;
-
- snprintf(path, sizeof(path), "%s%08u/list", obj_path, epoch);
-
- fd = open(path, O_RDONLY);
- if (fd < 0) {
- eprintf("failed to open %s, %s\n", path, strerror(errno));
- res = SD_RES_EIO;
- goto out;
- }
- obj_nr = read(fd, buf, buf_len);
- dprintf("read objct list from %s, %"PRIu32"\n", path, obj_nr);
-
- obj_nr /= sizeof(uint64_t);
objlist = (uint64_t *)buf;
- for (i = 0; i < obj_nr; i++) {
- p[nr++] = objlist[i];
+ obj_nr = 0;
+ for (epoch = 1; epoch <= hdr->tgt_epoch; epoch++) {
+ snprintf(path, sizeof(path), "%s%08u/", obj_path, epoch);
- if (nr * sizeof(uint64_t) >= hdr->data_length)
- break;
- }
- close(fd);
+ dprintf("%"PRIu32", %s\n", sys->this_node.port, path);
-local:
- snprintf(path, sizeof(path), "%s%08u/", obj_path, hdr->tgt_epoch);
-
- dprintf("%"PRIu32", %s\n", sys->this_node.port, path);
-
- dir = opendir(path);
- if (!dir) {
- eprintf("%s\n", path);
- res = SD_RES_EIO;
- goto out;
- }
-
- while ((d = readdir(dir))) {
- if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
- continue;
-
- oid = strtoull(d->d_name, NULL, 16);
- if (oid == 0)
+ dir = opendir(path);
+ if (!dir) {
+ eprintf("%s\n", path);
continue;
+ }
- for (i = 0; i < obj_nr; i++)
- if (objlist[i] == oid)
- break;
- if (i < obj_nr)
- continue;
+ while ((d = readdir(dir))) {
+ if (!strcmp(d->d_name, ".") || !strcmp(d->d_name, ".."))
+ continue;
- p[nr++] = oid;
+ oid = strtoull(d->d_name, NULL, 16);
+ if (oid == 0)
+ continue;
- if (nr * sizeof(uint64_t) >= hdr->data_length)
- break;
- }
+ objlist[obj_nr++] = oid;
+ }
- eprintf("nr = %"PRIu32"\n", nr);
+ closedir(dir);
- closedir(dir);
+ nr = merge_objlist(NULL, 0, p, nr, objlist, obj_nr, 0);
+ }
out:
free(buf);
rsp->data_length = nr * sizeof(uint64_t);
@@ -1677,17 +1650,20 @@ static int merge_objlist(struct sheepdog_vnode_list_entry *entries, int nr_entri
uint64_t *list2, int nr_list2, int nr_objs)
{
int i, j, idx;
+ int old_nr_list1 = nr_list1;
for (i = 0; i < nr_list2; i++) {
- for (j = 0; j < nr_objs; j++) {
- idx = obj_to_sheep(entries, nr_entries, list2[i], j);
- if (is_myself(entries[idx].addr, entries[idx].port))
- break;
+ if (entries) {
+ for (j = 0; j < nr_objs; j++) {
+ idx = obj_to_sheep(entries, nr_entries, list2[i], j);
+ if (is_myself(entries[idx].addr, entries[idx].port))
+ break;
+ }
+ if (j == nr_objs)
+ continue;
}
- if (j == nr_objs)
- continue;
- if (bsearch(list2 + i, list1, nr_list1, sizeof(*list1), obj_cmp))
+ if (bsearch(list2 + i, list1, old_nr_list1, sizeof(*list1), obj_cmp))
continue;
list1[nr_list1++] = list2[i];
@@ -1761,9 +1737,6 @@ static void __start_recovery(struct work *work, int idx)
struct recovery_work *rw = container_of(work, struct recovery_work, work);
uint32_t epoch = rw->epoch;
int nr_objs;
- int fd;
- char path[PATH_MAX], tmp_path[PATH_MAX];
- int ret;
dprintf("%u\n", epoch);
@@ -1801,30 +1774,6 @@ static void __start_recovery(struct work *work, int idx)
goto fail;
}
- snprintf(path, sizeof(path), "%s%08u/list", obj_path, epoch);
- snprintf(tmp_path, sizeof(tmp_path), "%s%08u/list.tmp", obj_path, epoch);
-
- dprintf("write object list to %s\n", tmp_path);
- fd = open(tmp_path, O_RDWR | O_CREAT | O_SYNC, def_fmode);
- if (fd < 0) {
- eprintf("failed to open %s, %s, %m\n", tmp_path, strerror(errno));
- goto fail;
- }
- ret = write(fd, rw->oids, sizeof(*rw->oids) * rw->count);
- if (ret != sizeof(uint64_t) * rw->count) {
- eprintf("failed to write to %s, %m\n", tmp_path);
- close(fd);
- goto fail;
- }
- close(fd);
-
- dprintf("rename %s to %s\n", tmp_path, path);
- ret = rename(tmp_path, path);
- if (ret < 0) {
- eprintf("failed to rename %s to %s, %m\n", tmp_path, path);
- goto fail;
- }
-
return;
fail:
rw->count = 0;
--
1.7.2.5
More information about the sheepdog
mailing list