[Sheepdog] [PATCH 0/2] the super object removal
FUJITA Tomonori
fujita.tomonori at lab.ntt.co.jp
Fri Mar 26 10:11:55 CET 2010
On Fri, 26 Mar 2010 10:17:19 +0900
FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> wrote:
> We use the super object to manage the list of the existing VDIs.
>
> Unlike data objects, the super object is a directory and the directory
> is replicated on multiple nodes. I concluded that the super object
> recovery code is too tricky and complicated.
>
> So this patchset removes the super object. We manage the list of the
> existing VDIs like p2p applications. At startup, the nodes build the
> list of VDIs.
>
> TODO: restart support
Done. This can be applied on the top of the patchset.
=
From: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
Subject: [PATCH] add reboot support without the super object
Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
collie/group.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++----
collie/net.c | 6 ++++-
collie/store.c | 52 +++++++++++++++++++++++++++++++++++-----
include/meta.h | 5 ++++
lib/net.c | 5 +++-
shepherd/shepherd.c | 2 +
6 files changed, 120 insertions(+), 14 deletions(-)
diff --git a/collie/group.c b/collie/group.c
index 836de83..2067870 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -199,9 +199,6 @@ void cluster_queue_request(struct work *work, int idx)
rsp->result = SD_RES_SUCCESS;
break;
- case SD_OP_READ_VDIS:
- rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length);
- break;
default:
/* forward request to group */
goto forward;
@@ -431,6 +428,58 @@ static void join(struct join_message *msg)
msg->cluster_status = sys->status;
}
+static void get_vdi_bitmap_from_all(void)
+{
+ struct sd_req hdr;
+ struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
+ int i, j, ret, nr_nodes, fd;
+ /* fixme: we need this until starting up. */
+ static DECLARE_BITMAP(tmp_vdi_inuse, SD_NR_VDIS);
+ struct sheepdog_node_list_entry entry[SD_MAX_NODES];
+ unsigned int rlen, wlen;
+ char host[128];
+
+ /*
+ * we don't need the proper order but this is the simplest
+ * way.
+ */
+ nr_nodes = build_node_list(&sys->sd_node_list, entry);
+
+ for (i = 0; i < nr_nodes; i++) {
+ if (!memcmp(&sys->this_node, &entry[i], sizeof(sys->this_node)))
+ continue;
+
+ addr_to_str(host, sizeof(host), entry[i].addr, 0);
+
+ fd = connect_to(host, entry[i].port);
+ if (fd < 0) {
+ vprintf(SDOG_ERR "can't get the vdi bitmap %s, %m\n", host);
+ }
+
+ vprintf(SDOG_ERR "get the vdi bitmap %d %s\n", i, host);
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.opcode = SD_OP_READ_VDIS;
+ hdr.epoch = sys->epoch;
+ hdr.data_length = sizeof(tmp_vdi_inuse);
+ rlen = hdr.data_length;
+ wlen = 0;
+
+ ret = exec_req(fd, &hdr, (char *)tmp_vdi_inuse,
+ &wlen, &rlen);
+
+ close(fd);
+
+ if (ret || rsp->result != SD_RES_SUCCESS) {
+ vprintf(SDOG_ERR "can't get the vdi bitmap %d %d\n", ret,
+ rsp->result);
+ }
+
+ for (j = 0; j < ARRAY_SIZE(sys->vdi_inuse); j++)
+ sys->vdi_inuse[j] |= tmp_vdi_inuse[j];
+ }
+}
+
static void update_cluster_info(struct join_message *msg)
{
int i;
@@ -498,9 +547,14 @@ out:
if (sys->status == SD_STATUS_STARTUP && msg->cluster_status == SD_STATUS_OK)
sys->epoch = get_latest_epoch();
- if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS)
- sys->status = msg->cluster_status;
+ if (sys->status != SD_STATUS_INCONSISTENT_EPOCHS) {
+ if (msg->cluster_status == SD_STATUS_OK) {
+ get_vdi_bitmap_from_all();
+ set_global_nr_copies(sys->nr_sobjs);
+ }
+ sys->status = msg->cluster_status;
+ }
return;
}
diff --git a/collie/net.c b/collie/net.c
index 137790c..09b2452 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -54,6 +54,7 @@ static void queue_request(struct request *req)
case SD_OP_MAKE_FS:
case SD_OP_GET_NODE_LIST:
case SD_OP_READ_EPOCH:
+ case SD_OP_READ_VDIS:
break;
default:
if (sys->status == SD_STATUS_STARTUP)
@@ -88,9 +89,12 @@ static void queue_request(struct request *req)
case SD_OP_MAKE_FS:
case SD_OP_SHUTDOWN:
case SD_OP_STAT_CLUSTER:
- case SD_OP_READ_VDIS:
req->work.fn = cluster_queue_request;
break;
+ case SD_OP_READ_VDIS:
+ rsp->result = read_vdis(req->data, hdr->data_length, &rsp->data_length);
+ req->done(req);
+ return;
default:
eprintf("unknown operation %d\n", hdr->opcode);
rsp->result = SD_RES_SYSTEM_ERROR;
diff --git a/collie/store.c b/collie/store.c
index 429124c..5c870b6 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -439,9 +439,6 @@ static int store_queue_request_local(struct request *req, char *buf, uint32_t ep
goto out;
}
- if (!is_data_obj(oid))
- break;
-
if (hdr->flags & SD_FLAG_CMD_COW) {
dprintf("%" PRIu64 "\n", hdr->cow_oid);
@@ -567,7 +564,7 @@ void store_queue_request(struct work *work, int idx)
ret = store_queue_request_local(req, buf, epoch);
out:
if (ret != SD_RES_SUCCESS) {
- dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %x\n",
+ dprintf("failed, %d, %x, %" PRIx64" , %u, %u, %d\n",
idx, opcode, oid, epoch, req_epoch, ret);
rsp->result = ret;
}
@@ -1075,6 +1072,8 @@ static int init_path(char *d, int *new)
{
int ret, retry = 0;
struct stat s;
+
+ *new = 0;
again:
ret = stat(d, &s);
if (ret) {
@@ -1123,12 +1122,50 @@ static int init_obj_path(char *base_path)
static int init_epoch_path(char *base_path)
{
- int new;
+ int new, ret;
+ uint32_t epoch;
+ DIR *dir;
+ char path[1024];
+ struct dirent *dent;
+ uint64_t oid;
epoch_path = zalloc(strlen(base_path) + strlen(EPOCH_PATH) + 1);
sprintf(epoch_path, "%s" EPOCH_PATH, base_path);
- return init_path(epoch_path, &new);
+ ret = init_path(epoch_path, &new);
+ if (new || ret)
+ return ret;
+
+ epoch = get_latest_epoch();
+
+ snprintf(path, sizeof(path), "%s/%08u", obj_path, epoch);
+
+ vprintf(SDOG_INFO "found the epoch dir, %s\n", path);
+
+ dir = opendir(path);
+ if (!dir) {
+ vprintf(SDOG_ERR "failed to open the epoch dir, %m\n");
+ return SD_RES_EIO;
+ }
+
+ while ((dent = readdir(dir))) {
+ if (!strcmp(dent->d_name, ".") ||
+ !strcmp(dent->d_name, ".."))
+ continue;
+
+ oid = strtoull(dent->d_name, NULL, 16);
+
+ if (is_data_obj(oid))
+ continue;
+
+ vprintf(SDOG_DEBUG "found the vdi obj, %" PRIx64 " %lu\n",
+ oid, oid_to_bit(oid));
+
+ set_bit(oid_to_bit(oid), sys->vdi_inuse);
+ }
+ closedir(dir);
+
+ return 0;
}
static int init_mnt_path(char *base_path)
@@ -1254,6 +1291,7 @@ static int global_nr_copies(uint32_t *copies, int set)
}
} else {
if (ret != sizeof(*copies)) {
+ eprintf("use 'user_xattr' option?\n");
return SD_RES_SYSTEM_ERROR;
}
}
@@ -1268,5 +1306,5 @@ int set_global_nr_copies(uint32_t copies)
int get_global_nr_copies(uint32_t *copies)
{
- return global_nr_copies(copies, 1);
+ return global_nr_copies(copies, 0);
}
diff --git a/include/meta.h b/include/meta.h
index 99fc38a..338f660 100644
--- a/include/meta.h
+++ b/include/meta.h
@@ -70,4 +70,9 @@ static inline uint64_t bit_to_oid(unsigned long nr)
return ((unsigned long long)nr << VDI_SPACE_SHIFT) | VDI_BIT;
}
+static inline unsigned long oid_to_bit(uint64_t oid)
+{
+ return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT;
+}
+
#endif
diff --git a/lib/net.c b/lib/net.c
index c85ee2d..ff261e5 100644
--- a/lib/net.c
+++ b/lib/net.c
@@ -412,8 +412,11 @@ int read_object(struct sheepdog_node_list_entry *e,
addr_to_str(name, sizeof(name), e[n].addr, 0);
fd = connect_to(name, e[n].port);
- if (fd < 0)
+ if (fd < 0) {
+ printf("%s(%d): %s, %m\n", __func__, __LINE__,
+ name);
return -1;
+ }
memset(&hdr, 0, sizeof(hdr));
hdr.epoch = node_version;
diff --git a/shepherd/shepherd.c b/shepherd/shepherd.c
index 0d7cecb..55a4fe0 100644
--- a/shepherd/shepherd.c
+++ b/shepherd/shepherd.c
@@ -449,6 +449,8 @@ int parse_vdi(vdi_parser_func_t func, void *data)
if (ret == sizeof(i))
func(i.oid, i.name, i.snap_id, 0, &i, data);
+ else
+ printf("error %lu %" PRIx64 ", %d\n", nr, bit_to_oid(nr), ret);
}
--
1.7.0
More information about the sheepdog
mailing list