[Sheepdog] [PATCH 13/14] remove failed node from cpg ring
MORITA Kazutaka
morita.kazutaka at lab.ntt.co.jp
Sat May 14 09:03:58 CEST 2011
This patch handles local disk crash and file system unmount while
Sheepdog is running.
Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
sheep/group.c | 55 +++++++++++++++++++++++++++++++++++++++++++++++++--
sheep/sdnet.c | 26 ++++++++++++++++++++++++
sheep/sheep_priv.h | 2 +
sheep/store.c | 11 ++++++++-
4 files changed, 89 insertions(+), 5 deletions(-)
diff --git a/sheep/group.c b/sheep/group.c
index 6deb4a2..cb693ff 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -65,6 +65,10 @@ struct join_message {
} nodes[SD_MAX_NODES];
};
+struct leave_message {
+ struct message_header header;
+};
+
struct vdi_op_message {
struct message_header header;
struct sd_vdi_req req;
@@ -841,6 +845,9 @@ static void __sd_deliver_done(struct cpg_event *cevent)
struct message_header *m;
char name[128];
int do_recovery;
+ struct node *node;
+ struct sheepdog_node_list_entry e[SD_MAX_NODES];
+ int nr;
m = w->msg;
@@ -849,13 +856,33 @@ static void __sd_deliver_done(struct cpg_event *cevent)
case SD_MSG_JOIN:
update_cluster_info((struct join_message *)m);
break;
+ case SD_MSG_LEAVE:
+ node = find_node(&sys->sd_node_list, m->nodeid, m->pid);
+ if (node) {
+ sys->nr_vnodes = 0;
+
+ list_del(&node->list);
+ free(node);
+ if (sys->status == SD_STATUS_OK) {
+ nr = get_ordered_sd_node_list(e);
+ dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
+ epoch_log_write(sys->epoch + 1, (char *)e,
+ nr * sizeof(struct sheepdog_node_list_entry));
+
+ sys->epoch++;
+
+ update_epoch_store(sys->epoch);
+ }
+ }
+ break;
default:
eprintf("unknown message %d\n", m->op);
break;
}
}
- do_recovery = (m->state == DM_FIN && m->op == SD_MSG_JOIN);
+ do_recovery = (m->state == DM_FIN &&
+ (m->op == SD_MSG_JOIN || m->op == SD_MSG_LEAVE));
dprintf("op: %d, state: %u, size: %d, from: %s\n",
m->op, m->state, m->msg_length,
@@ -1364,6 +1391,11 @@ do_retry:
list_del(&cevent->cpg_event_list);
if (is_io_request(req->rq.opcode)) {
+ int copies = sys->nr_sobjs;
+
+ if (copies > req->nr_nodes)
+ copies = req->nr_nodes;
+
if (__is_access_to_recoverying_objects(req)) {
if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
req->rp.result = SD_RES_NEW_NODE_VER;
@@ -1383,9 +1415,9 @@ do_retry:
sys->nr_outstanding_io++;
if (is_access_local(req->entry, req->nr_vnodes,
- ((struct sd_obj_req *)&req->rq)->oid, sys->nr_sobjs) ||
+ ((struct sd_obj_req *)&req->rq)->oid, copies) ||
is_access_local(req->entry, req->nr_vnodes,
- ((struct sd_obj_req *)&req->rq)->cow_oid, sys->nr_sobjs)) {
+ ((struct sd_obj_req *)&req->rq)->cow_oid, copies)) {
int ret = check_epoch(req);
if (ret != SD_RES_SUCCESS) {
req->rp.result = ret;
@@ -1628,3 +1660,20 @@ join_retry:
register_event(fd, group_handler, NULL);
return 0;
}
+
+/* after this function is called, this node only works as a gateway */
+int leave_cluster(void)
+{
+ struct leave_message msg;
+
+ memset(&msg, 0, sizeof(msg));
+ msg.header.proto_ver = SD_SHEEP_PROTO_VER;
+ msg.header.op = SD_MSG_LEAVE;
+ msg.header.state = DM_FIN;
+ msg.header.msg_length = sizeof(msg);
+ msg.header.from = sys->this_node;
+ msg.header.nodeid = sys->this_nodeid;
+ msg.header.pid = sys->this_pid;
+
+ return send_message(sys->handle, (struct message_header *)&msg);
+}
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 0251f71..089e7f6 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -59,6 +59,9 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
if (oid == 0)
return 0;
+ if (copies > nr_nodes)
+ copies = nr_nodes;
+
for (i = 0; i < copies; i++) {
n = obj_to_sheep(e, nr_nodes, oid, i);
@@ -82,6 +85,8 @@ static void setup_access_to_local_objects(struct request *req)
copies = hdr->copies;
if (!copies)
copies = sys->nr_sobjs;
+ if (copies > req->nr_nodes)
+ copies = req->nr_nodes;
if (is_access_local(req->entry, req->nr_vnodes, hdr->oid, copies))
req->local_oid = hdr->oid;
@@ -92,6 +97,10 @@ static void __done(struct work *work, int idx)
struct request *req = container_of(work, struct request, work);
struct sd_req *hdr = (struct sd_req *)&req->rq;
int again = 0;
+ int copies = sys->nr_sobjs;
+
+ if (copies > req->nr_nodes)
+ copies = req->nr_nodes;
switch (hdr->opcode) {
case SD_OP_NEW_VDI:
@@ -151,6 +160,23 @@ static void __done(struct work *work, int idx)
bmap->vdi_id = vdi_id;
list_add(&bmap->list, &sys->consistent_obj_list);
set_bit(data_oid_to_idx(obj_hdr->oid), bmap->dobjs);
+ } else if (is_access_local(req->entry, req->nr_vnodes,
+ ((struct sd_obj_req *)&req->rq)->oid, copies) &&
+ req->rp.result == SD_RES_EIO) {
+ eprintf("leave from cluster\n");
+ leave_cluster();
+
+ if (req->rq.flags & SD_FLAG_CMD_DIRECT)
+ /* hack to retry */
+ req->rp.result = SD_RES_NETWORK_ERROR;
+ else {
+ req->rq.epoch = sys->epoch;
+ setup_ordered_sd_vnode_list(req);
+ setup_access_to_local_objects(req);
+
+ list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings);
+ again = 1;
+ }
}
done:
resume_pending_requests();
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 82840de..e0be2cc 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -28,6 +28,7 @@
#define SD_MSG_JOIN 0x01
#define SD_MSG_VDI_OP 0x02
#define SD_MSG_MASTER_CHANGED 0x03
+#define SD_MSG_LEAVE 0x04
#define SD_STATUS_OK 0x00
#define SD_STATUS_WAIT_FOR_FORMAT 0x01
@@ -172,6 +173,7 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
void resume_pending_requests(void);
int create_cluster(int port);
+int leave_cluster(void);
void start_cpg_event_work(void);
void store_queue_request(struct work *work, int idx);
diff --git a/sheep/store.c b/sheep/store.c
index 5f0de2a..6cb60de 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -489,9 +489,16 @@ static int ob_open(uint32_t epoch, uint64_t oid, int aflags, int *ret)
fd = open(path, flags, def_fmode);
if (fd < 0) {
eprintf("failed to open %s, %s\n", path, strerror(errno));
- if (errno == ENOENT)
+ if (errno == ENOENT) {
+ struct stat s;
+
*ret = SD_RES_NO_OBJ;
- else
+ if (stat(obj_path, &s) < 0) {
+ /* store directory is corrupted */
+ eprintf("corrupted\n");
+ *ret = SD_RES_EIO;
+ }
+ } else
*ret = SD_RES_UNKNOWN;
} else
*ret = 0;
--
1.5.6.5
More information about the sheepdog
mailing list