[Sheepdog] [PATCH 13/14] remove failed node from cpg ring

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Sat May 14 09:03:58 CEST 2011


This patch handles local disk crash and file system unmount while
Sheepdog is running.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/group.c      |   55 +++++++++++++++++++++++++++++++++++++++++++++++++--
 sheep/sdnet.c      |   26 ++++++++++++++++++++++++
 sheep/sheep_priv.h |    2 +
 sheep/store.c      |   11 ++++++++-
 4 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 6deb4a2..cb693ff 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -65,6 +65,10 @@ struct join_message {
 	} nodes[SD_MAX_NODES];
 };
 
+struct leave_message {
+	struct message_header header;
+};
+
 struct vdi_op_message {
 	struct message_header header;
 	struct sd_vdi_req req;
@@ -841,6 +845,9 @@ static void __sd_deliver_done(struct cpg_event *cevent)
 	struct message_header *m;
 	char name[128];
 	int do_recovery;
+	struct node *node;
+	struct sheepdog_node_list_entry e[SD_MAX_NODES];
+	int nr;
 
 	m = w->msg;
 
@@ -849,13 +856,33 @@ static void __sd_deliver_done(struct cpg_event *cevent)
 		case SD_MSG_JOIN:
 			update_cluster_info((struct join_message *)m);
 			break;
+		case SD_MSG_LEAVE:
+			node = find_node(&sys->sd_node_list, m->nodeid, m->pid);
+			if (node) {
+				sys->nr_vnodes = 0;
+
+				list_del(&node->list);
+				free(node);
+				if (sys->status == SD_STATUS_OK) {
+					nr = get_ordered_sd_node_list(e);
+					dprintf("update epoch, %d, %d\n", sys->epoch + 1, nr);
+					epoch_log_write(sys->epoch + 1, (char *)e,
+							nr * sizeof(struct sheepdog_node_list_entry));
+
+					sys->epoch++;
+
+					update_epoch_store(sys->epoch);
+				}
+			}
+			break;
 		default:
 			eprintf("unknown message %d\n", m->op);
 			break;
 		}
 	}
 
-	do_recovery = (m->state == DM_FIN && m->op == SD_MSG_JOIN);
+	do_recovery = (m->state == DM_FIN &&
+		       (m->op == SD_MSG_JOIN || m->op == SD_MSG_LEAVE));
 
 	dprintf("op: %d, state: %u, size: %d, from: %s\n",
 		m->op, m->state, m->msg_length,
@@ -1364,6 +1391,11 @@ do_retry:
 		list_del(&cevent->cpg_event_list);
 
 		if (is_io_request(req->rq.opcode)) {
+			int copies = sys->nr_sobjs;
+
+			if (copies > req->nr_nodes)
+				copies = req->nr_nodes;
+
 			if (__is_access_to_recoverying_objects(req)) {
 				if (req->rq.flags & SD_FLAG_CMD_DIRECT) {
 					req->rp.result = SD_RES_NEW_NODE_VER;
@@ -1383,9 +1415,9 @@ do_retry:
 			sys->nr_outstanding_io++;
 
 			if (is_access_local(req->entry, req->nr_vnodes,
-					    ((struct sd_obj_req *)&req->rq)->oid, sys->nr_sobjs) ||
+					    ((struct sd_obj_req *)&req->rq)->oid, copies) ||
 			    is_access_local(req->entry, req->nr_vnodes,
-					    ((struct sd_obj_req *)&req->rq)->cow_oid, sys->nr_sobjs)) {
+					    ((struct sd_obj_req *)&req->rq)->cow_oid, copies)) {
 				int ret = check_epoch(req);
 				if (ret != SD_RES_SUCCESS) {
 					req->rp.result = ret;
@@ -1628,3 +1660,20 @@ join_retry:
 	register_event(fd, group_handler, NULL);
 	return 0;
 }
+
+/* after this function is called, this node only works as a gateway */
+int leave_cluster(void)
+{
+	struct leave_message msg;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.header.proto_ver = SD_SHEEP_PROTO_VER;
+	msg.header.op = SD_MSG_LEAVE;
+	msg.header.state = DM_FIN;
+	msg.header.msg_length = sizeof(msg);
+	msg.header.from = sys->this_node;
+	msg.header.nodeid = sys->this_nodeid;
+	msg.header.pid = sys->this_pid;
+
+	return send_message(sys->handle, (struct message_header *)&msg);
+}
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 0251f71..089e7f6 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -59,6 +59,9 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
 	if (oid == 0)
 		return 0;
 
+	if (copies > nr_nodes)
+		copies = nr_nodes;
+
 	for (i = 0; i < copies; i++) {
 		n = obj_to_sheep(e, nr_nodes, oid, i);
 
@@ -82,6 +85,8 @@ static void setup_access_to_local_objects(struct request *req)
 	copies = hdr->copies;
 	if (!copies)
 		copies = sys->nr_sobjs;
+	if (copies > req->nr_nodes)
+		copies = req->nr_nodes;
 
 	if (is_access_local(req->entry, req->nr_vnodes, hdr->oid, copies))
 		req->local_oid = hdr->oid;
@@ -92,6 +97,10 @@ static void __done(struct work *work, int idx)
 	struct request *req = container_of(work, struct request, work);
 	struct sd_req *hdr = (struct sd_req *)&req->rq;
 	int again = 0;
+	int copies = sys->nr_sobjs;
+
+	if (copies > req->nr_nodes)
+		copies = req->nr_nodes;
 
 	switch (hdr->opcode) {
 	case SD_OP_NEW_VDI:
@@ -151,6 +160,23 @@ static void __done(struct work *work, int idx)
 			bmap->vdi_id = vdi_id;
 			list_add(&bmap->list, &sys->consistent_obj_list);
 			set_bit(data_oid_to_idx(obj_hdr->oid), bmap->dobjs);
+		} else if (is_access_local(req->entry, req->nr_vnodes,
+					   ((struct sd_obj_req *)&req->rq)->oid, copies) &&
+			   req->rp.result == SD_RES_EIO) {
+			eprintf("leave from cluster\n");
+			leave_cluster();
+
+			if (req->rq.flags & SD_FLAG_CMD_DIRECT)
+				/* hack to retry */
+				req->rp.result = SD_RES_NETWORK_ERROR;
+			else {
+				req->rq.epoch = sys->epoch;
+				setup_ordered_sd_vnode_list(req);
+				setup_access_to_local_objects(req);
+
+				list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings);
+				again = 1;
+			}
 		}
 done:
 		resume_pending_requests();
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 82840de..e0be2cc 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -28,6 +28,7 @@
 #define SD_MSG_JOIN             0x01
 #define SD_MSG_VDI_OP           0x02
 #define SD_MSG_MASTER_CHANGED   0x03
+#define SD_MSG_LEAVE            0x04
 
 #define SD_STATUS_OK                0x00
 #define SD_STATUS_WAIT_FOR_FORMAT   0x01
@@ -172,6 +173,7 @@ int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
 void resume_pending_requests(void);
 
 int create_cluster(int port);
+int leave_cluster(void);
 
 void start_cpg_event_work(void);
 void store_queue_request(struct work *work, int idx);
diff --git a/sheep/store.c b/sheep/store.c
index 5f0de2a..6cb60de 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -489,9 +489,16 @@ static int ob_open(uint32_t epoch, uint64_t oid, int aflags, int *ret)
 	fd = open(path, flags, def_fmode);
 	if (fd < 0) {
 		eprintf("failed to open %s, %s\n", path, strerror(errno));
-		if (errno == ENOENT)
+		if (errno == ENOENT) {
+			struct stat s;
+
 			*ret = SD_RES_NO_OBJ;
-		else
+			if (stat(obj_path, &s) < 0) {
+				/* store directory is corrupted */
+				eprintf("corrupted\n");
+				*ret = SD_RES_EIO;
+			}
+		} else
 			*ret = SD_RES_UNKNOWN;
 	} else
 		*ret = 0;
-- 
1.5.6.5




More information about the sheepdog mailing list