[sheepdog] [PATCH] sheep: remove block/unblock callbacks

Liu Yuan namei.unix at gmail.com
Wed Jun 6 11:29:56 CEST 2012


From: Liu Yuan <tailai.ly at taobao.com>

This patch tries to completely remove block/unblock as well as
sd_block_handler() code from both cluster driver and core sheep code, simpifying
the code a lot also boost performance a lot. We actually have a very nice
construct to do blocking request handling:our top/bottom style work queue. So
the requests can be nicely ordered by a block_wqueue.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/cluster/accord.c    |   39 -------------------------
 sheep/cluster/corosync.c  |   59 ++++----------------------------------
 sheep/cluster/local.c     |   40 --------------------------
 sheep/cluster/zookeeper.c |   50 +-------------------------------
 sheep/group.c             |   69 +++++++++++----------------------------------
 5 files changed, 24 insertions(+), 233 deletions(-)

diff --git a/sheep/cluster/accord.c b/sheep/cluster/accord.c
index c3e0320..27ddb99 100644
--- a/sheep/cluster/accord.c
+++ b/sheep/cluster/accord.c
@@ -30,7 +30,6 @@ enum acrd_event_type {
 	EVENT_JOIN_REQUEST = 1,
 	EVENT_JOIN_RESPONSE,
 	EVENT_LEAVE,
-	EVENT_BLOCK,
 	EVENT_NOTIFY,
 };
 
@@ -46,8 +45,6 @@ struct acrd_event {
 	uint64_t ids[SD_MAX_NODES];
 
 	enum cluster_join_result join_result;
-
-	int callbacked; /* set non-zero after sd_block_handler() was called */
 };
 
 static struct sd_node this_node;
@@ -279,7 +276,6 @@ static int add_event(struct acrd_handle *ah, enum acrd_event_type type,
 		memmove(i, i + 1, sizeof(*i) * (ev.nr_nodes - idx));
 		break;
 	case EVENT_NOTIFY:
-	case EVENT_BLOCK:
 		break;
 	case EVENT_JOIN_RESPONSE:
 		abort();
@@ -427,29 +423,6 @@ static int accord_notify(void *msg, size_t msg_len)
 	return add_event(ahandle, EVENT_NOTIFY, &this_node, msg, msg_len);
 }
 
-static void accord_block(void)
-{
-	add_event(ahandle, EVENT_BLOCK, &this_node, NULL, 0);
-}
-
-static void accord_unblock(void *msg, size_t msg_len)
-{
-	struct acrd_event ev;
-
-	pthread_mutex_lock(&queue_lock);
-
-	acrd_queue_pop(ahandle, &ev);
-
-	ev.type = EVENT_NOTIFY;
-	ev.buf_len = msg_len;
-	if (msg)
-		memcpy(ev.buf, msg, msg_len);
-
-	acrd_queue_push_back(ahandle, &ev);
-
-	pthread_mutex_unlock(&queue_lock);
-}
-
 static void acrd_handler(int listen_fd, int events, void *data)
 {
 	int ret;
@@ -510,16 +483,6 @@ static void acrd_handler(int listen_fd, int events, void *data)
 	case EVENT_LEAVE:
 		sd_leave_handler(&ev.sender, ev.nodes, ev.nr_nodes);
 		break;
-	case EVENT_BLOCK:
-		if (node_cmp(&ev.sender, &this_node) == 0 && !ev.callbacked) {
-			ev.callbacked = 1;
-
-			acrd_queue_push_back(ahandle, &ev);
-			sd_block_handler();
-		} else {
-			acrd_queue_push_back(ahandle, NULL);
-		}
-		break;
 	case EVENT_NOTIFY:
 		sd_notify_handler(&ev.sender, ev.buf, ev.buf_len);
 		break;
@@ -592,8 +555,6 @@ struct cluster_driver cdrv_accord = {
 	.join       = accord_join,
 	.leave      = accord_leave,
 	.notify     = accord_notify,
-	.block      = accord_block,
-	.unblock    = accord_unblock,
 };
 
 cdrv_register(cdrv_accord);
diff --git a/sheep/cluster/corosync.c b/sheep/cluster/corosync.c
index e6655a0..6ecbf27 100644
--- a/sheep/cluster/corosync.c
+++ b/sheep/cluster/corosync.c
@@ -43,7 +43,6 @@ enum corosync_event_type {
 	COROSYNC_EVENT_TYPE_JOIN_REQUEST,
 	COROSYNC_EVENT_TYPE_JOIN_RESPONSE,
 	COROSYNC_EVENT_TYPE_LEAVE,
-	COROSYNC_EVENT_TYPE_BLOCK,
 	COROSYNC_EVENT_TYPE_NOTIFY,
 };
 
@@ -53,8 +52,6 @@ enum corosync_message_type {
 	COROSYNC_MSG_TYPE_JOIN_RESPONSE,
 	COROSYNC_MSG_TYPE_LEAVE,
 	COROSYNC_MSG_TYPE_NOTIFY,
-	COROSYNC_MSG_TYPE_BLOCK,
-	COROSYNC_MSG_TYPE_UNBLOCK,
 };
 
 struct corosync_event {
@@ -291,8 +288,8 @@ static int __corosync_dispatch_one(struct corosync_event *cevent)
 		case CJ_RES_JOIN_LATER:
 			build_node_list(cpg_nodes, nr_cpg_nodes, entries);
 			sd_join_handler(&cevent->sender.ent, entries,
-						       nr_cpg_nodes, cevent->result,
-						       cevent->msg);
+					nr_cpg_nodes, cevent->result,
+					cevent->msg);
 			break;
 		}
 		break;
@@ -307,21 +304,11 @@ static int __corosync_dispatch_one(struct corosync_event *cevent)
 		build_node_list(cpg_nodes, nr_cpg_nodes, entries);
 		sd_leave_handler(&cevent->sender.ent, entries, nr_cpg_nodes);
 		break;
-	case COROSYNC_EVENT_TYPE_BLOCK:
-		if (cpg_node_equal(&cevent->sender, &this_node) &&
-		    !cevent->callbacked) {
-			sd_block_handler();
-			cevent->callbacked = 1;
-		}
-
-		/* block the rest messages until unblock message comes */
-		return 0;
 	case COROSYNC_EVENT_TYPE_NOTIFY:
 		sd_notify_handler(&cevent->sender.ent, cevent->msg,
-						 cevent->msg_len);
+				  cevent->msg_len);
 		break;
 	}
-
 	return 1;
 }
 
@@ -330,7 +317,8 @@ static void __corosync_dispatch(void)
 	struct corosync_event *cevent;
 
 	while (!list_empty(&corosync_event_list)) {
-		cevent = list_first_entry(&corosync_event_list, typeof(*cevent), list);
+		cevent = list_first_entry(&corosync_event_list, typeof(*cevent),
+					  list);
 
 		/* update join status */
 		if (!join_finished) {
@@ -362,7 +350,6 @@ static void __corosync_dispatch(void)
 		} else {
 			switch (cevent->type) {
 			case COROSYNC_MSG_TYPE_JOIN_REQUEST:
-			case COROSYNC_MSG_TYPE_BLOCK:
 				return;
 			default:
 				break;
@@ -420,16 +407,12 @@ static void cdrv_cpg_deliver(cpg_handle_t handle,
 		cevent->sender = cmsg->sender;
 		cevent->msg_len = cmsg->msg_len;
 		break;
-	case COROSYNC_MSG_TYPE_BLOCK:
 	case COROSYNC_MSG_TYPE_NOTIFY:
 		cevent = zalloc(sizeof(*cevent));
 		if (!cevent)
 			panic("failed to allocate memory\n");
 
-		if (cmsg->type == COROSYNC_MSG_TYPE_BLOCK)
-			cevent->type = COROSYNC_EVENT_TYPE_BLOCK;
-		else
-			cevent->type = COROSYNC_EVENT_TYPE_NOTIFY;
+		cevent->type = COROSYNC_EVENT_TYPE_NOTIFY;
 
 		cevent->sender = cmsg->sender;
 		cevent->msg_len = cmsg->msg_len;
@@ -480,14 +463,6 @@ static void cdrv_cpg_deliver(cpg_handle_t handle,
 		       sizeof(*cmsg->nodes) * cmsg->nr_nodes);
 
 		break;
-	case COROSYNC_MSG_TYPE_UNBLOCK:
-		cevent = update_event(COROSYNC_EVENT_TYPE_BLOCK, &cmsg->sender,
-				      cmsg->msg, cmsg->msg_len);
-		if (!cevent)
-			break;
-
-		cevent->type = COROSYNC_EVENT_TYPE_NOTIFY;
-		break;
 	}
 
 	__corosync_dispatch();
@@ -542,14 +517,6 @@ static void cdrv_cpg_confchg(cpg_handle_t handle,
 			continue;
 		}
 
-		cevent = find_event(COROSYNC_EVENT_TYPE_BLOCK, left_sheep + i);
-		if (cevent) {
-			/* the node left before sending UNBLOCK */
-			list_del(&cevent->list);
-			free(cevent->msg);
-			free(cevent);
-		}
-
 		cevent = zalloc(sizeof(*cevent));
 		if (!cevent)
 			panic("failed to allocate memory\n");
@@ -640,18 +607,6 @@ static int corosync_leave(void)
 			    NULL, 0);
 }
 
-static void corosync_block(void)
-{
-	send_message(COROSYNC_MSG_TYPE_BLOCK, 0, &this_node, NULL, 0,
-			    NULL, 0);
-}
-
-static void corosync_unblock(void *msg, size_t msg_len)
-{
-	send_message(COROSYNC_MSG_TYPE_UNBLOCK, 0, &this_node, NULL, 0,
-		     msg, msg_len);
-}
-
 static int corosync_notify(void *msg, size_t msg_len)
 {
 	return send_message(COROSYNC_MSG_TYPE_NOTIFY, 0, &this_node,
@@ -739,8 +694,6 @@ struct cluster_driver cdrv_corosync = {
 	.join       = corosync_join,
 	.leave      = corosync_leave,
 	.notify     = corosync_notify,
-	.block      = corosync_block,
-	.unblock    = corosync_unblock,
 };
 
 cdrv_register(cdrv_corosync);
diff --git a/sheep/cluster/local.c b/sheep/cluster/local.c
index 4f2d8e8..a1f0cc6 100644
--- a/sheep/cluster/local.c
+++ b/sheep/cluster/local.c
@@ -36,7 +36,6 @@ enum local_event_type {
 	EVENT_JOIN_REQUEST = 1,
 	EVENT_JOIN_RESPONSE,
 	EVENT_LEAVE,
-	EVENT_BLOCK,
 	EVENT_NOTIFY,
 };
 
@@ -52,8 +51,6 @@ struct local_event {
 	pid_t pids[SD_MAX_NODES];
 
 	enum cluster_join_result join_result;
-
-	int callbacked; /* set non-zero after sd_block_handler() was called */
 };
 
 
@@ -244,7 +241,6 @@ static void add_event(enum local_event_type type, struct sd_node *node,
 		memmove(p, p + 1, sizeof(*p) * (ev.nr_nodes - idx));
 		break;
 	case EVENT_NOTIFY:
-	case EVENT_BLOCK:
 		break;
 	case EVENT_JOIN_RESPONSE:
 		abort();
@@ -314,34 +310,6 @@ static int local_notify(void *msg, size_t msg_len)
 	return 0;
 }
 
-static void local_block(void)
-{
-	shm_queue_lock();
-
-	add_event(EVENT_BLOCK, &this_node, NULL, 0);
-
-	shm_queue_unlock();
-}
-
-static void local_unblock(void *msg, size_t msg_len)
-{
-	struct local_event *ev;
-
-	shm_queue_lock();
-
-	ev = shm_queue_peek();
-
-	ev->type = EVENT_NOTIFY;
-	ev->buf_len = msg_len;
-	if (msg)
-		memcpy(ev->buf, msg, msg_len);
-	msync(ev, sizeof(*ev), MS_SYNC);
-
-	shm_queue_notify();
-
-	shm_queue_unlock();
-}
-
 static void local_handler(int listen_fd, int events, void *data)
 {
 	struct signalfd_siginfo siginfo;
@@ -404,12 +372,6 @@ static void local_handler(int listen_fd, int events, void *data)
 		sd_leave_handler(&ev->sender, ev->nodes, ev->nr_nodes);
 		shm_queue_pop();
 		break;
-	case EVENT_BLOCK:
-		if (node_eq(&ev->sender, &this_node) && !ev->callbacked) {
-			sd_block_handler();
-			ev->callbacked = 1;
-		}
-		break;
 	case EVENT_NOTIFY:
 		sd_notify_handler(&ev->sender, ev->buf, ev->buf_len);
 		shm_queue_pop();
@@ -467,8 +429,6 @@ struct cluster_driver cdrv_local = {
 	.join       = local_join,
 	.leave      = local_leave,
 	.notify     = local_notify,
-	.block      = local_block,
-	.unblock    = local_unblock,
 };
 
 cdrv_register(cdrv_local);
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 8499cb4..7924920 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -44,7 +44,6 @@ enum zk_event_type {
 	EVENT_JOIN_REQUEST = 1,
 	EVENT_JOIN_RESPONSE,
 	EVENT_LEAVE,
-	EVENT_BLOCK,
 	EVENT_NOTIFY,
 };
 
@@ -60,8 +59,6 @@ struct zk_event {
 
 	enum cluster_join_result join_result;
 
-	int callbacked; /* set non-zero after sd_block_handler() was called */
-
 	size_t buf_len;
 	uint8_t buf[SD_MAX_EVENT_BUF_SIZE];
 };
@@ -82,7 +79,7 @@ static size_t nr_zk_nodes;
 
 static inline int is_blocking_event(struct zk_event *ev)
 {
-	return ev->type == EVENT_BLOCK || ev->type == EVENT_JOIN_REQUEST;
+	return ev->type == EVENT_JOIN_REQUEST;
 }
 
 /* zookeeper API wrapper */
@@ -497,7 +494,6 @@ static int add_event(zhandle_t *zh, enum zk_event_type type,
 	ev.type = type;
 	ev.sender = *znode;
 	ev.buf_len = buf_len;
-	ev.callbacked = 0;
 	if (buf)
 		memcpy(ev.buf, buf, buf_len);
 	zk_queue_push(zh, &ev);
@@ -514,7 +510,6 @@ static int leave_event(zhandle_t *zh, struct zk_node *znode)
 	ev->type = EVENT_LEAVE;
 	ev->sender = *znode;
 	ev->buf_len = 0;
-	ev->callbacked = 0;
 
 	nr_levents = uatomic_add_return(&nr_zk_levents, 1);
 	dprintf("nr_zk_levents:%d, tail:%u\n", nr_levents, zk_levent_tail);
@@ -611,34 +606,6 @@ static int zk_notify(void *msg, size_t msg_len)
 	return add_event(zhandle, EVENT_NOTIFY, &this_node, msg, msg_len);
 }
 
-static void zk_block(void)
-{
-	add_event(zhandle, EVENT_BLOCK, &this_node, NULL, 0);
-}
-
-static void zk_unblock(void *msg, size_t msg_len)
-{
-	int rc;
-	struct zk_event ev;
-	eventfd_t value = 1;
-
-	rc = zk_queue_pop(zhandle, &ev);
-	assert(rc == 0);
-
-	ev.type = EVENT_NOTIFY;
-	ev.buf_len = msg_len;
-	if (msg)
-		memcpy(ev.buf, msg, msg_len);
-
-	zk_queue_push_back(zhandle, &ev);
-
-	uatomic_dec(&zk_notify_blocked);
-
-	/* this notify is necessary */
-	dprintf("write event to efd:%d\n", efd);
-	eventfd_write(efd, value);
-}
-
 static void zk_handler(int listen_fd, int events, void *data)
 {
 	int ret, rc;
@@ -764,19 +731,6 @@ static void zk_handler(int listen_fd, int events, void *data)
 		build_node_list(zk_node_btroot);
 		sd_leave_handler(&ev.sender.node, sd_nodes, nr_sd_nodes);
 		break;
-	case EVENT_BLOCK:
-		dprintf("BLOCK\n");
-		if (node_eq(&ev.sender.node, &this_node.node)
-				&& !ev.callbacked) {
-			uatomic_inc(&zk_notify_blocked);
-			ev.callbacked = 1;
-			zk_queue_push_back(zhandle, &ev);
-			sd_block_handler();
-		} else {
-			zk_queue_push_back(zhandle, NULL);
-		}
-
-		break;
 	case EVENT_NOTIFY:
 		dprintf("NOTIFY\n");
 		sd_notify_handler(&ev.sender.node, ev.buf, ev.buf_len);
@@ -835,8 +789,6 @@ struct cluster_driver cdrv_zookeeper = {
 	.join       = zk_join,
 	.leave      = zk_leave,
 	.notify     = zk_notify,
-	.block      = zk_block,
-	.unblock    = zk_unblock,
 };
 
 cdrv_register(cdrv_zookeeper);
diff --git a/sheep/group.c b/sheep/group.c
index c2679f2..638200f 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -252,7 +252,7 @@ int get_nr_copies(struct vnode_info *vnode_info)
 }
 
 static struct vdi_op_message *prepare_cluster_msg(struct request *req,
-		size_t *sizep)
+						  size_t *sizep)
 {
 	struct vdi_op_message *msg;
 	size_t size;
@@ -264,11 +264,7 @@ static struct vdi_op_message *prepare_cluster_msg(struct request *req,
 
 	assert(size <= SD_MAX_EVENT_BUF_SIZE);
 
-	msg = zalloc(size);
-	if (!msg) {
-		eprintf("failed to allocate memory\n");
-		return NULL;
-	}
+	msg = xzalloc(size);
 
 	memcpy(&msg->req, &req->rq, sizeof(struct sd_req));
 	memcpy(&msg->rsp, &req->rp, sizeof(struct sd_rsp));
@@ -283,9 +279,11 @@ static struct vdi_op_message *prepare_cluster_msg(struct request *req,
 static void do_cluster_request(struct work *work)
 {
 	struct request *req = container_of(work, struct request, work);
-	int ret;
+	int ret = SD_RES_SUCCESS;
+
+	if (has_process_work(req->op))
+		ret = do_process_work(req);
 
-	ret = do_process_work(req);
 	req->rp.result = ret;
 }
 
@@ -296,32 +294,14 @@ static void cluster_op_done(struct work *work)
 	size_t size;
 
 	msg = prepare_cluster_msg(req, &size);
-	if (!msg)
-		panic();
 
-	sys->cdrv->unblock(msg, size);
+	/* Kick off the cluster to process_main() */
+	sys->cdrv->notify(msg, size);
 
 	free(msg);
 }
 
 /*
- * Perform a blocked cluster operation.
- *
- * Must run in the main thread as it access unlocked state like
- * sys->pending_list.
- */
-void sd_block_handler(void)
-{
-	struct request *req = list_first_entry(&sys->pending_list,
-						struct request, pending_list);
-
-	req->work.fn = do_cluster_request;
-	req->work.done = cluster_op_done;
-
-	queue_work(sys->block_wqueue, &req->work);
-}
-
-/*
  * Execute a cluster operation by letting the cluster driver send it to all
  * nodes in the cluster.
  *
@@ -332,24 +312,10 @@ void queue_cluster_request(struct request *req)
 {
 	eprintf("%p %x\n", req, req->rq.opcode);
 
-	if (has_process_work(req->op)) {
-		list_add_tail(&req->pending_list, &sys->pending_list);
-		sys->cdrv->block();
-	} else {
-		struct vdi_op_message *msg;
-		size_t size;
-
-		msg = prepare_cluster_msg(req, &size);
-		if (!msg)
-			return;
-
-		list_add_tail(&req->pending_list, &sys->pending_list);
-
-		msg->rsp.result = SD_RES_SUCCESS;
-		sys->cdrv->notify(msg, size);
-
-		free(msg);
-	}
+	list_add_tail(&req->pending_list, &sys->pending_list);
+	req->work.fn = do_cluster_request;
+	req->work.done = cluster_op_done;
+	queue_work(sys->block_wqueue, &req->work);
 }
 
 static inline int get_nodes_nr_from(struct list_head *l)
@@ -748,20 +714,19 @@ void sd_notify_handler(struct sd_node *sender, void *data, size_t data_len)
 	struct vdi_op_message *msg = data;
 	struct sd_op_template *op = get_sd_op(msg->req.opcode);
 	int ret = msg->rsp.result;
-	struct request *req = NULL;
 
 	dprintf("size: %zd, from: %s\n", data_len, node_to_str(sender));
 
+	if (ret == SD_RES_SUCCESS && has_process_main(op))
+		ret = do_process_main(op, &msg->req, &msg->rsp, msg->data);
+
 	if (is_myself(sender->addr, sender->port)) {
+		struct request *req;
+
 		req = list_first_entry(&sys->pending_list, struct request,
 				       pending_list);
 		list_del(&req->pending_list);
-	}
-
-	if (ret == SD_RES_SUCCESS && has_process_main(op))
-		ret = do_process_main(op, &msg->req, &msg->rsp, msg->data);
 
-	if (req) {
 		msg->rsp.result = ret;
 		if (has_process_main(req->op))
 			memcpy(req->data, msg->data, msg->rsp.data_length);
-- 
1.7.10.2




More information about the sheepdog mailing list