[sheepdog] [PATCH v3 2/8] sheep: make requests with new epoch sleep until epoch is updated

Thu May 24 05:37:45 CEST 2012

From: levin li <xingke.lwp at taobao.com>

If requests comes with epoch newer than system epoch, then
we shouldn't just make it done with result SD_RES_NEW_NODE_VER,
if so, the sender would busy retrying this request, which may
casue CPU too busy to process other request.

We push the requests with new epoch into a wait_epoch_queue to
make it wait for epoch consistency, after epoch changes we wake
up these requests in the queue, which avoids busy retrying.

Signed-off-by: levin li <xingke.lwp at taobao.com>
---
 sheep/group.c      |    1 +
 sheep/recovery.c   |    2 ++
 sheep/sdnet.c      |   28 ++++++++++++++++++++++++----
 sheep/sheep_priv.h |    2 ++
 4 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index e2632ac..8673b7a 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -1381,6 +1381,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes)
 
 	INIT_LIST_HEAD(&sys->request_queue);
 	INIT_LIST_HEAD(&sys->event_queue);
+	INIT_LIST_HEAD(&sys->wait_rw_queue);
 
 	ret = send_join_request(&sys->this_node);
 	if (ret != 0)
diff --git a/sheep/recovery.c b/sheep/recovery.c
index f341fc6..de4bc62 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -692,5 +692,7 @@ int start_recovery(uint32_t epoch)
 		queue_work(sys->recovery_wqueue, &rw->work);
 	}
 
+	resume_wait_epoch_requests();
+
 	return 0;
 }
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 5e9cb3b..cd3311e 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -174,14 +174,23 @@ static int check_epoch(struct request *req)
 	int ret = SD_RES_SUCCESS;
 
 	if (before(req_epoch, sys->epoch)) {
-		ret = SD_RES_OLD_NODE_VER;
 		eprintf("old node version %u, %u, %x\n",
 				sys->epoch, req_epoch, opcode);
+		/* make gateway to retry. */
+		req->rp.result = SD_RES_OLD_NODE_VER;
+		req->rp.epoch = sys->epoch;
+		req->work.done(&req->work);
+		ret = req->rp.result;
 	} else if (after(req_epoch, sys->epoch)) {
-		ret = SD_RES_NEW_NODE_VER;
 		eprintf("new node version %u, %u, %x\n",
 				sys->epoch, req_epoch, opcode);
+
+		/* wait for epoch consistency. */
+		req->rp.result = SD_RES_NEW_NODE_VER;
+		list_add_tail(&req->request_list, &sys->wait_rw_queue);
+		ret = req->rp.result;
 	}
+
 	return ret;
 }
 
@@ -201,9 +210,7 @@ static int check_request(struct request *req)
 	else {
 		int ret = check_epoch(req);
 		if (ret != SD_RES_SUCCESS) {
-			req->rp.result = ret;
 			sys->nr_outstanding_io++;
-			req->work.done(&req->work);
 			return -1;
 		}
 	}
@@ -251,6 +258,19 @@ void resume_pending_requests(void)
 		process_request_event_queues();
 }
 
+void resume_wait_epoch_requests(void)
+{
+	struct request *req, *t;
+
+	list_for_each_entry_safe(req, t, &sys->wait_rw_queue,
+				 request_list) {
+
+		list_del(&req->request_list);
+		list_add_tail(&req->request_list, &sys->request_queue);
+		process_request_event_queues();
+	}
+}
+
 static void queue_request(struct request *req)
 {
 	struct sd_req *hdr = &req->rq;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 3d5a964..ba29a2a 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -137,6 +137,7 @@ struct cluster_info {
 
 	struct list_head request_queue;
 	struct list_head event_queue;
+	struct list_head wait_rw_queue;
 	struct event_struct *cur_cevent;
 	int nr_outstanding_io;
 	int nr_outstanding_reqs;
@@ -267,6 +268,7 @@ int get_nr_copies(struct vnode_info *vnode_info);
 int is_access_to_busy_objects(uint64_t oid);
 
 void resume_pending_requests(void);
+void resume_wait_epoch_requests(void);
 
 int create_cluster(int port, int64_t zone, int nr_vnodes);
 int leave_cluster(void);
-- 
1.7.10