[sheepdog] [PATCH] sheep: let outstanding IO req doesn't block confchg event

Liu Yuan namei.unix at gmail.com
Mon May 28 09:07:20 CEST 2012


From: Liu Yuan <tailai.ly at taobao.com>

We already define the in-fly IO object as busy object, which sit on the
sys->outstanding_req_list. So recovery request for this object will be queued
on sys->req_wait_for_obj_list, where it will be resumed later.

So there is no need to block confchg event for outstanding IO thus confchg
could be processed as soon as possible. Confchg should take precedence over IO
req because:

Suppose doing heavy IO on each node while cluster is in recovery.
Every node is issuing IO request while doing recovery. Both outstanding
IO and unfinished confchg event blocks each other (nearly dead lock),
all nodes are busy retrying those pending I/Os (live lock), and recovery
requests are mostly denied of service, neither outstanding IO nor
recovery moves on to completion.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/farm/farm.c  |   51 ++++++++++++++++++++++++++++-----------------------
 sheep/group.c      |    3 +--
 sheep/sdnet.c      |    2 --
 sheep/sheep_priv.h |    1 -
 4 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index 1575d24..6b820ce 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -110,34 +110,39 @@ static int farm_write(uint64_t oid, struct siocb *iocb, int create)
 	char path[PATH_MAX];
 	ssize_t size;
 
-	if (is_vdi_obj(oid))
-		flags &= ~O_DIRECT;
+	if (iocb->epoch < sys_epoch()) {
+		dprintf("%"PRIu32" sys %"PRIu32"\n", iocb->epoch, sys_epoch());
+		return SD_RES_OLD_NODE_VER;
+	} else {
+		if (is_vdi_obj(oid))
+			flags &= ~O_DIRECT;
 
-	if (create)
-		flags |= O_CREAT | O_TRUNC;
+		if (create)
+			flags |= O_CREAT | O_TRUNC;
 
-	sprintf(path, "%s%016"PRIx64, obj_path, oid);
-	fd = open(path, flags, def_fmode);
-	if (fd < 0)
-		return err_to_sderr(oid, errno);
-
-	if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
-		ret = prealloc(fd, is_vdi_obj(oid) ?
-			       SD_INODE_SIZE : SD_DATA_OBJ_SIZE);
-		if (ret != SD_RES_SUCCESS)
+		sprintf(path, "%s%016"PRIx64, obj_path, oid);
+		fd = open(path, flags, def_fmode);
+		if (fd < 0)
+			return err_to_sderr(oid, errno);
+
+		if (create && !(iocb->flags & SD_FLAG_CMD_COW)) {
+			ret = prealloc(fd, is_vdi_obj(oid) ?
+				       SD_INODE_SIZE : SD_DATA_OBJ_SIZE);
+			if (ret != SD_RES_SUCCESS)
+				goto out;
+		}
+		size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
+		if (size != iocb->length) {
+			eprintf("%m\n");
+			ret = SD_RES_EIO;
 			goto out;
-	}
-	size = xpwrite(fd, iocb->buf, iocb->length, iocb->offset);
-	if (size != iocb->length) {
-		eprintf("%m\n");
-		ret = SD_RES_EIO;
-		goto out;
-	}
+		}
 
-	trunk_update_entry(oid);
+		trunk_update_entry(oid);
 out:
-	close(fd);
-	return ret;
+		close(fd);
+		return ret;
+	}
 }
 
 static int write_last_sector(int fd, uint32_t length)
diff --git a/sheep/group.c b/sheep/group.c
index 50a53c1..f3b95cc 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -1096,7 +1096,6 @@ static void process_request_queue(void)
 		if (is_io_op(req->op)) {
 			list_add_tail(&req->request_list,
 				      &sys->outstanding_req_list);
-			sys->nr_outstanding_io++;
 
 			if (need_consistency_check(req))
 				set_consistency_check(req);
@@ -1125,7 +1124,7 @@ static inline void process_event_queue(void)
 	 * we need to serialize events so we don't call queue_work
 	 * if one event is running by executing event_fn() or event_done().
 	 */
-	if (event_running || sys->nr_outstanding_io)
+	if (event_running)
 		return;
 
 	cevent = list_first_entry(&sys->event_queue,
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 4224220..f4408f7 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -97,7 +97,6 @@ static void io_op_done(struct work *work)
 	struct sd_req *hdr = &req->rq;
 
 	list_del(&req->request_list);
-	sys->nr_outstanding_io--;
 
 	switch (req->rp.result) {
 	case SD_RES_OLD_NODE_VER:
@@ -193,7 +192,6 @@ static int check_request(struct request *req)
 		/* ask gateway to retry. */
 		req->rp.result = SD_RES_OLD_NODE_VER;
 		req->rp.epoch = sys->epoch;
-		sys->nr_outstanding_io++;
 		req->work.done(&req->work);
 		return -1;
 	} else if (after(req->rq.epoch, sys->epoch)) {
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 69ece1c..6114a21 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -140,7 +140,6 @@ struct cluster_info {
 	struct list_head wait_rw_queue;
 	struct list_head wait_obj_queue;
 	struct event_struct *cur_cevent;
-	int nr_outstanding_io;
 	int nr_outstanding_reqs;
 	unsigned int outstanding_data_size;
 
-- 
1.7.10.2




More information about the sheepdog mailing list