[Sheepdog] [PATCH 2/2] avoid the race between recovery and IO requests

FUJITA Tomonori fujita.tomonori at lab.ntt.co.jp
Fri May 7 07:15:50 CEST 2010


- we can't perform IO requests against the object that we are
recovering.

- we can't recover the object that we are performing IO requests
  against.

Signed-off-by: FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp>
---
 collie/collie.h |    5 +++++
 collie/group.c  |   35 +++++++++++++++++++----------------
 collie/net.c    |   30 ++++++++++++++++++------------
 collie/store.c  |   45 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/collie/collie.h b/collie/collie.h
index 17f9dec..b53c824 100644
--- a/collie/collie.h
+++ b/collie/collie.h
@@ -126,6 +126,9 @@ int read_vdis(char *data, int len, unsigned int *rsp_len);
 
 int setup_ordered_sd_node_list(struct request *req);
 int get_ordered_sd_node_list(struct sheepdog_node_list_entry *entries);
+int is_access_to_busy_objects(uint64_t oid);
+
+void resume_pending_requests(void);
 
 int create_cluster(int port);
 
@@ -155,6 +158,8 @@ int set_cluster_ctime(uint64_t ctime);
 uint64_t get_cluster_ctime(void);
 
 int start_recovery(uint32_t epoch, uint32_t *failed_vdis, int nr_failed_vdis);
+void resume_recovery_work(void);
+int is_recoverying_oid(uint64_t oid);
 
 static inline int is_myself(struct sheepdog_node_list_entry *e)
 {
diff --git a/collie/group.c b/collie/group.c
index 66930c4..105a5f9 100644
--- a/collie/group.c
+++ b/collie/group.c
@@ -1320,30 +1320,33 @@ static int check_epoch(struct request *req)
 	return ret;
 }
 
-static int is_access_to_busy_objects(struct request *req)
+int is_access_to_busy_objects(uint64_t oid)
 {
-	struct request *o_req;
+	struct request *req;
 
-	if (!req->local_oid[0] && !req->local_oid[1])
+	if (!oid)
 		return 0;
 
-	list_for_each_entry(o_req, &sys->outstanding_req_list, r_wlist) {
-
-		if (req->local_oid[0]) {
-			if (req->local_oid[0] == o_req->local_oid[0] ||
-			    req->local_oid[0] == o_req->local_oid[1])
+	list_for_each_entry(req, &sys->outstanding_req_list, r_wlist) {
+		if (oid == req->local_oid[0] || oid == req->local_oid[1])
 				return 1;
-		}
-
-		if (req->local_oid[1]) {
-			if (req->local_oid[1] == o_req->local_oid[0] ||
-			    req->local_oid[1] == o_req->local_oid[1])
-				return 1;
-		}
 	}
 	return 0;
 }
 
+static int __is_access_to_busy_objects(struct request *req)
+{
+	if (is_access_to_busy_objects(req->local_oid[0]) ||
+	    is_access_to_busy_objects(req->local_oid[1]))
+		return 1;
+
+	if (is_recoverying_oid(req->local_oid[0]) ||
+	    is_recoverying_oid(req->local_oid[1]))
+		return 1;
+
+	return 0;
+}
+
 /* can be called only by the main process */
 void start_cpg_event_work(void)
 {
@@ -1387,7 +1390,7 @@ void start_cpg_event_work(void)
 		list_del(&cevent->cpg_event_list);
 
 		if (is_io_request(req->rq.opcode)) {
-			if (is_access_to_busy_objects(req)) {
+			if (__is_access_to_busy_objects(req)) {
 				list_add_tail(&req->r_wlist, &sys->req_wait_for_obj_list);
 				continue;
 			}
diff --git a/collie/net.c b/collie/net.c
index f641022..d3e6ce5 100644
--- a/collie/net.c
+++ b/collie/net.c
@@ -35,6 +35,22 @@ int is_io_request(unsigned op)
 	return ret;
 }
 
+void resume_pending_requests(void)
+{
+	struct request *next, *tmp;
+
+	list_for_each_entry_safe(next, tmp, &sys->req_wait_for_obj_list,
+				 r_wlist) {
+		struct cpg_event *cevent = &next->cev;
+
+		list_del(&next->r_wlist);
+		list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings);
+	}
+
+	if (list_empty(&sys->cpg_event_siblings))
+		start_cpg_event_work();
+}
+
 static void __done(struct work *work, int idx)
 {
 	struct request *req = container_of(work, struct request, work);
@@ -53,7 +69,6 @@ static void __done(struct work *work, int idx)
 	}
 
 	if (is_io_request(hdr->opcode)) {
-		struct request *next, *tmp;
 		list_del(&req->r_wlist);
 
 		sys->nr_outstanding_io--;
@@ -63,17 +78,8 @@ static void __done(struct work *work, int idx)
 		 * of sys->cpg_event_siblings.
 		 */
 
-		list_for_each_entry_safe(next, tmp, &sys->req_wait_for_obj_list,
-					 r_wlist) {
-			struct cpg_event *cevent = &next->cev;
-
-			list_del(&next->r_wlist);
-			list_add_tail(&cevent->cpg_event_list, &sys->cpg_event_siblings);
-		}
-
-		if (!sys->nr_outstanding_io &&
-		    !list_empty(&sys->cpg_event_siblings))
-			start_cpg_event_work();
+		resume_pending_requests();
+		resume_recovery_work();
 	}
 
 	req->done(req);
diff --git a/collie/store.c b/collie/store.c
index d21a1c3..5fc8e28 100644
--- a/collie/store.c
+++ b/collie/store.c
@@ -1255,17 +1255,57 @@ out:
 		rw->done++;
 }
 
+static struct recovery_work *suspended_recovery_work;
+static uint64_t recovering_oid;
+
 static void __start_recovery(struct work *work, int idx);
 
 static void recover_timer(void *data)
 {
 	struct recovery_work *rw = (struct recovery_work *)data;
+	uint64_t oid = *(((uint64_t *)rw->buf) + rw->done);
+
+	if (is_access_to_busy_objects(oid)) {
+		suspended_recovery_work = rw;
+		return;
+	}
+
+	recovering_oid = oid;
 	queue_work(&rw->work);
 }
 
+void resume_recovery_work(void)
+{
+	struct recovery_work *rw;
+	uint64_t oid;
+
+	if (!suspended_recovery_work)
+		return;
+
+	rw = suspended_recovery_work;
+
+	oid =  *(((uint64_t *)rw->buf) + rw->done);
+	if (is_access_to_busy_objects(oid))
+		return;
+
+	suspended_recovery_work = NULL;
+	recovering_oid = oid;
+	queue_work(&rw->work);
+}
+
+int is_recoverying_oid(uint64_t oid)
+{
+	return recovering_oid && recovering_oid == oid;
+}
+
 static void recover_done(struct work *work, int idx)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work, work);
+	uint64_t oid = *(((uint64_t *)rw->buf) + rw->done);
+
+	recovering_oid = 0;
+
+	resume_pending_requests();
 
 	if (rw->retry) {
 		rw->retry = 0;
@@ -1279,6 +1319,11 @@ static void recover_done(struct work *work, int idx)
 	if (rw->done < rw->count && list_empty(&recovery_work_list)) {
 		rw->work.fn = recover_one;
 
+		if (is_access_to_busy_objects(oid)) {
+			suspended_recovery_work = rw;
+			return;
+		}
+		recovering_oid = oid;
 		queue_work(&rw->work);
 		return;
 	}
-- 
1.6.5




More information about the sheepdog mailing list