[sheepdog] [PATCH v3 10/13] recovery: don't notify completion for md recovery

Mon May 27 13:36:13 CEST 2013

If we do, stale objects will be removed before the recovery, that goes after the
md recovery, tries to recovery objects from other nodes.

Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 sheep/ops.c      |   12 ------------
 sheep/recovery.c |   22 ++++++++++++++++++----
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/sheep/ops.c b/sheep/ops.c
index 9911afb..0d35dad 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -661,18 +661,6 @@ static int cluster_recovery_completion(const struct sd_req *req,
 		nr_recovereds = 0;
 	}
 
-	/*
-	 * Disk failure might send duplicate notification, ingore it.
-	 *
-	 * We can't simply stop disk recovery from sending notication because
-	 * disk recovery might supersede node recovery, which indeed need
-	 * to send notification
-	 */
-	for (i = 0; i < nr_recovereds; i++)
-		if (node_eq(node, recovereds + i)) {
-			sd_dprintf("duplicate %s", node_to_str(node));
-			return SD_RES_SUCCESS;
-		}
 	recovereds[nr_recovereds++] = *node;
 	xqsort(recovereds, nr_recovereds, node_cmp);
 
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 21d76e2..46a7baa 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -69,6 +69,7 @@ struct recovery_info {
 	 * and no recovery work is running
 	 */
 	bool suspended;
+	bool notify_complete;
 
 	int count;
 	uint64_t *oids;
@@ -432,11 +433,19 @@ static void free_recovery_info(struct recovery_info *rinfo)
 static inline bool run_next_rw(void)
 {
 	struct recovery_info *nrinfo = uatomic_xchg_ptr(&next_rinfo, NULL);
+	struct recovery_info *cur = main_thread_get(current_rinfo);
 
 	if (nrinfo == NULL)
 		return false;
 
-	free_recovery_info(main_thread_get(current_rinfo));
+	/*
+	 * When md recovery supersed the reweight or node recovery, we need to
+	 * notify completion.
+	 */
+	if (!nrinfo->notify_complete && cur->notify_complete)
+		nrinfo->notify_complete = true;
+
+	free_recovery_info(cur);
 
 	sd_store->update_epoch(nrinfo->epoch);
 
@@ -479,10 +488,11 @@ static inline void finish_recovery(struct recovery_info *rinfo)
 
 	wakeup_all_requests();
 
-	rinfo->state = RW_NOTIFY_COMPLETION;
+	if (rinfo->notify_complete) {
+		rinfo->state = RW_NOTIFY_COMPLETION;
+		queue_recovery_work(rinfo);
+	}
 
-	/* notify recovery completion to other nodes */
-	queue_recovery_work(rinfo);
 	free_recovery_info(rinfo);
 
 	sd_dprintf("recovery complete: new epoch %"PRIu32, recovered_epoch);
@@ -786,6 +796,10 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
 	rinfo->state = RW_PREPARE_LIST;
 	rinfo->epoch = sys->epoch;
 	rinfo->count = 0;
+	if (epoch_lifted)
+		rinfo->notify_complete = true; /* Reweight or node recovery */
+	else
+		rinfo->notify_complete = false; /* MD recovery */
 
 	rinfo->cur_vinfo = grab_vnode_info(cur_vinfo);
 	rinfo->old_vinfo = grab_vnode_info(old_vinfo);
-- 
1.7.9.5