[sheepdog] [PATCH] recovery: notify completion only when all objects are fresh

Fri May 31 14:55:41 CEST 2013

From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

To reduce the risk of data loss, we shouldn't remove stale objects if
there are some sheeps who failed to recover objects.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/recovery.c |    9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index dba89fb..e8edbca 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -85,6 +85,7 @@ struct recovery_info {
 
 struct recovery_info *next_rinfo;
 static main_thread(struct recovery_info *) current_rinfo;
+static bool safe_mode; /* set true when something critical happens */
 
 static void queue_recovery_work(struct recovery_info *rinfo);
 
@@ -260,6 +261,7 @@ again:
 		sd_printf(SDOG_ALERT, "cannot access any replicas of "
 			  "%"PRIx64" at epoch %d", oid, tgt_epoch);
 		sd_printf(SDOG_ALERT, "clients may see old data");
+		safe_mode = true;
 		/* fall through */
 	default:
 		/* No luck, roll back to an older configuration and try again */
@@ -276,6 +278,7 @@ rollback:
 			/* We rollback in case we don't get a valid epoch */
 			sd_printf(SDOG_ALERT, "cannot get epoch %d", tgt_epoch);
 			sd_printf(SDOG_ALERT, "clients may see old data");
+			safe_mode = true;
 			goto rollback;
 		}
 
@@ -461,6 +464,11 @@ static void notify_recovery_completion_work(struct work *work)
 	struct sd_req hdr;
 	int ret;
 
+	if (safe_mode) {
+		sd_iprintf("skip notifying recovery completion");
+		return;
+	}
+
 	sd_init_req(&hdr, SD_OP_COMPLETE_RECOVERY);
 	hdr.obj.tgt_epoch = rw->epoch;
 	hdr.flags = SD_FLAG_CMD_WRITE;
@@ -687,6 +695,7 @@ retry:
 			  e->nid.port);
 		sd_printf(SDOG_ALERT, "some objects may be not recovered at "
 			  "epoch %d", epoch);
+		safe_mode = true;
 		free(buf);
 		return NULL;
 	}
-- 
1.7.9.5