[sheepdog] [PATCH v3 10/10] sheep: show error message when object may be lost

Thu May 9 12:15:44 CEST 2013

From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

>From the point of view of block storage semantics, sheepdog must not show the
old data to clients.  If sheep may break the data consistency, sheep should
alert it so that users can know the problem.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/internal_proto.h |    1 +
 include/sheep.h          |    1 +
 sheep/recovery.c         |   28 +++++++++++++++++++++++++++-
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index f124b9e..995e213 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -97,6 +97,7 @@
 #define SD_RES_KILLED           0x8D /* Node is killed */
 #define SD_RES_OID_EXIST        0x8E /* Object ID exists already */
 #define SD_RES_AGAIN            0x8F /* Ask to try again */
+#define SD_RES_STALE_OBJ        0x90 /* Object may be stale */
 
 #define SD_FLAG_NOHALT       0x0004 /* Serve the IO rquest even lack of nodes */
 #define SD_FLAG_QUORUM       0x0008 /* Serve the IO rquest as long we are quorate */
diff --git a/include/sheep.h b/include/sheep.h
index 8ba4bbb..456fd07 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -221,6 +221,7 @@ static inline const char *sd_strerror(int err)
 		[SD_RES_KILLED] = "Node is killed",
 		[SD_RES_OID_EXIST] = "Object ID exists already",
 		[SD_RES_AGAIN] = "Ask to try again",
+		[SD_RES_STALE_OBJ] = "Object may be stale",
 	};
 
 	if (descs[err] == NULL) {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index fb13e5a..7e461b5 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -122,6 +122,7 @@ static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
 	int nr_copies, ret, start = 0;
 	void *buf = NULL;
 	struct siocb iocb = { 0 };
+	bool fully_replicated = true;
 
 	nr_copies = get_obj_copy_number(oid, old->nr_zones);
 
@@ -182,10 +183,23 @@ static int recover_object_from_replica(uint64_t oid, struct vnode_info *old,
 		case SD_RES_OLD_NODE_VER:
 			/* move to the next epoch recovery */
 			goto out;
+		case SD_RES_NO_OBJ:
+			fully_replicated = false;
+			/* fall through */
 		default:
 			break;
 		}
 	}
+
+	/*
+	 * sheep would return a stale object when
+	 *  - all the nodes hold the copies, and
+	 *  - all the nodes are gone
+	 * at the some epoch
+	 */
+	if (fully_replicated && ret != SD_RES_SUCCESS)
+		ret = SD_RES_STALE_OBJ;
+
 out:
 	free(buf);
 	return ret;
@@ -220,6 +234,11 @@ again:
 	case SD_RES_OLD_NODE_VER:
 		row->stop = true;
 		break;
+	case SD_RES_STALE_OBJ:
+		sd_printf(SDOG_ALERT, "cannot access any replicas of "
+			  "%"PRIx64" at epoch %d", oid, tgt_epoch);
+		sd_printf(SDOG_ALERT, "clients may see old data");
+		/* fall through */
 	default:
 		/* No luck, roll back to an older configuration and try again */
 rollback:
@@ -231,9 +250,12 @@ rollback:
 		}
 
 		new_old = get_vnode_info_epoch(tgt_epoch, rw->cur_vinfo);
-		if (!new_old)
+		if (!new_old) {
 			/* We rollback in case we don't get a valid epoch */
+			sd_printf(SDOG_ALERT, "cannot get epoch %d", tgt_epoch);
+			sd_printf(SDOG_ALERT, "clients may see old data");
 			goto rollback;
+		}
 
 		put_vnode_info(cur);
 		cur = old;
@@ -623,6 +645,10 @@ retry:
 		buf = xrealloc(buf, buf_size);
 		goto retry;
 	default:
+		sd_printf(SDOG_ALERT, "cannot get object list from %s:%d", name,
+			  e->nid.port);
+		sd_printf(SDOG_ALERT, "some objects may be not recovered at "
+			  "epoch %d", epoch);
 		free(buf);
 		return NULL;
 	}
-- 
1.7.9.5