[sheepdog] [PATCH 4/6] md: implement automatic recovery

Tue Mar 26 10:46:23 CET 2013

From: Liu Yuan <tailai.ly at taobao.com>

I choose to take advantage of current node recovery mechanism because:
 1. It supports multiple failure events
 2. It provide nice race-free and wait queue mechanism for VM IO requests and
    recovery IO requests.
 3. already-made logic that can recover objects from other nodes. This is very
    crutial to md recovery because we distribute objects across disks without
    extra copies.
 4. Maintan one mechanism is always better than two

So disk failure will simply be seen as a node recovery without epoch lifted and
md recovery has all the merits of node recovery:
 1. multiple disk failures handling
 2. serve VM IO requests while in recovery
 3. can hot-plug and hot-unplug the disks (missing part yet)

Besides, another nice spin-off is that with unified recovery, both node and disk
recovery can suspend and supersede each other (missing part yet)

The missing parts will be implemented in later patch.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/md.c          |  236 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 sheep/plain_store.c |   16 +---
 sheep/sheep.c       |    4 +-
 sheep/sheep_priv.h  |    3 +
 4 files changed, 242 insertions(+), 17 deletions(-)

diff --git a/sheep/md.c b/sheep/md.c
index 3569acc..621934d 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -20,6 +20,7 @@
 #include <math.h>
 #include <sys/xattr.h>
 #include <dirent.h>
+#include <pthread.h>
 
 #include "sheep_priv.h"
 
@@ -41,9 +42,21 @@ struct vdisk {
 static struct disk md_disks[MD_MAX_DISK];
 static struct vdisk md_vds[MD_MAX_VDISK];
 
-static int md_nr_disks;
+static pthread_rwlock_t md_lock = PTHREAD_RWLOCK_INITIALIZER;
+static uint32_t md_nr_disks; /* Protected by md_lock */
 static int md_nr_vds;
 
+static inline uint32_t nr_online_disks(void)
+{
+	uint32_t nr;
+
+	pthread_rwlock_rdlock(&md_lock);
+	nr = md_nr_disks;
+	pthread_rwlock_unlock(&md_lock);
+
+	return nr;
+}
+
 static struct vdisk *oid_to_vdisk_from(struct vdisk *vds, int nr, uint64_t oid)
 {
 	uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
@@ -187,9 +200,24 @@ uint64_t md_init_space(void)
 char *get_object_path(uint64_t oid)
 {
 	struct vdisk *vd;
+	char *p;
 
 	if (!sys->enable_md)
 		return obj_path;
+
+	pthread_rwlock_rdlock(&md_lock);
+	vd = oid_to_vdisk(oid);
+	p = md_disks[vd->idx].path;
+	pthread_rwlock_unlock(&md_lock);
+	sd_dprintf("%d, %s", vd->idx, p);
+
+	return p;
+}
+
+static char *get_object_path_nolock(uint64_t oid)
+{
+	struct vdisk *vd;
+
 	vd = oid_to_vdisk(oid);
 	return md_disks[vd->idx].path;
 }
@@ -239,21 +267,68 @@ static int for_each_object_in_path(char *path,
 	return ret;
 }
 
+/* Iterate the working directory and return if func() return SD_RES_SUCCESS */
+static int for_each_object_in_path2(char *path,
+				    int (*func)(uint64_t, char *, void *),
+				    bool cleanup, void *arg)
+{
+	DIR *dir;
+	struct dirent *d;
+	uint64_t oid;
+	int ret = SD_RES_EIO;
+	char p[PATH_MAX];
+
+	dir = opendir(path);
+	if (!dir) {
+		sd_eprintf("failed to open %s, %m", path);
+		return SD_RES_EIO;
+	}
+
+	while ((d = readdir(dir))) {
+		if (!strncmp(d->d_name, ".", 1))
+			continue;
+
+		oid = strtoull(d->d_name, NULL, 16);
+		if (oid == 0 || oid == ULLONG_MAX)
+			continue;
+
+		/* don't call callback against temporary objects */
+		if (strlen(d->d_name) == 20 &&
+		    strcmp(d->d_name + 16, ".tmp") == 0) {
+			if (cleanup) {
+				snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp",
+					 path, oid);
+				sd_dprintf("remove tmp object %s", p);
+				unlink(p);
+			}
+			continue;
+		}
+
+		ret = func(oid, path, arg);
+		if (ret == SD_RES_SUCCESS)
+			break;
+	}
+	closedir(dir);
+	return ret;
+}
+
 int for_each_object_in_wd(int (*func)(uint64_t oid, char *path, void *arg),
 			  bool cleanup, void *arg)
 {
-	int i, ret;
+	int i, ret = SD_RES_SUCCESS;
 
 	if (!sys->enable_md)
 		return for_each_object_in_path(obj_path, func, cleanup, arg);
 
+	pthread_rwlock_rdlock(&md_lock);
 	for (i = 0; i < md_nr_disks; i++) {
 		ret = for_each_object_in_path(md_disks[i].path, func,
 					      cleanup, arg);
 		if (ret != SD_RES_SUCCESS)
-			return ret;
+			break;
 	}
-	return SD_RES_SUCCESS;
+	pthread_rwlock_unlock(&md_lock);
+	return ret;
 }
 
 int for_each_obj_path(int (*func)(char *path))
@@ -263,10 +338,163 @@ int for_each_obj_path(int (*func)(char *path))
 	if (!sys->enable_md)
 		return func(obj_path);
 
+	pthread_rwlock_rdlock(&md_lock);
 	for (i = 0; i < md_nr_disks; i++) {
 		ret = func(md_disks[i].path);
 		if (ret != SD_RES_SUCCESS)
 			return ret;
 	}
+	pthread_rwlock_unlock(&md_lock);
+	return SD_RES_SUCCESS;
+}
+
+struct md_work {
+	struct work work;
+	char path[PATH_MAX];
+};
+
+static int path_to_disk_idx(char *path)
+{
+	int i;
+
+	for (i = 0; i < md_nr_disks; i++)
+		if (strcmp(md_disks[i].path, path) == 0)
+			return i;
+
+	return -1;
+}
+
+static inline void kick_recover(void)
+{
+	struct vnode_info *vinfo = get_vnode_info();
+
+	start_recovery(vinfo, vinfo);
+	put_vnode_info(vinfo);
+}
+
+static void unplug_disk(int idx)
+{
+	int i;
+
+	/*
+	 * We need to keep last disk path to generate EIO when all disks are
+	 * broken
+	 */
+	for (i = idx; i < md_nr_disks - 1; i++)
+		md_disks[i] = md_disks[i + 1];
+	md_nr_disks--;
+	sys->disk_space = md_init_space();
+	if (md_nr_disks > 0)
+		kick_recover();
+}
+
+static void md_do_recover(struct work *work)
+{
+	struct md_work *mw = container_of(work, struct md_work, work);
+	int idx;
+
+	pthread_rwlock_wrlock(&md_lock);
+	idx = path_to_disk_idx(mw->path);
+	if (idx < 0)
+		/* Just ignore the duplicate EIO of the same path */
+		goto out;
+	unplug_disk(idx);
+out:
+	pthread_rwlock_unlock(&md_lock);
+	free(mw);
+}
+
+int md_handle_eio(char *fault_path)
+{
+	struct md_work *mw;
+
+	if (!sys->enable_md)
+		return SD_RES_EIO;
+
+	if (nr_online_disks() == 0)
+		return SD_RES_EIO;
+
+	mw = xzalloc(sizeof(*mw));
+	mw->work.done = md_do_recover;
+	pstrcpy(mw->path, PATH_MAX, fault_path);
+	queue_work(sys->md_wqueue, &mw->work);
+
+	/* Fool the requester to retry */
+	return SD_RES_NETWORK_ERROR;
+}
+
+static int check_and_move(uint64_t oid, char *path, void *arg)
+{
+	uint64_t tgt_oid = *(uint64_t *)arg;
+	char old[PATH_MAX], new[PATH_MAX];
+
+	if (oid != tgt_oid)
+		return SD_RES_EIO;
+
+	snprintf(old, PATH_MAX, "%s/%016" PRIx64, path, oid);
+	snprintf(new, PATH_MAX, "%s/%016" PRIx64, get_object_path_nolock(oid),
+		 oid);
+	if (rename(old, new) < 0) {
+		sd_eprintf("%"PRIx64 " failed, %m", oid);
+		return SD_RES_EIO;
+	}
+
+	sd_dprintf("%"PRIx64" from %s to %s", oid, old, new);
 	return SD_RES_SUCCESS;
 }
+
+static int for_each_object(int (*func)(uint64_t oid, char *path, void *arg),
+			   void *arg)
+{
+	int i, ret = SD_RES_EIO;
+
+	pthread_rwlock_rdlock(&md_lock);
+	for (i = 0; i < md_nr_disks; i++) {
+		ret = for_each_object_in_path2(md_disks[i].path, func, false,
+					       arg);
+		if (ret == SD_RES_SUCCESS)
+			break;
+	}
+	pthread_rwlock_unlock(&md_lock);
+	return ret;
+}
+
+static inline bool md_access(char *path)
+{
+	if (access(path, R_OK | W_OK) < 0) {
+		if (errno != ENOENT)
+			sd_eprintf("failed to check %s, %m", path);
+		return false;
+	}
+
+	return true;
+}
+
+static bool md_handle_exist(uint64_t oid)
+{
+	char path[PATH_MAX];
+
+	snprintf(path, PATH_MAX, "%s/%016" PRIx64, get_object_path(oid), oid);
+	if (md_access(path))
+		return true;
+	/*
+	 * We have to iterate the WD because we don't have epoch-like history
+	 * track to locate the objects for multiple disk failure. Simply do
+	 * hard iteration simplify the code a lot. FIXME: optimize me.
+	 */
+	if (for_each_object(check_and_move, &oid) == SD_RES_SUCCESS)
+		return true;
+
+	return false;
+}
+
+bool md_exist(uint64_t oid)
+{
+	char path[PATH_MAX];
+	if (!sys->enable_md) {
+		snprintf(path, PATH_MAX, "%s/%016" PRIx64, obj_path, oid);
+		return md_access(path);
+	}
+
+	return md_handle_exist(oid);
+}
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index f9741bb..6b41238 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -55,16 +55,7 @@ static int get_stale_obj_path(uint64_t oid, uint32_t epoch, char *path)
 
 bool default_exist(uint64_t oid)
 {
-	char path[PATH_MAX];
-
-	get_obj_path(oid, path);
-	if (access(path, R_OK | W_OK) < 0) {
-		if (errno != ENOENT)
-			sd_eprintf("failed to check object %"PRIx64", %m", oid);
-		return false;
-	}
-
-	return true;
+	return md_exist(oid);
 }
 
 static int err_to_sderr(char *path, uint64_t oid, int err)
@@ -72,11 +63,12 @@ static int err_to_sderr(char *path, uint64_t oid, int err)
 	struct stat s;
 	char *dir = dirname(path);
 
+	sd_dprintf("%s", dir);
 	switch (err) {
 	case ENOENT:
 		if (stat(dir, &s) < 0) {
 			sd_eprintf("%s corrupted", dir);
-			return SD_RES_EIO;
+			return md_handle_eio(dir);
 		}
 		sd_dprintf("object %016" PRIx64 " not found locally", oid);
 		return SD_RES_NO_OBJ;
@@ -86,7 +78,7 @@ static int err_to_sderr(char *path, uint64_t oid, int err)
 		return SD_RES_NO_SPACE;
 	default:
 		sd_eprintf("oid=%"PRIx64", %m", oid);
-		return SD_RES_EIO;
+		return md_handle_eio(dir);
 	}
 }
 
diff --git a/sheep/sheep.c b/sheep/sheep.c
index a72a1ef..4fa7d58 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -340,6 +340,7 @@ static int init_work_queues(void)
 	sys->deletion_wqueue = init_ordered_work_queue("deletion");
 	sys->block_wqueue = init_ordered_work_queue("block");
 	sys->sockfd_wqueue = init_ordered_work_queue("sockfd");
+	sys->md_wqueue = init_ordered_work_queue("md");
 	if (sys->enable_object_cache) {
 		sys->oc_reclaim_wqueue = init_ordered_work_queue("oc_reclaim");
 		sys->oc_push_wqueue = init_work_queue("oc_push", WQ_DYNAMIC);
@@ -347,7 +348,8 @@ static int init_work_queues(void)
 			return -1;
 	}
 	if (!sys->gateway_wqueue || !sys->io_wqueue || !sys->recovery_wqueue ||
-	    !sys->deletion_wqueue || !sys->block_wqueue || !sys->sockfd_wqueue)
+	    !sys->deletion_wqueue || !sys->block_wqueue ||
+	    !sys->sockfd_wqueue || !sys->md_wqueue)
 			return -1;
 	return 0;
 }
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 21aec29..d395a5f 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -116,6 +116,7 @@ struct cluster_info {
 	struct work_queue *sockfd_wqueue;
 	struct work_queue *oc_reclaim_wqueue;
 	struct work_queue *oc_push_wqueue;
+	struct work_queue *md_wqueue;
 
 	bool enable_object_cache;
 
@@ -419,5 +420,7 @@ int journal_file_write(uint64_t oid, const char *buf, size_t size, off_t, bool);
 int md_init_disk(char *path);
 uint64_t md_init_space(void);
 char *get_object_path(uint64_t oid);
+int md_handle_eio(char *);
+bool md_exist(uint64_t oid);
 
 #endif
-- 
1.7.9.5