[sheepdog] [PATCH v3] sheep: fix missing .stale problem

Liu Yuan namei.unix at gmail.com
Mon Mar 2 06:21:36 CET 2015


From: Liu Yuan <liuyuan at cmss.chinamobile.com>

If .stale is missing, the recovery algorithm is almost broken. We have this
problem because following logic for .stale:

store->format() purge all the directories including .stale, rmdir(.stale)
   |
   V
store->init() then mkdir(.stale)

This order was previously strickly honored but dbf0e8782 that pushed the purge
work into the worker thread, broke this ordering. On many systems, the actual
execution order for .stale becomes:

mkdir(.stale) -> rmdir(.stale)

because rmdir(.stale) is deffered after store->init(), so this poor node will
never have .stale exist.

Since the store->cleanup() is the actual user of async rmdir, the fix is just
add a new async interface and call it and have others call sync one.

Cc: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
---
 include/util.h      |  1 +
 lib/util.c          | 18 ++++++++++++++----
 sheep/plain_store.c |  6 +++++-
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/include/util.h b/include/util.h
index aba7b32..c5b5ac9 100644
--- a/include/util.h
+++ b/include/util.h
@@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
 char *chomp(char *str);
 int rmdir_r(const char *dir_path);
 int purge_directory(const char *dir_path);
+int purge_directory_async(const char *dir_path);
 bool is_numeric(const char *p);
 const char *data_to_str(void *data, size_t data_length);
 int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
diff --git a/lib/util.c b/lib/util.c
index 73cf1af..aad6d96 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
 }
 
 /* Purge directory recursively */
-int purge_directory(const char *dir_path)
+static int raw_purge_directory(const char *dir_path, bool async)
 {
 	int ret = 0;
 	struct stat s;
@@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
 		return -errno;
 	}
 
-	if (util_wqueue) {
+	if (async && util_wqueue) {
 		/* we have workqueue for it, don't unlink in this thread */
 		w = xzalloc(sizeof(*w));
 		w->nr_units = 0;
@@ -452,7 +452,7 @@ int purge_directory(const char *dir_path)
 			goto out;
 		}
 
-		if (util_wqueue) {
+		if (async && util_wqueue) {
 			struct purge_work_unit *unit;
 
 			unit = &w->units[w->nr_units++];
@@ -482,7 +482,7 @@ int purge_directory(const char *dir_path)
 		}
 	}
 
-	if (util_wqueue) {
+	if (async && util_wqueue) {
 		w->work.fn = purge_work_fn;
 		w->work.done = purge_work_done;
 		queue_work(util_wqueue, &w->work);
@@ -493,6 +493,16 @@ out:
 	return ret;
 }
 
+int purge_directory(const char *dir_path)
+{
+	return raw_purge_directory(dir_path, false);
+}
+
+int purge_directory_async(const char *dir_path)
+{
+	return raw_purge_directory(dir_path, true);
+}
+
 /* remove directory recursively */
 int rmdir_r(const char *dir_path)
 {
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 9614afa..5ea2946 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
 	char p[PATH_MAX];
 
 	snprintf(p, PATH_MAX, "%s/.stale", path);
-	return purge_dir(p);
+
+	if (purge_directory_async(p) < 0)
+		return SD_RES_EIO;
+
+	return SD_RES_SUCCESS;
 }
 
 int default_cleanup(void)
-- 
1.9.1




More information about the sheepdog mailing list