[sheepdog] [PATCH] sheep: fix missing .stale problem

Liu Yuan namei.unix at gmail.com
Sat Feb 28 10:35:04 CET 2015


On Sat, Feb 28, 2015 at 05:20:29PM +0800, Liu Yuan wrote:
> On Sat, Feb 28, 2015 at 05:14:56PM +0800, Liu Yuan wrote:
> > From: Liu Yuan <liuyuan at cmss.chinamobile.com>
> > 
> > If .stale is missing, the recovery algorithm is almost broken. We have this
> > problem because following logic for .stale:
> > 
> > store->format() purge all the directories including .stale, rmdir(.stale)
> >    |
> >    V
> > store->init() then mkdir(.stale)
> > 
> > This order was previously strickly honored but dbf0e8782 that pushed the purge
> > work into the worker thread, broke this ordering. On many systems, the actual
> > execution order for .stale becomes:
> > 
> > mkdir(.stale) -> rmdir(.stale)
> > 
> > because rmdir(.stale) is deffered after store->init(), so this poor node will
> > never have .stale exist.
> > 
> > Since the store->cleanup() is the actual user of async rmdir, the fix is just
> > add a new async interface and call it and have others call sync one.
> > 
> > Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
> > ---
> >  include/util.h      |  1 +
> >  lib/util.c          | 18 ++++++++++++++----
> >  sheep/plain_store.c |  6 +++++-
> >  3 files changed, 20 insertions(+), 5 deletions(-)
> > 
> > diff --git a/include/util.h b/include/util.h
> > index aba7b32..c5b5ac9 100644
> > --- a/include/util.h
> > +++ b/include/util.h
> > @@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
> >  char *chomp(char *str);
> >  int rmdir_r(const char *dir_path);
> >  int purge_directory(const char *dir_path);
> > +int purge_directory_async(const char *dir_path);
> >  bool is_numeric(const char *p);
> >  const char *data_to_str(void *data, size_t data_length);
> >  int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
> > diff --git a/lib/util.c b/lib/util.c
> > index 73cf1af..f563c1e 100644
> > --- a/lib/util.c
> > +++ b/lib/util.c
> > @@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
> >  }
> >  
> >  /* Purge directory recursively */
> > -int purge_directory(const char *dir_path)
> > +static int raw_purge_directory(const char *dir_path, bool async)
> >  {
> >  	int ret = 0;
> >  	struct stat s;
> > @@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
> >  		return -errno;
> >  	}
> >  
> > -	if (util_wqueue) {
> > +	if (async) {
> >  		/* we have workqueue for it, don't unlink in this thread */
> >  		w = xzalloc(sizeof(*w));
> >  		w->nr_units = 0;
> > @@ -452,7 +452,7 @@ int purge_directory(const char *dir_path)
> >  			goto out;
> >  		}
> >  
> > -		if (util_wqueue) {
> > +		if (async) {
> >  			struct purge_work_unit *unit;
> >  
> >  			unit = &w->units[w->nr_units++];
> > @@ -482,7 +482,7 @@ int purge_directory(const char *dir_path)
> >  		}
> >  	}
> >  
> > -	if (util_wqueue) {
> > +	if (async) {
> >  		w->work.fn = purge_work_fn;
> >  		w->work.done = purge_work_done;
> >  		queue_work(util_wqueue, &w->work);
> > @@ -493,6 +493,16 @@ out:
> >  	return ret;
> >  }
> >  
> > +int purge_directory(const char *dir_path)
> > +{
> > +	return raw_purge_directory(dir_path, false);
> > +}
> > +
> > +int purge_directory_async(const char *dir_path)
> > +{
> > +	return raw_purge_directory(dir_path, true);
> > +}
> > +
> >  /* remove directory recursively */
> >  int rmdir_r(const char *dir_path)
> >  {
> > diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> > index 9614afa..87fc1b0 100644
> > --- a/sheep/plain_store.c
> > +++ b/sheep/plain_store.c
> > @@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
> >  	char p[PATH_MAX];
> >  
> >  	snprintf(p, PATH_MAX, "%s/.stale", path);
> > -	return purge_dir(p);
> > +
> > +	if (purge_directory_async(path) < 0)
> > +		return SD_RES_EIO;
> > +
> > +	return SD_RES_SUCCESS;
> >  }
> >  
> >  int default_cleanup(void)
> > -- 
> > 1.9.1
> > 
> 
> Please ignore this patch. It only partialy fix the problem dbf0e8782 brought.

Oops, it was my mistake, following is the v2

>From 53eacde3dd23861765db24aa12eeacefcecb7fc5 Mon Sep 17 00:00:00 2001
From: Liu Yuan <liuyuan at cmss.chinamobile.com>
Date: Sat, 28 Feb 2015 17:30:42 +0800
Subject: [PATCH v2] sheep: fix missing .stale problem

If .stale is missing, the recovery algorithm is almost broken. We have this
problem because following logic for .stale:

store->format() purge all the directories including .stale, rmdir(.stale)
   |
   V
store->init() then mkdir(.stale)

This order was previously strickly honored but dbf0e8782 that pushed the purge
work into the worker thread, broke this ordering. On many systems, the actual
execution order for .stale becomes:

mkdir(.stale) -> rmdir(.stale)

because rmdir(.stale) is deffered after store->init(), so this poor node will
never have .stale exist.

Since the store->cleanup() is the actual user of async rmdir, the fix is just
add a new async interface and call it and have others call sync one.

Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>

diff --git a/include/util.h b/include/util.h
index aba7b32..c5b5ac9 100644
--- a/include/util.h
+++ b/include/util.h
@@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
 char *chomp(char *str);
 int rmdir_r(const char *dir_path);
 int purge_directory(const char *dir_path);
+int purge_directory_async(const char *dir_path);
 bool is_numeric(const char *p);
 const char *data_to_str(void *data, size_t data_length);
 int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
diff --git a/lib/util.c b/lib/util.c
index 73cf1af..4d06bb8 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
 }
 
 /* Purge directory recursively */
-int purge_directory(const char *dir_path)
+static int raw_purge_directory(const char *dir_path, bool async)
 {
 	int ret = 0;
 	struct stat s;
@@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
 		return -errno;
 	}
 
-	if (util_wqueue) {
+	if (async) {
 		/* we have workqueue for it, don't unlink in this thread */
 		w = xzalloc(sizeof(*w));
 		w->nr_units = 0;
@@ -452,13 +452,14 @@ int purge_directory(const char *dir_path)
 			goto out;
 		}
 
-		if (util_wqueue) {
+		if (async) {
 			struct purge_work_unit *unit;
 
 			unit = &w->units[w->nr_units++];
 
 			unit->is_dir = S_ISDIR(s.st_mode);
 			strcpy(unit->path, path);
+			sd_err("%s", unit->path);
 
 			if (w->nr_units == w->units_size) {
 				w->units_size *= 2;
@@ -482,7 +483,7 @@ int purge_directory(const char *dir_path)
 		}
 	}
 
-	if (util_wqueue) {
+	if (async) {
 		w->work.fn = purge_work_fn;
 		w->work.done = purge_work_done;
 		queue_work(util_wqueue, &w->work);
@@ -493,6 +494,16 @@ out:
 	return ret;
 }
 
+int purge_directory(const char *dir_path)
+{
+	return raw_purge_directory(dir_path, false);
+}
+
+int purge_directory_async(const char *dir_path)
+{
+	return raw_purge_directory(dir_path, true);
+}
+
 /* remove directory recursively */
 int rmdir_r(const char *dir_path)
 {
@@ -502,6 +513,7 @@ int rmdir_r(const char *dir_path)
 	if (ret == 0)
 		ret = rmdir(dir_path);
 
+	sd_notice("%s", dir_path);
 	return ret;
 }
 
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 9614afa..5ea2946 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
 	char p[PATH_MAX];
 
 	snprintf(p, PATH_MAX, "%s/.stale", path);
-	return purge_dir(p);
+
+	if (purge_directory_async(p) < 0)
+		return SD_RES_EIO;
+
+	return SD_RES_SUCCESS;
 }
 
 int default_cleanup(void)



More information about the sheepdog mailing list