[sheepdog] [PATCH] sheep: fix missing .stale problem

Liu Yuan namei.unix at gmail.com
Mon Mar 2 02:49:54 CET 2015


On Sat, Feb 28, 2015 at 05:35:04PM +0800, Liu Yuan wrote:
> On Sat, Feb 28, 2015 at 05:20:29PM +0800, Liu Yuan wrote:
> > On Sat, Feb 28, 2015 at 05:14:56PM +0800, Liu Yuan wrote:
> > > From: Liu Yuan <liuyuan at cmss.chinamobile.com>
> > > 
> > > If .stale is missing, the recovery algorithm is almost broken. We have this
> > > problem because following logic for .stale:
> > > 
> > > store->format() purge all the directories including .stale, rmdir(.stale)
> > >    |
> > >    V
> > > store->init() then mkdir(.stale)
> > > 
> > > This order was previously strickly honored but dbf0e8782 that pushed the purge
> > > work into the worker thread, broke this ordering. On many systems, the actual
> > > execution order for .stale becomes:
> > > 
> > > mkdir(.stale) -> rmdir(.stale)
> > > 
> > > because rmdir(.stale) is deffered after store->init(), so this poor node will
> > > never have .stale exist.
> > > 
> > > Since the store->cleanup() is the actual user of async rmdir, the fix is just
> > > add a new async interface and call it and have others call sync one.
> > > 
> > > Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
> > > ---
> > >  include/util.h      |  1 +
> > >  lib/util.c          | 18 ++++++++++++++----
> > >  sheep/plain_store.c |  6 +++++-
> > >  3 files changed, 20 insertions(+), 5 deletions(-)
> > > 
> > > diff --git a/include/util.h b/include/util.h
> > > index aba7b32..c5b5ac9 100644
> > > --- a/include/util.h
> > > +++ b/include/util.h
> > > @@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
> > >  char *chomp(char *str);
> > >  int rmdir_r(const char *dir_path);
> > >  int purge_directory(const char *dir_path);
> > > +int purge_directory_async(const char *dir_path);
> > >  bool is_numeric(const char *p);
> > >  const char *data_to_str(void *data, size_t data_length);
> > >  int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
> > > diff --git a/lib/util.c b/lib/util.c
> > > index 73cf1af..f563c1e 100644
> > > --- a/lib/util.c
> > > +++ b/lib/util.c
> > > @@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
> > >  }
> > >  
> > >  /* Purge directory recursively */
> > > -int purge_directory(const char *dir_path)
> > > +static int raw_purge_directory(const char *dir_path, bool async)
> > >  {
> > >  	int ret = 0;
> > >  	struct stat s;
> > > @@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
> > >  		return -errno;
> > >  	}
> > >  
> > > -	if (util_wqueue) {
> > > +	if (async) {
> > >  		/* we have workqueue for it, don't unlink in this thread */
> > >  		w = xzalloc(sizeof(*w));
> > >  		w->nr_units = 0;
> > > @@ -452,7 +452,7 @@ int purge_directory(const char *dir_path)
> > >  			goto out;
> > >  		}
> > >  
> > > -		if (util_wqueue) {
> > > +		if (async) {
> > >  			struct purge_work_unit *unit;
> > >  
> > >  			unit = &w->units[w->nr_units++];
> > > @@ -482,7 +482,7 @@ int purge_directory(const char *dir_path)
> > >  		}
> > >  	}
> > >  
> > > -	if (util_wqueue) {
> > > +	if (async) {
> > >  		w->work.fn = purge_work_fn;
> > >  		w->work.done = purge_work_done;
> > >  		queue_work(util_wqueue, &w->work);
> > > @@ -493,6 +493,16 @@ out:
> > >  	return ret;
> > >  }
> > >  
> > > +int purge_directory(const char *dir_path)
> > > +{
> > > +	return raw_purge_directory(dir_path, false);
> > > +}
> > > +
> > > +int purge_directory_async(const char *dir_path)
> > > +{
> > > +	return raw_purge_directory(dir_path, true);
> > > +}
> > > +
> > >  /* remove directory recursively */
> > >  int rmdir_r(const char *dir_path)
> > >  {
> > > diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> > > index 9614afa..87fc1b0 100644
> > > --- a/sheep/plain_store.c
> > > +++ b/sheep/plain_store.c
> > > @@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
> > >  	char p[PATH_MAX];
> > >  
> > >  	snprintf(p, PATH_MAX, "%s/.stale", path);
> > > -	return purge_dir(p);
> > > +
> > > +	if (purge_directory_async(path) < 0)
> > > +		return SD_RES_EIO;
> > > +
> > > +	return SD_RES_SUCCESS;
> > >  }
> > >  
> > >  int default_cleanup(void)
> > > -- 
> > > 1.9.1
> > > 
> > 
> > Please ignore this patch. It only partialy fix the problem dbf0e8782 brought.
> 
> Oops, it was my mistake, following is the v2
> 
> From 53eacde3dd23861765db24aa12eeacefcecb7fc5 Mon Sep 17 00:00:00 2001
> From: Liu Yuan <liuyuan at cmss.chinamobile.com>
> Date: Sat, 28 Feb 2015 17:30:42 +0800
> Subject: [PATCH v2] sheep: fix missing .stale problem
> 
> If .stale is missing, the recovery algorithm is almost broken. We have this
> problem because following logic for .stale:
> 
> store->format() purge all the directories including .stale, rmdir(.stale)
>    |
>    V
> store->init() then mkdir(.stale)
> 
> This order was previously strickly honored but dbf0e8782 that pushed the purge
> work into the worker thread, broke this ordering. On many systems, the actual
> execution order for .stale becomes:
> 
> mkdir(.stale) -> rmdir(.stale)
> 
> because rmdir(.stale) is deffered after store->init(), so this poor node will
> never have .stale exist.
> 
> Since the store->cleanup() is the actual user of async rmdir, the fix is just
> add a new async interface and call it and have others call sync one.
> 
> Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
> 
> diff --git a/include/util.h b/include/util.h
> index aba7b32..c5b5ac9 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
>  char *chomp(char *str);
>  int rmdir_r(const char *dir_path);
>  int purge_directory(const char *dir_path);
> +int purge_directory_async(const char *dir_path);
>  bool is_numeric(const char *p);
>  const char *data_to_str(void *data, size_t data_length);
>  int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
> diff --git a/lib/util.c b/lib/util.c
> index 73cf1af..4d06bb8 100644
> --- a/lib/util.c
> +++ b/lib/util.c
> @@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
>  }
>  
>  /* Purge directory recursively */
> -int purge_directory(const char *dir_path)
> +static int raw_purge_directory(const char *dir_path, bool async)
>  {
>  	int ret = 0;
>  	struct stat s;
> @@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
>  		return -errno;
>  	}
>  
> -	if (util_wqueue) {
> +	if (async) {
>  		/* we have workqueue for it, don't unlink in this thread */
>  		w = xzalloc(sizeof(*w));
>  		w->nr_units = 0;
> @@ -452,13 +452,14 @@ int purge_directory(const char *dir_path)
>  			goto out;
>  		}
>  
> -		if (util_wqueue) {
> +		if (async) {
>  			struct purge_work_unit *unit;
>  
>  			unit = &w->units[w->nr_units++];
>  
>  			unit->is_dir = S_ISDIR(s.st_mode);
>  			strcpy(unit->path, path);
> +			sd_err("%s", unit->path);
>  
>  			if (w->nr_units == w->units_size) {
>  				w->units_size *= 2;
> @@ -482,7 +483,7 @@ int purge_directory(const char *dir_path)
>  		}
>  	}
>  
> -	if (util_wqueue) {
> +	if (async) {
>  		w->work.fn = purge_work_fn;
>  		w->work.done = purge_work_done;
>  		queue_work(util_wqueue, &w->work);
> @@ -493,6 +494,16 @@ out:
>  	return ret;
>  }
>  
> +int purge_directory(const char *dir_path)
> +{
> +	return raw_purge_directory(dir_path, false);
> +}
> +
> +int purge_directory_async(const char *dir_path)
> +{
> +	return raw_purge_directory(dir_path, true);
> +}
> +
>  /* remove directory recursively */
>  int rmdir_r(const char *dir_path)
>  {
> @@ -502,6 +513,7 @@ int rmdir_r(const char *dir_path)
>  	if (ret == 0)
>  		ret = rmdir(dir_path);
>  
> +	sd_notice("%s", dir_path);
>  	return ret;
>  }
>  
> diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> index 9614afa..5ea2946 100644
> --- a/sheep/plain_store.c
> +++ b/sheep/plain_store.c
> @@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
>  	char p[PATH_MAX];
>  
>  	snprintf(p, PATH_MAX, "%s/.stale", path);
> -	return purge_dir(p);
> +
> +	if (purge_directory_async(p) < 0)
> +		return SD_RES_EIO;
> +
> +	return SD_RES_SUCCESS;
>  }
>  
>  int default_cleanup(void)

Hitoshi? ping...



More information about the sheepdog mailing list