[sheepdog] [PATCH] sheep: fix missing .stale problem
Liu Yuan
namei.unix at gmail.com
Mon Mar 2 02:49:54 CET 2015
On Sat, Feb 28, 2015 at 05:35:04PM +0800, Liu Yuan wrote:
> On Sat, Feb 28, 2015 at 05:20:29PM +0800, Liu Yuan wrote:
> > On Sat, Feb 28, 2015 at 05:14:56PM +0800, Liu Yuan wrote:
> > > From: Liu Yuan <liuyuan at cmss.chinamobile.com>
> > >
> > > If .stale is missing, the recovery algorithm is almost broken. We have this
> > > problem because following logic for .stale:
> > >
> > > store->format() purge all the directories including .stale, rmdir(.stale)
> > > |
> > > V
> > > store->init() then mkdir(.stale)
> > >
> > > This order was previously strickly honored but dbf0e8782 that pushed the purge
> > > work into the worker thread, broke this ordering. On many systems, the actual
> > > execution order for .stale becomes:
> > >
> > > mkdir(.stale) -> rmdir(.stale)
> > >
> > > because rmdir(.stale) is deffered after store->init(), so this poor node will
> > > never have .stale exist.
> > >
> > > Since the store->cleanup() is the actual user of async rmdir, the fix is just
> > > add a new async interface and call it and have others call sync one.
> > >
> > > Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
> > > ---
> > > include/util.h | 1 +
> > > lib/util.c | 18 ++++++++++++++----
> > > sheep/plain_store.c | 6 +++++-
> > > 3 files changed, 20 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/include/util.h b/include/util.h
> > > index aba7b32..c5b5ac9 100644
> > > --- a/include/util.h
> > > +++ b/include/util.h
> > > @@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
> > > char *chomp(char *str);
> > > int rmdir_r(const char *dir_path);
> > > int purge_directory(const char *dir_path);
> > > +int purge_directory_async(const char *dir_path);
> > > bool is_numeric(const char *p);
> > > const char *data_to_str(void *data, size_t data_length);
> > > int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
> > > diff --git a/lib/util.c b/lib/util.c
> > > index 73cf1af..f563c1e 100644
> > > --- a/lib/util.c
> > > +++ b/lib/util.c
> > > @@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
> > > }
> > >
> > > /* Purge directory recursively */
> > > -int purge_directory(const char *dir_path)
> > > +static int raw_purge_directory(const char *dir_path, bool async)
> > > {
> > > int ret = 0;
> > > struct stat s;
> > > @@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
> > > return -errno;
> > > }
> > >
> > > - if (util_wqueue) {
> > > + if (async) {
> > > /* we have workqueue for it, don't unlink in this thread */
> > > w = xzalloc(sizeof(*w));
> > > w->nr_units = 0;
> > > @@ -452,7 +452,7 @@ int purge_directory(const char *dir_path)
> > > goto out;
> > > }
> > >
> > > - if (util_wqueue) {
> > > + if (async) {
> > > struct purge_work_unit *unit;
> > >
> > > unit = &w->units[w->nr_units++];
> > > @@ -482,7 +482,7 @@ int purge_directory(const char *dir_path)
> > > }
> > > }
> > >
> > > - if (util_wqueue) {
> > > + if (async) {
> > > w->work.fn = purge_work_fn;
> > > w->work.done = purge_work_done;
> > > queue_work(util_wqueue, &w->work);
> > > @@ -493,6 +493,16 @@ out:
> > > return ret;
> > > }
> > >
> > > +int purge_directory(const char *dir_path)
> > > +{
> > > + return raw_purge_directory(dir_path, false);
> > > +}
> > > +
> > > +int purge_directory_async(const char *dir_path)
> > > +{
> > > + return raw_purge_directory(dir_path, true);
> > > +}
> > > +
> > > /* remove directory recursively */
> > > int rmdir_r(const char *dir_path)
> > > {
> > > diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> > > index 9614afa..87fc1b0 100644
> > > --- a/sheep/plain_store.c
> > > +++ b/sheep/plain_store.c
> > > @@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
> > > char p[PATH_MAX];
> > >
> > > snprintf(p, PATH_MAX, "%s/.stale", path);
> > > - return purge_dir(p);
> > > +
> > > + if (purge_directory_async(path) < 0)
> > > + return SD_RES_EIO;
> > > +
> > > + return SD_RES_SUCCESS;
> > > }
> > >
> > > int default_cleanup(void)
> > > --
> > > 1.9.1
> > >
> >
> > Please ignore this patch. It only partialy fix the problem dbf0e8782 brought.
>
> Oops, it was my mistake, following is the v2
>
> From 53eacde3dd23861765db24aa12eeacefcecb7fc5 Mon Sep 17 00:00:00 2001
> From: Liu Yuan <liuyuan at cmss.chinamobile.com>
> Date: Sat, 28 Feb 2015 17:30:42 +0800
> Subject: [PATCH v2] sheep: fix missing .stale problem
>
> If .stale is missing, the recovery algorithm is almost broken. We have this
> problem because following logic for .stale:
>
> store->format() purge all the directories including .stale, rmdir(.stale)
> |
> V
> store->init() then mkdir(.stale)
>
> This order was previously strickly honored but dbf0e8782 that pushed the purge
> work into the worker thread, broke this ordering. On many systems, the actual
> execution order for .stale becomes:
>
> mkdir(.stale) -> rmdir(.stale)
>
> because rmdir(.stale) is deffered after store->init(), so this poor node will
> never have .stale exist.
>
> Since the store->cleanup() is the actual user of async rmdir, the fix is just
> add a new async interface and call it and have others call sync one.
>
> Signed-off-by: Liu Yuan <liuyuan at cmss.chinamobile.com>
>
> diff --git a/include/util.h b/include/util.h
> index aba7b32..c5b5ac9 100644
> --- a/include/util.h
> +++ b/include/util.h
> @@ -107,6 +107,7 @@ void pstrcpy(char *buf, int buf_size, const char *str);
> char *chomp(char *str);
> int rmdir_r(const char *dir_path);
> int purge_directory(const char *dir_path);
> +int purge_directory_async(const char *dir_path);
> bool is_numeric(const char *p);
> const char *data_to_str(void *data, size_t data_length);
> int install_sighandler(int signum, void (*handler)(int, siginfo_t *, void *),
> diff --git a/lib/util.c b/lib/util.c
> index 73cf1af..4d06bb8 100644
> --- a/lib/util.c
> +++ b/lib/util.c
> @@ -417,7 +417,7 @@ static void purge_work_done(struct work *work)
> }
>
> /* Purge directory recursively */
> -int purge_directory(const char *dir_path)
> +static int raw_purge_directory(const char *dir_path, bool async)
> {
> int ret = 0;
> struct stat s;
> @@ -433,7 +433,7 @@ int purge_directory(const char *dir_path)
> return -errno;
> }
>
> - if (util_wqueue) {
> + if (async) {
> /* we have workqueue for it, don't unlink in this thread */
> w = xzalloc(sizeof(*w));
> w->nr_units = 0;
> @@ -452,13 +452,14 @@ int purge_directory(const char *dir_path)
> goto out;
> }
>
> - if (util_wqueue) {
> + if (async) {
> struct purge_work_unit *unit;
>
> unit = &w->units[w->nr_units++];
>
> unit->is_dir = S_ISDIR(s.st_mode);
> strcpy(unit->path, path);
> + sd_err("%s", unit->path);
>
> if (w->nr_units == w->units_size) {
> w->units_size *= 2;
> @@ -482,7 +483,7 @@ int purge_directory(const char *dir_path)
> }
> }
>
> - if (util_wqueue) {
> + if (async) {
> w->work.fn = purge_work_fn;
> w->work.done = purge_work_done;
> queue_work(util_wqueue, &w->work);
> @@ -493,6 +494,16 @@ out:
> return ret;
> }
>
> +int purge_directory(const char *dir_path)
> +{
> + return raw_purge_directory(dir_path, false);
> +}
> +
> +int purge_directory_async(const char *dir_path)
> +{
> + return raw_purge_directory(dir_path, true);
> +}
> +
> /* remove directory recursively */
> int rmdir_r(const char *dir_path)
> {
> @@ -502,6 +513,7 @@ int rmdir_r(const char *dir_path)
> if (ret == 0)
> ret = rmdir(dir_path);
>
> + sd_notice("%s", dir_path);
> return ret;
> }
>
> diff --git a/sheep/plain_store.c b/sheep/plain_store.c
> index 9614afa..5ea2946 100644
> --- a/sheep/plain_store.c
> +++ b/sheep/plain_store.c
> @@ -255,7 +255,11 @@ static int purge_stale_dir(const char *path)
> char p[PATH_MAX];
>
> snprintf(p, PATH_MAX, "%s/.stale", path);
> - return purge_dir(p);
> +
> + if (purge_directory_async(p) < 0)
> + return SD_RES_EIO;
> +
> + return SD_RES_SUCCESS;
> }
>
> int default_cleanup(void)
Hitoshi? ping...
More information about the sheepdog
mailing list