[sheepdog] [PATCH v2] sheep/recovery: multi-threading recovery process
Hitoshi Mitake
mitake.hitoshi at gmail.com
Wed Jan 29 08:28:35 CET 2014
At Tue, 28 Jan 2014 18:01:42 +0800,
Liu Yuan wrote:
>
> Rationale for multi-threaded recovery:
>
> 1. If one node is added, we find that all the VMs on other nodes will get
> noticeably affected until 50% data is transferred to the new node.
>
> 2. For node failure, we might not have problems of running VM but the
> recovery process boost will benefit IO operation of VM with less
> chances to be blocked for write and also improve reliability.
>
> 3. For disk failure in node, this is similar to adding a node. All
> the data on the broken disk will be recovered on other disks in
> this node. Speedy recoery not only improve data reliability but
> also cause less writing blocking on the lost data.
>
> Our oid scheduling algorithm is intact and simply add multi-threading onto top
> of current recovery algorithm with minimal changes.
>
> - we still have ->oids array to denote oids to be recovered
> - we start up 2 * nr_disks threads for recovery
> - the tricky part is that we need to wait all the running threads to
> completion before start next recovery events for multiple nodes/disks events
>
> This patch passes "./check -g md -md" on my local box
On my box, at least 32 and 33 failed. I'm seeking the root cause now
but this patch seems to be a little bit dangerous.
Thanks,
Hitoshi
>
> Signed-off-by: Liu Yuan <namei.unix at gmail.com>
> ---
> sheep/md.c | 9 ++++++--
> sheep/recovery.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++--------
> sheep/sheep.c | 2 +-
> sheep/sheep_priv.h | 1 +
> 4 files changed, 60 insertions(+), 12 deletions(-)
>
> diff --git a/sheep/md.c b/sheep/md.c
> index e535454..9a5e40c 100644
> --- a/sheep/md.c
> +++ b/sheep/md.c
> @@ -43,9 +43,9 @@ static struct md md = {
> .lock = SD_RW_LOCK_INITIALIZER,
> };
>
> -static inline int nr_online_disks(void)
> +static inline uint32_t nr_online_disks(void)
> {
> - int nr;
> + uint32_t nr;
>
> sd_read_lock(&md.lock);
> nr = md.nr_disks;
> @@ -677,3 +677,8 @@ uint64_t md_get_size(uint64_t *used)
>
> return fsize + *used;
> }
> +
> +uint32_t md_nr_disks(void)
> +{
> + return nr_online_disks();
> +}
> diff --git a/sheep/recovery.c b/sheep/recovery.c
> index c15297c..0cf9c16 100644
> --- a/sheep/recovery.c
> +++ b/sheep/recovery.c
> @@ -54,6 +54,8 @@ struct recovery_info {
> uint32_t epoch;
> uint32_t tgt_epoch;
> uint64_t done;
> + uint64_t next;
> + uint32_t running_threads_nr;
>
> /*
> * true when automatic recovery is disabled
> @@ -507,6 +509,8 @@ static int do_recover_object(struct recovery_obj_work *row)
> {
> uint64_t oid = row->oid;
>
> + sd_debug("try recover object %"PRIx64, oid);
> +
> if (is_erasure_oid(oid))
> return recover_erasure_object(row);
> else
> @@ -597,7 +601,8 @@ main_fn bool oid_in_recovery(uint64_t oid)
> return false;
> }
>
> - if (rinfo->oids[rinfo->done] == oid) {
> + if (xlfind(&oid, rinfo->oids + rinfo->done,
> + rinfo->next - rinfo->done, oid_cmp)) {
> if (rinfo->suspended)
> break;
> /*
> @@ -666,12 +671,17 @@ static void free_recovery_info(struct recovery_info *rinfo)
> /* Return true if next recovery work is queued. */
> static inline bool run_next_rw(void)
> {
> - struct recovery_info *nrinfo = uatomic_xchg_ptr(&next_rinfo, NULL);
> + struct recovery_info *nrinfo = uatomic_read(&next_rinfo);
> struct recovery_info *cur = main_thread_get(current_rinfo);
>
> if (nrinfo == NULL)
> return false;
>
> + sd_debug("running threads nr %"PRIu32, cur->running_threads_nr);
> + if (cur->running_threads_nr > 1)
> + return true;
> +
> + nrinfo = uatomic_xchg_ptr(&next_rinfo, NULL);
> /*
> * When md recovery supersed the reweight or node recovery, we need to
> * notify completion.
> @@ -743,14 +753,14 @@ static inline bool oid_in_prio_oids(struct recovery_info *rinfo, uint64_t oid)
> /*
> * Schedule prio_oids to be recovered first in FIFO order
> *
> - * rw->done is index of the original next object to be recovered and also the
> - * number of objects already recovered.
> + * rw->next is index of the original next object to be recovered and also the
> + * number of objects already recovered and being recovered.
> * we just move rw->prio_oids in between:
> - * new_oids = [0..rw->done - 1] + [rw->prio_oids] + [rw->done]
> + * new_oids = [0..rw->next - 1] + [rw->prio_oids] + [rw->next]
> */
> static inline void finish_schedule_oids(struct recovery_info *rinfo)
> {
> - uint64_t i, nr_recovered = rinfo->done, new_idx;
> + uint64_t i, nr_recovered = rinfo->next, new_idx;
> uint64_t *new_oids;
>
> /* If I am the last oid, done */
> @@ -763,7 +773,7 @@ static inline void finish_schedule_oids(struct recovery_info *rinfo)
> rinfo->nr_prio_oids * sizeof(uint64_t));
> new_idx = nr_recovered + rinfo->nr_prio_oids;
>
> - for (i = rinfo->done; i < rinfo->count; i++) {
> + for (i = rinfo->next; i < rinfo->count; i++) {
> if (oid_in_prio_oids(rinfo, rinfo->oids[i]))
> continue;
> new_oids[new_idx++] = rinfo->oids[i];
> @@ -810,8 +820,14 @@ static void recover_next_object(struct recovery_info *rinfo)
> return;
> }
>
> + /* no more objects to be recovered */
> + if (rinfo->next >= rinfo->count)
> + return;
> +
> /* Try recover next object */
> queue_recovery_work(rinfo);
> + rinfo->next++;
> + rinfo->running_threads_nr++;
> }
>
> void resume_suspended_recovery(void)
> @@ -849,6 +865,14 @@ static void recover_object_main(struct work *work)
> }
>
> wakeup_requests_on_oid(row->oid);
> + /* ->oids[done, next] is out of order since finish order is random */
> + if (rinfo->oids[rinfo->done] != row->oid) {
> + uint64_t *p = xlfind(&row->oid, rinfo->oids + rinfo->done,
> + rinfo->next - rinfo->done, oid_cmp);
> +
> + *p = rinfo->oids[rinfo->done];
> + rinfo->oids[rinfo->done] = row->oid;
> + }
> rinfo->done++;
>
> sd_info("object %"PRIx64" is recovered (%"PRIu64"/%"PRIu64")", row->oid,
> @@ -861,6 +885,7 @@ static void recover_object_main(struct work *work)
>
> finish_recovery(rinfo);
> out:
> + rinfo->running_threads_nr--;
> free_recovery_obj_work(row);
> }
>
> @@ -872,6 +897,22 @@ static void finish_object_list(struct work *work)
> struct recovery_list_work,
> base);
> struct recovery_info *rinfo = main_thread_get(current_rinfo);
> + /*
> + * Rationale for multi-threaded recovery:
> + * 1. If one node is added, we find that all the VMs on other nodes will
> + * get noticeably affected until 50% data is transferred to the new
> + * node.
> + * 2. For node failure, we might not have problems of running VM but the
> + * recovery process boost will benefit IO operation of VM with less
> + * chances to be blocked for write and also improve reliability.
> + * 3. For disk failure in node, this is similar to adding a node. All
> + * the data on the broken disk will be recovered on other disks in
> + * this node. Speedy recoery not only improve data reliability but
> + * also cause less writing blocking on the lost data.
> + *
> + * We choose md_nr_disks() * 2 threads for recovery, no rationale.
> + */
> + uint32_t nr_threads = md_nr_disks() * 2;
>
> rinfo->state = RW_RECOVER_OBJ;
> rinfo->count = rlw->count;
> @@ -887,7 +928,8 @@ static void finish_object_list(struct work *work)
> return;
> }
>
> - recover_next_object(rinfo);
> + for (uint32_t i = 0; i < nr_threads; i++)
> + recover_next_object(rinfo);
> return;
> }
>
> @@ -1076,7 +1118,7 @@ static void queue_recovery_work(struct recovery_info *rinfo)
> break;
> case RW_RECOVER_OBJ:
> row = xzalloc(sizeof(*row));
> - row->oid = rinfo->oids[rinfo->done];
> + row->oid = rinfo->oids[rinfo->next];
>
> rw = &row->base;
> rw->work.fn = recover_object_work;
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index 7ef3746..3164789 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -444,7 +444,7 @@ static int create_work_queues(void)
> sys->net_wqueue = create_work_queue("net", WQ_UNLIMITED);
> sys->gateway_wqueue = create_work_queue("gway", WQ_UNLIMITED);
> sys->io_wqueue = create_work_queue("io", WQ_UNLIMITED);
> - sys->recovery_wqueue = create_ordered_work_queue("rw");
> + sys->recovery_wqueue = create_work_queue("rw", WQ_UNLIMITED);
> sys->deletion_wqueue = create_ordered_work_queue("deletion");
> sys->block_wqueue = create_ordered_work_queue("block");
> sys->md_wqueue = create_ordered_work_queue("md");
> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
> index e8d688b..6140666 100644
> --- a/sheep/sheep_priv.h
> +++ b/sheep/sheep_priv.h
> @@ -484,6 +484,7 @@ uint32_t md_get_info(struct sd_md_info *info);
> int md_plug_disks(char *disks);
> int md_unplug_disks(char *disks);
> uint64_t md_get_size(uint64_t *used);
> +uint32_t md_nr_disks(void);
>
> /* http.c */
> #ifdef HAVE_HTTP
> --
> 1.8.1.2
>
> --
> sheepdog mailing list
> sheepdog at lists.wpkg.org
> http://lists.wpkg.org/mailman/listinfo/sheepdog
More information about the sheepdog
mailing list