[sheepdog] [PATCH 4/8] make IO request to wait when recovery is in RW_INIT

Tue May 22 12:42:20 CEST 2012

On 05/22/2012 04:27 PM, Yibin Shen wrote:
> On Tue, May 22, 2012 at 10:51 AM, levin li<levin108 at gmail.com>  wrote:
>> From: levin li<xingke.lwp at taobao.com>
>>
>> When recovery is in RW_INIT state, the IO request would be
>> marked as recovering, then the sender will busy retrying,
>> we should make the request wait until recovery enters RW_RUN
>> state to determine whether the object requested is in recovery
>> instead of just return SD_RES_NEW_NODE_VER.
>>
>> Signed-off-by: levin li<xingke.lwp at taobao.com>
>> ---
>>   sheep/group.c      |    1 +
>>   sheep/recovery.c   |   25 +++++++++++++++++++++++--
>>   sheep/sdnet.c      |   11 ++++++++---
>>   sheep/sheep_priv.h |    2 ++
>>   4 files changed, 34 insertions(+), 5 deletions(-)
>>
>> diff --git a/sheep/group.c b/sheep/group.c
>> index 3266c38..63742a2 100644
>> --- a/sheep/group.c
>> +++ b/sheep/group.c
>> @@ -1379,6 +1379,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes)
>>         INIT_LIST_HEAD(&sys->request_queue);
>>         INIT_LIST_HEAD(&sys->event_queue);
>>         INIT_LIST_HEAD(&sys->wait_epoch_queue);
>> +       INIT_LIST_HEAD(&sys->wait_rw_queue);
>>
>>         ret = send_join_request(&sys->this_node);
>>         if (ret != 0)
>> diff --git a/sheep/recovery.c b/sheep/recovery.c
>> index 3b5caa3..0d05661 100644
>> --- a/sheep/recovery.c
>> +++ b/sheep/recovery.c
>> @@ -526,6 +526,13 @@ int node_in_recovery(void)
>>         return !!recovering_work;
>>   }
>>
>> +int is_recovery_init(void)
>> +{
>> +       struct recovery_work *rw = recovering_work;
>> +
>> +       return rw->state == RW_INIT;
>> +}
>> +
>>   int is_recovering_oid(uint64_t oid)
>>   {
>>         struct recovery_work *rw = recovering_work;
>> @@ -573,6 +580,19 @@ int is_recovering_oid(uint64_t oid)
>>         return 0;
>>   }
>>
>> +static void resume_wait_recovery_queue(void)
>> +{
>> +       struct request *req, *t;
>> +
>> +       list_for_each_entry_safe(req, t,&sys->wait_rw_queue,
>> +                                request_list) {
>> +               dprintf("resume wait oid %" PRIx64 "\n", req->local_oid);
>> +               list_del(&req->request_list);
>> +               list_add_tail(&req->request_list,&sys->request_queue);
>> +               process_request_event_queues();
>> +       }
>> +}
>> +
>>   static void do_recover_main(struct work *work)
>>   {
>>         struct recovery_work *rw = container_of(work, struct recovery_work, work);
>> @@ -582,9 +602,10 @@ static void do_recover_main(struct work *work)
>>
>>   again:
>>         if (rw->prior_count == 0) {
>> -               if (rw->state == RW_INIT)
>> +               if (rw->state == RW_INIT) {
>>                         rw->state = RW_RUN;
>> -               else if (!rw->retry)
>> +                       resume_wait_recovery_queue();
>> +               } else if (!rw->retry)
>>                         rw->done++;
>>         }
>>
>> diff --git a/sheep/sdnet.c b/sheep/sdnet.c
>> index e1334c8..565625e 100644
>> --- a/sheep/sdnet.c
>> +++ b/sheep/sdnet.c
>> @@ -227,9 +227,14 @@ static int check_request(struct request *req)
>>         if (is_recovering_oid(req->local_oid)) {
>>                 if (req->rq.flags&  SD_FLAG_CMD_IO_LOCAL) {
>>                         /* Sheep peer request */
>> -                       req->rp.result = SD_RES_NEW_NODE_VER;
>> -                       sys->nr_outstanding_io++;
>> -                       req->work.done(&req->work);
>> +                       if (is_recovery_init())
>> +                               list_add_tail(&req->request_list,
>> +&sys->wait_rw_queue);
> we know do_recovery_work() contain fill_obj_list() operation,
> In a large scale cluster,
> this function is really time consuming, It seems not a good idea to
> block all I/O request in RW_INIT phase.

Yes, it takes a long time for a recovery work to get into RW_RUN from 
RW_INIT in large cluster, it's indeed a problem, we should find a better
way to solve this problem.

thanks,

levin

>> +                       else
>> +                               req->rp.result = SD_RES_NEW_NODE_VER;
>> +                               sys->nr_outstanding_io++;
>> +                               req->work.done(&req->work);
>> +                       }
>>                 } else {
>>                         /* Gateway request */
>>                         list_add_tail(&req->request_list,&sys->req_wait_for_obj_list);
>> diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
>> index 00c299a..5e804c4 100644
>> --- a/sheep/sheep_priv.h
>> +++ b/sheep/sheep_priv.h
>> @@ -137,6 +137,7 @@ struct cluster_info {
>>         struct list_head request_queue;
>>         struct list_head event_queue;
>>         struct list_head wait_epoch_queue;
>> +       struct list_head wait_rw_queue;
>>         struct event_struct *cur_cevent;
>>         int nr_outstanding_io;
>>         int nr_outstanding_reqs;
>> @@ -300,6 +301,7 @@ int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *);
>>   int start_recovery(uint32_t epoch);
>>   void resume_recovery_work(void);
>>   int is_recovering_oid(uint64_t oid);
>> +int is_recovery_init(void);
>>   int node_in_recovery(void);
>>
>>   int write_object(struct vnode_info *vnodes, uint32_t node_version,
>> --
>> 1.7.10
>>
>> --
>> sheepdog mailing list
>> sheepdog at lists.wpkg.org
>> http://lists.wpkg.org/mailman/listinfo/sheepdog
>
> ________________________________
>
> This email (including any attachments) is confidential and may be legally privileged. If you received this email in error, please delete it immediately and do not copy it or use it for any purpose or disclose its contents to any other person. Thank you.
>
> 本电邮(包括任何附件)可能含有机密资料并受法律保护。如您不是正确的收件人，请您立即删除本邮件。请不要将本电邮进行复制并用作任何其他用途、或透露本邮件之内容。谢谢。