On 05/24/2012 11:37 AM, levin li wrote: > From: levin li <xingke.lwp at taobao.com> > > If requests comes with epoch newer than system epoch, then > we shouldn't just make it done with result SD_RES_NEW_NODE_VER, > if so, the sender would busy retrying this request, which may > casue CPU too busy to process other request. > > We push the requests with new epoch into a wait_epoch_queue to > make it wait for epoch consistency, after epoch changes we wake > up these requests in the queue, which avoids busy retrying. > > Signed-off-by: levin li <xingke.lwp at taobao.com> > --- > sheep/group.c | 1 + > sheep/recovery.c | 2 ++ > sheep/sdnet.c | 28 ++++++++++++++++++++++++---- > sheep/sheep_priv.h | 2 ++ > 4 files changed, 29 insertions(+), 4 deletions(-) > > diff --git a/sheep/group.c b/sheep/group.c > index e2632ac..8673b7a 100644 > --- a/sheep/group.c > +++ b/sheep/group.c > @@ -1381,6 +1381,7 @@ int create_cluster(int port, int64_t zone, int nr_vnodes) > > INIT_LIST_HEAD(&sys->request_queue); > INIT_LIST_HEAD(&sys->event_queue); > + INIT_LIST_HEAD(&sys->wait_rw_queue); > > ret = send_join_request(&sys->this_node); > if (ret != 0) > diff --git a/sheep/recovery.c b/sheep/recovery.c > index f341fc6..de4bc62 100644 > --- a/sheep/recovery.c > +++ b/sheep/recovery.c > @@ -692,5 +692,7 @@ int start_recovery(uint32_t epoch) > queue_work(sys->recovery_wqueue, &rw->work); > } > > + resume_wait_epoch_requests(); > + > return 0; > } > diff --git a/sheep/sdnet.c b/sheep/sdnet.c > index 5e9cb3b..cd3311e 100644 > --- a/sheep/sdnet.c > +++ b/sheep/sdnet.c > @@ -174,14 +174,23 @@ static int check_epoch(struct request *req) > int ret = SD_RES_SUCCESS; > > if (before(req_epoch, sys->epoch)) { > - ret = SD_RES_OLD_NODE_VER; > eprintf("old node version %u, %u, %x\n", > sys->epoch, req_epoch, opcode); > + /* make gateway to retry. */ better rewording as /* ask sender to retry */ > + req->rp.result = SD_RES_OLD_NODE_VER; > + req->rp.epoch = sys->epoch; > + req->work.done(&req->work); > + ret = req->rp.result; > } else if (after(req_epoch, sys->epoch)) { > - ret = SD_RES_NEW_NODE_VER; > eprintf("new node version %u, %u, %x\n", > sys->epoch, req_epoch, opcode); > + > + /* wait for epoch consistency. */ better rewording as /* put on local wait queue, waiting for local epoch to be lifted */ > + req->rp.result = SD_RES_NEW_NODE_VER; > + list_add_tail(&req->request_list, &sys->wait_rw_queue); > + ret = req->rp.result; > } > + > return ret; > } > Seems that unfold check_epoch() into check_request() will clarify the logic a bit. And also, recovery requests will also visit this code, your comments only consider gateway requests. Well, recovery code has its own retry mechanism. > @@ -201,9 +210,7 @@ static int check_request(struct request *req) > else { > int ret = check_epoch(req); > if (ret != SD_RES_SUCCESS) { > - req->rp.result = ret; > sys->nr_outstanding_io++; > - req->work.done(&req->work); > return -1; > } > } > @@ -251,6 +258,19 @@ void resume_pending_requests(void) > process_request_event_queues(); > } > > +void resume_wait_epoch_requests(void) > +{ > + struct request *req, *t; > + > + list_for_each_entry_safe(req, t, &sys->wait_rw_queue, > + request_list) { > + > + list_del(&req->request_list); > + list_add_tail(&req->request_list, &sys->request_queue); > + process_request_event_queues(); > + } > +} How about drag process_request_event_queues() out of the loop? > + > static void queue_request(struct request *req) > { > struct sd_req *hdr = &req->rq; > diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h > index 3d5a964..ba29a2a 100644 > --- a/sheep/sheep_priv.h > +++ b/sheep/sheep_priv.h > @@ -137,6 +137,7 @@ struct cluster_info { > > struct list_head request_queue; > struct list_head event_queue; > + struct list_head wait_rw_queue; > struct event_struct *cur_cevent; > int nr_outstanding_io; > int nr_outstanding_reqs; > @@ -267,6 +268,7 @@ int get_nr_copies(struct vnode_info *vnode_info); > int is_access_to_busy_objects(uint64_t oid); > > void resume_pending_requests(void); > +void resume_wait_epoch_requests(void); > > int create_cluster(int port, int64_t zone, int nr_vnodes); > int leave_cluster(void); |