On 05/21/2012 12:11 PM, levin li wrote: > From: levin li <xingke.lwp at taobao.com> > > Every node has the same sd_node order in its epoch, so in > fill_obj_list(), every node starts from a same node to request > the object list, which may cause the node overload. > > Indeed, we meet this problem when there's 960 nodes in our > cluster, when in the period of fill_obj_list, some node get > 'too many requests' in client_rx_handler(), so I change it > to start from a random node in fill_obj_list() to make load blance. > > Signed-off-by: levin li <xingke.lwp at taobao.com> > --- > sheep/recovery.c | 12 +++++++++++- > 1 file changed, 11 insertions(+), 1 deletion(-) > > diff --git a/sheep/recovery.c b/sheep/recovery.c > index e31f226..55bb122 100644 > --- a/sheep/recovery.c > +++ b/sheep/recovery.c > @@ -699,6 +699,8 @@ static int fill_obj_list(struct recovery_work *rw) > int retry_cnt; > struct sd_node *cur = rw->cur_nodes; > int cur_nr = rw->cur_nr_nodes; > + int start = random() % cur_nr; > + int end = cur_nr; > > buf = malloc(buf_size); > if (!buf) { > @@ -706,7 +708,9 @@ static int fill_obj_list(struct recovery_work *rw) > rw->retry = 1; > return -1; > } > - for (i = 0; i < cur_nr; i++) { > + > +again: > + for (i = start; i < end; i++) { > int buf_nr; > struct sd_node *node = cur + i; > > @@ -738,6 +742,12 @@ static int fill_obj_list(struct recovery_work *rw) > rw->count = merge_objlist(rw->oids, rw->count, (uint64_t *)buf, buf_nr); > } > > + if (start != 0 && !next_rw) { > + end = start; > + start = 0; > + goto again; > + } > + > dprintf("%d\n", rw->count); > free(buf); > return 0; Applied, thanks. Yuan |