From: levin li <xingke.lwp at taobao.com> Every node has the same sd_node order in its epoch, so in fill_obj_list(), every node starts from a same node to request the object list, which may cause the node overload. Indeed, we meet this problem when there's 960 nodes in our cluster, when in the period of fill_obj_list, some node get 'too many requests' in client_rx_handler(), so I change it to start from a random node in fill_obj_list() to make load blance. Signed-off-by: levin li <xingke.lwp at taobao.com> --- sheep/recovery.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sheep/recovery.c b/sheep/recovery.c index e31f226..55bb122 100644 --- a/sheep/recovery.c +++ b/sheep/recovery.c @@ -699,6 +699,8 @@ static int fill_obj_list(struct recovery_work *rw) int retry_cnt; struct sd_node *cur = rw->cur_nodes; int cur_nr = rw->cur_nr_nodes; + int start = random() % cur_nr; + int end = cur_nr; buf = malloc(buf_size); if (!buf) { @@ -706,7 +708,9 @@ static int fill_obj_list(struct recovery_work *rw) rw->retry = 1; return -1; } - for (i = 0; i < cur_nr; i++) { + +again: + for (i = start; i < end; i++) { int buf_nr; struct sd_node *node = cur + i; @@ -738,6 +742,12 @@ static int fill_obj_list(struct recovery_work *rw) rw->count = merge_objlist(rw->oids, rw->count, (uint64_t *)buf, buf_nr); } + if (start != 0 && !next_rw) { + end = start; + start = 0; + goto again; + } + dprintf("%d\n", rw->count); free(buf); return 0; -- 1.7.10 |