[sheepdog] [PATCH 1/8] use rbtree to replace the priority scheduler in recovery

Tue May 22 04:51:01 CEST 2012

From: levin li <xingke.lwp at taobao.com>

When IO request comes for an object in recovery, we put it
into the priority rbtree, during the recovery, we make sheep
recover the objects in the rbtree firstly, it's more efficient
than the scheduler in is_recovering_oid()

Signed-off-by: levin li <xingke.lwp at taobao.com>
---
 sheep/recovery.c   |  149 +++++++++++++++++++++++++++++++++++++++-------------
 sheep/sdnet.c      |    2 +-
 sheep/sheep_priv.h |    2 +-
 3 files changed, 114 insertions(+), 39 deletions(-)

diff --git a/sheep/recovery.c b/sheep/recovery.c
index 55bb122..8c083c7 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -37,6 +37,11 @@ struct recovery_work {
 	int count;
 	uint64_t *oids;
 
+	uint64_t oid_to_recovery;
+
+	struct rb_root prior_tree;
+	int prior_count;
+
 	int old_nr_nodes;
 	struct sd_node old_nodes[SD_MAX_NODES];
 	int cur_nr_nodes;
@@ -47,9 +52,61 @@ struct recovery_work {
 	struct sd_vnode cur_vnodes[SD_MAX_VNODES];
 };
 
+
+struct object_rb_entry {
+	uint64_t oid;
+	struct rb_node node;
+};
+
 static struct recovery_work *next_rw;
 static struct recovery_work *recovering_work;
 
+static struct object_rb_entry *prior_rb_tree_insert(struct rb_root *root,
+			struct object_rb_entry *new)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct object_rb_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct object_rb_entry, node);
+
+		if (new->oid < entry->oid)
+			p = &(*p)->rb_left;
+		else if (new->oid > entry->oid)
+			p = &(*p)->rb_right;
+		else
+			return entry; /* already has this entry */
+	}
+	rb_link_node(&new->node, parent, p);
+	rb_insert_color(&new->node, root);
+
+	return NULL; /* insert successfully */
+}
+
+static int prior_rb_tree_search(struct rb_root *root,
+				uint64_t oid)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct object_rb_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct object_rb_entry, node);
+
+		if (oid < entry->oid)
+			p = &(*p)->rb_left;
+		else if (oid > entry->oid)
+			p = &(*p)->rb_right;
+		else
+			return 1; /* already has this entry */
+	}
+
+	return 0;
+}
+
 static int obj_cmp(const void *oid1, const void *oid2)
 {
 	const uint64_t hval1 = fnv_64a_buf((void *)oid1, sizeof(uint64_t), FNV1A_64_INIT);
@@ -310,7 +367,7 @@ static void rollback_old_cur(struct sd_vnode *old, int *old_nr, int *old_copies,
 static int do_recover_object(struct recovery_work *rw, int copy_idx)
 {
 	struct sd_vnode *old, *cur;
-	uint64_t oid = rw->oids[rw->done];
+	uint64_t oid = rw->oid_to_recovery;
 	int old_nr = rw->old_nr_vnodes, cur_nr = rw->cur_nr_vnodes;
 	uint32_t epoch = rw->epoch, tgt_epoch = rw->epoch - 1;
 	struct sd_vnode *tgt_entry;
@@ -469,10 +526,8 @@ int node_in_recovery(void)
 	return !!recovering_work;
 }
 
-int is_recoverying_oid(uint64_t oid)
+int is_recovering_oid(uint64_t oid)
 {
-	uint64_t hval = fnv_64a_buf(&oid, sizeof(uint64_t), FNV1A_64_INIT);
-	uint64_t min_hval;
 	struct recovery_work *rw = recovering_work;
 	int i;
 
@@ -482,8 +537,6 @@ int is_recoverying_oid(uint64_t oid)
 	if (!rw)
 		return 0; /* there is no thread working for object recovery */
 
-	min_hval = fnv_64a_buf(&rw->oids[rw->done + rw->nr_blocking], sizeof(uint64_t), FNV1A_64_INIT);
-
 	if (before(rw->epoch, sys->epoch))
 		return 1;
 
@@ -495,31 +548,27 @@ int is_recoverying_oid(uint64_t oid)
 		return 0;
 	}
 
-	/* the first 'rw->nr_blocking' objects were already scheduled to be done earlier */
-	for (i = 0; i < rw->nr_blocking; i++)
-		if (rw->oids[rw->done + i] == oid)
-			return 1;
-
-	if (min_hval <= hval) {
-		uint64_t *p;
-		p = bsearch(&oid, rw->oids + rw->done + rw->nr_blocking,
-			    rw->count - rw->done - rw->nr_blocking, sizeof(oid), obj_cmp);
-		if (p) {
-			dprintf("recover the object %" PRIx64 " first\n", oid);
-			if (rw->nr_blocking == 0)
-				rw->nr_blocking = 1; /* the first oid may be processed now */
-			if (p > rw->oids + rw->done + rw->nr_blocking) {
-				/* this object should be recovered earlier */
-				memmove(rw->oids + rw->done + rw->nr_blocking + 1,
-					rw->oids + rw->done + rw->nr_blocking,
-					sizeof(uint64_t) * (p - (rw->oids + rw->done + rw->nr_blocking)));
-				rw->oids[rw->done + rw->nr_blocking] = oid;
-				rw->nr_blocking++;
-			}
-			return 1;
+	if (prior_rb_tree_search(&rw->prior_tree, oid)) {
+		dprintf("object %" PRIx64 " has been in prior tree.\n", oid);
+		return 1;
+	}
+
+	for (i = rw->done; i < rw->count; i++) {
+		if (rw->oids[i] == oid) {
+			struct object_rb_entry *entry;
+			entry = zalloc(sizeof(*entry));
+			entry->oid = oid;
+			rb_init_node(&entry->node);
+			prior_rb_tree_insert(&rw->prior_tree, entry);
+			rw->prior_count++;
+			rw->oids[i] = 0;
+			break;
 		}
 	}
 
+	if (i < rw->count)
+		return 1;
+
 	dprintf("the object %" PRIx64 " is not found\n", oid);
 	return 0;
 }
@@ -527,17 +576,35 @@ int is_recoverying_oid(uint64_t oid)
 static void do_recover_main(struct work *work)
 {
 	struct recovery_work *rw = container_of(work, struct recovery_work, work);
-	uint64_t oid;
+	struct object_rb_entry *entry;
+	struct rb_node *node;
+	uint64_t oid = 0;
 
-	if (rw->state == RW_INIT)
-		rw->state = RW_RUN;
-	else if (!rw->retry) {
-		rw->done++;
-		if (rw->nr_blocking > 0)
-			rw->nr_blocking--;
+again:
+	if (rw->prior_count == 0) {
+		if (rw->state == RW_INIT)
+			rw->state = RW_RUN;
+		else if (!rw->retry)
+			rw->done++;
+	}
+
+	if (rw->prior_count > 0) {
+		node = rb_first(&rw->prior_tree);
+		entry = rb_entry(node, struct object_rb_entry, node);
+		oid = entry->oid;
+		rb_erase(node, &rw->prior_tree);
+		free(entry);
+	} else if (rw->done < rw->count) {
+		oid = rw->oids[rw->done];
+		if (!oid) {
+			dprintf("oid is zero\n");
+			/* object has been recovered by fast recovery,
+			 * move to the next. */
+			goto again;
+		}
 	}
 
-	oid = rw->oids[rw->done];
+	rw->oid_to_recovery = oid;
 
 	if (rw->retry && !next_rw) {
 		rw->retry = 0;
@@ -548,7 +615,12 @@ static void do_recover_main(struct work *work)
 		return;
 	}
 
-	if (rw->done < rw->count && !next_rw) {
+	if ((rw->done < rw->count || rw->prior_count > 0) &&
+		!next_rw) {
+
+		if (rw->prior_count)
+			rw->prior_count--;
+
 		rw->work.fn = recover_object;
 
 		if (is_access_to_busy_objects(oid)) {
@@ -811,6 +883,9 @@ int start_recovery(uint32_t epoch)
 	rw->epoch = epoch;
 	rw->count = 0;
 
+	rw->prior_tree = RB_ROOT;
+	rw->prior_count = 0;
+
 	rw->work.fn = do_recovery_work;
 	rw->work.done = do_recover_main;
 
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index a13b3e3..c578e54 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -211,7 +211,7 @@ static int check_request(struct request *req)
 	if (!req->local_oid)
 		return 0;
 
-	if (is_recoverying_oid(req->local_oid)) {
+	if (is_recovering_oid(req->local_oid)) {
 		if (req->rq.flags & SD_FLAG_CMD_IO_LOCAL) {
 			/* Sheep peer request */
 			req->rp.result = SD_RES_NEW_NODE_VER;
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 21ee282..026e260 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -297,7 +297,7 @@ int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *);
 
 int start_recovery(uint32_t epoch);
 void resume_recovery_work(void);
-int is_recoverying_oid(uint64_t oid);
+int is_recovering_oid(uint64_t oid);
 int node_in_recovery(void);
 
 int write_object(struct vnode_info *vnodes, uint32_t node_version,
-- 
1.7.10