[sheepdog] [PATCH v4 5/6] sheep: cache vnode_info when doing recovery

Tue May 20 10:40:33 CEST 2014

From: Robin Dong <sanbai at taobao.com>

When sheepdog doing recovery in same low-performance machines, the CPU is
very high. After using perf tools to check the hot point of performance in
sheep daemon, we find out that the "alloc_vnode_info()" function cost lots
of CPU circyles because the rollback_vnode_info() rebuilds the vnode_info
by calling alloc_vnode_info() too frequently.

The solution is to cache result of alloc_vnode_info() for specific 'epoch'
and 'nr_nodes' in the recovery context.

Signed-off-by: Robin Dong <sanbai at taobao.com>
---
 sheep/group.c      | 12 ++++++++++++
 sheep/recovery.c   | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------
 sheep/sheep_priv.h |  4 ++++
 3 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 63e9ab9..360771d 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -175,6 +175,18 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 	return alloc_vnode_info(&nroot);
 }
 
+int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
+		    struct sd_node *nodes, int len)
+{
+	int nr_nodes;
+
+	nr_nodes = epoch_log_read(epoch, nodes, len);
+	if (nr_nodes < 0)
+		nr_nodes = epoch_log_read_remote(epoch, nodes, len,
+						 NULL, cur_vinfo);
+	return nr_nodes;
+}
+
 int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 			void *data)
 {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 27eb6c8..75e4b93 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -17,6 +17,7 @@ struct recovery_work {
 	uint32_t epoch;
 	uint32_t tgt_epoch;
 
+	struct recovery_info *rinfo;
 	struct vnode_info *old_vinfo;
 	struct vnode_info *cur_vinfo;
 
@@ -71,6 +72,10 @@ struct recovery_info {
 
 	struct vnode_info *old_vinfo;
 	struct vnode_info *cur_vinfo;
+
+	int max_epoch;
+	struct vnode_info **vinfo_array;
+	struct sd_mutex vinfo_lock;
 };
 
 static struct recovery_info *next_rinfo;
@@ -95,23 +100,44 @@ static inline bool node_is_gateway_only(void)
 	return sys->this_node.nr_vnodes == 0;
 }
 
+static inline int vinfo_idx(uint32_t epoch, int nr_nodes)
+{
+	return epoch * SD_MAX_NODES + nr_nodes;
+}
+
 static struct vnode_info *rollback_vnode_info(uint32_t *epoch,
+					      struct recovery_info *rinfo,
 					      struct vnode_info *cur)
 {
-	struct vnode_info *vinfo;
+	struct sd_node nodes[SD_MAX_NODES];
+	int nr_nodes, idx;
+	struct rb_root nroot = RB_ROOT;
+
 rollback:
 	*epoch -= 1;
 	if (*epoch < last_gathered_epoch)
 		return NULL;
 
-	vinfo = get_vnode_info_epoch(*epoch, cur);
-	if (!vinfo) {
+	nr_nodes = get_nodes_epoch(*epoch, cur, nodes, sizeof(nodes));
+	if (!nr_nodes) {
 		/* We rollback in case we don't get a valid epoch */
 		sd_alert("cannot get epoch %d", *epoch);
 		sd_alert("clients may see old data");
 		goto rollback;
 	}
-	return vinfo;
+	idx = vinfo_idx(*epoch, nr_nodes);
+	/* double check */
+	if (rinfo->vinfo_array[idx] == NULL) {
+		sd_mutex_lock(&rinfo->vinfo_lock);
+		if (rinfo->vinfo_array[idx] == NULL) {
+			for (int i = 0; i < nr_nodes; i++)
+				rb_insert(&nroot, &nodes[i], rb, node_cmp);
+			rinfo->vinfo_array[idx] = alloc_vnode_info(&nroot);
+		}
+		sd_mutex_unlock(&rinfo->vinfo_lock);
+	}
+	refcount_inc(&(rinfo->vinfo_array[idx]->refcnt));
+	return rinfo->vinfo_array[idx];
 }
 
 /*
@@ -202,7 +228,8 @@ again:
 		break;
 	default:
 rollback:
-		new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
+		new_old = rollback_vnode_info(&tgt_epoch, rw->rinfo,
+					      rw->cur_vinfo);
 		if (!new_old) {
 			sd_err("can not read %"PRIx64" idx %d", oid, idx);
 			free(buf);
@@ -385,7 +412,8 @@ again:
 		/* fall through */
 	default:
 		/* No luck, roll back to an older configuration and try again */
-		new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
+		new_old = rollback_vnode_info(&tgt_epoch, rw->rinfo,
+					      rw->cur_vinfo);
 		if (!new_old) {
 			sd_err("can not recover oid %"PRIx64, oid);
 			ret = -1;
@@ -669,10 +697,19 @@ static void free_recovery_obj_work(struct recovery_obj_work *row)
 
 static void free_recovery_info(struct recovery_info *rinfo)
 {
+	int idx;
+
 	put_vnode_info(rinfo->cur_vinfo);
 	put_vnode_info(rinfo->old_vinfo);
 	free(rinfo->oids);
 	free(rinfo->prio_oids);
+	for (int i = 0; i < rinfo->max_epoch; i++)
+		for (int j = 0; j < SD_MAX_NODES; j++) {
+			idx = vinfo_idx(i, j);
+			put_vnode_info(rinfo->vinfo_array[idx]);
+		}
+	free(rinfo->vinfo_array);
+	sd_destroy_mutex(&rinfo->vinfo_lock);
 	free(rinfo);
 }
 
@@ -1069,6 +1106,10 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
 	rinfo->tgt_epoch = epoch_lifted ? sys->cinfo.epoch - 1 :
 		sys->cinfo.epoch;
 	rinfo->count = 0;
+	rinfo->max_epoch = sys->cinfo.epoch;
+	rinfo->vinfo_array = xzalloc(sizeof(struct vnode_info *) *
+				     rinfo->max_epoch * SD_MAX_NODES);
+	sd_init_mutex(&rinfo->vinfo_lock);
 	if (epoch_lifted)
 		rinfo->notify_complete = true; /* Reweight or node recovery */
 	else
@@ -1136,6 +1177,7 @@ static void queue_recovery_work(struct recovery_info *rinfo)
 
 	rw->epoch = rinfo->epoch;
 	rw->tgt_epoch = rinfo->tgt_epoch;
+	rw->rinfo = rinfo;
 	rw->cur_vinfo = grab_vnode_info(rinfo->cur_vinfo);
 	rw->old_vinfo = grab_vnode_info(rinfo->old_vinfo);
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 693171c..f405b75 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -346,12 +346,16 @@ int get_vdi_attr(struct sheepdog_vdi_attr *vattr, int data_len, uint32_t vid,
 int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
 		void *data);
 
+void reset_vinfo_array(void);
 struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
 struct vnode_info *get_vnode_info(void);
 void put_vnode_info(struct vnode_info *vinfo);
 struct vnode_info *alloc_vnode_info(const struct rb_root *);
 struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
 					struct vnode_info *cur_vinfo);
+int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
+		    struct sd_node *nodes, int len);
+
 void wait_get_vdis_done(void);
 
 int get_nr_copies(struct vnode_info *vnode_info);
-- 
1.7.12.4