[sheepdog] [PATCH v4 5/6] sheep: cache vnode_info when doing recovery
Robin Dong
robin.k.dong at gmail.com
Tue May 20 10:40:33 CEST 2014
From: Robin Dong <sanbai at taobao.com>
When sheepdog doing recovery in same low-performance machines, the CPU is
very high. After using perf tools to check the hot point of performance in
sheep daemon, we find out that the "alloc_vnode_info()" function cost lots
of CPU circyles because the rollback_vnode_info() rebuilds the vnode_info
by calling alloc_vnode_info() too frequently.
The solution is to cache result of alloc_vnode_info() for specific 'epoch'
and 'nr_nodes' in the recovery context.
Signed-off-by: Robin Dong <sanbai at taobao.com>
---
sheep/group.c | 12 ++++++++++++
sheep/recovery.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------
sheep/sheep_priv.h | 4 ++++
3 files changed, 64 insertions(+), 6 deletions(-)
diff --git a/sheep/group.c b/sheep/group.c
index 63e9ab9..360771d 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -175,6 +175,18 @@ struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
return alloc_vnode_info(&nroot);
}
+int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
+ struct sd_node *nodes, int len)
+{
+ int nr_nodes;
+
+ nr_nodes = epoch_log_read(epoch, nodes, len);
+ if (nr_nodes < 0)
+ nr_nodes = epoch_log_read_remote(epoch, nodes, len,
+ NULL, cur_vinfo);
+ return nr_nodes;
+}
+
int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
void *data)
{
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 27eb6c8..75e4b93 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -17,6 +17,7 @@ struct recovery_work {
uint32_t epoch;
uint32_t tgt_epoch;
+ struct recovery_info *rinfo;
struct vnode_info *old_vinfo;
struct vnode_info *cur_vinfo;
@@ -71,6 +72,10 @@ struct recovery_info {
struct vnode_info *old_vinfo;
struct vnode_info *cur_vinfo;
+
+ int max_epoch;
+ struct vnode_info **vinfo_array;
+ struct sd_mutex vinfo_lock;
};
static struct recovery_info *next_rinfo;
@@ -95,23 +100,44 @@ static inline bool node_is_gateway_only(void)
return sys->this_node.nr_vnodes == 0;
}
+static inline int vinfo_idx(uint32_t epoch, int nr_nodes)
+{
+ return epoch * SD_MAX_NODES + nr_nodes;
+}
+
static struct vnode_info *rollback_vnode_info(uint32_t *epoch,
+ struct recovery_info *rinfo,
struct vnode_info *cur)
{
- struct vnode_info *vinfo;
+ struct sd_node nodes[SD_MAX_NODES];
+ int nr_nodes, idx;
+ struct rb_root nroot = RB_ROOT;
+
rollback:
*epoch -= 1;
if (*epoch < last_gathered_epoch)
return NULL;
- vinfo = get_vnode_info_epoch(*epoch, cur);
- if (!vinfo) {
+ nr_nodes = get_nodes_epoch(*epoch, cur, nodes, sizeof(nodes));
+ if (!nr_nodes) {
/* We rollback in case we don't get a valid epoch */
sd_alert("cannot get epoch %d", *epoch);
sd_alert("clients may see old data");
goto rollback;
}
- return vinfo;
+ idx = vinfo_idx(*epoch, nr_nodes);
+ /* double check */
+ if (rinfo->vinfo_array[idx] == NULL) {
+ sd_mutex_lock(&rinfo->vinfo_lock);
+ if (rinfo->vinfo_array[idx] == NULL) {
+ for (int i = 0; i < nr_nodes; i++)
+ rb_insert(&nroot, &nodes[i], rb, node_cmp);
+ rinfo->vinfo_array[idx] = alloc_vnode_info(&nroot);
+ }
+ sd_mutex_unlock(&rinfo->vinfo_lock);
+ }
+ refcount_inc(&(rinfo->vinfo_array[idx]->refcnt));
+ return rinfo->vinfo_array[idx];
}
/*
@@ -202,7 +228,8 @@ again:
break;
default:
rollback:
- new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
+ new_old = rollback_vnode_info(&tgt_epoch, rw->rinfo,
+ rw->cur_vinfo);
if (!new_old) {
sd_err("can not read %"PRIx64" idx %d", oid, idx);
free(buf);
@@ -385,7 +412,8 @@ again:
/* fall through */
default:
/* No luck, roll back to an older configuration and try again */
- new_old = rollback_vnode_info(&tgt_epoch, rw->cur_vinfo);
+ new_old = rollback_vnode_info(&tgt_epoch, rw->rinfo,
+ rw->cur_vinfo);
if (!new_old) {
sd_err("can not recover oid %"PRIx64, oid);
ret = -1;
@@ -669,10 +697,19 @@ static void free_recovery_obj_work(struct recovery_obj_work *row)
static void free_recovery_info(struct recovery_info *rinfo)
{
+ int idx;
+
put_vnode_info(rinfo->cur_vinfo);
put_vnode_info(rinfo->old_vinfo);
free(rinfo->oids);
free(rinfo->prio_oids);
+ for (int i = 0; i < rinfo->max_epoch; i++)
+ for (int j = 0; j < SD_MAX_NODES; j++) {
+ idx = vinfo_idx(i, j);
+ put_vnode_info(rinfo->vinfo_array[idx]);
+ }
+ free(rinfo->vinfo_array);
+ sd_destroy_mutex(&rinfo->vinfo_lock);
free(rinfo);
}
@@ -1069,6 +1106,10 @@ int start_recovery(struct vnode_info *cur_vinfo, struct vnode_info *old_vinfo,
rinfo->tgt_epoch = epoch_lifted ? sys->cinfo.epoch - 1 :
sys->cinfo.epoch;
rinfo->count = 0;
+ rinfo->max_epoch = sys->cinfo.epoch;
+ rinfo->vinfo_array = xzalloc(sizeof(struct vnode_info *) *
+ rinfo->max_epoch * SD_MAX_NODES);
+ sd_init_mutex(&rinfo->vinfo_lock);
if (epoch_lifted)
rinfo->notify_complete = true; /* Reweight or node recovery */
else
@@ -1136,6 +1177,7 @@ static void queue_recovery_work(struct recovery_info *rinfo)
rw->epoch = rinfo->epoch;
rw->tgt_epoch = rinfo->tgt_epoch;
+ rw->rinfo = rinfo;
rw->cur_vinfo = grab_vnode_info(rinfo->cur_vinfo);
rw->old_vinfo = grab_vnode_info(rinfo->old_vinfo);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 693171c..f405b75 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -346,12 +346,16 @@ int get_vdi_attr(struct sheepdog_vdi_attr *vattr, int data_len, uint32_t vid,
int local_get_node_list(const struct sd_req *req, struct sd_rsp *rsp,
void *data);
+void reset_vinfo_array(void);
struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
struct vnode_info *get_vnode_info(void);
void put_vnode_info(struct vnode_info *vinfo);
struct vnode_info *alloc_vnode_info(const struct rb_root *);
struct vnode_info *get_vnode_info_epoch(uint32_t epoch,
struct vnode_info *cur_vinfo);
+int get_nodes_epoch(uint32_t epoch, struct vnode_info *cur_vinfo,
+ struct sd_node *nodes, int len);
+
void wait_get_vdis_done(void);
int get_nr_copies(struct vnode_info *vnode_info);
--
1.7.12.4
More information about the sheepdog
mailing list