[sheepdog] [PATCH v2 3/5] sheep: use rbtree to manage vnodes hash ring
Liu Yuan
namei.unix at gmail.com
Tue Sep 10 04:53:50 CEST 2013
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
dog/dog.c | 6 +-
dog/dog.h | 4 +-
dog/vdi.c | 20 ++---
include/internal_proto.h | 1 -
include/sheep.h | 199 +++++++++++++++-------------------------------
sheep/gateway.c | 8 +-
sheep/group.c | 12 ++-
sheep/plain_store.c | 3 +-
sheep/recovery.c | 7 +-
sheep/request.c | 6 +-
10 files changed, 98 insertions(+), 168 deletions(-)
diff --git a/dog/dog.c b/dog/dog.c
index 9ec41c1..0009c92 100644
--- a/dog/dog.c
+++ b/dog/dog.c
@@ -49,8 +49,8 @@ static void usage(const struct command *commands, int status);
uint32_t sd_epoch;
struct sd_node sd_nodes[SD_MAX_NODES];
-struct sd_vnode sd_vnodes[SD_MAX_VNODES];
-int sd_nodes_nr, sd_vnodes_nr;
+int sd_nodes_nr;
+struct rb_root sd_vroot = RB_ROOT;
int update_node_list(int max_nodes)
{
@@ -92,7 +92,7 @@ int update_node_list(int max_nodes)
}
memcpy(sd_nodes, buf, size);
- sd_vnodes_nr = nodes_to_vnodes(sd_nodes, sd_nodes_nr, sd_vnodes);
+ nodes_to_vnodes(sd_nodes, sd_nodes_nr, &sd_vroot);
sd_epoch = hdr.epoch;
out:
if (buf)
diff --git a/dog/dog.h b/dog/dog.h
index e1c499e..9b17eb7 100644
--- a/dog/dog.h
+++ b/dog/dog.h
@@ -56,8 +56,8 @@ extern bool verbose;
extern uint32_t sd_epoch;
extern struct sd_node sd_nodes[SD_MAX_NODES];
-extern struct sd_vnode sd_vnodes[SD_MAX_VNODES];
-extern int sd_nodes_nr, sd_vnodes_nr;
+extern struct rb_root sd_vroot;
+extern int sd_nodes_nr;
bool is_current(const struct sd_inode *i);
char *strnumber(uint64_t _size);
diff --git a/dog/vdi.c b/dog/vdi.c
index 82821ac..c99d0ae 100644
--- a/dog/vdi.c
+++ b/dog/vdi.c
@@ -904,14 +904,12 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
int i, j, ret;
struct sd_req hdr;
struct sd_rsp *rsp = (struct sd_rsp *)&hdr;
- struct sd_vnode *vnodes;
const struct sd_vnode *vnode_buf[SD_MAX_COPIES];
struct epoch_log *logs;
- int vnodes_nr, nr_logs, log_length;
+ int nr_logs, log_length;
log_length = sd_epoch * sizeof(struct epoch_log);
logs = xmalloc(log_length);
- vnodes = xmalloc(sizeof(*vnodes) * SD_MAX_VNODES);
sd_init_req(&hdr, SD_OP_STAT_CLUSTER);
hdr.data_length = log_length;
@@ -927,6 +925,9 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
nr_logs = rsp->data_length / sizeof(struct epoch_log);
for (i = nr_logs - 1; i >= 0; i--) {
+ struct rb_root vroot;
+ struct sd_vnode *v;
+
printf("\nobj %"PRIx64" locations at epoch %d, copies = %d\n",
oid, logs[i].epoch, nr_copies);
printf("---------------------------------------------------\n");
@@ -943,22 +944,23 @@ static int do_track_object(uint64_t oid, uint8_t nr_copies)
}
continue;
}
- vnodes_nr = nodes_to_vnodes(logs[i].nodes,
- logs[i].nr_nodes, vnodes);
- oid_to_vnodes(vnodes, vnodes_nr, oid, nr_copies, vnode_buf);
+ nodes_to_vnodes(logs[i].nodes, logs[i].nr_nodes, &vroot);
+ oid_to_vnodes(oid, &vroot, nr_copies, vnode_buf);
for (j = 0; j < nr_copies; j++) {
const struct node_id *n = &vnode_buf[j]->node->nid;
printf("%s\n", addr_to_str(n->addr, n->port));
}
+ rb_for_each_entry(v, &vroot, rb) {
+ rb_erase(&v->rb, &vroot);
+ free(v);
+ }
}
free(logs);
- free(vnodes);
return EXIT_SUCCESS;
error:
free(logs);
- free(vnodes);
return EXIT_SYSFAIL;
}
@@ -1541,7 +1543,7 @@ static void queue_vdi_check_work(const struct sd_inode *inode, uint64_t oid,
info->done = done;
info->wq = wq;
- oid_to_vnodes(sd_vnodes, sd_vnodes_nr, oid, nr_copies, tgt_vnodes);
+ oid_to_vnodes(oid, &sd_vroot, nr_copies, tgt_vnodes);
for (int i = 0; i < nr_copies; i++) {
info->vcw[i].info = info;
info->vcw[i].vnode = tgt_vnodes[i];
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 8544c75..7d1df2c 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -29,7 +29,6 @@
#define SD_MAX_NODES 1024
#define SD_DEFAULT_VNODES 128
-#define SD_MAX_VNODES (SD_MAX_NODES * SD_DEFAULT_VNODES)
/*
* Operations with opcodes above 0x80 are considered part of the inter-sheep
diff --git a/include/sheep.h b/include/sheep.h
index bbb7721..6c4b601 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -1,5 +1,6 @@
/*
* Copyright (C) 2009-2011 Nippon Telegraph and Telephone Corporation.
+ * Copyright (C) 2012-2013 Taobao Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License version
@@ -17,19 +18,18 @@
#include "bitops.h"
#include "list.h"
#include "net.h"
+#include "rbtree.h"
struct sd_vnode {
+ struct rb_node rb;
const struct sd_node *node;
- uint64_t id;
+ uint64_t hash;
};
struct vnode_info {
- struct sd_vnode vnodes[SD_MAX_VNODES];
- int nr_vnodes;
-
+ struct rb_root vroot;
struct sd_node nodes[SD_MAX_NODES];
int nr_nodes;
-
int nr_zones;
refcnt_t refcnt;
};
@@ -56,133 +56,77 @@ static inline void sd_init_req(struct sd_req *req, uint8_t opcode)
req->proto_ver = opcode < 0x80 ? SD_PROTO_VER : SD_SHEEP_PROTO_VER;
}
-static inline int same_zone(const struct sd_vnode *e, int n1, int n2)
+static inline int same_zone(struct sd_vnode *v1, struct sd_vnode *v2)
{
- return e[n1].node->zone == e[n2].node->zone;
+ return v1->node->zone == v2->node->zone;
}
-/* Get the first vnode's index which is matching the OID */
-static inline int get_vnode_first_idx(const struct sd_vnode *entries,
- int nr_entries, uint64_t oid)
+static inline int vnode_cmp(const struct sd_vnode *node1,
+ const struct sd_vnode *node2)
{
- uint64_t id = sd_hash_oid(oid);
- int start, end, pos;
-
- assert(nr_entries > 0);
-
- start = 0;
- end = nr_entries - 1;
-
- if (id > entries[end].id || id < entries[start].id)
- return (end + 1) % nr_entries;
-
- for (;;) {
- pos = (end - start) / 2 + start;
- if (entries[pos].id < id) {
- if (entries[pos + 1].id >= id)
- return (pos + 1) % nr_entries;
- start = pos;
- } else
- end = pos;
- }
+ return intcmp(node1->hash, node2->hash);
}
-/* Get next vnode's index according to the PREV_IDXS */
-static inline int get_vnode_next_idx(const struct sd_vnode *entries,
- int nr_entries, int *prev_idxs,
- int nr_prev_idxs)
+/* If v1_hash < oid_hash <= v2_hash, then oid is resident on v2 */
+static inline struct sd_vnode *
+oid_to_first_vnode(uint64_t oid, struct rb_root *root)
{
- int i, idx, first_idx;
- bool found;
-
- first_idx = prev_idxs[0];
- idx = prev_idxs[nr_prev_idxs - 1];
- for (;;) {
- idx = (idx + 1) % nr_entries;
- if (unlikely(idx == first_idx))
- panic("can't find next new idx");
-
- for (found = false, i = 0; i < nr_prev_idxs; i++) {
- if (same_zone(entries, idx, prev_idxs[i])) {
- found = true;
- break;
- }
- }
-
- if (!found)
- return idx;
- }
+ struct sd_vnode dummy = {
+ .hash = sd_hash_oid(oid),
+ };
+ return rb_nsearch(root, &dummy, rb, vnode_cmp);
}
-/* Get the n'th vnode's index which is matching the OID */
-static inline int get_vnode_nth_idx(const struct sd_vnode *entries,
- int nr_entries, uint64_t oid, int nth)
+/* Replica are placed along the ring one by one with different zones */
+static inline void oid_to_vnodes(uint64_t oid, struct rb_root *root,
+ int nr_copies,
+ const struct sd_vnode **vnodes)
{
- int nr_idxs = 0, idxs[SD_MAX_COPIES];
-
- idxs[nr_idxs++] = get_vnode_first_idx(entries, nr_entries, oid);
-
- if (!nth)
- return idxs[nth];
-
- while (nr_idxs <= nth) {
- idxs[nr_idxs] = get_vnode_next_idx(entries, nr_entries,
- idxs, nr_idxs);
- nr_idxs++;
+ struct sd_vnode *next = oid_to_first_vnode(oid, root);
+
+ vnodes[0] = next;
+ for (int i = 1; i < nr_copies; i++) {
+next:
+ next = rb_entry(rb_next(&next->rb), struct sd_vnode, rb);
+ if (!next) /* Wrap around */
+ next = rb_entry(rb_first(root), struct sd_vnode, rb);
+ if (unlikely(next == vnodes[0]))
+ panic("can't find a valid vnode");
+ for (int j = 0; j < i; j++)
+ if (same_zone((struct sd_vnode *)vnodes[j], next))
+ goto next;
+ vnodes[i] = next;
}
-
- return idxs[nth];
}
-static inline const struct sd_vnode *oid_to_vnode(const struct sd_vnode *entries,
- int nr_entries, uint64_t oid,
- int copy_idx)
+static inline const struct sd_vnode *
+oid_to_vnode(uint64_t oid, struct rb_root *root, int copy_idx)
{
- int idx = get_vnode_nth_idx(entries, nr_entries, oid, copy_idx);
+ const struct sd_vnode *vnodes[SD_MAX_COPIES];
- return &entries[idx];
+ oid_to_vnodes(oid, root, copy_idx + 1, vnodes);
+
+ return vnodes[copy_idx];
}
-static inline const struct sd_node *oid_to_node(const struct sd_vnode *entries,
- int nr_entries, uint64_t oid,
- int copy_idx)
+static inline const struct sd_node *
+oid_to_node(uint64_t oid, struct rb_root *root, int copy_idx)
{
const struct sd_vnode *vnode;
- vnode = oid_to_vnode(entries, nr_entries, oid, copy_idx);
+ vnode = oid_to_vnode(oid, root, copy_idx);
return vnode->node;
}
-static inline void oid_to_vnodes(const struct sd_vnode *entries, int nr_entries,
- uint64_t oid, int nr_copies,
- const struct sd_vnode **vnodes)
-{
- int idx, idxs[SD_MAX_COPIES], i;
-
- if (nr_entries == 0)
- return;
-
- idx = get_vnode_first_idx(entries, nr_entries, oid);
- idxs[0] = idx;
- vnodes[0] = &entries[idx];
-
- for (i = 1; i < nr_copies; i++) {
- idx = get_vnode_next_idx(entries, nr_entries, idxs, i);
- idxs[i] = idx;
- vnodes[i] = &entries[idx];
- }
-}
-
-static inline void oid_to_nodes(const struct sd_vnode *entries, int nr_entries,
- uint64_t oid, int nr_copies,
+static inline void oid_to_nodes(uint64_t oid, struct rb_root *root,
+ int nr_copies,
const struct sd_node **nodes)
{
- int i;
const struct sd_vnode *vnodes[SD_MAX_COPIES];
- oid_to_vnodes(entries, nr_entries, oid, nr_copies, vnodes);
- for (i = 0; i < nr_copies; i++)
+ oid_to_vnodes(oid, root, nr_copies, vnodes);
+ for (int i = 0; i < nr_copies; i++)
nodes[i] = vnodes[i]->node;
}
@@ -273,39 +217,24 @@ static inline bool node_eq(const struct sd_node *a, const struct sd_node *b)
return node_cmp(a, b) == 0;
}
-static inline int vnode_cmp(const struct sd_vnode *node1,
- const struct sd_vnode *node2)
-{
- return intcmp(node1->id, node2->id);
-}
-
-static inline int node_to_vnodes(const struct sd_node *n,
- struct sd_vnode *vnodes)
+static inline void
+nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes, struct rb_root *root)
{
- int nr = 0;
- uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), io_addr));
-
- for (int i = 0; i < n->nr_vnodes; i++) {
- hval = sd_hash_next(hval);
- vnodes[nr].id = hval;
- vnodes[nr].node = n;
- nr++;
+ for (int j = 0; j < nr_nodes; j++) {
+ const struct sd_node *n = nodes + j;
+ uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid),
+ io_addr));
+
+ for (int i = 0; i < n->nr_vnodes; i++) {
+ struct sd_vnode *v = xmalloc(sizeof(*v));
+
+ hval = sd_hash_next(hval);
+ v->hash = hval;
+ v->node = n;
+ if (unlikely(rb_insert(root, v, rb, vnode_cmp)))
+ panic("vdisk hash collison");
+ }
}
-
- return nr;
-}
-
-static inline int nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes,
- struct sd_vnode *vnodes)
-{
- int nr_vnodes = 0;
-
- for (int i = 0; i < nr_nodes; i++)
- nr_vnodes += node_to_vnodes(nodes + i, vnodes + nr_vnodes);
-
- xqsort(vnodes, nr_vnodes, vnode_cmp);
-
- return nr_vnodes;
}
#define MAX_NODE_STR_LEN 256
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 33f04e0..1b0810b 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -43,8 +43,7 @@ int gateway_read_obj(struct request *req)
nr_copies = get_req_copy_number(req);
- oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
- nr_copies, obj_vnodes);
+ oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes);
for (i = 0; i < nr_copies; i++) {
v = obj_vnodes[i];
if (!vnode_is_local(v))
@@ -239,11 +238,10 @@ static int init_target_nodes(struct request *req, uint64_t oid,
const struct sd_node **target_nodes)
{
int nr_to_send;
- const struct vnode_info *vinfo = req->vinfo;
+ struct vnode_info *vinfo = req->vinfo;
nr_to_send = get_req_copy_number(req);
- oid_to_nodes(vinfo->vnodes, vinfo->nr_vnodes, oid, nr_to_send,
- target_nodes);
+ oid_to_nodes(oid, &vinfo->vroot, nr_to_send, target_nodes);
return nr_to_send;
}
diff --git a/sheep/group.c b/sheep/group.c
index e2ee266..378d084 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -101,8 +101,14 @@ main_fn struct vnode_info *get_vnode_info(void)
void put_vnode_info(struct vnode_info *vnode_info)
{
if (vnode_info) {
- if (refcount_dec(&vnode_info->refcnt) == 0)
+ if (refcount_dec(&vnode_info->refcnt) == 0) {
+ struct sd_vnode *v;
+ rb_for_each_entry(v, &vnode_info->vroot, rb) {
+ rb_erase(&v->rb, &vnode_info->vroot);
+ free(v);
+ }
free(vnode_info);
+ }
}
}
@@ -139,14 +145,14 @@ struct vnode_info *alloc_vnode_info(const struct sd_node *nodes,
vnode_info = xzalloc(sizeof(*vnode_info));
+ INIT_RB_ROOT(&vnode_info->vroot);
vnode_info->nr_nodes = nr_nodes;
memcpy(vnode_info->nodes, nodes, sizeof(*nodes) * nr_nodes);
xqsort(vnode_info->nodes, nr_nodes, node_cmp);
recalculate_vnodes(vnode_info->nodes, nr_nodes);
- vnode_info->nr_vnodes = nodes_to_vnodes(vnode_info->nodes, nr_nodes,
- vnode_info->vnodes);
+ nodes_to_vnodes(vnode_info->nodes, nr_nodes, &vnode_info->vroot);
vnode_info->nr_zones = get_zones_nr_from(nodes, nr_nodes);
refcount_set(&vnode_info->refcnt, 1);
return vnode_info;
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 7cba585..dc5e3d9 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -397,8 +397,7 @@ static bool oid_stale(uint64_t oid)
vinfo = get_vnode_info();
nr_copies = get_obj_copy_number(oid, vinfo->nr_zones);
- oid_to_vnodes(vinfo->vnodes, vinfo->nr_vnodes, oid,
- nr_copies, obj_vnodes);
+ oid_to_vnodes(oid, &vinfo->vroot, nr_copies, obj_vnodes);
for (i = 0; i < nr_copies; i++) {
v = obj_vnodes[i];
if (vnode_is_local(v)) {
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 6a02adf..da5e711 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -184,7 +184,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
for (int i = 0; i < nr_copies; i++) {
const struct sd_vnode *vnode;
- vnode = oid_to_vnode(old->vnodes, old->nr_vnodes, oid, i);
+ vnode = oid_to_vnode(oid, &old->vroot, i);
if (vnode_is_local(vnode)) {
start = i;
@@ -197,7 +197,7 @@ static int recover_object_from_replica(struct recovery_obj_work *row,
const struct sd_node *node;
int idx = (i + start) % nr_copies;
- node = oid_to_node(old->vnodes, old->nr_vnodes, oid, idx);
+ node = oid_to_node(oid, &old->vroot, idx);
if (invalid_node(node, row->base.cur_vinfo))
continue;
@@ -724,8 +724,7 @@ static void screen_object_list(struct recovery_list_work *rlw,
nr_objs = get_obj_copy_number(oids[i], rw->cur_vinfo->nr_zones);
- oid_to_vnodes(rw->cur_vinfo->vnodes, rw->cur_vinfo->nr_vnodes,
- oids[i], nr_objs, vnodes);
+ oid_to_vnodes(oids[i], &rw->cur_vinfo->vroot, nr_objs, vnodes);
for (j = 0; j < nr_objs; j++) {
if (!vnode_is_local(vnodes[j]))
continue;
diff --git a/sheep/request.c b/sheep/request.c
index 5bd917b..5113fca 100644
--- a/sheep/request.c
+++ b/sheep/request.c
@@ -28,9 +28,7 @@ static bool is_access_local(struct request *req, uint64_t oid)
int i;
nr_copies = get_req_copy_number(req);
- oid_to_vnodes(req->vinfo->vnodes, req->vinfo->nr_vnodes, oid,
- nr_copies, obj_vnodes);
-
+ oid_to_vnodes(oid, &req->vinfo->vroot, nr_copies, obj_vnodes);
for (i = 0; i < nr_copies; i++) {
if (vnode_is_local(obj_vnodes[i]))
return true;
@@ -310,7 +308,7 @@ static void queue_gateway_request(struct request *req)
return;
queue_work:
- if (req->vinfo->nr_vnodes == 0) {
+ if (RB_EMPTY_ROOT(&req->vinfo->vroot)) {
sd_err("there is no living nodes");
req->rp.result = SD_RES_HALT;
put_request(req);
--
1.7.9.5
More information about the sheepdog
mailing list