[sheepdog] [PATCH 3/7] sheep: use sd_hash instead of fnv_64a_buf

MORITA Kazutaka morita.kazutaka at gmail.com
Fri Aug 30 11:32:05 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

This patch uses sd_hash instead of fnv_64a_buf for better object
dispersion.

This also fixes the problem that we are using the number of disks for
calculating virtual disk ids.  Without this patch, virtual disk ids of
the existing disks will be changed for each we plug/unplug a disk; a
lot of object movement will happen.

Please note that this patch breaks backward compatibility because this
changes a hash function of our consistent hashing algorithm, which
means that object location will be also changed.  The upgrade code for
the existing users will be appeared before releasing the next sheepdog
release.

I've not changed some hash calculation which is used for getting
object file names (vdi object name and attribute object name).  It is
because changing them makes it very hard to implement the upgrade code
and forces us to modify qemu client codes too.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/sheep.h           |   54 ++++++++++++++++++++++-----------------------
 include/sheepdog_proto.h  |   25 ++++++++++++++++++---
 sheep/cluster/zookeeper.c |   13 ++++++-----
 sheep/md.c                |   37 +++++++++++++++++--------------
 sheep/ops.c               |    3 +--
 sheep/recovery.c          |    4 ++--
 sheep/vdi.c               |   26 ++++++++++++++--------
 7 files changed, 97 insertions(+), 65 deletions(-)

diff --git a/include/sheep.h b/include/sheep.h
index 68c852f..3d7db6a 100644
--- a/include/sheep.h
+++ b/include/sheep.h
@@ -67,7 +67,7 @@ static inline int same_zone(const struct sd_vnode *e, int n1, int n2)
 static inline int get_vnode_first_idx(const struct sd_vnode *entries,
 				      int nr_entries, uint64_t oid)
 {
-	uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
+	uint64_t id = sd_hash_oid(oid);
 	int start, end, pos;
 
 	assert(nr_entries > 0);
@@ -278,37 +278,37 @@ static inline int vnode_cmp(const struct sd_vnode *node1,
 	return intcmp(node1->id, node2->id);
 }
 
-static inline int nodes_to_vnodes(struct sd_node *nodes, int nr,
-				  struct sd_vnode *vnodes)
+static inline int node_to_vnodes(const struct sd_node *nodes, int idx,
+				 struct sd_vnode *vnodes)
 {
-	struct sd_node *n = nodes;
-	int i, j, nr_vnodes = 0;
-	uint64_t hval;
-
-	while (nr--) {
-		hval = FNV1A_64_INIT;
-
-		for (i = 0; i < n->nr_vnodes; i++) {
-			if (vnodes) {
-				hval = fnv_64a_buf(&n->nid.port, sizeof(n->nid.port), hval);
-				for (j = ARRAY_SIZE(n->nid.addr) - 1; j >= 0; j--)
-					hval = fnv_64a_buf(&n->nid.addr[j], 1, hval);
-
-				vnodes[nr_vnodes].id = hval;
-				memcpy(vnodes[nr_vnodes].nid.addr, n->nid.addr, sizeof(n->nid.addr));
-				vnodes[nr_vnodes].nid.port = n->nid.port;
-				vnodes[nr_vnodes].node_idx = n - nodes;
-				vnodes[nr_vnodes].zone = n->zone;
-			}
+	int nr = 0;
+	const struct sd_node *n = nodes + idx;
+	uint64_t hval = sd_hash(&n->nid, offsetof(typeof(n->nid), io_addr));
 
-			nr_vnodes++;
-		}
+	for (int i = 0; i < n->nr_vnodes; i++) {
+		hval = sd_hash_next(hval);
 
-		n++;
+		vnodes[nr].id = hval;
+		memcpy(vnodes[nr].nid.addr, n->nid.addr, sizeof(n->nid.addr));
+		vnodes[nr].nid.port = n->nid.port;
+		vnodes[nr].node_idx = idx;
+		vnodes[nr].zone = n->zone;
+
+		nr++;
 	}
 
-	if (vnodes)
-		xqsort(vnodes, nr_vnodes, vnode_cmp);
+	return nr;
+}
+
+static inline int nodes_to_vnodes(const struct sd_node *nodes, int nr_nodes,
+				  struct sd_vnode *vnodes)
+{
+	int nr_vnodes = 0;
+
+	for (int i = 0; i < nr_nodes; i++)
+		nr_vnodes += node_to_vnodes(nodes, i, vnodes + nr_vnodes);
+
+	xqsort(vnodes, nr_vnodes, vnode_cmp);
 
 	return nr_vnodes;
 }
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index c194c8c..aa9d6c2 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -14,6 +14,7 @@
 #include <inttypes.h>
 #include <stdint.h>
 #include <stdbool.h>
+#include <string.h>
 #include <linux/limits.h>
 
 #include "compiler.h"
@@ -284,11 +285,29 @@ static inline uint64_t sd_hash_next(uint64_t hval)
 	return fnv_64a_64(hval, hval);
 }
 
-static inline uint64_t hash_64(uint64_t val, unsigned int bits)
+/*
+ * Create a hash value from an object id.  The result is same as sd_hash(&oid,
+ * sizeof(oid)) but this function is a bit faster.
+ */
+static inline uint64_t sd_hash_oid(uint64_t oid)
+{
+	return sd_hash_64(oid);
+}
+
+/*
+ * Create a hash value from a vdi name.  We cannot use sd_hash_buf for this
+ * purpose because of backward compatibility.
+ */
+static inline uint32_t sd_hash_vdi(const char *name)
 {
-	uint64_t hash = fnv_64a_buf(&val, sizeof(uint64_t), FNV1A_64_INIT);
+	uint64_t hval = fnv_64a_buf(name, strlen(name), FNV1A_64_INIT);
+
+	return (uint32_t)(hval & (SD_NR_VDIS - 1));
+}
 
-	return hash & ((1 << bits) - 1);
+static inline uint64_t hash_64(uint64_t val, unsigned int bits)
+{
+	return sd_hash_64(val) >> (64 - bits);
 }
 
 static inline bool is_data_obj_writeable(const struct sd_inode *inode,
diff --git a/sheep/cluster/zookeeper.c b/sheep/cluster/zookeeper.c
index 3972ba0..238c114 100644
--- a/sheep/cluster/zookeeper.c
+++ b/sheep/cluster/zookeeper.c
@@ -473,12 +473,15 @@ static int zk_queue_init(void)
 static uint64_t get_uniq_id(void)
 {
 	static int seq;
-	uint64_t id, n = uatomic_add_return(&seq, 1);
-
-	id = fnv_64a_buf(&this_node, sizeof(this_node), FNV1A_64_INIT);
-	id = fnv_64a_buf(&n, sizeof(n), id);
+	struct {
+		uint64_t n;
+		struct zk_node node;
+	} id = {
+		.n = uatomic_add_return(&seq, 1),
+		.node = this_node,
+	};
 
-	return id;
+	return sd_hash(&id, sizeof(id));
 }
 
 static int add_event(enum zk_event_type type, struct zk_node *znode, void *buf,
diff --git a/sheep/md.c b/sheep/md.c
index dd2bba5..d51a7c6 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -47,7 +47,7 @@ static inline int nr_online_disks(void)
 
 static struct vdisk *oid_to_vdisk_from(struct vdisk *vds, int nr, uint64_t oid)
 {
-	uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
+	uint64_t id = sd_hash_oid(oid);
 	int start, end, pos;
 
 	start = 0;
@@ -72,28 +72,31 @@ static int vdisk_cmp(const struct vdisk *d1, const struct vdisk *d2)
 	return intcmp(d1->id, d2->id);
 }
 
-static inline int disks_to_vdisks(struct disk *ds, int nmds, struct vdisk *vds)
+static inline int disk_to_vdisks(const struct disk *ds, int idx, struct vdisk *vds)
 {
-	struct disk *d_iter = ds;
-	int i, j, nr_vdisks = 0;
-	uint64_t hval;
+	int nr = 0;
+	const struct disk *d = ds + idx;
+	uint64_t hval = sd_hash(d->path, strlen(d->path));
 
-	while (nmds--) {
-		hval = FNV1A_64_INIT;
+	for (int i = 0; i < d->nr_vdisks; i++) {
+		hval = sd_hash_next(hval);
 
-		for (i = 0; i < d_iter->nr_vdisks; i++) {
-			hval = fnv_64a_buf(&nmds, sizeof(nmds), hval);
-			for (j = strlen(d_iter->path) - 1; j >= 0; j--)
-				hval = fnv_64a_buf(&d_iter->path[j], 1, hval);
+		vds[nr].id = hval;
+		vds[nr].idx = idx;
 
-			vds[nr_vdisks].id = hval;
-			vds[nr_vdisks].idx = d_iter - ds;
+		nr++;
+	}
 
-			nr_vdisks++;
-		}
+	return nr;
+}
+
+static inline int disks_to_vdisks(const struct disk *ds, int nmds, struct vdisk *vds)
+{
+	int nr_vdisks = 0;
+
+	for (int i = 0; i < nmds; i++)
+		nr_vdisks += disk_to_vdisks(ds, i, vds + nr_vdisks);
 
-		d_iter++;
-	}
 	xqsort(vds, nr_vdisks, vdisk_cmp);
 
 	return nr_vdisks;
diff --git a/sheep/ops.c b/sheep/ops.c
index f2d0a4d..a3a2e99 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -312,8 +312,7 @@ static int cluster_get_vdi_attr(struct request *req)
 	 * the current VDI id can change if we take a snapshot,
 	 * so we use the hash value of the VDI name as the VDI id
 	 */
-	vid = fnv_64a_buf(vattr->name, strlen(vattr->name), FNV1A_64_INIT);
-	vid &= SD_NR_VDIS - 1;
+	vid = sd_hash_vdi(vattr->name);
 	ret = get_vdi_attr(req->data, hdr->data_length,
 			   vid, &attrid, info.create_time,
 			   !!(hdr->flags & SD_FLAG_CMD_CREAT),
diff --git a/sheep/recovery.c b/sheep/recovery.c
index 6faaeab..a5f5c43 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -82,8 +82,8 @@ static size_t list_buffer_size = DEFAULT_LIST_BUFFER_SIZE;
 
 static int obj_cmp(const uint64_t *oid1, const uint64_t *oid2)
 {
-	const uint64_t hval1 = fnv_64a_buf(oid1, sizeof(*oid1), FNV1A_64_INIT);
-	const uint64_t hval2 = fnv_64a_buf(oid2, sizeof(*oid2), FNV1A_64_INIT);
+	const uint64_t hval1 = sd_hash_oid(*oid1);
+	const uint64_t hval2 = sd_hash_oid(*oid2);
 
 	return intcmp(hval1, hval2);
 }
diff --git a/sheep/vdi.c b/sheep/vdi.c
index d5df668..a46d572 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -307,8 +307,7 @@ out:
 static int get_vdi_bitmap_range(const char *name, unsigned long *left,
 				unsigned long *right)
 {
-	*left = fnv_64a_buf(name, strlen(name), FNV1A_64_INIT) &
-		(SD_NR_VDIS - 1);
+	*left = sd_hash_vdi(name);
 	*right = find_next_zero_bit(sys->vdi_inuse, SD_NR_VDIS, *left);
 	if (*left == *right)
 		return SD_RES_NO_VDI;
@@ -845,23 +844,32 @@ err:
 	return ret;
 }
 
+/* Calculate a vdi attribute id from sheepdog_vdi_attr. */
+static uint32_t hash_vdi_attr(const struct sheepdog_vdi_attr *attr)
+{
+	uint64_t hval;
+
+	/* We cannot use sd_hash for backward compatibility. */
+	hval = fnv_64a_buf(attr->name, sizeof(attr->name), FNV1A_64_INIT);
+	hval = fnv_64a_buf(attr->tag, sizeof(attr->tag), hval);
+	hval = fnv_64a_buf(&attr->snap_id, sizeof(attr->snap_id), hval);
+	hval = fnv_64a_buf(attr->key, sizeof(attr->key), hval);
+
+	return (uint32_t)(hval & ((UINT64_C(1) << VDI_SPACE_SHIFT) - 1));
+}
+
 int get_vdi_attr(struct sheepdog_vdi_attr *vattr, int data_len,
 		 uint32_t vid, uint32_t *attrid, uint64_t create_time,
 		 bool wr, bool excl, bool delete)
 {
 	struct sheepdog_vdi_attr tmp_attr;
-	uint64_t oid, hval;
+	uint64_t oid;
 	uint32_t end;
 	int ret;
 
 	vattr->ctime = create_time;
 
-	/* we cannot include value_len for calculating the hash value */
-	hval = fnv_64a_buf(vattr->name, sizeof(vattr->name), FNV1A_64_INIT);
-	hval = fnv_64a_buf(vattr->tag, sizeof(vattr->tag), hval);
-	hval = fnv_64a_buf(&vattr->snap_id, sizeof(vattr->snap_id), hval);
-	hval = fnv_64a_buf(vattr->key, sizeof(vattr->key), hval);
-	*attrid = hval & ((UINT64_C(1) << VDI_SPACE_SHIFT) - 1);
+	*attrid = hash_vdi_attr(vattr);
 
 	end = *attrid - 1;
 	while (*attrid != end) {
-- 
1.7.9.5




More information about the sheepdog mailing list