[sheepdog] [PATCH v2 1/7] md: add dummy md proper
Liu Yuan
namei.unix at gmail.com
Mon Mar 11 09:19:27 CET 2013
From: Liu Yuan <tailai.ly at taobao.com>
This add the dummy objects mapping code, which use consistent hashing, for md
like support which aims to manage multiple disks in one node.
This patch just mainly add the compile support for md.c and some initilization
code modifcation to adopt to later series in the sheep core code.
Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
include/util.h | 1 +
lib/util.c | 10 +++
sheep/Makefile.am | 4 +-
sheep/farm/farm.c | 9 ---
sheep/md.c | 180 ++++++++++++++++++++++++++++++++++++++++++++++++++++
sheep/sheep.c | 12 ++--
sheep/sheep_priv.h | 8 ++-
sheep/store.c | 59 ++++++++++++-----
8 files changed, 249 insertions(+), 34 deletions(-)
create mode 100644 sheep/md.c
diff --git a/include/util.h b/include/util.h
index 23d5678..3307d7e 100644
--- a/include/util.h
+++ b/include/util.h
@@ -88,6 +88,7 @@ bool is_numeric(const char *p);
int install_sighandler(int signum, void (*handler)(int), bool once);
int install_crash_handler(void (*handler)(int));
pid_t gettid(void);
+bool is_xattr_enabled(const char *path);
void trim_zero_sectors(void *buf, uint64_t *offset, uint32_t *len);
void untrim_zero_sectors(void *buf, uint64_t offset, uint32_t len,
diff --git a/lib/util.c b/lib/util.c
index 6908bb0..41396da 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -21,6 +21,7 @@
#include <assert.h>
#include <ctype.h>
#include <signal.h>
+#include <sys/xattr.h>
#include "util.h"
#include "logger.h"
@@ -381,3 +382,12 @@ pid_t gettid(void)
{
return syscall(SYS_gettid);
}
+
+bool is_xattr_enabled(const char *path)
+{
+ int ret, dummy;
+
+ ret = getxattr(path, "user.dummy", &dummy, sizeof(dummy));
+
+ return !(ret == -1 && errno == ENOTSUP);
+}
diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index a1f564c..7f87616 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -27,7 +27,7 @@ sbin_PROGRAMS = sheep
sheep_SOURCES = sheep.c group.c request.c gateway.c store.c vdi.c work.c \
journal.c ops.c recovery.c cluster/local.c \
object_cache.c object_list_cache.c sockfd_cache.c \
- plain_store.c config.c migrate.c journal_file.c
+ plain_store.c config.c migrate.c journal_file.c md.c
if BUILD_COROSYNC
sheep_SOURCES += cluster/corosync.c
@@ -62,4 +62,4 @@ check-syntax:
$(COMPILE) -fsyntax-only $(CHK_SOURCES)
check-style:
- @$(CHECK_STYLE) $(sheep_SOURCES) $(noinst_HEADERS)
\ No newline at end of file
+ @$(CHECK_STYLE) $(sheep_SOURCES) $(noinst_HEADERS)
diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index b45b53f..1943dc4 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -99,15 +99,6 @@ out:
return ret;
}
-static bool is_xattr_enabled(const char *path)
-{
- int ret, dummy;
-
- ret = getxattr(path, "user.dummy", &dummy, sizeof(dummy));
-
- return !(ret == -1 && errno == ENOTSUP);
-}
-
static int farm_init(const char *p)
{
sd_dprintf("use farm store driver");
diff --git a/sheep/md.c b/sheep/md.c
new file mode 100644
index 0000000..1aad8f0
--- /dev/null
+++ b/sheep/md.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright (C) 2013 Taobao Inc.
+ *
+ * Liu Yuan <namei.unix at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/limits.h>
+#include <sys/types.h>
+#include <sys/statvfs.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <errno.h>
+#include <math.h>
+#include <sys/xattr.h>
+
+#include "sheep_priv.h"
+
+#define SD_DEFAULT_RAID_VDISKS 128
+#define SD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */
+#define SD_MAX_VDISK (SD_MAX_DISK * SD_DEFAULT_RAID_VDISKS)
+
+struct disk {
+ char path[PATH_MAX];
+ uint16_t nr_vdisks;
+ uint64_t space;
+} md_disks[SD_MAX_DISK];
+
+struct vdisk {
+ uint16_t idx;
+ uint64_t id;
+} md_vds[SD_MAX_VDISK];
+
+static int md_nr_disks;
+static int md_nr_vds;
+
+static struct vdisk *oid_to_vdisk_from(struct vdisk *vds, int nr, uint64_t oid)
+{
+ uint64_t id = fnv_64a_buf(&oid, sizeof(oid), FNV1A_64_INIT);
+ int start, end, pos;
+
+ start = 0;
+ end = nr - 1;
+
+ if (id > vds[end].id || id < vds[start].id)
+ return &vds[start];
+
+ for (;;) {
+ pos = (end - start) / 2 + start;
+ if (vds[pos].id < id) {
+ if (vds[pos + 1].id >= id)
+ return &vds[pos + 1];
+ start = pos;
+ } else
+ end = pos;
+ }
+}
+
+static int vdisk_cmp(const void *a, const void *b)
+{
+ const struct vdisk *d1 = a;
+ const struct vdisk *d2 = b;
+
+ if (d1->id < d2->id)
+ return -1;
+ if (d1->id > d2->id)
+ return 1;
+ return 0;
+}
+
+static inline int disks_to_vdisks(struct disk *ds, int nmds, struct vdisk *vds)
+{
+ struct disk *d_iter = ds;
+ int i, j, nr_vdisks = 0;
+ uint64_t hval;
+
+ while (nmds--) {
+ hval = FNV1A_64_INIT;
+
+ for (i = 0; i < d_iter->nr_vdisks; i++) {
+ hval = fnv_64a_buf(&nmds, sizeof(nmds), hval);
+ for (j = strlen(d_iter->path) - 1; j >= 0; j--)
+ hval = fnv_64a_buf(&d_iter->path[j], 1, hval);
+
+ vds[nr_vdisks].id = hval;
+ vds[nr_vdisks].idx = d_iter - ds;
+
+ nr_vdisks++;
+ }
+
+ d_iter++;
+ }
+ qsort(vds, nr_vdisks, sizeof(*vds), vdisk_cmp);
+
+ return nr_vdisks;
+}
+
+static inline struct vdisk *oid_to_vdisk(uint64_t oid)
+{
+ return oid_to_vdisk_from(md_vds, md_nr_vds, oid);
+}
+
+int md_init_disk(char *path)
+{
+ md_nr_disks++;
+
+ if (mkdir(path, def_dmode) < 0)
+ if (errno != EEXIST)
+ panic("%s, %m", path);
+ sd_dprintf("%s added to md, nr %d", path, md_nr_disks);
+ memcpy(md_disks[md_nr_disks - 1].path, path, PATH_MAX);
+ return 0;
+}
+
+static inline void calculate_vdisks(struct disk *disks, int nr_disks,
+ uint64_t total)
+{
+ uint64_t avg_size = total / nr_disks;
+ float factor;
+ int i;
+
+ for (i = 0; i < nr_disks; i++) {
+ factor = (float)disks[i].space / (float)avg_size;
+ md_disks[i].nr_vdisks = rintf(SD_DEFAULT_RAID_VDISKS * factor);
+ sd_dprintf("%s has %d vdisks, free space %" PRIu64,
+ md_disks[i].path, md_disks[i].nr_vdisks,
+ md_disks[i].space);
+ }
+}
+
+#define RDNAME "user.md.size"
+#define RDSIZE sizeof(uint64_t)
+
+static uint64_t init_path_space(char *path)
+{
+ struct statvfs fs;
+ uint64_t size;
+
+ if (getxattr(path, RDNAME, &size, RDSIZE) < 0) {
+ if (errno == ENODATA)
+ goto create;
+ else
+ panic("%s, %m", path);
+ }
+
+ return size;
+create:
+ if (statvfs(path, &fs) < 0)
+ panic("get disk %s space failed %m", path);
+ size = (int64_t)fs.f_frsize * fs.f_bfree;
+ if (setxattr(path, RDNAME, &size, RDSIZE, 0) < 0)
+ panic("%s, %m", path);
+ return size;
+}
+
+uint64_t md_init_space(void)
+{
+ uint64_t total = 0;
+ int i;
+
+ if (!md_nr_disks)
+ return 0;
+
+ for (i = 0; i < md_nr_disks; i++) {
+ if (!is_xattr_enabled(md_disks[i].path))
+ panic("multi-disk support need xattr feature");
+ md_disks[i].space = init_path_space(md_disks[i].path);
+ total += md_disks[i].space;
+ }
+ calculate_vdisks(md_disks, md_nr_disks, total);
+ md_nr_vds = disks_to_vdisks(md_disks, md_nr_disks, md_vds);
+
+ return total;
+}
diff --git a/sheep/sheep.c b/sheep/sheep.c
index 39fe766..a72a1ef 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -391,7 +391,8 @@ int main(int argc, char **argv)
int ch, longindex, ret, port = SD_LISTEN_PORT, io_port = SD_LISTEN_PORT;
int log_level = SDOG_INFO, nr_vnodes = SD_DEFAULT_VNODES;
const char *dirp = DEFAULT_OBJECT_DIR, *short_options;
- char *dir, *p, *pid_file = NULL, *bindaddr = NULL, path[PATH_MAX];
+ char *dir, *p, *pid_file = NULL, *bindaddr = NULL, path[PATH_MAX],
+ *argp = NULL;
bool is_daemon = true, to_stdout = false, explicit_addr = false;
int64_t zone = -1, free_space = 0;
struct cluster_driver *cdrv;
@@ -558,8 +559,10 @@ int main(int argc, char **argv)
sys->disk_space = 0;
}
- if (optind != argc)
- dirp = argv[optind];
+ if (optind != argc) {
+ argp = strdup(argv[optind]);
+ dirp = strtok(argv[optind], ",");
+ }
ret = init_base_path(dirp);
if (ret)
@@ -587,7 +590,8 @@ int main(int argc, char **argv)
if (ret)
exit(1);
- ret = init_global_pathnames(dir);
+ ret = init_global_pathnames(dir, argp);
+ free(argp);
if (ret)
exit(1);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a4d8f03..d6fcc58 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -221,9 +221,8 @@ int create_listen_port(char *bindaddr, int port);
int init_unix_domain_socket(const char *dir);
int init_store_driver(bool is_gateway);
-int init_global_pathnames(const char *d);
+int init_global_pathnames(const char *d, char *);
int init_base_path(const char *dir);
-int init_obj_path(const char *d);
int init_disk_space(const char *d);
int fill_vdi_copy_list(void *data);
@@ -414,4 +413,9 @@ bool sheep_need_retry(uint32_t epoch);
/* journal_file.c */
int journal_file_init(const char *path, size_t size, bool skip);
int journal_file_write(uint64_t oid, const char *buf, size_t size, off_t, bool);
+
+/* md.c */
+int md_init_disk(char *path);
+uint64_t md_init_space(void);
+
#endif
diff --git a/sheep/store.c b/sheep/store.c
index 2e88c0d..5cf6ec4 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -205,23 +205,39 @@ int init_base_path(const char *d)
#define OBJ_PATH "/obj/"
-int init_obj_path(const char *base_path)
+/*
+ * farm needs extra HEX_LEN + 3 chars to store snapshot objects.
+ * HEX_LEN + 3 = '/' + hex(2) + '/' + hex(38) + '\0'
+ */
+static inline int check_path_len(const char *path)
{
- int len;
-
- len = strlen(base_path);
- /* farm needs extra HEX_LEN + 3 chars to store snapshot objects.
- * HEX_LEN + 3 = '/' + hex(2) + '/' + hex(38) + '\0'
- */
+ int len = strlen(path);
if (len + HEX_LEN + 3 > PATH_MAX) {
- sd_eprintf("insanely long object directory %s", base_path);
+ sd_eprintf("insanely long object directory %s", path);
return -1;
}
+ return 0;
+}
+
+static int init_obj_path(const char *base_path, char *argp)
+{
+ char *p;
+ int len;
+
+ if (check_path_len(base_path) < 0)
+ return -1;
+
len = strlen(base_path) + strlen(OBJ_PATH) + 1;
obj_path = xzalloc(len);
snprintf(obj_path, len, "%s" OBJ_PATH, base_path);
+ /* Eat up the first component */
+ strtok(argp, ",");
+ while ((p = strtok(NULL, ",")))
+ if (md_init_disk(p) < 0)
+ return -1;
+
return init_path(obj_path, NULL);
}
@@ -305,31 +321,40 @@ int init_store_driver(bool is_gateway)
int init_disk_space(const char *base_path)
{
int ret = SD_RES_SUCCESS;
- uint64_t space_size = 0;
+ uint64_t space_size = 0, mds;
struct statvfs fs;
if (sys->gateway_only)
goto out;
+ /* We need to init md even we don't need to update sapce */
+ mds = md_init_space();
+
+ /* If it is restarted */
ret = get_cluster_space(&space_size);
if (space_size != 0) {
sys->disk_space = space_size;
goto out;
}
+ /* User has specified the space at startup */
if (sys->disk_space) {
ret = set_cluster_space(sys->disk_space);
goto out;
}
- ret = statvfs(base_path, &fs);
- if (ret < 0) {
- sd_dprintf("get disk space failed %m");
- ret = SD_RES_EIO;
- goto out;
+ if (mds) {
+ sys->disk_space = mds;
+ } else {
+ ret = statvfs(base_path, &fs);
+ if (ret < 0) {
+ sd_dprintf("get disk space failed %m");
+ ret = SD_RES_EIO;
+ goto out;
+ }
+ sys->disk_space = (uint64_t)fs.f_frsize * fs.f_bfree;
}
- sys->disk_space = (uint64_t)fs.f_frsize * fs.f_bfree;
ret = set_cluster_space(sys->disk_space);
out:
sd_dprintf("disk free space is %" PRIu64, sys->disk_space);
@@ -337,11 +362,11 @@ out:
}
/* Initilize all the global pathnames used internally */
-int init_global_pathnames(const char *d)
+int init_global_pathnames(const char *d, char *argp)
{
int ret;
- ret = init_obj_path(d);
+ ret = init_obj_path(d, argp);
if (ret)
return ret;
--
1.7.9.5
More information about the sheepdog
mailing list