[sheepdog] [PATCH v3 2/4] md: make vdisk management dynamic
Liu Yuan
namei.unix at gmail.com
Tue Sep 3 11:04:43 CEST 2013
This removes the roof limit of disks that one sheep can manage.
The vnode creation algorithm isn't changed, but the calculation of vdisk number
is changed:
we assign each vdisk 1G bytes, so, e.g, 160G will have 160 vdisks and 1T
will have 1024 vdisks.
In this way, we only need to add/remove associated vdisks from vdisk ring by
plug/unplug physical disks.
vdisks are now managed by rbtree instead of a fixed array.
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
sheep/md.c | 361 ++++++++++++++++++++++++----------------------------
sheep/sheep_priv.h | 2 +-
2 files changed, 164 insertions(+), 199 deletions(-)
diff --git a/sheep/md.c b/sheep/md.c
index d51a7c6..d9cd757 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -13,98 +13,86 @@
#include "sheep_priv.h"
-#define MD_DEFAULT_VDISKS 128
-#define MD_MAX_VDISK (MD_MAX_DISK * MD_DEFAULT_VDISKS)
+#define MD_VDISK_SIZE ((uint64_t)1*1024*1024*1024) /* 1G */
+
+#define NONE_EXIST_PATH "/all/disks/are/broken/,ps/əʌo7/!"
struct disk {
+ struct list_node list;
char path[PATH_MAX];
- uint16_t nr_vdisks;
uint64_t space;
};
struct vdisk {
- uint16_t idx;
- uint64_t id;
+ struct rb_node rb;
+ struct disk *disk;
+ uint64_t hash;
};
-static struct disk md_disks[MD_MAX_DISK];
-static struct vdisk md_vds[MD_MAX_VDISK];
+struct md {
+ struct rb_root vroot;
+ struct list_head disk_list;
+ struct sd_lock lock;
+ uint64_t space;
+ uint32_t nr_disks;
+};
-static struct sd_lock md_lock = SD_LOCK_INITIALIZER;
-static int md_nr_disks; /* Protected by md_lock */
-static int md_nr_vds;
+static struct md md = {
+ .vroot = RB_ROOT,
+ .disk_list = LIST_HEAD_INIT(md.disk_list),
+ .lock = SD_LOCK_INITIALIZER,
+};
static inline int nr_online_disks(void)
{
int nr;
- sd_read_lock(&md_lock);
- nr = md_nr_disks;
- sd_unlock(&md_lock);
+ sd_read_lock(&md.lock);
+ nr = md.nr_disks;
+ sd_unlock(&md.lock);
return nr;
}
-static struct vdisk *oid_to_vdisk_from(struct vdisk *vds, int nr, uint64_t oid)
+static inline int vdisk_number(struct disk *disk)
{
- uint64_t id = sd_hash_oid(oid);
- int start, end, pos;
-
- start = 0;
- end = nr - 1;
-
- if (id > vds[end].id || id < vds[start].id)
- return &vds[start];
-
- for (;;) {
- pos = (end - start) / 2 + start;
- if (vds[pos].id < id) {
- if (vds[pos + 1].id >= id)
- return &vds[pos + 1];
- start = pos;
- } else
- end = pos;
- }
+ return DIV_ROUND_UP(disk->space, MD_VDISK_SIZE);
}
static int vdisk_cmp(const struct vdisk *d1, const struct vdisk *d2)
{
- return intcmp(d1->id, d2->id);
+ return intcmp(d1->hash, d2->hash);
}
-static inline int disk_to_vdisks(const struct disk *ds, int idx, struct vdisk *vds)
+static struct vdisk *vdisk_insert(struct vdisk *new)
{
- int nr = 0;
- const struct disk *d = ds + idx;
- uint64_t hval = sd_hash(d->path, strlen(d->path));
-
- for (int i = 0; i < d->nr_vdisks; i++) {
- hval = sd_hash_next(hval);
-
- vds[nr].id = hval;
- vds[nr].idx = idx;
-
- nr++;
- }
-
- return nr;
+ return rb_insert(&md.vroot, new, rb, vdisk_cmp);
}
-static inline int disks_to_vdisks(const struct disk *ds, int nmds, struct vdisk *vds)
+/* If v1_hash < oid <= v2_hash, then oid is resident in v2 */
+static struct vdisk *oid_to_vdisk(uint64_t oid)
{
- int nr_vdisks = 0;
-
- for (int i = 0; i < nmds; i++)
- nr_vdisks += disk_to_vdisks(ds, i, vds + nr_vdisks);
+ struct vdisk dummy = {
+ .hash = sd_hash_oid(oid),
+ };
- xqsort(vds, nr_vdisks, vdisk_cmp);
-
- return nr_vdisks;
+ return rb_find(&md.vroot, &dummy, rb, vdisk_cmp);
}
-static inline struct vdisk *oid_to_vdisk(uint64_t oid)
+static void create_vdisks(struct disk *disk)
{
- return oid_to_vdisk_from(md_vds, md_nr_vds, oid);
+ uint64_t hval = sd_hash(disk->path, strlen(disk->path));
+ int nr = vdisk_number(disk);
+
+ for (int i = 0; i < nr; i++) {
+ struct vdisk *v = xmalloc(sizeof(*v));
+
+ hval = sd_hash_next(hval);
+ v->hash = hval;
+ v->disk = disk;
+ if (unlikely(vdisk_insert(v)))
+ panic("vdisk hash collison");
+ }
}
static inline void trim_last_slash(char *path)
@@ -114,56 +102,19 @@ static inline void trim_last_slash(char *path)
path[strlen(path) - 1] = '\0';
}
-static int path_to_disk_idx(char *path)
+static struct disk *path_to_disk(char *path)
{
- int i;
+ struct disk *disk;
trim_last_slash(path);
- for (i = 0; i < md_nr_disks; i++)
- if (strcmp(md_disks[i].path, path) == 0)
- return i;
-
- return -1;
-}
-
-bool md_add_disk(char *path)
-{
- if (path_to_disk_idx(path) != -1) {
- sd_err("duplicate path %s", path);
- return false;
+ list_for_each_entry(disk, &md.disk_list, list) {
+ if (strcmp(disk->path, path) == 0)
+ return disk;
}
- if (xmkdir(path, sd_def_dmode) < 0) {
- sd_err("can't mkdir for %s, %m", path);
- return false;
- }
-
- md_nr_disks++;
-
- pstrcpy(md_disks[md_nr_disks - 1].path, PATH_MAX, path);
- sd_info("%s, nr %d", md_disks[md_nr_disks - 1].path, md_nr_disks);
- return true;
-}
-
-static inline void calculate_vdisks(struct disk *disks, int nr_disks,
- uint64_t total)
-{
- uint64_t avg_size = total / nr_disks;
- float factor;
- int i;
-
- for (i = 0; i < nr_disks; i++) {
- factor = (float)disks[i].space / (float)avg_size;
- md_disks[i].nr_vdisks = rintf(MD_DEFAULT_VDISKS * factor);
- sd_debug("%s has %d vdisks, free space %" PRIu64,
- md_disks[i].path, md_disks[i].nr_vdisks,
- md_disks[i].space);
- }
+ return NULL;
}
-#define MDNAME "user.md.size"
-#define MDSIZE sizeof(uint64_t)
-
static int get_total_object_size(uint64_t oid, char *wd, uint32_t epoch,
void *total)
{
@@ -272,6 +223,8 @@ static uint64_t init_path_space(char *path)
goto broken_path;
}
+#define MDNAME "user.md.size"
+#define MDSIZE sizeof(uint64_t)
if (getxattr(path, MDNAME, &size, MDSIZE) < 0) {
if (errno == ENODATA) {
goto create;
@@ -295,81 +248,94 @@ broken_path:
return 0;
}
-static inline void md_remove_disk(int idx)
+/* We don't need lock at init stage */
+bool md_add_disk(char *path)
{
- int i;
+ struct disk *new;
- sd_info("%s from multi-disk array", md_disks[idx].path);
- /*
- * We need to keep last disk path to generate EIO when all disks are
- * broken
- */
- for (i = idx; i < md_nr_disks - 1; i++)
- md_disks[i] = md_disks[i + 1];
+ if (path_to_disk(path)) {
+ sd_err("duplicate path %s", path);
+ return false;
+ }
+
+ if (xmkdir(path, sd_def_dmode) < 0) {
+ sd_err("can't mkdir for %s, %m", path);
+ return false;
+ }
+
+ new = xmalloc(sizeof(*new));
+ pstrcpy(new->path, PATH_MAX, path);
+ new->space = init_path_space(new->path);
+ if (!new->space) {
+ free(new);
+ return false;
+ }
- md_nr_disks--;
+ create_vdisks(new);
+ list_add(&new->list, &md.disk_list);
+ md.space += new->space;
+ md.nr_disks++;
+
+ sd_info("%s, vdisk nr %d, total disk %d", new->path, vdisk_number(new),
+ md.nr_disks);
+ return true;
}
-uint64_t md_init_space(void)
+static inline void md_remove_disk(struct disk *disk)
{
- uint64_t total;
- int i;
-
-reinit:
- if (!md_nr_disks)
- return 0;
- total = 0;
+ struct vdisk *v;
- for (i = 0; i < md_nr_disks; i++) {
- md_disks[i].space = init_path_space(md_disks[i].path);
- if (!md_disks[i].space) {
- md_remove_disk(i);
- goto reinit;
- }
- total += md_disks[i].space;
+ sd_info("%s from multi-disk array", disk->path);
+ list_del(&disk->list);
+ md.nr_disks--;
+ rb_for_each_entry(v, &md.vroot, rb) {
+ if (v->disk == disk)
+ rb_erase(&v->rb, &md.vroot);
}
- calculate_vdisks(md_disks, md_nr_disks, total);
- md_nr_vds = disks_to_vdisks(md_disks, md_nr_disks, md_vds);
+ free(disk);
+}
- return total;
+uint64_t md_init_space(void)
+{
+ return md.space;
}
-char *md_get_object_path(uint64_t oid)
+static const char *md_get_object_path_nolock(uint64_t oid)
{
struct vdisk *vd;
- char *p;
- sd_read_lock(&md_lock);
- vd = oid_to_vdisk(oid);
- p = md_disks[vd->idx].path;
- sd_unlock(&md_lock);
- sd_debug("%d, %s", vd->idx, p);
+ if (md.nr_disks == 0)
+ return NONE_EXIST_PATH; /* To generate EIO */
- return p;
+ vd = oid_to_vdisk(oid);
+ return vd->disk->path;
}
-static char *md_get_object_path_nolock(uint64_t oid)
+const char *md_get_object_path(uint64_t oid)
{
- struct vdisk *vd;
+ const char *p;
- vd = oid_to_vdisk(oid);
- return md_disks[vd->idx].path;
+ sd_read_lock(&md.lock);
+ p = md_get_object_path_nolock(oid);
+ sd_unlock(&md.lock);
+
+ return p;
}
int for_each_object_in_wd(int (*func)(uint64_t oid, char *path, uint32_t epoch,
void *arg),
bool cleanup, void *arg)
{
- int i, ret = SD_RES_SUCCESS;
+ int ret = SD_RES_SUCCESS;
+ struct disk *disk;
- sd_read_lock(&md_lock);
- for (i = 0; i < md_nr_disks; i++) {
- ret = for_each_object_in_path(md_disks[i].path, func,
- cleanup, arg);
+ sd_read_lock(&md.lock);
+ list_for_each_entry(disk, &md.disk_list, list) {
+ ret = for_each_object_in_path(disk->path, func, cleanup, arg);
if (ret != SD_RES_SUCCESS)
break;
}
- sd_unlock(&md_lock);
+ sd_unlock(&md.lock);
return ret;
}
@@ -377,33 +343,35 @@ int for_each_object_in_stale(int (*func)(uint64_t oid, char *path,
uint32_t epoch, void *arg),
void *arg)
{
- int i, ret = SD_RES_SUCCESS;
+ int ret = SD_RES_SUCCESS;
char path[PATH_MAX];
+ struct disk *disk;
- sd_read_lock(&md_lock);
- for (i = 0; i < md_nr_disks; i++) {
- snprintf(path, sizeof(path), "%s/.stale", md_disks[i].path);
+ sd_read_lock(&md.lock);
+ list_for_each_entry(disk, &md.disk_list, list) {
+ snprintf(path, sizeof(path), "%s/.stale", disk->path);
sd_err("%s", path);
ret = for_each_object_in_path(path, func, false, arg);
if (ret != SD_RES_SUCCESS)
break;
}
- sd_unlock(&md_lock);
+ sd_unlock(&md.lock);
return ret;
}
int for_each_obj_path(int (*func)(char *path))
{
- int i, ret = SD_RES_SUCCESS;
+ int ret = SD_RES_SUCCESS;
+ struct disk *disk;
- sd_read_lock(&md_lock);
- for (i = 0; i < md_nr_disks; i++) {
- ret = func(md_disks[i].path);
+ sd_read_lock(&md.lock);
+ list_for_each_entry(disk, &md.disk_list, list) {
+ ret = func(disk->path);
if (ret != SD_RES_SUCCESS)
break;
}
- sd_unlock(&md_lock);
+ sd_unlock(&md.lock);
return ret;
}
@@ -423,18 +391,18 @@ static inline void kick_recover(void)
static void md_do_recover(struct work *work)
{
struct md_work *mw = container_of(work, struct md_work, work);
- int idx, nr = 0;
+ struct disk *disk;
+ int nr = 0;
- sd_write_lock(&md_lock);
- idx = path_to_disk_idx(mw->path);
- if (idx < 0)
+ sd_write_lock(&md.lock);
+ disk = path_to_disk(mw->path);
+ if (!disk)
/* Just ignore the duplicate EIO of the same path */
goto out;
- md_remove_disk(idx);
- md_init_space();
- nr = md_nr_disks;
+ md_remove_disk(disk);
+ nr = md.nr_disks;
out:
- sd_unlock(&md_lock);
+ sd_unlock(&md.lock);
if (nr > 0)
kick_recover();
@@ -549,15 +517,16 @@ static int md_check_and_move(uint64_t oid, uint32_t epoch, char *path)
static int scan_wd(uint64_t oid, uint32_t epoch)
{
- int i, ret = SD_RES_EIO;
+ int ret = SD_RES_EIO;
+ struct disk *disk;
- sd_read_lock(&md_lock);
- for (i = 0; i < md_nr_disks; i++) {
- ret = md_check_and_move(oid, epoch, md_disks[i].path);
+ sd_read_lock(&md.lock);
+ list_for_each_entry(disk, &md.disk_list, list) {
+ ret = md_check_and_move(oid, epoch, disk->path);
if (ret == SD_RES_SUCCESS)
break;
}
- sd_unlock(&md_lock);
+ sd_unlock(&md.lock);
return ret;
}
@@ -597,40 +566,42 @@ int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path)
uint32_t md_get_info(struct sd_md_info *info)
{
uint32_t ret = sizeof(*info);
- int i;
+ struct disk *disk;
+ int i = 0;
memset(info, 0, ret);
- sd_read_lock(&md_lock);
- for (i = 0; i < md_nr_disks; i++) {
+ sd_read_lock(&md.lock);
+ list_for_each_entry(disk, &md.disk_list, list) {
info->disk[i].idx = i;
- pstrcpy(info->disk[i].path, PATH_MAX, md_disks[i].path);
+ pstrcpy(info->disk[i].path, PATH_MAX, disk->path);
/* FIXME: better handling failure case. */
info->disk[i].free = get_path_free_size(info->disk[i].path,
&info->disk[i].used);
+ i++;
}
- info->nr = md_nr_disks;
- sd_unlock(&md_lock);
+ info->nr = md.nr_disks;
+ sd_unlock(&md.lock);
return ret;
}
static inline void md_del_disk(char *path)
{
- int idx = path_to_disk_idx(path);
+ struct disk *disk = path_to_disk(path);
- if (idx < 0) {
+ if (!disk) {
sd_err("invalid path %s", path);
return;
}
- md_remove_disk(idx);
+ md_remove_disk(disk);
}
static int do_plug_unplug(char *disks, bool plug)
{
char *path;
- int old_nr, cur_nr = 0, ret = SD_RES_UNKNOWN;
+ int old_nr, ret = SD_RES_UNKNOWN;
- sd_write_lock(&md_lock);
- old_nr = md_nr_disks;
+ sd_write_lock(&md.lock);
+ old_nr = md.nr_disks;
path = strtok(disks, ",");
do {
if (plug) {
@@ -642,22 +613,14 @@ static int do_plug_unplug(char *disks, bool plug)
} while ((path = strtok(NULL, ",")));
/* If no disks change, bail out */
- if (old_nr == md_nr_disks)
+ if (old_nr == md.nr_disks)
goto out;
- md_init_space();
- cur_nr = md_nr_disks;
-
ret = SD_RES_SUCCESS;
out:
- sd_unlock(&md_lock);
+ sd_unlock(&md.lock);
- /*
- * We have to kick recover aggressively because there is possibility
- * that nr of disks are removed during md_init_space() happens to equal
- * nr of disks we added.
- */
- if (cur_nr > 0 && ret == SD_RES_SUCCESS)
+ if (ret == SD_RES_SUCCESS)
kick_recover();
return ret;
@@ -676,12 +639,14 @@ int md_unplug_disks(char *disks)
uint64_t md_get_size(uint64_t *used)
{
uint64_t fsize = 0;
- *used = 0;
+ struct disk *disk;
- sd_read_lock(&md_lock);
- for (int i = 0; i < md_nr_disks; i++)
- fsize += get_path_free_size(md_disks[i].path, used);
- sd_unlock(&md_lock);
+ *used = 0;
+ sd_read_lock(&md.lock);
+ list_for_each_entry(disk, &md.disk_list, list) {
+ fsize += get_path_free_size(disk->path, used);
+ }
+ sd_unlock(&md.lock);
return fsize + *used;
}
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 8029f48..0feffef 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -439,7 +439,7 @@ int journal_remove_object(uint64_t oid);
/* md.c */
bool md_add_disk(char *path);
uint64_t md_init_space(void);
-char *md_get_object_path(uint64_t oid);
+const char *md_get_object_path(uint64_t oid);
int md_handle_eio(char *);
bool md_exist(uint64_t oid);
int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path);
--
1.7.9.5
More information about the sheepdog
mailing list