[sheepdog] [PATCH v4 4/7] md: add hot-plug and hot-unplug support
Liu Yuan
namei.unix at gmail.com
Wed Apr 3 17:02:59 CEST 2013
From: Liu Yuan <tailai.ly at taobao.com>
We allow group plug, group unplug and disks failure during (un)plugging.
Also add disk information function for collie.
Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
collie/collie.c | 2 +-
include/internal_proto.h | 16 +++
include/sheepdog_proto.h | 2 +
sheep/md.c | 260 ++++++++++++++++++++++++++++++++--------------
sheep/ops.c | 45 ++++++++
sheep/sheep_priv.h | 5 +-
sheep/store.c | 3 +-
7 files changed, 250 insertions(+), 83 deletions(-)
diff --git a/collie/collie.c b/collie/collie.c
index 08c78eb..19085b4 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -19,7 +19,7 @@
#include "util.h"
static const char program_name[] = "collie";
-const char *sdhost = "localhost";
+const char *sdhost = "127.0.0.1";
int sdport = SD_LISTEN_PORT;
bool highlight = true;
bool raw_output;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 6f1fdb3..c43855b 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -69,6 +69,9 @@
#define SD_OP_FLUSH_PEER 0xAE
#define SD_OP_NOTIFY_VDI_ADD 0xAF
#define SD_OP_DELETE_CACHE 0xB0
+#define SD_OP_MD_INFO 0xB1
+#define SD_OP_MD_PLUG 0xB2
+#define SD_OP_MD_UNPLUG 0xB3
/* internal flags for hdr.flags, must be above 0x80 */
#define SD_FLAG_CMD_RECOVERY 0x0080
@@ -229,4 +232,17 @@ struct vdi_op_message {
uint8_t data[0];
};
+struct md_info {
+ int idx;
+ uint64_t size;
+ uint64_t used;
+ char path[PATH_MAX];
+};
+
+#define MD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */
+struct sd_md_info {
+ struct md_info disk[MD_MAX_DISK];
+ int nr;
+};
+
#endif /* __INTERNAL_PROTO_H__ */
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index fe3738b..94baede 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -13,6 +13,8 @@
#include <inttypes.h>
#include <stdint.h>
+#include <linux/limits.h>
+
#include "util.h"
#define SD_PROTO_VER 0x02
diff --git a/sheep/md.c b/sheep/md.c
index 821a391..4da23c4 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -21,11 +21,12 @@
#include <sys/xattr.h>
#include <dirent.h>
#include <pthread.h>
+#include <string.h>
#include "sheep_priv.h"
+#include "util.h"
#define MD_DEFAULT_VDISKS 128
-#define MD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */
#define MD_MAX_VDISK (MD_MAX_DISK * MD_DEFAULT_VDISKS)
struct disk {
@@ -123,20 +124,38 @@ static inline struct vdisk *oid_to_vdisk(uint64_t oid)
return oid_to_vdisk_from(md_vds, md_nr_vds, oid);
}
-int md_init_disk(char *path)
+static int path_to_disk_idx(char *path)
{
+ int i;
+
+ for (i = 0; i < md_nr_disks; i++)
+ if (strcmp(md_disks[i].path, path) == 0)
+ return i;
+
+ return -1;
+}
+
+void md_add_disk(char *path)
+{
+ if (path_to_disk_idx(path) != -1) {
+ sd_eprintf("duplicate path %s", path);
+ return;
+ }
+
+ if (xmkdir(path, def_dmode) < 0) {
+ sd_eprintf("can't mkdir for %s, %m", path);
+ return;
+ }
+
md_nr_disks++;
- if (xmkdir(path, def_dmode) < 0)
- panic("%s, %m", path);
pstrcpy(md_disks[md_nr_disks - 1].path, PATH_MAX, path);
- sd_iprintf("%s added to md, nr %d", md_disks[md_nr_disks - 1].path,
+ sd_iprintf("%s, nr %d", md_disks[md_nr_disks - 1].path,
md_nr_disks);
- return 0;
}
static inline void calculate_vdisks(struct disk *disks, int nr_disks,
- uint64_t total)
+ uint64_t total)
{
uint64_t avg_size = total / nr_disks;
float factor;
@@ -154,6 +173,79 @@ static inline void calculate_vdisks(struct disk *disks, int nr_disks,
#define MDNAME "user.md.size"
#define MDSIZE sizeof(uint64_t)
+static int get_total_object_size(uint64_t oid, char *ignore, void *total)
+{
+ uint64_t *t = total;
+ *t += get_objsize(oid);
+
+ return SD_RES_SUCCESS;
+}
+
+/* If cleanup is true, temporary objects will be removed */
+static int for_each_object_in_path(char *path,
+ int (*func)(uint64_t, char *, void *),
+ bool cleanup, void *arg)
+{
+ DIR *dir;
+ struct dirent *d;
+ uint64_t oid;
+ int ret = SD_RES_SUCCESS;
+ char p[PATH_MAX];
+
+ dir = opendir(path);
+ if (!dir) {
+ sd_eprintf("failed to open %s, %m", path);
+ return SD_RES_EIO;
+ }
+
+ while ((d = readdir(dir))) {
+ if (!strncmp(d->d_name, ".", 1))
+ continue;
+
+ oid = strtoull(d->d_name, NULL, 16);
+ if (oid == 0 || oid == ULLONG_MAX)
+ continue;
+
+ /* don't call callback against temporary objects */
+ if (strlen(d->d_name) == 20 &&
+ strcmp(d->d_name + 16, ".tmp") == 0) {
+ if (cleanup) {
+ snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp",
+ path, oid);
+ sd_dprintf("remove tmp object %s", p);
+ unlink(p);
+ }
+ continue;
+ }
+
+ ret = func(oid, path, arg);
+ if (ret != SD_RES_SUCCESS)
+ break;
+ }
+ closedir(dir);
+ return ret;
+}
+
+static uint64_t get_path_size(char *path, uint64_t *used)
+{
+ struct statvfs fs;
+ uint64_t size;
+
+ if (statvfs(path, &fs) < 0) {
+ sd_eprintf("get disk %s space failed %m", path);
+ return 0;
+ }
+ size = (int64_t)fs.f_frsize * fs.f_bfree;
+
+ if (!used)
+ goto out;
+ if (for_each_object_in_path(path, get_total_object_size, false, used)
+ != SD_RES_SUCCESS)
+ return 0;
+out:
+ return size;
+}
+
/*
* If path is broken during initilization or not support xattr return 0. We can
* safely use 0 to represent failure case because 0 space path can be
@@ -161,7 +253,6 @@ static inline void calculate_vdisks(struct disk *disks, int nr_disks,
*/
static uint64_t init_path_space(char *path)
{
- struct statvfs fs;
uint64_t size;
if (!is_xattr_enabled(path)) {
@@ -180,11 +271,9 @@ static uint64_t init_path_space(char *path)
return size;
create:
- if (statvfs(path, &fs) < 0) {
- sd_eprintf("get disk %s space failed %m", path);
+ size = get_path_size(path, NULL);
+ if (!size)
goto broken_path;
- }
- size = (int64_t)fs.f_frsize * fs.f_bfree;
if (setxattr(path, MDNAME, &size, MDSIZE, 0) < 0) {
sd_eprintf("%s, %m", path);
goto broken_path;
@@ -229,7 +318,8 @@ reinit:
}
calculate_vdisks(md_disks, md_nr_disks, total);
md_nr_vds = disks_to_vdisks(md_disks, md_nr_disks, md_vds);
- sys->enable_md = true;
+ if (!sys->enable_md)
+ sys->enable_md = true;
return total;
}
@@ -259,51 +349,6 @@ static char *get_object_path_nolock(uint64_t oid)
return md_disks[vd->idx].path;
}
-/* If cleanup is true, temporary objects will be removed */
-static int for_each_object_in_path(char *path,
- int (*func)(uint64_t, char *, void *),
- bool cleanup, void *arg)
-{
- DIR *dir;
- struct dirent *d;
- uint64_t oid;
- int ret = SD_RES_SUCCESS;
- char p[PATH_MAX];
-
- dir = opendir(path);
- if (!dir) {
- sd_eprintf("failed to open %s, %m", path);
- return SD_RES_EIO;
- }
-
- while ((d = readdir(dir))) {
- if (!strncmp(d->d_name, ".", 1))
- continue;
-
- oid = strtoull(d->d_name, NULL, 16);
- if (oid == 0 || oid == ULLONG_MAX)
- continue;
-
- /* don't call callback against temporary objects */
- if (strlen(d->d_name) == 20 &&
- strcmp(d->d_name + 16, ".tmp") == 0) {
- if (cleanup) {
- snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp",
- path, oid);
- sd_dprintf("remove tmp object %s", p);
- unlink(p);
- }
- continue;
- }
-
- ret = func(oid, path, arg);
- if (ret != SD_RES_SUCCESS)
- break;
- }
- closedir(dir);
- return ret;
-}
-
int for_each_object_in_wd(int (*func)(uint64_t oid, char *path, void *arg),
bool cleanup, void *arg)
{
@@ -345,17 +390,6 @@ struct md_work {
char path[PATH_MAX];
};
-static int path_to_disk_idx(char *path)
-{
- int i;
-
- for (i = 0; i < md_nr_disks; i++)
- if (strcmp(md_disks[i].path, path) == 0)
- return i;
-
- return -1;
-}
-
static inline void kick_recover(void)
{
struct vnode_info *vinfo = get_vnode_info();
@@ -364,15 +398,6 @@ static inline void kick_recover(void)
put_vnode_info(vinfo);
}
-static void unplug_disk(int idx)
-{
-
- remove_disk(idx);
- sys->disk_space = md_init_space();
- if (md_nr_disks > 0)
- kick_recover();
-}
-
static void md_do_recover(struct work *work)
{
struct md_work *mw = container_of(work, struct md_work, work);
@@ -383,7 +408,10 @@ static void md_do_recover(struct work *work)
if (idx < 0)
/* Just ignore the duplicate EIO of the same path */
goto out;
- unplug_disk(idx);
+ remove_disk(idx);
+ sys->disk_space = md_init_space();
+ if (md_nr_disks > 0)
+ kick_recover();
out:
pthread_rwlock_unlock(&md_lock);
free(mw);
@@ -500,3 +528,77 @@ int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path)
return SD_RES_NO_OBJ;
}
+
+uint32_t md_get_info(struct sd_md_info *info)
+{
+ uint32_t ret = sizeof(*info);
+ int i;
+
+ memset(info, 0, ret);
+ pthread_rwlock_rdlock(&md_lock);
+ for (i = 0; i < md_nr_disks; i++) {
+ info->disk[i].idx = i;
+ pstrcpy(info->disk[i].path, PATH_MAX, md_disks[i].path);
+ /* FIXME: better handling failure case. */
+ info->disk[i].size = get_path_size(info->disk[i].path,
+ &info->disk[i].used);
+ }
+ info->nr = md_nr_disks;
+ pthread_rwlock_unlock(&md_lock);
+ return ret;
+}
+
+static inline void md_del_disk(char *path)
+{
+ int idx = path_to_disk_idx(path);
+
+ if (idx < 0) {
+ sd_eprintf("invalid path %s", path);
+ return;
+ }
+ remove_disk(idx);
+}
+
+static int do_plug_unplug(char *disks, bool plug)
+{
+ char *path;
+ int old_nr, ret = SD_RES_UNKNOWN;
+
+ pthread_rwlock_wrlock(&md_lock);
+ old_nr = md_nr_disks;
+ path = strtok(disks, ",");
+ do {
+ if (plug)
+ md_add_disk(path);
+ else
+ md_del_disk(path);
+ } while ((path = strtok(NULL, ",")));
+
+ /* If no disks change, bail out */
+ if (old_nr == md_nr_disks)
+ goto out;
+
+ sys->disk_space = md_init_space();
+ /*
+ * We have to kick recover aggressively because there is possibility
+ * that nr of disks are removed during md_init_space() happens to equal
+ * nr of disks we added.
+ */
+ if (md_nr_disks > 0)
+ kick_recover();
+
+ ret = SD_RES_SUCCESS;
+out:
+ pthread_rwlock_unlock(&md_lock);
+ return ret;
+}
+
+int md_plug_disks(char *disks)
+{
+ return do_plug_unplug(disks, true);
+}
+
+int md_unplug_disks(char *disks)
+{
+ return do_plug_unplug(disks, false);
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index 0d10220..35e2823 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -666,6 +666,33 @@ static int local_set_cache_size(const struct sd_req *req, struct sd_rsp *rsp,
return SD_RES_SUCCESS;
}
+static int local_md_info(struct request *request)
+{
+ struct sd_rsp *rsp = &request->rp;
+ struct sd_req *req = &request->rq;
+
+ assert(req->data_length == sizeof(struct sd_md_info));
+ rsp->data_length = md_get_info((struct sd_md_info *)request->data);
+
+ return rsp->data_length ? SD_RES_SUCCESS : SD_RES_UNKNOWN;
+}
+
+static int local_md_plug(const struct sd_req *req, struct sd_rsp *rsp,
+ void *data)
+{
+ char *disks = (char *)data;
+
+ return md_plug_disks(disks);
+}
+
+static int local_md_unplug(const struct sd_req *req, struct sd_rsp *rsp,
+ void *data)
+{
+ char *disks = (char *)data;
+
+ return md_unplug_disks(disks);
+}
+
static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
void *data)
{
@@ -1109,6 +1136,24 @@ static struct sd_op_template sd_ops[] = {
.process_main = local_set_cache_size,
},
+ [SD_OP_MD_INFO] = {
+ .name = "MD_INFO",
+ .type = SD_OP_TYPE_LOCAL,
+ .process_work = local_md_info,
+ },
+
+ [SD_OP_MD_PLUG] = {
+ .name = "MD_PLUG_DISKS",
+ .type = SD_OP_TYPE_LOCAL,
+ .process_main = local_md_plug,
+ },
+
+ [SD_OP_MD_UNPLUG] = {
+ .name = "MD_UNPLUG_DISKS",
+ .type = SD_OP_TYPE_LOCAL,
+ .process_main = local_md_unplug,
+ },
+
/* gateway I/O operations */
[SD_OP_CREATE_AND_WRITE_OBJ] = {
.name = "CREATE_AND_WRITE_OBJ",
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 4267efd..a067347 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -411,11 +411,14 @@ int journal_file_init(const char *path, size_t size, bool skip);
int journal_file_write(uint64_t oid, const char *buf, size_t size, off_t, bool);
/* md.c */
-int md_init_disk(char *path);
+void md_add_disk(char *path);
uint64_t md_init_space(void);
char *get_object_path(uint64_t oid);
int md_handle_eio(char *);
bool md_exist(uint64_t oid);
int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path);
+uint32_t md_get_info(struct sd_md_info *info);
+int md_plug_disks(char *disks);
+int md_unplug_disks(char *disks);
#endif
diff --git a/sheep/store.c b/sheep/store.c
index 76250b3..af9da9e 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -267,8 +267,7 @@ static int init_obj_path(const char *base_path, char *argp)
/* Eat up the first component */
strtok(argp, ",");
while ((p = strtok(NULL, ",")))
- if (md_init_disk(p) < 0)
- return -1;
+ md_add_disk(p);
return init_path(obj_path, NULL);
}
--
1.7.9.5
More information about the sheepdog
mailing list