[sheepdog] [PATCH 4/6] md: add hot-plug and hot-unplug support

Liu Yuan namei.unix at gmail.com
Thu Mar 28 16:38:50 CET 2013


From: Liu Yuan <tailai.ly at taobao.com>

We allow group plug, group unplug and disks failure during (un)plugging.

Also add disk information function for collie.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 collie/collie.c          |    2 +-
 include/internal_proto.h |   16 +++
 include/sheepdog_proto.h |    2 +
 sheep/md.c               |  263 ++++++++++++++++++++++++++++++++--------------
 sheep/ops.c              |   45 ++++++++
 sheep/sheep_priv.h       |    5 +-
 sheep/store.c            |    3 +-
 7 files changed, 253 insertions(+), 83 deletions(-)

diff --git a/collie/collie.c b/collie/collie.c
index 08c78eb..19085b4 100644
--- a/collie/collie.c
+++ b/collie/collie.c
@@ -19,7 +19,7 @@
 #include "util.h"
 
 static const char program_name[] = "collie";
-const char *sdhost = "localhost";
+const char *sdhost = "127.0.0.1";
 int sdport = SD_LISTEN_PORT;
 bool highlight = true;
 bool raw_output;
diff --git a/include/internal_proto.h b/include/internal_proto.h
index 6f1fdb3..c43855b 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -69,6 +69,9 @@
 #define SD_OP_FLUSH_PEER 0xAE
 #define SD_OP_NOTIFY_VDI_ADD  0xAF
 #define SD_OP_DELETE_CACHE    0xB0
+#define SD_OP_MD_INFO   0xB1
+#define SD_OP_MD_PLUG   0xB2
+#define SD_OP_MD_UNPLUG 0xB3
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
@@ -229,4 +232,17 @@ struct vdi_op_message {
 	uint8_t data[0];
 };
 
+struct md_info {
+	int idx;
+	uint64_t size;
+	uint64_t used;
+	char path[PATH_MAX];
+};
+
+#define MD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */
+struct sd_md_info {
+	struct md_info disk[MD_MAX_DISK];
+	int nr;
+};
+
 #endif /* __INTERNAL_PROTO_H__ */
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index fe3738b..94baede 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -13,6 +13,8 @@
 
 #include <inttypes.h>
 #include <stdint.h>
+#include <linux/limits.h>
+
 #include "util.h"
 
 #define SD_PROTO_VER 0x02
diff --git a/sheep/md.c b/sheep/md.c
index 821a391..124f2ba 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -21,11 +21,12 @@
 #include <sys/xattr.h>
 #include <dirent.h>
 #include <pthread.h>
+#include <string.h>
 
 #include "sheep_priv.h"
+#include "util.h"
 
 #define MD_DEFAULT_VDISKS 128
-#define MD_MAX_DISK 64 /* FIXME remove roof and make it dynamic */
 #define MD_MAX_VDISK (MD_MAX_DISK * MD_DEFAULT_VDISKS)
 
 struct disk {
@@ -123,20 +124,33 @@ static inline struct vdisk *oid_to_vdisk(uint64_t oid)
 	return oid_to_vdisk_from(md_vds, md_nr_vds, oid);
 }
 
-int md_init_disk(char *path)
+static int path_to_disk_idx(char *path)
 {
+	int i;
+
+	for (i = 0; i < md_nr_disks; i++)
+		if (strcmp(md_disks[i].path, path) == 0)
+			return i;
+
+	return -1;
+}
+
+void md_add_disk(char *path)
+{
+	if (path_to_disk_idx(path) != -1) {
+		sd_eprintf("duplicate path %s", path);
+		return;
+	}
+
 	md_nr_disks++;
 
-	if (xmkdir(path, def_dmode) < 0)
-			panic("%s, %m", path);
 	pstrcpy(md_disks[md_nr_disks - 1].path, PATH_MAX, path);
-	sd_iprintf("%s added to md, nr %d", md_disks[md_nr_disks - 1].path,
+	sd_iprintf("%s, nr %d", md_disks[md_nr_disks - 1].path,
 		   md_nr_disks);
-	return 0;
 }
 
 static inline void calculate_vdisks(struct disk *disks, int nr_disks,
-			     uint64_t total)
+				    uint64_t total)
 {
 	uint64_t avg_size = total / nr_disks;
 	float factor;
@@ -154,6 +168,79 @@ static inline void calculate_vdisks(struct disk *disks, int nr_disks,
 #define MDNAME	"user.md.size"
 #define MDSIZE	sizeof(uint64_t)
 
+static int get_total_object_size(uint64_t oid, char *ignore, void *total)
+{
+	uint64_t *t = total;
+	*t += get_objsize(oid);
+
+	return SD_RES_SUCCESS;
+}
+
+/* If cleanup is true, temporary objects will be removed */
+static int for_each_object_in_path(char *path,
+				   int (*func)(uint64_t, char *, void *),
+				   bool cleanup, void *arg)
+{
+	DIR *dir;
+	struct dirent *d;
+	uint64_t oid;
+	int ret = SD_RES_SUCCESS;
+	char p[PATH_MAX];
+
+	dir = opendir(path);
+	if (!dir) {
+		sd_eprintf("failed to open %s, %m", path);
+		return SD_RES_EIO;
+	}
+
+	while ((d = readdir(dir))) {
+		if (!strncmp(d->d_name, ".", 1))
+			continue;
+
+		oid = strtoull(d->d_name, NULL, 16);
+		if (oid == 0 || oid == ULLONG_MAX)
+			continue;
+
+		/* don't call callback against temporary objects */
+		if (strlen(d->d_name) == 20 &&
+		    strcmp(d->d_name + 16, ".tmp") == 0) {
+			if (cleanup) {
+				snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp",
+					 path, oid);
+				sd_dprintf("remove tmp object %s", p);
+				unlink(p);
+			}
+			continue;
+		}
+
+		ret = func(oid, path, arg);
+		if (ret != SD_RES_SUCCESS)
+			break;
+	}
+	closedir(dir);
+	return ret;
+}
+
+static uint64_t get_path_size(char *path, uint64_t *used)
+{
+	struct statvfs fs;
+	uint64_t size;
+
+	if (statvfs(path, &fs) < 0) {
+		sd_eprintf("get disk %s space failed %m", path);
+		return 0;
+	}
+	size = (int64_t)fs.f_frsize * fs.f_bfree;
+
+	if (!used)
+		goto out;
+	if (for_each_object_in_path(path, get_total_object_size, false, used)
+	    != SD_RES_SUCCESS)
+		return 0;
+out:
+	return size;
+}
+
 /*
  * If path is broken during initilization or not support xattr return 0. We can
  * safely use 0 to represent failure case  because 0 space path can be
@@ -161,9 +248,13 @@ static inline void calculate_vdisks(struct disk *disks, int nr_disks,
  */
 static uint64_t init_path_space(char *path)
 {
-	struct statvfs fs;
 	uint64_t size;
 
+	if (xmkdir(path, def_dmode) < 0) {
+		sd_eprintf("%s, %m", path);
+		goto broken_path;
+	}
+
 	if (!is_xattr_enabled(path)) {
 		sd_iprintf("multi-disk support need xattr feature");
 		goto broken_path;
@@ -180,11 +271,9 @@ static uint64_t init_path_space(char *path)
 
 	return size;
 create:
-	if (statvfs(path, &fs) < 0) {
-		sd_eprintf("get disk %s space failed %m", path);
+	size = get_path_size(path, NULL);
+	if (!size)
 		goto broken_path;
-	}
-	size = (int64_t)fs.f_frsize * fs.f_bfree;
 	if (setxattr(path, MDNAME, &size, MDSIZE, 0) < 0) {
 		sd_eprintf("%s, %m", path);
 		goto broken_path;
@@ -229,7 +318,8 @@ reinit:
 	}
 	calculate_vdisks(md_disks, md_nr_disks, total);
 	md_nr_vds = disks_to_vdisks(md_disks, md_nr_disks, md_vds);
-	sys->enable_md = true;
+	if (!sys->enable_md)
+		sys->enable_md = true;
 
 	return total;
 }
@@ -259,51 +349,6 @@ static char *get_object_path_nolock(uint64_t oid)
 	return md_disks[vd->idx].path;
 }
 
-/* If cleanup is true, temporary objects will be removed */
-static int for_each_object_in_path(char *path,
-				   int (*func)(uint64_t, char *, void *),
-				   bool cleanup, void *arg)
-{
-	DIR *dir;
-	struct dirent *d;
-	uint64_t oid;
-	int ret = SD_RES_SUCCESS;
-	char p[PATH_MAX];
-
-	dir = opendir(path);
-	if (!dir) {
-		sd_eprintf("failed to open %s, %m", path);
-		return SD_RES_EIO;
-	}
-
-	while ((d = readdir(dir))) {
-		if (!strncmp(d->d_name, ".", 1))
-			continue;
-
-		oid = strtoull(d->d_name, NULL, 16);
-		if (oid == 0 || oid == ULLONG_MAX)
-			continue;
-
-		/* don't call callback against temporary objects */
-		if (strlen(d->d_name) == 20 &&
-		    strcmp(d->d_name + 16, ".tmp") == 0) {
-			if (cleanup) {
-				snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp",
-					 path, oid);
-				sd_dprintf("remove tmp object %s", p);
-				unlink(p);
-			}
-			continue;
-		}
-
-		ret = func(oid, path, arg);
-		if (ret != SD_RES_SUCCESS)
-			break;
-	}
-	closedir(dir);
-	return ret;
-}
-
 int for_each_object_in_wd(int (*func)(uint64_t oid, char *path, void *arg),
 			  bool cleanup, void *arg)
 {
@@ -345,17 +390,6 @@ struct md_work {
 	char path[PATH_MAX];
 };
 
-static int path_to_disk_idx(char *path)
-{
-	int i;
-
-	for (i = 0; i < md_nr_disks; i++)
-		if (strcmp(md_disks[i].path, path) == 0)
-			return i;
-
-	return -1;
-}
-
 static inline void kick_recover(void)
 {
 	struct vnode_info *vinfo = get_vnode_info();
@@ -364,15 +398,6 @@ static inline void kick_recover(void)
 	put_vnode_info(vinfo);
 }
 
-static void unplug_disk(int idx)
-{
-
-	remove_disk(idx);
-	sys->disk_space = md_init_space();
-	if (md_nr_disks > 0)
-		kick_recover();
-}
-
 static void md_do_recover(struct work *work)
 {
 	struct md_work *mw = container_of(work, struct md_work, work);
@@ -383,7 +408,10 @@ static void md_do_recover(struct work *work)
 	if (idx < 0)
 		/* Just ignore the duplicate EIO of the same path */
 		goto out;
-	unplug_disk(idx);
+	remove_disk(idx);
+	sys->disk_space = md_init_space();
+	if (md_nr_disks > 0)
+		kick_recover();
 out:
 	pthread_rwlock_unlock(&md_lock);
 	free(mw);
@@ -500,3 +528,80 @@ int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path)
 
 	return SD_RES_NO_OBJ;
 }
+
+uint32_t md_get_info(struct sd_md_info *info)
+{
+	uint32_t ret = sizeof(*info);
+	int i;
+
+	memset(info, 0, ret);
+	pthread_rwlock_rdlock(&md_lock);
+	for (i = 0; i < md_nr_disks; i++) {
+		info->disk[i].idx = i;
+		pstrcpy(info->disk[i].path, PATH_MAX, md_disks[i].path);
+		info->disk[i].size = get_path_size(info->disk[i].path,
+						   &info->disk[i].used);
+		if (!info->disk[i].size) {
+			ret = 0;
+			break;
+		}
+	}
+	info->nr = md_nr_disks;
+	pthread_rwlock_unlock(&md_lock);
+	return ret;
+}
+
+static inline void md_del_disk(char *path)
+{
+	int idx = path_to_disk_idx(path);
+
+	if (idx < 0) {
+		sd_eprintf("invalid path %s", path);
+		return;
+	}
+	remove_disk(idx);
+}
+
+static int do_plug_unplug(char *disks, bool plug)
+{
+	char *path;
+	int old_nr, ret = SD_RES_UNKNOWN;
+
+	pthread_rwlock_wrlock(&md_lock);
+	old_nr = md_nr_disks;
+	path = strtok(disks, ",");
+	do {
+		if (plug)
+			md_add_disk(path);
+		else
+			md_del_disk(path);
+	} while ((path = strtok(NULL, ",")));
+
+	/* If no disks change, bail out */
+	if (old_nr == md_nr_disks)
+		goto out;
+
+	sys->disk_space = md_init_space();
+	/*
+	 * We have to kick recover aggressively because there is possibility
+	 * that nr of disks are removed during md_init_space() happens to equal
+	 * nr of disks we added.
+	 */
+	if (md_nr_disks > 0)
+		kick_recover();
+
+	ret = SD_RES_SUCCESS;
+out:
+	pthread_rwlock_unlock(&md_lock);
+	return ret;
+}
+
+int md_plug_disks(char *disks)
+{
+	return do_plug_unplug(disks, true);
+}
+
+int md_unplug_disks(char *disks)
+{
+	return do_plug_unplug(disks, false);
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index 8cba70d..3839437 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -667,6 +667,33 @@ static int local_set_cache_size(const struct sd_req *req, struct sd_rsp *rsp,
 	return SD_RES_SUCCESS;
 }
 
+static int local_md_info(struct request *request)
+{
+	struct sd_rsp *rsp = &request->rp;
+	struct sd_req *req = &request->rq;
+
+	assert(req->data_length == sizeof(struct sd_md_info));
+	rsp->data_length = md_get_info((struct sd_md_info *)request->data);
+
+	return rsp->data_length ? SD_RES_SUCCESS : SD_RES_UNKNOWN;
+}
+
+static int local_md_plug(const struct sd_req *req, struct sd_rsp *rsp,
+			 void *data)
+{
+	char *disks = (char *)data;
+
+	return md_plug_disks(disks);
+}
+
+static int local_md_unplug(const struct sd_req *req, struct sd_rsp *rsp,
+			   void *data)
+{
+	char *disks = (char *)data;
+
+	return md_unplug_disks(disks);
+}
+
 static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
 			   void *data)
 {
@@ -1110,6 +1137,24 @@ static struct sd_op_template sd_ops[] = {
 		.process_main = local_set_cache_size,
 	},
 
+	[SD_OP_MD_INFO] = {
+		.name = "MD_INFO",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_work = local_md_info,
+	},
+
+	[SD_OP_MD_PLUG] = {
+		.name = "MD_PLUG_DISKS",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_main = local_md_plug,
+	},
+
+	[SD_OP_MD_UNPLUG] = {
+		.name = "MD_UNPLUG_DISKS",
+		.type = SD_OP_TYPE_LOCAL,
+		.process_main = local_md_unplug,
+	},
+
 	/* gateway I/O operations */
 	[SD_OP_CREATE_AND_WRITE_OBJ] = {
 		.name = "CREATE_AND_WRITE_OBJ",
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 652fd3a..098a7bb 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -417,11 +417,14 @@ int journal_file_init(const char *path, size_t size, bool skip);
 int journal_file_write(uint64_t oid, const char *buf, size_t size, off_t, bool);
 
 /* md.c */
-int md_init_disk(char *path);
+void md_add_disk(char *path);
 uint64_t md_init_space(void);
 char *get_object_path(uint64_t oid);
 int md_handle_eio(char *);
 bool md_exist(uint64_t oid);
 int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path);
+uint32_t md_get_info(struct sd_md_info *info);
+int md_plug_disks(char *disks);
+int md_unplug_disks(char *disks);
 
 #endif
diff --git a/sheep/store.c b/sheep/store.c
index 58303fa..cbf24dc 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -269,8 +269,7 @@ static int init_obj_path(const char *base_path, char *argp)
 	/* Eat up the first component */
 	strtok(argp, ",");
 	while ((p = strtok(NULL, ",")))
-		if (md_init_disk(p) < 0)
-			return -1;
+		md_add_disk(p);
 
 	return init_path(obj_path, NULL);
 }
-- 
1.7.9.5




More information about the sheepdog mailing list