[sheepdog] [PATCH] sheep : add new strage type "split"

Saeki Masaki saeki.masaki at po.ntts.co.jp
Mon Dec 22 09:22:05 CET 2014


This patch is an experimental. So Please check with caution.

Current sheepdog stores whole objects in single directory like "/var/lib/sheepdog/obj"
This mechanism is difficult to handle massive files when increasing cluster volume.

In particular, inode object having special informations about VDI,
so it is preferable to divide directory.

New storage type is named "split"
It separates the inode object and data object to some directory.

How to use.
specify --store option at the time dog cluster format

dog cluster format --store split

Signed-off-by: Masaki Saeki <saeki.masaki at po.ntts.co.jp>
---
 sheep/md.c          |   32 +++++++++++
 sheep/plain_store.c |  150 ++++++++++++++++++++++++++++++++++++++++++++++++---
 sheep/sheep_priv.h  |    1 +
 3 files changed, 175 insertions(+), 8 deletions(-)

diff --git a/sheep/md.c b/sheep/md.c
index a90fdb9..e3d1b0f 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -201,6 +201,14 @@ static int for_each_object_in_path(const char *path,
 		if (unlikely(!strncmp(d->d_name, ".", 1)))
 			continue;
 
+		/* check sub directory "00-ff" and "inode" for split store */
+		if (check_store_name("split")) {
+			if (strlen(d->d_name) == 2)
+				continue;
+			if (unlikely(!strncmp(d->d_name, "inode", 5)))
+				continue;
+		}
+
 		sd_debug("%s, %s", path, d->d_name);
 		oid = strtoull(d->d_name, NULL, 16);
 		if (oid == 0 || oid == ULLONG_MAX)
@@ -243,6 +251,8 @@ static uint64_t get_path_free_size(const char *path, uint64_t *used)
 {
 	struct statvfs fs;
 	uint64_t size;
+	int i;
+	char sub_path[PATH_MAX];
 
 	if (statvfs(path, &fs) < 0) {
 		sd_err("get disk %s space failed %m", path);
@@ -256,6 +266,25 @@ static uint64_t get_path_free_size(const char *path, uint64_t *used)
 				    NULL, used)
 	    != SD_RES_SUCCESS)
 		return 0;
+
+	/* search sub directory for split store */
+	if (check_store_name("split")) {
+		snprintf(sub_path, sizeof(sub_path), "%s/inode", path);
+		if (for_each_object_in_path(sub_path, get_total_object_size,
+					    false, NULL, used)
+		    != SD_RES_SUCCESS)
+			return 0;
+
+		for (i = 0 ; i < 256 ; i++) {
+			snprintf(sub_path, sizeof(sub_path), "%s/%02x",
+				 path, i);
+			if (for_each_object_in_path(sub_path,
+						    get_total_object_size,
+						    false, NULL, used)
+			    != SD_RES_SUCCESS)
+				return 0;
+		}
+	}
 out:
 	return size;
 }
@@ -811,6 +840,9 @@ static int do_plug_unplug(char *disks, bool plug)
 	const char *path;
 	int old_nr, ret = SD_RES_UNKNOWN;
 
+	if (check_store_name("split"))
+		return SD_RES_NO_SUPPORT;
+
 	sd_write_lock(&md.lock);
 	old_nr = md.nr_disks;
 	path = strtok(disks, ",");
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index cb90e31..bc130e2 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -44,15 +44,46 @@ static int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create)
 	return flags;
 }
 
+static inline int get_split_dir(uint64_t oid)
+{
+	return (int)((oid << 24) >> 56);
+}
+
+bool check_store_name(const char *name)
+{
+	return (strcmp(sd_store->name, name) == 0);
+}
+
 int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
 {
 	if (is_erasure_oid(oid)) {
 		if (unlikely(ec_index >= SD_MAX_COPIES))
 			panic("invalid ec_index %d", ec_index);
+
+		if (is_vdi_obj(oid) && check_store_name("split")) {
+			return snprintf(path, PATH_MAX,
+					"%s/inode/%016"PRIx64"_%d",
+					md_get_object_dir(oid), oid, ec_index);
+		} else {
+			return snprintf(path, PATH_MAX,
+					"%s/%02x/%016"PRIx64"_%d",
+					md_get_object_dir(oid),
+					get_split_dir(oid), oid, ec_index);
+		}
+
 		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
 				md_get_object_dir(oid), oid, ec_index);
 	}
 
+	if (is_vdi_obj(oid) && check_store_name("split")) {
+		return snprintf(path, PATH_MAX, "%s/inode/%016" PRIx64,
+					md_get_object_dir(oid), oid);
+	} else {
+		return snprintf(path, PATH_MAX, "%s/%02x/%016" PRIx64,
+				md_get_object_dir(oid),
+				get_split_dir(oid), oid);
+	}
+
 	return snprintf(path, PATH_MAX, "%s/%016" PRIx64,
 			md_get_object_dir(oid), oid);
 }
@@ -62,8 +93,29 @@ static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
 	if (is_erasure_oid(oid)) {
 		if (unlikely(ec_index >= SD_MAX_COPIES))
 			panic("invalid ec_index %d", ec_index);
+		if (is_vdi_obj(oid) && check_store_name("split")) {
+			return snprintf(path, PATH_MAX,
+					"%s/inode/%016"PRIx64"_%d.tmp",
+					md_get_object_dir(oid), oid, ec_index);
+		} else {
+			return snprintf(path, PATH_MAX,
+					"%s/%02x/%016"PRIx64"_%d.tmp",
+					md_get_object_dir(oid),
+					get_split_dir(oid), oid, ec_index);
+		}
+
 		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
 				md_get_object_dir(oid), oid, ec_index);
+
+	}
+
+	if (is_vdi_obj(oid) && check_store_name("split")) {
+		return snprintf(path, PATH_MAX, "%s/inode/%016" PRIx64".tmp",
+				md_get_object_dir(oid), oid);
+	} else {
+		return snprintf(path, PATH_MAX, "%s/%02x/%016" PRIx64".tmp",
+				md_get_object_dir(oid),
+				get_split_dir(oid), oid);
 	}
 
 	return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
@@ -238,6 +290,33 @@ static int make_stale_dir(const char *path)
 	return SD_RES_SUCCESS;
 }
 
+static int make_inode_dir(const char *path)
+{
+	char p[PATH_MAX];
+
+	snprintf(p, PATH_MAX, "%s/inode", path);
+	if (xmkdir(p, sd_def_dmode) < 0) {
+		sd_err("%s failed, %m", p);
+		return SD_RES_EIO;
+	}
+	return SD_RES_SUCCESS;
+}
+
+static int make_split_dir(const char *path)
+{
+	int i;
+	char p[PATH_MAX];
+
+	for (i = 0 ; i < 256 ; i++) {
+		snprintf(p, PATH_MAX, "%s/%02x", path, i);
+		if (xmkdir(p, sd_def_dmode) < 0) {
+			sd_err("%s failed, %m", p);
+			return SD_RES_EIO;
+		}
+	}
+	return SD_RES_SUCCESS;
+}
+
 static int purge_dir(const char *path)
 {
 	if (purge_directory(path) < 0)
@@ -318,7 +397,19 @@ int default_init(void)
 {
 	int ret;
 
-	sd_debug("use plain store driver");
+	if (check_store_name("split")) {
+		sd_debug("use split store driver");
+
+		ret = for_each_obj_path(make_inode_dir);
+		if (ret != SD_RES_SUCCESS)
+			return ret;
+		ret = for_each_obj_path(make_split_dir);
+		if (ret != SD_RES_SUCCESS)
+			return ret;
+	} else {
+		sd_debug("use plain store driver");
+	}
+
 	ret = for_each_obj_path(make_stale_dir);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
@@ -497,7 +588,13 @@ int default_link(uint64_t oid, uint32_t tgt_epoch)
 	sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
 		 tgt_epoch);
 
-	snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_dir(oid), oid);
+	if (is_vdi_obj(oid) && check_store_name("split")) {
+		snprintf(path, PATH_MAX, "%s/inode/%016"PRIx64,
+			 md_get_object_dir(oid), oid);
+	} else {
+		snprintf(path, PATH_MAX, "%s/%016"PRIx64,
+			 md_get_object_dir(oid), oid);
+	}
 	get_store_stale_path(oid, tgt_epoch, 0, stale_path);
 
 	if (link(stale_path, path) < 0) {
@@ -557,14 +654,32 @@ static int move_object_to_stale_dir(uint64_t oid, const char *wd,
 
 	/* ec_index from md.c is reliable so we can directly use it */
 	if (ec_index < SD_MAX_COPIES) {
-		snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
-			 md_get_object_dir(oid), oid, ec_index);
+		if (check_store_name("split")) {
+			snprintf(path, PATH_MAX, "%s/%02x/%016"PRIx64"_%d",
+				 md_get_object_dir(oid),
+				 get_split_dir(oid), oid, ec_index);
+		} else {
+			snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
+				 md_get_object_dir(oid), oid, ec_index);
+		}
 		snprintf(stale_path, PATH_MAX,
 			 "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
 			 md_get_object_dir(oid), oid, ec_index, tgt_epoch);
+
 	} else {
-		snprintf(path, PATH_MAX, "%s/%016" PRIx64,
-			 md_get_object_dir(oid), oid);
+		if (check_store_name("split")) {
+			if (is_vdi_obj(oid)) {
+				snprintf(path, PATH_MAX, "%s/inode/%016" PRIx64,
+					 md_get_object_dir(oid), oid);
+			} else {
+				snprintf(path, PATH_MAX, "%s/%02x/%016" PRIx64,
+					 md_get_object_dir(oid),
+					 get_split_dir(oid), oid);
+			}
+		} else {
+			snprintf(path, PATH_MAX, "%s/%016" PRIx64,
+				 md_get_object_dir(oid), oid);
+		}
 		snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
 			 md_get_object_dir(oid), oid, tgt_epoch);
 	}
@@ -662,8 +777,9 @@ static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
 			   size_t size)
 {
 	if (default_exist(oid, 0)) {
-		snprintf(path, PATH_MAX, "%s/%016"PRIx64,
-			 md_get_object_dir(oid), oid);
+		if (check_store_name("split"))
+			snprintf(path, PATH_MAX, "%s/inode/%016" PRIx64,
+				 md_get_object_dir(oid), oid);
 	} else {
 		get_store_stale_path(oid, epoch, 0, path);
 		if (access(path, F_OK) < 0) {
@@ -750,3 +866,21 @@ static struct store_driver plain_store = {
 };
 
 add_store_driver(plain_store);
+
+static struct store_driver split_store = {
+	.name = "split",
+	.init = default_init,
+	.exist = default_exist,
+	.create_and_write = default_create_and_write,
+	.write = default_write,
+	.read = default_read,
+	.link = default_link,
+	.update_epoch = default_update_epoch,
+	.cleanup = default_cleanup,
+	.format = default_format,
+	.remove_object = default_remove_object,
+	.get_hash = default_get_hash,
+	.purge_obj = default_purge_obj,
+};
+
+add_store_driver(split_store);
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index f782044..60bbd0d 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -277,6 +277,7 @@ int for_each_object_in_stale(int (*func)(uint64_t oid, const char *path,
 int for_each_obj_path(int (*func)(const char *path));
 size_t get_store_objsize(uint64_t oid);
 int get_store_path(uint64_t oid, uint8_t ec_index, char *path);
+bool check_store_name(const char *name);
 
 extern struct list_head store_drivers;
 #define add_store_driver(driver)				\
-- 
1.7.1




More information about the sheepdog mailing list