[sheepdog] [PATCH stable-0.8 4/6] sheep: store ec_index in the pathname instead of in xattr

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Thu Mar 20 09:52:23 CET 2014


From: Liu Yuan <namei.unix at gmail.com>

This patch solves a performance issue that in the recovery, update_epoch() is
called to iterate all the objects to move stale objects into stale directories
this is fast for replication objects because oid_stale() doesn't access disks
at all. But for erasure objects, we have to read xattr for each one. This is
considerably slow for massive data deployment. For e.g, it might take more than
one hour for 6T data in 6 nodes cluster. Since update_epoch() is required to
run in the main thread, this means we can't process any requests during the
first phase of recovery.

Instead of storing 'ec_index' in the xattr of the object, we directly store it
in the pathname of ec objects, e.g, xxxoidxxx_1 indicate its ec index is 1 and
it is an ec object. By this way, we

- simplify the read/write path a bit
- no need to read disks at all for update_epoch()

Thus largely boost the update_epoch() and also boost normal read/write a bit.

With this patch, the disk layout for ec objects is changed from

oid       --> oid_index       [data  object]
oid.epoch --> oid_index.epoch [stale object]

Reviewed-by: Robin Dong <sanbai at taobao.com>
Signed-off-by: Liu Yuan <namei.unix at gmail.com>
---
 sheep/journal.c     |   2 +-
 sheep/md.c          | 137 +++++++++++++++++++++++++++--------------
 sheep/ops.c         |   6 +-
 sheep/plain_store.c | 171 +++++++++++++++++++++-------------------------------
 sheep/recovery.c    |  10 ++-
 sheep/sheep_priv.h  |  40 +++++++++---
 6 files changed, 204 insertions(+), 162 deletions(-)

diff --git a/sheep/journal.c b/sheep/journal.c
index 1365d14..57502b6 100644
--- a/sheep/journal.c
+++ b/sheep/journal.c
@@ -142,7 +142,7 @@ static int replay_journal_entry(struct journal_descriptor *jd)
 	char *p = (char *)jd;
 
 	snprintf(path, PATH_MAX, "%s/%016"PRIx64,
-		 md_get_object_path(jd->oid), jd->oid);
+		 md_get_object_dir(jd->oid), jd->oid);
 
 	if (jd->flag == JF_REMOVE_OBJ) {
 		sd_info("%s (remove)", path);
diff --git a/sheep/md.c b/sheep/md.c
index dfce49a..6da5828 100644
--- a/sheep/md.c
+++ b/sheep/md.c
@@ -143,7 +143,7 @@ static struct disk *path_to_disk(const char *path)
 }
 
 static int get_total_object_size(uint64_t oid, const char *wd, uint32_t epoch,
-				 void *total)
+				 uint8_t ec_index, void *total)
 {
 	uint64_t *t = total;
 	struct stat s;
@@ -158,17 +158,30 @@ static int get_total_object_size(uint64_t oid, const char *wd, uint32_t epoch,
 	return SD_RES_SUCCESS;
 }
 
+static int64_t find_string_integer(const char *str, const char *delimiter)
+{
+	char *pos = strstr(str, delimiter), *p;
+	int64_t ret;
+
+	ret = strtoll(pos + 1, &p, 10);
+	if (ret == LLONG_MAX || p == pos + 1) {
+		sd_err("%s strtoul failed, delimiter %s, %m", str, delimiter);
+		return -1;
+	}
+
+	return ret;
+}
+
 /* If cleanup is true, temporary objects will be removed */
 static int for_each_object_in_path(const char *path,
 				   int (*func)(uint64_t, const char *, uint32_t,
-					       void *),
+					       uint8_t, void *),
 				   bool cleanup, void *arg)
 {
 	DIR *dir;
 	struct dirent *d;
 	uint64_t oid;
 	int ret = SD_RES_SUCCESS;
-	char p[PATH_MAX];
 
 	dir = opendir(path);
 	if (unlikely(!dir)) {
@@ -178,36 +191,39 @@ static int for_each_object_in_path(const char *path,
 
 	while ((d = readdir(dir))) {
 		uint32_t epoch = 0;
+		uint8_t ec_index = SD_MAX_COPIES;
 
+		/* skip ".", ".." and ".stale" */
 		if (unlikely(!strncmp(d->d_name, ".", 1)))
 			continue;
 
+		sd_debug("%s, %s", path, d->d_name);
 		oid = strtoull(d->d_name, NULL, 16);
 		if (oid == 0 || oid == ULLONG_MAX)
 			continue;
 
 		/* don't call callback against temporary objects */
-		if (strlen(d->d_name) == 20 &&
-		    strcmp(d->d_name + 16, ".tmp") == 0) {
+		if (is_tmp_dentry(d->d_name)) {
 			if (cleanup) {
-				snprintf(p, PATH_MAX, "%s/%016"PRIx64".tmp",
-					 path, oid);
-				sd_debug("remove tmp object %s", p);
-				unlink(p);
+				sd_debug("remove tmp object %s", d->d_name);
+				unlink(d->d_name);
 			}
 			continue;
 		}
 
-		if (strlen(d->d_name) > 17 && d->d_name[16] == '.') {
-			epoch = strtoul(d->d_name + 17, NULL, 10);
-			if (epoch == 0 || epoch == ULONG_MAX) {
-				sd_info("%s ignored, strtoul failed %m",
-					d->d_name);
+		if (is_stale_dentry(d->d_name)) {
+			epoch = find_string_integer(d->d_name, ".");
+			if (epoch < 0)
 				continue;
-			}
 		}
 
-		ret = func(oid, path, epoch, arg);
+		if (is_ec_dentry(d->d_name)) {
+			ec_index = find_string_integer(d->d_name, "_");
+			if (ec_index < 0)
+				continue;
+		}
+
+		ret = func(oid, path, epoch, ec_index, arg);
 		if (ret != SD_RES_SUCCESS)
 			break;
 	}
@@ -332,7 +348,7 @@ uint64_t md_init_space(void)
 	return md.space;
 }
 
-static const char *md_get_object_path_nolock(uint64_t oid)
+static const char *md_get_object_dir_nolock(uint64_t oid)
 {
 	const struct vdisk *vd;
 
@@ -343,12 +359,12 @@ static const char *md_get_object_path_nolock(uint64_t oid)
 	return vd->disk->path;
 }
 
-const char *md_get_object_path(uint64_t oid)
+const char *md_get_object_dir(uint64_t oid)
 {
 	const char *p;
 
 	sd_read_lock(&md.lock);
-	p = md_get_object_path_nolock(oid);
+	p = md_get_object_dir_nolock(oid);
 	sd_rw_unlock(&md.lock);
 
 	return p;
@@ -356,7 +372,7 @@ const char *md_get_object_path(uint64_t oid)
 
 struct process_path_arg {
 	const char *path;
-	int (*func)(uint64_t oid, const char *path, uint32_t epoch, void *arg);
+	int (*func)(uint64_t oid, const char *, uint32_t, uint8_t, void *arg);
 	bool cleanup;
 	void *opaque;
 	int result;
@@ -376,7 +392,8 @@ static void *thread_process_path(void *arg)
 }
 
 int for_each_object_in_wd(int (*func)(uint64_t oid, const char *path,
-				      uint32_t epoch, void *arg),
+				      uint32_t epoch, uint8_t ec_index,
+				      void *arg),
 			  bool cleanup, void *arg)
 {
 	int ret = SD_RES_SUCCESS;
@@ -437,7 +454,7 @@ int for_each_object_in_wd(int (*func)(uint64_t oid, const char *path,
 }
 
 int for_each_object_in_stale(int (*func)(uint64_t oid, const char *path,
-					 uint32_t epoch, void *arg),
+					 uint32_t epoch, uint8_t, void *arg),
 			     void *arg)
 {
 	int ret = SD_RES_SUCCESS;
@@ -533,19 +550,37 @@ static inline bool md_access(const char *path)
 	return true;
 }
 
-static int get_old_new_path(uint64_t oid, uint32_t epoch, const char *path,
-			    char *old, size_t old_size, char *new,
-			    size_t new_size)
+static int get_old_new_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
+			    const char *path, char *old, char *new)
 {
 	if (!epoch) {
-		snprintf(old, old_size, "%s/%016" PRIx64, path, oid);
-		snprintf(new, new_size, "%s/%016" PRIx64,
-			 md_get_object_path_nolock(oid), oid);
+		if (!is_erasure_oid(oid)) {
+			snprintf(old, PATH_MAX, "%s/%016" PRIx64, path, oid);
+			snprintf(new, PATH_MAX, "%s/%016" PRIx64,
+				 md_get_object_dir_nolock(oid), oid);
+		} else {
+			snprintf(old, PATH_MAX, "%s/%016" PRIx64"_%d", path,
+				 oid, ec_index);
+			snprintf(new, PATH_MAX, "%s/%016" PRIx64"_%d",
+				 md_get_object_dir_nolock(oid), oid, ec_index);
+		}
 	} else {
-		snprintf(old, old_size, "%s/.stale/%016"PRIx64".%"PRIu32, path,
-			 oid, epoch);
-		snprintf(new, new_size, "%s/.stale/%016"PRIx64".%"PRIu32,
-			 md_get_object_path_nolock(oid), oid, epoch);
+		if (!is_erasure_oid(oid)) {
+			snprintf(old, PATH_MAX,
+				 "%s/.stale/%016"PRIx64".%"PRIu32, path,
+				 oid, epoch);
+			snprintf(new, PATH_MAX,
+				 "%s/.stale/%016"PRIx64".%"PRIu32,
+				 md_get_object_dir_nolock(oid), oid, epoch);
+		} else {
+			snprintf(old, PATH_MAX,
+				 "%s/.stale/%016"PRIx64"_%d.%"PRIu32, path,
+				 oid, ec_index, epoch);
+			snprintf(new, PATH_MAX,
+				 "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
+				 md_get_object_dir_nolock(oid),
+				 oid, ec_index ,epoch);
+		}
 	}
 
 	if (!md_access(old))
@@ -587,12 +622,12 @@ out:
 	return ret;
 }
 
-static int md_check_and_move(uint64_t oid, uint32_t epoch, const char *path)
+static int md_check_and_move(uint64_t oid, uint32_t epoch, uint8_t ec_index,
+			     const char *path)
 {
 	char old[PATH_MAX], new[PATH_MAX];
 
-	if (get_old_new_path(oid, epoch, path, old, sizeof(old), new,
-			     sizeof(new)) < 0)
+	if (get_old_new_path(oid, epoch, ec_index, path, old, new) < 0)
 		return SD_RES_EIO;
 	/*
 	 * Recovery thread and main thread might try to recover the same object.
@@ -613,14 +648,14 @@ static int md_check_and_move(uint64_t oid, uint32_t epoch, const char *path)
 	return SD_RES_SUCCESS;
 }
 
-static int scan_wd(uint64_t oid, uint32_t epoch)
+static int scan_wd(uint64_t oid, uint32_t epoch, uint8_t ec_index)
 {
 	int ret = SD_RES_EIO;
 	const struct disk *disk;
 
 	sd_read_lock(&md.lock);
 	rb_for_each_entry(disk, &md.root, rb) {
-		ret = md_check_and_move(oid, epoch, disk->path);
+		ret = md_check_and_move(oid, epoch, ec_index, disk->path);
 		if (ret == SD_RES_SUCCESS)
 			break;
 	}
@@ -628,12 +663,11 @@ static int scan_wd(uint64_t oid, uint32_t epoch)
 	return ret;
 }
 
-bool md_exist(uint64_t oid)
+bool md_exist(uint64_t oid, uint8_t ec_index)
 {
 	char path[PATH_MAX];
 
-	snprintf(path, PATH_MAX, "%s/%016" PRIx64, md_get_object_path(oid),
-		 oid);
+	get_store_path(oid, ec_index, path);
 	if (md_access(path))
 		return true;
 	/*
@@ -641,21 +675,32 @@ bool md_exist(uint64_t oid)
 	 * track to locate the objects for multiple disk failure. Simply do
 	 * hard iteration simplify the code a lot.
 	 */
-	if (scan_wd(oid, 0) == SD_RES_SUCCESS)
+	if (scan_wd(oid, 0, ec_index) == SD_RES_SUCCESS)
 		return true;
 
 	return false;
 }
 
-int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path, size_t size)
+int md_get_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
+		      char *path)
 {
-	snprintf(path, size, "%s/.stale/%016"PRIx64".%"PRIu32,
-		 md_get_object_path(oid), oid, epoch);
+	if (unlikely(!epoch))
+		panic("invalid 0 epoch");
+
+	if (is_erasure_oid(oid)) {
+		if (unlikely(ec_index >= SD_MAX_COPIES))
+			panic("invalid ec index %d", ec_index);
+
+		snprintf(path, PATH_MAX, "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
+			 md_get_object_dir(oid), oid, ec_index, epoch);
+	} else
+		snprintf(path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
+			 md_get_object_dir(oid), oid, epoch);
+
 	if (md_access(path))
 		return SD_RES_SUCCESS;
 
-	assert(epoch);
-	if (scan_wd(oid, epoch) == SD_RES_SUCCESS)
+	if (scan_wd(oid, epoch, ec_index) == SD_RES_SUCCESS)
 		return SD_RES_SUCCESS;
 
 	return SD_RES_NO_OBJ;
diff --git a/sheep/ops.c b/sheep/ops.c
index 6aa547e..fb3f95c 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -923,10 +923,11 @@ static int local_kill_node(const struct sd_req *req, struct sd_rsp *rsp,
 static int peer_remove_obj(struct request *req)
 {
 	uint64_t oid = req->rq.obj.oid;
+	uint8_t ec_index = req->rq.obj.ec_index;
 
 	objlist_cache_remove(oid);
 
-	return sd_store->remove_object(oid);
+	return sd_store->remove_object(oid, ec_index);
 }
 
 int peer_read_obj(struct request *req)
@@ -946,6 +947,7 @@ int peer_read_obj(struct request *req)
 	iocb.length = hdr->data_length;
 	iocb.offset = hdr->obj.offset;
 	iocb.ec_index = hdr->obj.ec_index;
+	iocb.copy_policy = hdr->obj.copy_policy;
 	ret = sd_store->read(hdr->obj.oid, &iocb);
 	if (ret != SD_RES_SUCCESS)
 		goto out;
@@ -965,6 +967,8 @@ static int peer_write_obj(struct request *req)
 	iocb.buf = req->data;
 	iocb.length = hdr->data_length;
 	iocb.offset = hdr->obj.offset;
+	iocb.ec_index = hdr->obj.ec_index;
+	iocb.copy_policy = hdr->obj.copy_policy;
 
 	return sd_store->write(oid, &iocb);
 }
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index 716de33..9338c86 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -15,26 +15,6 @@
 
 #define sector_algined(x) ({ ((x) & (SECTOR_SIZE - 1)) == 0; })
 
-#define ECNAME "user.ec.index"
-#define ECSIZE sizeof(uint8_t)
-static int set_erasure_index(const char *path, uint8_t idx)
-{
-	if (setxattr(path, ECNAME, &idx, ECSIZE, 0) < 0) {
-		sd_err("failed to setxattr %s, %m", path);
-		return -1;
-	}
-	return 0;
-}
-
-static int get_erasure_index(const char *path, uint8_t *idx)
-{
-	if (getxattr(path, ECNAME, idx, ECSIZE) < 0) {
-		sd_err("failed to getxattr %s, %m", path);
-		return -1;
-	}
-	return 0;
-}
-
 static inline bool iocb_is_aligned(const struct siocb *iocb)
 {
 	return  sector_algined(iocb->offset) && sector_algined(iocb->length);
@@ -59,31 +39,45 @@ static int prepare_iocb(uint64_t oid, const struct siocb *iocb, bool create)
 	return flags;
 }
 
-static int get_obj_path(uint64_t oid, char *path, size_t size)
+int get_store_path(uint64_t oid, uint8_t ec_index, char *path)
 {
-	return snprintf(path, size, "%s/%016" PRIx64,
-			md_get_object_path(oid), oid);
+	if (is_erasure_oid(oid)) {
+		if (unlikely(ec_index >= SD_MAX_COPIES))
+			panic("invalid ec_index %d", ec_index);
+		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
+				md_get_object_dir(oid), oid, ec_index);
+	}
+
+	return snprintf(path, PATH_MAX, "%s/%016" PRIx64,
+			md_get_object_dir(oid), oid);
 }
 
-static int get_tmp_obj_path(uint64_t oid, char *path, size_t size)
+static int get_store_tmp_path(uint64_t oid, uint8_t ec_index, char *path)
 {
-	return snprintf(path, size, "%s/%016"PRIx64".tmp",
-			md_get_object_path(oid), oid);
+	if (is_erasure_oid(oid)) {
+		if (unlikely(ec_index >= SD_MAX_COPIES))
+			panic("invalid ec_index %d", ec_index);
+		return snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d.tmp",
+				md_get_object_dir(oid), oid, ec_index);
+	}
+
+	return snprintf(path, PATH_MAX, "%s/%016" PRIx64".tmp",
+			md_get_object_dir(oid), oid);
 }
 
-static int get_stale_obj_path(uint64_t oid, uint32_t epoch, char *path,
-			      size_t size)
+static int get_store_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index,
+				char *path)
 {
-	return md_get_stale_path(oid, epoch, path, size);
+	return md_get_stale_path(oid, epoch, ec_index, path);
 }
 
 /*
  * Check if oid is in this nodes (if oid is in the wrong place, it will be moved
  * to the correct one after this call in a MD setup.
  */
-bool default_exist(uint64_t oid)
+bool default_exist(uint64_t oid, uint8_t ec_index)
 {
-	return md_exist(oid);
+	return md_exist(oid, ec_index);
 }
 
 static int err_to_sderr(const char *path, uint64_t oid, int err)
@@ -95,7 +89,7 @@ static int err_to_sderr(const char *path, uint64_t oid, int err)
 	pstrcpy(p, sizeof(p), path);
 	dir = dirname(p);
 
-	sd_debug("%s", dir);
+	sd_debug("%s", path);
 	switch (err) {
 	case ENOENT:
 		if (stat(dir, &s) < 0) {
@@ -144,14 +138,14 @@ int default_write(uint64_t oid, const struct siocb *iocb)
 		sync();
 	}
 
-	get_obj_path(oid, path, sizeof(path));
+	get_store_path(oid, iocb->ec_index, path);
 
 	/*
 	 * Make sure oid is in the right place because oid might be misplaced
 	 * in a wrong place, due to 'shutdown/restart with less/more disks' or
 	 * any bugs. We need call err_to_sderr() to return EIO if disk is broken
 	 */
-	if (!default_exist(oid))
+	if (!default_exist(oid, iocb->ec_index))
 		return err_to_sderr(path, oid, ENOENT);
 
 	fd = open(path, flags, sd_def_fmode);
@@ -238,7 +232,8 @@ out:
 }
 
 static int init_objlist_and_vdi_bitmap(uint64_t oid, const char *wd,
-				       uint32_t epoch, void *arg)
+				       uint32_t epoch, uint8_t ec_index,
+				       void *arg)
 {
 	int ret;
 	objlist_cache_insert(oid);
@@ -267,11 +262,6 @@ int default_init(void)
 	return for_each_object_in_wd(init_objlist_and_vdi_bitmap, true, NULL);
 }
 
-static inline bool is_stale_path(const char *path)
-{
-	return !!strstr(path, "stale");
-}
-
 static int default_read_from_path(uint64_t oid, const char *path,
 				  const struct siocb *iocb)
 {
@@ -284,31 +274,15 @@ static int default_read_from_path(uint64_t oid, const char *path,
 	 * in a wrong place, due to 'shutdown/restart with less disks' or any
 	 * bugs. We need call err_to_sderr() to return EIO if disk is broken.
 	 *
-	 * For stale path, get_stale_obj_path() already does default_exist job.
+	 * For stale path, get_store_stale_path already does default_exist job.
 	 */
-	if (!is_stale_path(path) && !default_exist(oid))
+	if (!is_stale_path(path) && !default_exist(oid, iocb->ec_index))
 		return err_to_sderr(path, oid, ENOENT);
 
 	fd = open(path, flags);
-
 	if (fd < 0)
 		return err_to_sderr(path, oid, errno);
 
-	if (is_erasure_oid(oid) && iocb->ec_index <= SD_MAX_COPIES) {
-		uint8_t idx;
-
-		if (get_erasure_index(path, &idx) < 0) {
-			close(fd);
-			return err_to_sderr(path, oid, errno);
-		}
-		/* We pretend NO-OBJ to read old object in the stale dir */
-		if (idx != iocb->ec_index) {
-			sd_debug("ec_index %d != %d", iocb->ec_index, idx);
-			close(fd);
-			return SD_RES_NO_OBJ;
-		}
-	}
-
 	size = xpread(fd, iocb->buf, iocb->length, iocb->offset);
 	if (unlikely(size != iocb->length)) {
 		sd_err("failed to read object %"PRIx64", path=%s, offset=%"
@@ -325,7 +299,7 @@ int default_read(uint64_t oid, const struct siocb *iocb)
 	int ret;
 	char path[PATH_MAX];
 
-	get_obj_path(oid, path, sizeof(path));
+	get_store_path(oid, iocb->ec_index, path);
 	ret = default_read_from_path(oid, path, iocb);
 
 	/*
@@ -334,7 +308,7 @@ int default_read(uint64_t oid, const struct siocb *iocb)
 	 */
 	if (ret == SD_RES_NO_OBJ && iocb->epoch > 0 &&
 	    iocb->epoch < sys_epoch()) {
-		get_stale_obj_path(oid, iocb->epoch, path, sizeof(path));
+		get_store_stale_path(oid, iocb->epoch, iocb->ec_index, path);
 		ret = default_read_from_path(oid, path, iocb);
 	}
 
@@ -374,12 +348,11 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 	int flags = prepare_iocb(oid, iocb, true);
 	int ret, fd;
 	uint32_t len = iocb->length;
-	bool ec = is_erasure_oid(oid);
 	size_t obj_size;
 
 	sd_debug("%"PRIx64, oid);
-	get_obj_path(oid, path, sizeof(path));
-	get_tmp_obj_path(oid, tmp_path, sizeof(tmp_path));
+	get_store_path(oid, iocb->ec_index, path);
+	get_store_tmp_path(oid, iocb->ec_index, tmp_path);
 
 	if (uatomic_is_true(&sys->use_journal) &&
 	    journal_write_store(oid, iocb->buf, iocb->length,
@@ -409,15 +382,7 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 		return err_to_sderr(path, oid, errno);
 	}
 
-	if (ec) {
-		uint8_t policy = iocb->copy_policy ?:
-			get_vdi_copy_policy(oid_to_vid(oid));
-		int d;
-		ec_policy_to_dp(policy, &d, NULL);
-		obj_size = SD_DATA_OBJ_SIZE / d;
-	} else
-		obj_size = get_objsize(oid);
-
+	obj_size = get_store_objsize(oid);
 	ret = prealloc(fd, obj_size);
 	if (ret < 0) {
 		ret = err_to_sderr(path, oid, errno);
@@ -431,11 +396,6 @@ int default_create_and_write(uint64_t oid, const struct siocb *iocb)
 		goto out;
 	}
 
-	if (ec && set_erasure_index(tmp_path, iocb->ec_index) < 0) {
-		ret = err_to_sderr(tmp_path, oid, errno);
-		goto out;
-	}
-
 	ret = rename(tmp_path, path);
 	if (ret < 0) {
 		sd_err("failed to rename %s to %s: %m", tmp_path, path);
@@ -459,8 +419,8 @@ int default_link(uint64_t oid, uint32_t tgt_epoch)
 	sd_debug("try link %"PRIx64" from snapshot with epoch %d", oid,
 		 tgt_epoch);
 
-	get_obj_path(oid, path, sizeof(path));
-	get_stale_obj_path(oid, tgt_epoch, stale_path, sizeof(stale_path));
+	snprintf(path, PATH_MAX, "%s/%016"PRIx64, md_get_object_dir(oid), oid);
+	get_store_stale_path(oid, tgt_epoch, 0, stale_path);
 
 	if (link(stale_path, path) < 0) {
 		/*
@@ -485,7 +445,7 @@ out:
  * node(index gets changed even it has some other copy belongs to it) because
  * of hash ring changes, we consider it stale.
  */
-static bool oid_stale(uint64_t oid)
+static bool oid_stale(uint64_t oid, int ec_index)
 {
 	uint32_t i, nr_copies;
 	struct vnode_info *vinfo;
@@ -499,14 +459,8 @@ static bool oid_stale(uint64_t oid)
 	for (i = 0; i < nr_copies; i++) {
 		v = obj_vnodes[i];
 		if (vnode_is_local(v)) {
-			if (is_erasure_oid(oid)) {
-				char path[PATH_MAX];
-				uint8_t idx;
-
-				get_obj_path(oid, path, sizeof(path));
-				if (get_erasure_index(path, &idx) < 0)
-					break;
-				if (idx == i)
+			if (ec_index < SD_MAX_COPIES) {
+				if (i == ec_index)
 					ret = false;
 			} else {
 				ret = false;
@@ -520,14 +474,24 @@ static bool oid_stale(uint64_t oid)
 }
 
 static int move_object_to_stale_dir(uint64_t oid, const char *wd,
-				    uint32_t epoch, void *arg)
+				    uint32_t epoch, uint8_t ec_index, void *arg)
 {
 	char path[PATH_MAX], stale_path[PATH_MAX];
-	uint32_t tgt_epoch = *(int *)arg;
-
-	snprintf(path, PATH_MAX, "%s/%016" PRIx64, wd, oid);
-	snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32, wd,
-		 oid, tgt_epoch);
+	uint32_t tgt_epoch = *(uint32_t *)arg;
+
+	/* ec_index from md.c is reliable so we can directly use it */
+	if (ec_index < SD_MAX_COPIES) {
+		snprintf(path, PATH_MAX, "%s/%016"PRIx64"_%d",
+			 md_get_object_dir(oid), oid, ec_index);
+		snprintf(stale_path, PATH_MAX,
+			 "%s/.stale/%016"PRIx64"_%d.%"PRIu32,
+			 md_get_object_dir(oid), oid, ec_index, tgt_epoch);
+	} else {
+		snprintf(path, PATH_MAX, "%s/%016" PRIx64,
+			 md_get_object_dir(oid), oid);
+		snprintf(stale_path, PATH_MAX, "%s/.stale/%016"PRIx64".%"PRIu32,
+			 md_get_object_dir(oid), oid, tgt_epoch);
+	}
 
 	if (unlikely(rename(path, stale_path)) < 0) {
 		sd_err("failed to move stale object %" PRIX64 " to %s, %m", oid,
@@ -540,10 +504,10 @@ static int move_object_to_stale_dir(uint64_t oid, const char *wd,
 }
 
 static int check_stale_objects(uint64_t oid, const char *wd, uint32_t epoch,
-			       void *arg)
+			       uint8_t ec_index, void *arg)
 {
-	if (oid_stale(oid))
-		return move_object_to_stale_dir(oid, wd, 0, arg);
+	if (oid_stale(oid, ec_index))
+		return move_object_to_stale_dir(oid, wd, 0, ec_index, arg);
 
 	return SD_RES_SUCCESS;
 }
@@ -569,20 +533,20 @@ int default_format(void)
 	return SD_RES_SUCCESS;
 }
 
-int default_remove_object(uint64_t oid)
+int default_remove_object(uint64_t oid, uint8_t ec_index)
 {
 	char path[PATH_MAX];
 
 	if (uatomic_is_true(&sys->use_journal))
 		journal_remove_object(oid);
 
-	get_obj_path(oid, path, sizeof(path));
+	get_store_path(oid, ec_index, path);
 
 	if (unlink(path) < 0) {
 		if (errno == ENOENT)
 			return SD_RES_NO_OBJ;
 
-		sd_err("failed to remove object %"PRIx64", %m", oid);
+		sd_err("failed, %s, %m", path);
 		return SD_RES_EIO;
 	}
 
@@ -619,10 +583,11 @@ static int set_object_sha1(const char *path, const uint8_t *sha1)
 static int get_object_path(uint64_t oid, uint32_t epoch, char *path,
 			   size_t size)
 {
-	if (default_exist(oid)) {
-		get_obj_path(oid, path, size);
+	if (default_exist(oid, 0)) {
+		snprintf(path, PATH_MAX, "%s/%016"PRIx64,
+			 md_get_object_dir(oid), oid);
 	} else {
-		get_stale_obj_path(oid, epoch, path, size);
+		get_store_stale_path(oid, epoch, 0, path);
 		if (access(path, F_OK) < 0) {
 			if (errno == ENOENT)
 				return SD_RES_NO_OBJ;
diff --git a/sheep/recovery.c b/sheep/recovery.c
index defe59b..d8ba77b 100644
--- a/sheep/recovery.c
+++ b/sheep/recovery.c
@@ -448,6 +448,9 @@ static uint8_t local_node_copy_index(struct vnode_info *vinfo, uint64_t oid)
 {
 	int idx;
 
+	if (!is_erasure_oid(oid))
+		return 0; /* no need to proceed */
+
 	for (idx = 0; idx < vinfo->nr_zones; idx++) {
 		const struct sd_node *n = oid_to_node(oid, &vinfo->vroot, idx);
 		if (node_is_local(n))
@@ -521,9 +524,10 @@ static void recover_object_work(struct work *work)
 						     struct recovery_obj_work,
 						     base);
 	uint64_t oid = row->oid;
+	struct vnode_info *cur = rw->cur_vinfo;
 	int ret, epoch;
 
-	if (sd_store->exist(oid)) {
+	if (sd_store->exist(oid, local_node_copy_index(cur, oid))) {
 		sd_debug("the object is already recovered");
 		return;
 	}
@@ -572,11 +576,13 @@ static inline void prepare_schedule_oid(uint64_t oid)
 main_fn bool oid_in_recovery(uint64_t oid)
 {
 	struct recovery_info *rinfo = main_thread_get(current_rinfo);
+	struct vnode_info *cur;
 
 	if (!node_in_recovery())
 		return false;
 
-	if (sd_store->exist(oid)) {
+	cur = rinfo->cur_vinfo;
+	if (sd_store->exist(oid, local_node_copy_index(cur, oid))) {
 		sd_debug("the object %" PRIx64 " is already recoverd", oid);
 		return false;
 	}
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index eb31463..4f44b16 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -209,13 +209,13 @@ struct store_driver {
 	struct list_node list;
 	const char *name;
 	int (*init)(void);
-	bool (*exist)(uint64_t oid);
+	bool (*exist)(uint64_t oid, uint8_t ec_index);
 	/* create_and_write must be an atomic operation*/
 	int (*create_and_write)(uint64_t oid, const struct siocb *);
 	int (*write)(uint64_t oid, const struct siocb *);
 	int (*read)(uint64_t oid, const struct siocb *);
 	int (*format)(void);
-	int (*remove_object)(uint64_t oid);
+	int (*remove_object)(uint64_t oid, uint8_t ec_index);
 	int (*get_hash)(uint64_t oid, uint32_t epoch, uint8_t *sha1);
 	/* Operations in recovery */
 	int (*link)(uint64_t oid, uint32_t tgt_epoch);
@@ -229,7 +229,7 @@ struct store_driver {
 int peer_read_obj(struct request *req);
 
 int default_init(void);
-bool default_exist(uint64_t oid);
+bool default_exist(uint64_t oid, uint8_t ec_index);
 int default_create_and_write(uint64_t oid, const struct siocb *iocb);
 int default_write(uint64_t oid, const struct siocb *iocb);
 int default_read(uint64_t oid, const struct siocb *iocb);
@@ -237,16 +237,18 @@ int default_link(uint64_t oid, uint32_t tgt_epoch);
 int default_update_epoch(uint32_t epoch);
 int default_cleanup(void);
 int default_format(void);
-int default_remove_object(uint64_t oid);
+int default_remove_object(uint64_t oid, uint8_t ec_index);
 int default_get_hash(uint64_t oid, uint32_t epoch, uint8_t *sha1);
 int default_purge_obj(void);
-int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t, void *),
+int for_each_object_in_wd(int (*func)(uint64_t, const char *, uint32_t,
+				      uint8_t, void *),
 			  bool, void *);
 int for_each_object_in_stale(int (*func)(uint64_t oid, const char *path,
-					 uint32_t epoch, void *arg),
+					 uint32_t epoch, uint8_t, void *arg),
 			     void *arg);
 int for_each_obj_path(int (*func)(const char *path));
 size_t get_store_objsize(uint64_t oid);
+int get_store_path(uint64_t oid, uint8_t ec_index, char *path);
 
 extern struct list_head store_drivers;
 #define add_store_driver(driver)				\
@@ -477,15 +479,35 @@ int journal_remove_object(uint64_t oid);
 /* md.c */
 bool md_add_disk(const char *path, bool);
 uint64_t md_init_space(void);
-const char *md_get_object_path(uint64_t oid);
+const char *md_get_object_dir(uint64_t oid);
 int md_handle_eio(const char *);
-bool md_exist(uint64_t oid);
-int md_get_stale_path(uint64_t oid, uint32_t epoch, char *path, size_t size);
+bool md_exist(uint64_t oid, uint8_t ec_index);
+int md_get_stale_path(uint64_t oid, uint32_t epoch, uint8_t ec_index, char *);
 uint32_t md_get_info(struct sd_md_info *info);
 int md_plug_disks(char *disks);
 int md_unplug_disks(char *disks);
 uint64_t md_get_size(uint64_t *used);
 
+static inline bool is_stale_path(const char *path)
+{
+	return !!strstr(path, ".stale");
+}
+
+static inline bool is_stale_dentry(const char *dentry)
+{
+	return !!strstr(dentry, ".");
+}
+
+static inline bool is_tmp_dentry(const char *dentry)
+{
+	return !!strstr(dentry, ".tmp");
+}
+
+static inline bool is_ec_dentry(const char *dentry)
+{
+	return !!strstr(dentry, "_");
+}
+
 /* http.c */
 #ifdef HAVE_HTTP
 int http_init(const char *options);
-- 
1.8.1.2




More information about the sheepdog mailing list