[Sheepdog] [PATCH 2/2] object cache: cache create operation

Liu Yuan namei.unix at gmail.com
Fri Mar 30 05:18:31 CEST 2012


From: Liu Yuan <tailai.ly at taobao.com>

Currently create operation, we write through the cache. This operation would be
slow and return-err especially in node changes phase.

It is clean to let gateway only talks to object cache, and rely object cache layer
on pulling and pushing objects from/to sheep cluster.

With all the gateway requests operated on object cache to boost the IO performance,
we can do interested tricks with Farm, such as data de-duplication, which trade cpu
cycles for disk space while not hurting the IO performance.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 sheep/farm/farm.c    |    7 +++--
 sheep/object_cache.c |   65 ++++++++++++++++++++++++++++++++-----------------
 sheep/sheep_priv.h   |    5 +++-
 sheep/store.c        |   25 ++++++++++--------
 4 files changed, 64 insertions(+), 38 deletions(-)

diff --git a/sheep/farm/farm.c b/sheep/farm/farm.c
index 1e073f0..f1d87ad 100644
--- a/sheep/farm/farm.c
+++ b/sheep/farm/farm.c
@@ -127,13 +127,14 @@ static int err_to_sderr(uint64_t oid, int err)
 /*
  * Preallocate the whole object to get a better filesystem layout.
  */
-static int prealloc(int fd, uint32_t size)
+int prealloc(int fd, uint32_t size)
 {
 	int ret = fallocate(fd, 0, 0, size);
 	if (ret < 0) {
-		if (errno != ENOSYS && errno != EOPNOTSUPP)
+		if (errno != ENOSYS && errno != EOPNOTSUPP) {
+			dprintf("%m\n");
 			ret = SD_RES_SYSTEM_ERROR;
-		else
+		} else
 			ret = write_last_sector(fd, size);
 	} else
 		ret = SD_RES_SUCCESS;
diff --git a/sheep/object_cache.c b/sheep/object_cache.c
index b1b30d7..ef2c05c 100644
--- a/sheep/object_cache.c
+++ b/sheep/object_cache.c
@@ -137,20 +137,52 @@ out:
 	return cache;
 }
 
-int object_cache_lookup(struct object_cache *oc, uint32_t idx)
+static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx, int create)
+{
+	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
+
+	entry->idx = idx;
+	pthread_mutex_lock(&oc->lock);
+	if (!dirty_tree_insert(&oc->dirty_rb, entry)) {
+		if (create)
+			entry->create = 1;
+		list_add(&entry->list, &oc->dirty_list);
+	} else
+		free(entry);
+	pthread_mutex_unlock(&oc->lock);
+}
+
+int object_cache_lookup(struct object_cache *oc, uint32_t idx, int create)
 {
 	struct strbuf buf;
-	int fd, ret = 0;
+	int fd, ret = 0, flags = def_open_flags;
 
 	strbuf_init(&buf, PATH_MAX);
 	strbuf_addstr(&buf, cache_dir);
 	strbuf_addf(&buf, "/%06"PRIx32"/%08"PRIx32, oc->vid, idx);
 
-	fd = open(buf.buf, def_open_flags, def_fmode);
+	if (create)
+		flags |= O_CREAT | O_TRUNC;
+
+	fd = open(buf.buf, flags, def_fmode);
 	if (fd < 0) {
 		ret = -1;
 		goto out;
 	}
+
+	if (create) {
+		unsigned data_length;
+		uint64_t oid = oc->oid;
+		if (is_vdi_obj(oid))
+			data_length = SD_INODE_SIZE;
+		else
+			data_length = SD_DATA_OBJ_SIZE;
+		ret = prealloc(fd, data_length);
+		if (ret != SD_RES_SUCCESS)
+			ret = -1;
+		else
+			add_to_dirty_tree_and_list(oc, idx, 1);
+	}
 	close(fd);
 out:
 	strbuf_release(&buf);
@@ -195,19 +227,6 @@ static int read_cache_object(uint32_t vid, uint32_t idx, void *buf, size_t count
 	return ret;
 }
 
-static void add_to_dirty_tree_and_list(struct object_cache *oc, uint32_t idx)
-{
-	struct object_cache_entry *entry = xzalloc(sizeof(*entry));
-
-	entry->idx = idx;
-	pthread_mutex_lock(&oc->lock);
-	if (!dirty_tree_insert(&oc->dirty_rb, entry))
-		list_add(&entry->list, &oc->dirty_list);
-	else
-		free(entry);
-	pthread_mutex_unlock(&oc->lock);
-}
-
 int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
 {
 	struct sd_obj_req *hdr = (struct sd_obj_req *)&req->rq;
@@ -219,7 +238,7 @@ int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *req)
 		ret = write_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
 		if (ret != SD_RES_SUCCESS)
 			goto out;
-		add_to_dirty_tree_and_list(oc, idx);
+		add_to_dirty_tree_and_list(oc, idx, 0);
 	} else {
 		ret = read_cache_object(oc->vid, idx, req->data, hdr->data_length, hdr->offset);
 		if (ret != SD_RES_SUCCESS)
@@ -353,7 +372,7 @@ static uint64_t idx_to_oid(uint32_t vid, uint32_t idx)
 		return vid_to_data_oid(vid, idx);
 }
 
-static int push_cache_object(uint32_t vid, uint32_t idx)
+static int push_cache_object(uint32_t vid, uint32_t idx, int create)
 {
 	struct request fake_req;
 	struct sd_obj_req *hdr = (struct sd_obj_req *)&fake_req.rq;
@@ -362,7 +381,7 @@ static int push_cache_object(uint32_t vid, uint32_t idx)
 	int ret = SD_RES_NO_MEM;
 	uint64_t oid = idx_to_oid(vid, idx);
 
-	dprintf("%"PRIx64"\n", oid);
+	dprintf("%"PRIx64", create %d\n", oid, create);
 
 	memset(&fake_req, 0, sizeof(fake_req));
 	if (is_vdi_obj(oid))
@@ -382,13 +401,13 @@ static int push_cache_object(uint32_t vid, uint32_t idx)
 
 	hdr->offset = 0;
 	hdr->data_length = data_length;
-	hdr->opcode = SD_OP_WRITE_OBJ;
+	hdr->opcode = create ? SD_OP_CREATE_AND_WRITE_OBJ : SD_OP_WRITE_OBJ;
 	hdr->flags = SD_FLAG_CMD_WRITE;
 	hdr->oid = oid;
 	hdr->copies = sys->nr_sobjs;
 	hdr->epoch = sys->epoch;
 	fake_req.data = buf;
-	fake_req.op = get_sd_op(SD_OP_WRITE_OBJ);
+	fake_req.op = get_sd_op(hdr->opcode);
 	fake_req.entry = sys->vnodes;
 	fake_req.nr_vnodes = sys->nr_vnodes;
 	fake_req.nr_zones = get_zones_nr_from(sys->nodes, sys->nr_vnodes);
@@ -414,7 +433,7 @@ int object_cache_push(struct object_cache *oc)
 		return SD_RES_SUCCESS;
 
 	list_for_each_entry_safe(entry, t, &oc->dirty_list, list) {
-		ret = push_cache_object(oc->vid, entry->idx);
+		ret = push_cache_object(oc->vid, entry->idx, entry->create);
 		if (ret != SD_RES_SUCCESS)
 			goto out;
 		pthread_mutex_lock(&oc->lock);
@@ -441,7 +460,7 @@ int object_is_cached(uint64_t oid)
 		return 0;
 
 	cache->oid = oid;
-	if (object_cache_lookup(cache, idx) < 0)
+	if (object_cache_lookup(cache, idx, 0) < 0)
 		return 0;
 	else
 		return 1; /* found it */
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index f2ce87e..1b82389 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -305,6 +305,8 @@ int get_sheep_fd(uint8_t *addr, uint16_t port, int node_idx, uint32_t epoch);
 
 int rmdir_r(char *dir_path);
 
+int prealloc(int fd, uint32_t size);
+
 /* Operations */
 
 struct sd_op_template *get_sd_op(uint8_t opcode);
@@ -414,10 +416,11 @@ struct object_cache_entry {
 	uint32_t idx;
 	struct rb_node rb;
 	struct list_head list;
+	int create;
 };
 
 struct object_cache *find_object_cache(uint32_t vid, int create);
-int object_cache_lookup(struct object_cache *oc, uint32_t index);
+int object_cache_lookup(struct object_cache *oc, uint32_t index, int create);
 int object_cache_rw(struct object_cache *oc, uint32_t idx, struct request *);
 int object_cache_pull(struct object_cache *oc, uint32_t index);
 int object_cache_push(struct object_cache *oc);
diff --git a/sheep/store.c b/sheep/store.c
index 4d84350..548621a 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -765,14 +765,18 @@ static int handle_gateway_request(struct request *req)
 	uint32_t vid = oid_to_vid(oid);
 	uint32_t idx = data_oid_to_idx(oid);
 	struct object_cache *cache;
-	int ret;
+	int ret, create = 0;
 
 	if (is_vdi_obj(oid))
 		idx |= 1 << CACHE_VDI_SHIFT;
 
 	cache = find_object_cache(vid, 1);
 	cache->oid = oid;
-	if (object_cache_lookup(cache, idx) < 0) {
+
+	if (hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ)
+		create = 1;
+
+	if (object_cache_lookup(cache, idx, create) < 0) {
 		ret = object_cache_pull(cache, idx);
 		if (ret != SD_RES_SUCCESS)
 			return ret;
@@ -795,11 +799,10 @@ static int bypass_object_cache(struct sd_obj_req *hdr)
 	if (!(hdr->flags & SD_FLAG_CMD_CACHE))
 		return 1;
 
-	/* For create, we skip the cache because of consistency check.
+	/*
 	 * For vmstate && vdi_attr object, we don't do caching
 	 */
-	if (hdr->opcode == SD_OP_CREATE_AND_WRITE_OBJ || is_vmstate_obj(oid)
-			|| is_vdi_attr_obj(oid))
+	if (is_vmstate_obj(oid) || is_vdi_attr_obj(oid))
 		return 1;
 	return 0;
 }
@@ -822,13 +825,13 @@ void do_io_request(struct work *work)
 	if (hdr->flags & SD_FLAG_CMD_IO_LOCAL) {
 		ret = do_local_io(req, epoch);
 	} else {
-		/* fix object consistency when we read the object for the first time */
-		if (req->check_consistency) {
-			ret = fix_object_consistency(req);
-			if (ret != SD_RES_SUCCESS)
-				goto out;
-		}
 		if (bypass_object_cache(hdr)) {
+			/* fix object consistency when we read the object for the first time */
+			if (req->check_consistency) {
+				ret = fix_object_consistency(req);
+				if (ret != SD_RES_SUCCESS)
+					goto out;
+			}
 			if (hdr->flags & SD_FLAG_CMD_WRITE)
 				ret = forward_write_obj_req(req);
 			else
-- 
1.7.8.2




More information about the sheepdog mailing list