[Sheepdog] [PATCH] sheep: cache virtual nodes to reduce memory usage

MORITA Kazutaka morita.kazutaka at lab.ntt.co.jp
Wed Nov 2 08:04:14 CET 2011


Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 sheep/group.c      |   73 +++++++++++++++++++++++++++++++++++++++++++----
 sheep/sdnet.c      |    1 +
 sheep/sheep_priv.h |    7 ++--
 sheep/vdi.c        |   80 ++++++++++++++++++++++-----------------------------
 4 files changed, 107 insertions(+), 54 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index 0f0b5f5..bfdadbf 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -122,19 +122,80 @@ int get_zones_nr_from(struct sheepdog_node_list_entry *nodes, int nr_nodes)
 	return nr_zones;
 }
 
-void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
-			       int *nr_vnodes, int *nr_zones)
+struct vnodes_cache {
+	struct sheepdog_vnode_list_entry vnodes[SD_MAX_VNODES];
+	int nr_vnodes;
+	int nr_zones;
+	uint32_t epoch;
+
+	int refcnt;
+	struct list_head list;
+};
+
+int get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry **entries,
+			      int *nr_vnodes, int *nr_zones)
+{
+	static LIST_HEAD(vnodes_list);
+	struct vnodes_cache *cache;
+
+	list_for_each_entry(cache, &vnodes_list, list) {
+		if (cache->epoch == sys->epoch) {
+			*entries = cache->vnodes;
+			*nr_vnodes = cache->nr_vnodes;
+			*nr_zones = cache->nr_zones;
+			cache->refcnt++;
+
+			return SD_RES_SUCCESS;
+		}
+	}
+
+	cache = zalloc(sizeof(*cache));
+	if (!cache) {
+		eprintf("oom\n");
+		*entries = NULL;
+		return SD_RES_NO_MEM;
+	}
+
+	cache->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
+	memcpy(cache->vnodes, sys->vnodes, sizeof(sys->vnodes[0]) * sys->nr_vnodes);
+	cache->nr_vnodes = sys->nr_vnodes;
+	cache->epoch = sys->epoch;
+	cache->refcnt++;
+
+	*entries = cache->vnodes;
+	*nr_vnodes = cache->nr_vnodes;
+	*nr_zones = cache->nr_zones;
+
+	list_add(&cache->list, &vnodes_list);
+
+	return SD_RES_SUCCESS;
+}
+
+void free_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries)
 {
-	*nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
+	struct vnodes_cache *cache;
 
-	memcpy(entries, sys->vnodes, sizeof(*entries) * sys->nr_vnodes);
+	if (!entries)
+		return;
 
-	*nr_vnodes = sys->nr_vnodes;
+	cache = container_of(entries, struct vnodes_cache, vnodes[0]);
+	if (--cache->refcnt == 0) {
+		list_del(&cache->list);
+		free(cache);
+	}
 }
 
 void setup_ordered_sd_vnode_list(struct request *req)
 {
-	get_ordered_sd_vnode_list(req->entry, &req->nr_vnodes, &req->nr_zones);
+	int res;
+
+	if (req->entry)
+		free_ordered_sd_vnode_list(req->entry);
+
+	res = get_ordered_sd_vnode_list(&req->entry, &req->nr_vnodes,
+					&req->nr_zones);
+	if (res != SD_RES_SUCCESS)
+		panic("unrecoverable error\n");
 }
 
 static void do_cluster_op(void *arg)
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index 6114132..21a61ac 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -284,6 +284,7 @@ static void free_request(struct request *req)
 	sys->outstanding_data_size -= req->data_length;
 
 	list_del(&req->r_siblings);
+	free_ordered_sd_vnode_list(req->entry);
 	free(req->data);
 	free(req);
 }
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index a7a278a..8a945ea 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -84,7 +84,7 @@ struct request {
 
 	uint64_t local_oid;
 
-	struct sheepdog_vnode_list_entry entry[SD_MAX_VNODES];
+	struct sheepdog_vnode_list_entry *entry;
 	int nr_vnodes;
 	int nr_zones;
 	int check_consistency;
@@ -180,8 +180,9 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 
 int get_zones_nr_from(struct sheepdog_node_list_entry *nodes, int nr_nodes);
 void setup_ordered_sd_vnode_list(struct request *req);
-void get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries,
-			       int *nr_vnodes, int *nr_zones);
+int get_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry **entries,
+			      int *nr_vnodes, int *nr_zones);
+void free_ordered_sd_vnode_list(struct sheepdog_vnode_list_entry *entries);
 int is_access_to_busy_objects(uint64_t oid);
 int is_access_local(struct sheepdog_vnode_list_entry *e, int nr_nodes,
 		    uint64_t oid, int copies);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index b88c5e3..d783942 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -21,20 +21,13 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 			  uint32_t base_vid, uint32_t cur_vid, uint32_t copies,
 			  uint32_t snapid, int is_snapshot)
 {
-	struct sheepdog_vnode_list_entry *entries;
+	struct sheepdog_vnode_list_entry *entries = NULL;
 	/* we are not called concurrently */
 	struct sheepdog_inode *new = NULL, *base = NULL, *cur = NULL;
 	struct timeval tv;
 	int ret, nr_vnodes, nr_zones;
 	unsigned long block_size = SD_DATA_OBJ_SIZE;
 
-	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
-	if (!entries) {
-		eprintf("oom\n");
-		ret = SD_RES_NO_MEM;
-		goto out;
-	}
-
 	new = zalloc(sizeof(*new));
 	if (!new) {
 		eprintf("oom\n");
@@ -60,7 +53,9 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 		}
 	}
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
+	ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones);
+	if (ret != SD_RES_SUCCESS)
+		goto out;
 
 	if (base_vid) {
 		ret = read_object(entries, nr_vnodes, nr_zones, epoch,
@@ -149,7 +144,7 @@ static int create_vdi_obj(uint32_t epoch, char *name, uint32_t new_vid, uint64_t
 	if (ret != 0)
 		ret = SD_RES_VDI_WRITE;
 out:
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 	free(new);
 	free(cur);
 	free(base);
@@ -161,21 +156,22 @@ static int find_first_vdi(uint32_t epoch, unsigned long start, unsigned long end
 			  unsigned long *deleted_nr, uint32_t *next_snap,
 			  unsigned int *nr_copies, uint64_t *ctime)
 {
-	struct sheepdog_vnode_list_entry *entries;
+	struct sheepdog_vnode_list_entry *entries = NULL;
 	struct sheepdog_inode *inode = NULL;
 	unsigned long i;
 	int nr_vnodes, nr_zones, nr_reqs;
 	int ret, vdi_found = 0;
 
-	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
 	inode = malloc(SD_INODE_HEADER_SIZE);
-	if (!inode || !entries) {
+	if (!inode) {
 		eprintf("oom\n");
 		ret = SD_RES_NO_MEM;
 		goto out;
 	}
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
+	ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones);
+	if (ret != SD_RES_SUCCESS)
+		goto out;
 
 	nr_reqs = sys->nr_sobjs;
 	if (nr_reqs > nr_zones)
@@ -219,7 +215,7 @@ static int find_first_vdi(uint32_t epoch, unsigned long start, unsigned long end
 		ret = SD_RES_NO_VDI;
 out:
 	free(inode);
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 
 	return ret;
 }
@@ -347,13 +343,12 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	uint32_t dummy0;
 	unsigned long dummy1, dummy2;
 	int ret;
-	struct sheepdog_vnode_list_entry *entries;
+	struct sheepdog_vnode_list_entry *entries = NULL;
 	int nr_vnodes, nr_zones, nr_reqs;
 	struct sheepdog_inode *inode = NULL;
 
 	inode = malloc(SD_INODE_HEADER_SIZE);
-	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
-	if (!inode || !entries) {
+	if (!inode) {
 		eprintf("oom\n");
 		ret = SD_RES_NO_MEM;
 		goto out;
@@ -373,7 +368,10 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	if (ret != SD_RES_SUCCESS)
 		goto out;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
+	ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones);
+	if (ret != SD_RES_SUCCESS)
+		goto out;
+
 	nr_reqs = sys->nr_sobjs;
 	if (nr_reqs > nr_zones)
 		nr_reqs = nr_zones;
@@ -399,7 +397,7 @@ int del_vdi(uint32_t epoch, char *data, int data_len, uint32_t *vid,
 	ret = start_deletion(*vid, epoch);
 out:
 	free(inode);
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 
 	return ret;
 }
@@ -434,16 +432,15 @@ static void delete_one(struct work *work, int idx)
 {
 	struct deletion_work *dw = container_of(work, struct deletion_work, work);
 	uint32_t vdi_id = *(((uint32_t *)dw->buf) + dw->count - dw->done - 1);
-	struct sheepdog_vnode_list_entry *entries;
+	struct sheepdog_vnode_list_entry *entries = NULL;
 	int nr_vnodes, nr_zones;
 	int ret, i;
 	struct sheepdog_inode *inode = NULL;
 
 	eprintf("%d %d, %16x\n", dw->done, dw->count, vdi_id);
 
-	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
 	inode = malloc(sizeof(*inode));
-	if (!inode || !entries) {
+	if (!inode) {
 		eprintf("oom\n");
 		goto out;
 	}
@@ -453,7 +450,9 @@ static void delete_one(struct work *work, int idx)
 	 * is called in threads and not serialized with cpg_event so
 	 * we can't access to epoch and sd_node_list safely.
 	 */
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
+	ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones);
+	if (ret != SD_RES_SUCCESS)
+		goto out;
 
 	ret = read_object(entries, nr_vnodes, nr_zones, dw->epoch,
 			  vid_to_vdi_oid(vdi_id), (void *)inode, sizeof(*inode),
@@ -473,7 +472,7 @@ static void delete_one(struct work *work, int idx)
 			      inode->nr_copies);
 	}
 out:
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 	free(inode);
 }
 
@@ -586,16 +585,10 @@ out:
 int start_deletion(uint32_t vid, uint32_t epoch)
 {
 	struct deletion_work *dw = NULL;
-	struct sheepdog_vnode_list_entry *entries;
+	struct sheepdog_vnode_list_entry *entries = NULL;
 	int nr_vnodes, nr_zones, ret;
 	uint32_t root_vid;
 
-	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
-	if (!entries) {
-		eprintf("oom\n");
-		ret = SD_RES_NO_MEM;
-		goto err;
-	}
 	dw = zalloc(sizeof(struct deletion_work));
 	if (!dw) {
 		ret = SD_RES_NO_MEM;
@@ -615,7 +608,9 @@ int start_deletion(uint32_t vid, uint32_t epoch)
 	dw->work.fn = delete_one;
 	dw->work.done = delete_one_done;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
+	ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones);
+	if (ret != SD_RES_SUCCESS)
+		goto err;
 
 	root_vid = get_vdi_root(entries, nr_vnodes, nr_zones, dw->epoch, dw->vid);
 	if (!root_vid) {
@@ -640,11 +635,11 @@ int start_deletion(uint32_t vid, uint32_t epoch)
 	list_add_tail(&dw->dw_siblings, &deletion_work_list);
 	queue_work(sys->deletion_wqueue, &dw->work);
 out:
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 
 	return SD_RES_SUCCESS;
 err:
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 	if (dw)
 		free(dw->buf);
 	free(dw);
@@ -656,25 +651,20 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 		 uint32_t vid, uint32_t *attrid, int copies, uint64_t ctime,
 		 int write, int excl, int delete)
 {
-	struct sheepdog_vnode_list_entry *entries;
+	struct sheepdog_vnode_list_entry *entries = NULL;
 	struct sheepdog_vdi_attr tmp_attr;
 	uint64_t oid;
 	uint32_t end;
 	int ret, nr_zones, nr_vnodes;
 	int value_len;
 
-	entries = malloc(sizeof(*entries) * SD_MAX_VNODES);
-	if (!entries) {
-		eprintf("oom\n");
-		ret = SD_RES_NO_MEM;
-		goto out;
-	}
-
 	value_len = data_len - SD_ATTR_HEADER_SIZE;
 
 	vattr->ctime = ctime;
 
-	get_ordered_sd_vnode_list(entries, &nr_vnodes, &nr_zones);
+	ret = get_ordered_sd_vnode_list(&entries, &nr_vnodes, &nr_zones);
+	if (ret != SD_RES_SUCCESS)
+		goto out;
 
 	*attrid = fnv_64a_buf(vattr, SD_ATTR_HEADER_SIZE, FNV1A_64_INIT);
 	*attrid &= (UINT64_C(1) << VDI_SPACE_SHIFT) - 1;
@@ -731,7 +721,7 @@ int get_vdi_attr(uint32_t epoch, struct sheepdog_vdi_attr *vattr, int data_len,
 	dprintf("there is no space for new vdis\n");
 	ret = SD_RES_FULL_VDI;
 out:
-	free(entries);
+	free_ordered_sd_vnode_list(entries);
 
 	return ret;
 }
-- 
1.7.2.5




More information about the sheepdog mailing list