[sheepdog] [PATCH v3 1/9] sheep: introduce generational reference counting for object reclaim

Hitoshi Mitake mitake.hitoshi at gmail.com
Sun Feb 23 06:28:20 CET 2014


Generational reference counting is an algorithm to reclaim data
efficiently without race conditions on distributed system.  This
extends vdi objects structure to store generational reference counts,
and increments the counts when creating snapshots.

Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 include/sheepdog_proto.h |  6 +++++
 sheep/vdi.c              | 57 ++++++++++++++++++++++++++++++++++++------------
 2 files changed, 49 insertions(+), 14 deletions(-)

diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 1d17e8f..39d57aa 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -211,6 +211,11 @@ struct sd_rsp {
 	};
 };
 
+struct generation_reference {
+	int32_t generation;
+	int32_t count;
+};
+
 struct sd_inode {
 	char name[SD_MAX_VDI_LEN];
 	char tag[SD_MAX_VDI_TAG_LEN];
@@ -229,6 +234,7 @@ struct sd_inode {
 	uint32_t child_vdi_id[MAX_CHILDREN];
 	uint32_t data_vdi_id[SD_INODE_DATA_INDEX];
 	uint32_t btree_counter;
+	struct generation_reference data_ref[SD_INODE_DATA_INDEX];
 };
 
 struct sd_index {
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 7af743e..38e1e9a 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -224,7 +224,8 @@ out:
 
 static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
 				    uint32_t new_snapid, uint32_t new_vid,
-				    uint32_t *data_vdi_id)
+				    uint32_t *data_vdi_id,
+				    struct generation_reference *data_ref)
 {
 	struct sd_inode *new = xzalloc(sizeof(*new));
 	unsigned long block_size = SD_DATA_OBJ_SIZE;
@@ -246,6 +247,11 @@ static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
 	else if (new->store_policy)
 		sd_inode_init(new->data_vdi_id, 1);
 
+	if (data_ref)
+		for (int i = 0; i < SD_INODE_DATA_INDEX; i++)
+			new->data_ref[i].generation =
+				data_ref[i].generation + 1;
+
 	return new;
 }
 
@@ -264,7 +270,8 @@ static int find_free_idx(uint32_t *vdi_id, size_t max_idx)
 static int create_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
 		      uint32_t new_vid)
 {
-	struct sd_inode *new = alloc_inode(iocb, new_snapid, new_vid, NULL);
+	struct sd_inode *new = alloc_inode(iocb, new_snapid, new_vid, NULL,
+					   NULL);
 	int ret;
 
 	sd_debug("%s: size %" PRIu64 ", new_vid %" PRIx32 ", copies %d, "
@@ -323,18 +330,20 @@ static int clone_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
 
 	/* TODO: multiple sd_write_object should be performed atomically */
 
-	/* update a base vdi */
-	ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)&new_vid,
-			      sizeof(new_vid),
-			      offsetof(struct sd_inode, child_vdi_id[idx]),
-			      false);
+	for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++)
+		base->data_ref[i].count++;
+
+	ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base->data_ref,
+			      sizeof(base->data_ref),
+			      offsetof(struct sd_inode, data_ref), false);
 	if (ret != SD_RES_SUCCESS) {
 		ret = SD_RES_BASE_VDI_WRITE;
 		goto out;
 	}
 
 	/* create a new vdi */
-	new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+	new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id,
+			  base->data_ref);
 	ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
 			      sizeof(*new), 0, true);
 	if (ret != SD_RES_SUCCESS)
@@ -390,15 +399,24 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
 	/* update a base vdi */
 	base->snap_ctime = iocb->time;
 	base->child_vdi_id[idx] = new_vid;
+
+	for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++) {
+		base->data_ref[i].generation++;
+		base->data_ref[i].count = 0;
+	}
 	ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base,
-			      SD_INODE_HEADER_SIZE, 0, false);
+			      sizeof(*base), 0, false);
 	if (ret != SD_RES_SUCCESS) {
 		ret = SD_RES_BASE_VDI_WRITE;
 		goto out;
 	}
 
 	/* create a new vdi */
-	new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+	new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id, NULL);
+	for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++) {
+		new->data_ref[i].generation = base->data_ref[i].generation - 1;
+		new->data_ref[i].count = base->data_ref[i].count + 1;
+	}
 	ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
 			      sizeof(*new), 0, true);
 	if (ret != SD_RES_SUCCESS)
@@ -456,10 +474,20 @@ static int rebase_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
 
 	/* TODO: multiple sd_write_object should be performed atomically */
 
+       ret = sd_write_object(vid_to_vdi_oid(cur_vid), (char *)&iocb->time,
+                             sizeof(iocb->time),
+                             offsetof(struct sd_inode, snap_ctime), false);
+	if (ret != SD_RES_SUCCESS) {
+		ret = SD_RES_VDI_WRITE;
+		goto out;
+	}
+
+	for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++)
+		base->data_ref[i].count++;
 	/* update current working vdi */
-	ret = sd_write_object(vid_to_vdi_oid(cur_vid), (char *)&iocb->time,
-			      sizeof(iocb->time),
-			      offsetof(struct sd_inode, snap_ctime), false);
+	ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base->data_ref,
+			      sizeof(base->data_ref),
+			      offsetof(struct sd_inode, data_ref), false);
 	if (ret != SD_RES_SUCCESS) {
 		ret = SD_RES_VDI_WRITE;
 		goto out;
@@ -476,7 +504,8 @@ static int rebase_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
 	}
 
 	/* create a new vdi */
-	new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+	new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id,
+			  base->data_ref);
 	ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
 			      sizeof(*new), 0, true);
 	if (ret != SD_RES_SUCCESS)
-- 
1.8.3.2




More information about the sheepdog mailing list