[sheepdog] [PATCH v4 1/9] sheep: introduce generational reference counting for object reclaim
Hitoshi Mitake
mitake.hitoshi at gmail.com
Sun Feb 23 16:13:50 CET 2014
Generational reference counting is an algorithm to reclaim data
efficiently without race conditions on distributed system. This
extends vdi objects structure to store generational reference counts,
and increments the counts when creating snapshots.
Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v4:
- remove a bug in snapshot_vdi(), storing an invalid number of references
include/sheepdog_proto.h | 6 +++++
sheep/vdi.c | 59 ++++++++++++++++++++++++++++++++++++------------
2 files changed, 51 insertions(+), 14 deletions(-)
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 1d17e8f..39d57aa 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -211,6 +211,11 @@ struct sd_rsp {
};
};
+struct generation_reference {
+ int32_t generation;
+ int32_t count;
+};
+
struct sd_inode {
char name[SD_MAX_VDI_LEN];
char tag[SD_MAX_VDI_TAG_LEN];
@@ -229,6 +234,7 @@ struct sd_inode {
uint32_t child_vdi_id[MAX_CHILDREN];
uint32_t data_vdi_id[SD_INODE_DATA_INDEX];
uint32_t btree_counter;
+ struct generation_reference data_ref[SD_INODE_DATA_INDEX];
};
struct sd_index {
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 7af743e..751ed9f 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -224,7 +224,8 @@ out:
static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
uint32_t new_snapid, uint32_t new_vid,
- uint32_t *data_vdi_id)
+ uint32_t *data_vdi_id,
+ struct generation_reference *data_ref)
{
struct sd_inode *new = xzalloc(sizeof(*new));
unsigned long block_size = SD_DATA_OBJ_SIZE;
@@ -246,6 +247,11 @@ static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
else if (new->store_policy)
sd_inode_init(new->data_vdi_id, 1);
+ if (data_ref)
+ for (int i = 0; i < SD_INODE_DATA_INDEX; i++)
+ new->data_ref[i].generation =
+ data_ref[i].generation + 1;
+
return new;
}
@@ -264,7 +270,8 @@ static int find_free_idx(uint32_t *vdi_id, size_t max_idx)
static int create_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
uint32_t new_vid)
{
- struct sd_inode *new = alloc_inode(iocb, new_snapid, new_vid, NULL);
+ struct sd_inode *new = alloc_inode(iocb, new_snapid, new_vid, NULL,
+ NULL);
int ret;
sd_debug("%s: size %" PRIu64 ", new_vid %" PRIx32 ", copies %d, "
@@ -323,18 +330,20 @@ static int clone_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
/* TODO: multiple sd_write_object should be performed atomically */
- /* update a base vdi */
- ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)&new_vid,
- sizeof(new_vid),
- offsetof(struct sd_inode, child_vdi_id[idx]),
- false);
+ for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++)
+ base->data_ref[i].count++;
+
+ ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base->data_ref,
+ sizeof(base->data_ref),
+ offsetof(struct sd_inode, data_ref), false);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_BASE_VDI_WRITE;
goto out;
}
/* create a new vdi */
- new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+ new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id,
+ base->data_ref);
ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
sizeof(*new), 0, true);
if (ret != SD_RES_SUCCESS)
@@ -367,6 +376,7 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
{
struct sd_inode *new = NULL, *base = xzalloc(sizeof(*base));
int ret, idx;
+ uint32_t saved_count[SD_INODE_DATA_INDEX];
sd_debug("%s: size %" PRIu64 ", vid %" PRIx32 ", base %" PRIx32 ", "
"copies %d, snapid %" PRIu32, iocb->name, iocb->size, new_vid,
@@ -390,15 +400,25 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
/* update a base vdi */
base->snap_ctime = iocb->time;
base->child_vdi_id[idx] = new_vid;
+
+ for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++) {
+ base->data_ref[i].generation++;
+ saved_count[i] = base->data_ref[i].count;
+ base->data_ref[i].count = 0;
+ }
ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base,
- SD_INODE_HEADER_SIZE, 0, false);
+ sizeof(*base), 0, false);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_BASE_VDI_WRITE;
goto out;
}
/* create a new vdi */
- new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+ new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id, NULL);
+ for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++) {
+ new->data_ref[i].generation = base->data_ref[i].generation - 1;
+ new->data_ref[i].count = saved_count[i] + 1;
+ }
ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
sizeof(*new), 0, true);
if (ret != SD_RES_SUCCESS)
@@ -456,10 +476,20 @@ static int rebase_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
/* TODO: multiple sd_write_object should be performed atomically */
+ ret = sd_write_object(vid_to_vdi_oid(cur_vid), (char *)&iocb->time,
+ sizeof(iocb->time),
+ offsetof(struct sd_inode, snap_ctime), false);
+ if (ret != SD_RES_SUCCESS) {
+ ret = SD_RES_VDI_WRITE;
+ goto out;
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(base->data_ref); i++)
+ base->data_ref[i].count++;
/* update current working vdi */
- ret = sd_write_object(vid_to_vdi_oid(cur_vid), (char *)&iocb->time,
- sizeof(iocb->time),
- offsetof(struct sd_inode, snap_ctime), false);
+ ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base->data_ref,
+ sizeof(base->data_ref),
+ offsetof(struct sd_inode, data_ref), false);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_VDI_WRITE;
goto out;
@@ -476,7 +506,8 @@ static int rebase_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
}
/* create a new vdi */
- new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+ new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id,
+ base->data_ref);
ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
sizeof(*new), 0, true);
if (ret != SD_RES_SUCCESS)
--
1.8.3.2
More information about the sheepdog
mailing list