[sheepdog] [PATCH v5 04/14] sheep: introduce generational reference counting for object reclaim
Hitoshi Mitake
mitake.hitoshi at lab.ntt.co.jp
Tue Mar 4 06:42:48 CET 2014
From: Hitoshi Mitake <mitake.hitoshi at gmail.com>
Generational reference counting is an algorithm to reclaim data
efficiently without race conditions on distributed system. This
extends vdi objects structure to store generational reference counts,
and increments the counts when creating snapshots.
Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v5:
- update store version and break compatibility explicitly
- rename data_ref -> gref
v4:
- remove a bug in snapshot_vdi(), storing an invalid number of references
include/sheepdog_proto.h | 6 +++++
sheep/config.c | 2 +-
sheep/migrate.c | 8 +++++++
sheep/vdi.c | 58 +++++++++++++++++++++++++++++++++++-----------
4 files changed, 59 insertions(+), 15 deletions(-)
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9361bad..9937497 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -212,6 +212,11 @@ struct sd_rsp {
};
};
+struct generation_reference {
+ int32_t generation;
+ int32_t count;
+};
+
struct sd_inode {
char name[SD_MAX_VDI_LEN];
char tag[SD_MAX_VDI_TAG_LEN];
@@ -230,6 +235,7 @@ struct sd_inode {
uint32_t child_vdi_id[MAX_CHILDREN];
uint32_t data_vdi_id[SD_INODE_DATA_INDEX];
uint32_t btree_counter;
+ struct generation_reference gref[SD_INODE_DATA_INDEX];
};
struct sd_index {
diff --git a/sheep/config.c b/sheep/config.c
index 9a0217b..beb6509 100644
--- a/sheep/config.c
+++ b/sheep/config.c
@@ -11,7 +11,7 @@
#include "sheep_priv.h"
-#define SD_FORMAT_VERSION 0x0004
+#define SD_FORMAT_VERSION 0x0005
#define SD_CONFIG_SIZE 40
static struct sheepdog_config {
diff --git a/sheep/migrate.c b/sheep/migrate.c
index 0d69fa1..d46eaf7 100644
--- a/sheep/migrate.c
+++ b/sheep/migrate.c
@@ -496,6 +496,12 @@ static int migrate_from_v3_to_v4(void)
return 0;
}
+static int migrate_from_v4_to_v5(void)
+{
+ panic("not implemented yet");
+ return -1;
+}
+
static int (*migrate[])(void) = {
migrate_from_v0_to_v1, /* from 0.4.0 or 0.5.0 to 0.5.1 */
migrate_from_v1_to_v2, /* from 0.5.x to 0.6.0 */
@@ -506,6 +512,8 @@ static int (*migrate[])(void) = {
* produced by the commit 79706e07a068
*/
migrate_from_v3_to_v4,
+
+ migrate_from_v4_to_v5, /* from v0.8.x to v0.9.y */
};
int sd_migrate_store(int from, int to)
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 7af743e..e3e63d8 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -224,7 +224,8 @@ out:
static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
uint32_t new_snapid, uint32_t new_vid,
- uint32_t *data_vdi_id)
+ uint32_t *data_vdi_id,
+ struct generation_reference *gref)
{
struct sd_inode *new = xzalloc(sizeof(*new));
unsigned long block_size = SD_DATA_OBJ_SIZE;
@@ -246,6 +247,10 @@ static struct sd_inode *alloc_inode(const struct vdi_iocb *iocb,
else if (new->store_policy)
sd_inode_init(new->data_vdi_id, 1);
+ if (gref)
+ for (int i = 0; i < SD_INODE_DATA_INDEX; i++)
+ new->gref[i].generation = gref[i].generation + 1;
+
return new;
}
@@ -264,7 +269,8 @@ static int find_free_idx(uint32_t *vdi_id, size_t max_idx)
static int create_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
uint32_t new_vid)
{
- struct sd_inode *new = alloc_inode(iocb, new_snapid, new_vid, NULL);
+ struct sd_inode *new = alloc_inode(iocb, new_snapid, new_vid, NULL,
+ NULL);
int ret;
sd_debug("%s: size %" PRIu64 ", new_vid %" PRIx32 ", copies %d, "
@@ -323,18 +329,20 @@ static int clone_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
/* TODO: multiple sd_write_object should be performed atomically */
- /* update a base vdi */
- ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)&new_vid,
- sizeof(new_vid),
- offsetof(struct sd_inode, child_vdi_id[idx]),
- false);
+ for (int i = 0; i < ARRAY_SIZE(base->gref); i++)
+ base->gref[i].count++;
+
+ ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base->gref,
+ sizeof(base->gref),
+ offsetof(struct sd_inode, gref), false);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_BASE_VDI_WRITE;
goto out;
}
/* create a new vdi */
- new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+ new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id,
+ base->gref);
ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
sizeof(*new), 0, true);
if (ret != SD_RES_SUCCESS)
@@ -367,6 +375,7 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
{
struct sd_inode *new = NULL, *base = xzalloc(sizeof(*base));
int ret, idx;
+ uint32_t saved_count[SD_INODE_DATA_INDEX];
sd_debug("%s: size %" PRIu64 ", vid %" PRIx32 ", base %" PRIx32 ", "
"copies %d, snapid %" PRIu32, iocb->name, iocb->size, new_vid,
@@ -390,15 +399,25 @@ static int snapshot_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
/* update a base vdi */
base->snap_ctime = iocb->time;
base->child_vdi_id[idx] = new_vid;
+
+ for (int i = 0; i < ARRAY_SIZE(base->gref); i++) {
+ base->gref[i].generation++;
+ saved_count[i] = base->gref[i].count;
+ base->gref[i].count = 0;
+ }
ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base,
- SD_INODE_HEADER_SIZE, 0, false);
+ sizeof(*base), 0, false);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_BASE_VDI_WRITE;
goto out;
}
/* create a new vdi */
- new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+ new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id, NULL);
+ for (int i = 0; i < ARRAY_SIZE(base->gref); i++) {
+ new->gref[i].generation = base->gref[i].generation - 1;
+ new->gref[i].count = saved_count[i] + 1;
+ }
ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
sizeof(*new), 0, true);
if (ret != SD_RES_SUCCESS)
@@ -456,10 +475,20 @@ static int rebase_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
/* TODO: multiple sd_write_object should be performed atomically */
+ ret = sd_write_object(vid_to_vdi_oid(cur_vid), (char *)&iocb->time,
+ sizeof(iocb->time),
+ offsetof(struct sd_inode, snap_ctime), false);
+ if (ret != SD_RES_SUCCESS) {
+ ret = SD_RES_VDI_WRITE;
+ goto out;
+ }
+
+ for (int i = 0; i < ARRAY_SIZE(base->gref); i++)
+ base->gref[i].count++;
/* update current working vdi */
- ret = sd_write_object(vid_to_vdi_oid(cur_vid), (char *)&iocb->time,
- sizeof(iocb->time),
- offsetof(struct sd_inode, snap_ctime), false);
+ ret = sd_write_object(vid_to_vdi_oid(base_vid), (char *)base->gref,
+ sizeof(base->gref),
+ offsetof(struct sd_inode, gref), false);
if (ret != SD_RES_SUCCESS) {
ret = SD_RES_VDI_WRITE;
goto out;
@@ -476,7 +505,8 @@ static int rebase_vdi(const struct vdi_iocb *iocb, uint32_t new_snapid,
}
/* create a new vdi */
- new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id);
+ new = alloc_inode(iocb, new_snapid, new_vid, base->data_vdi_id,
+ base->gref);
ret = sd_write_object(vid_to_vdi_oid(new_vid), (char *)new,
sizeof(*new), 0, true);
if (ret != SD_RES_SUCCESS)
--
1.7.10.4
More information about the sheepdog
mailing list