[sheepdog] [PATCH v6 05/17] sheep: introduce ledger objects
Hitoshi Mitake
mitake.hitoshi at gmail.com
Wed Mar 5 16:28:46 CET 2014
This introduces ledger objects, which keeps track of the number of
outstanding references of each generation. Sheep decrements a
generational reference count with a gateway request SD_OP_DECREF_OBJ,
and reclaims objects when there is no generational reference.
Cc: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
Cc: Valerio Pachera <sirio81 at gmail.com>
Cc: Alessandro Bolgia <alessandro at extensys.it>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
v5:
- use sd_mutex
include/internal_proto.h | 2 ++
include/sheepdog_proto.h | 28 ++++++++++++++-
sheep/gateway.c | 6 ++++
sheep/ops.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++
sheep/sheep_priv.h | 5 +++
sheep/store.c | 25 ++++++++++++++
6 files changed, 155 insertions(+), 1 deletion(-)
diff --git a/include/internal_proto.h b/include/internal_proto.h
index ace4ac5..5bcfacb 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -100,6 +100,8 @@
#define SD_OP_NFS_CREATE 0xBB
#define SD_OP_NFS_DELETE 0xBC
#define SD_OP_EXIST 0xBD
+#define SD_OP_DECREF_OBJ 0xBE
+#define SD_OP_DECREF_PEER 0xBF
/* internal flags for hdr.flags, must be above 0x80 */
#define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index 9937497..c3b935f 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -93,6 +93,7 @@
#define VMSTATE_BIT (UINT64_C(1) << 62)
#define VDI_ATTR_BIT (UINT64_C(1) << 61)
#define VDI_BTREE_BIT (UINT64_C(1) << 60)
+#define LEDGER_BIT (UINT64_C(1) << 59)
#define OLD_MAX_DATA_OBJS (1ULL << 20)
#define MAX_DATA_OBJS (1ULL << 32)
#define MAX_CHILDREN 1024U
@@ -112,6 +113,7 @@
#define SD_INODE_DATA_INDEX_SIZE (sizeof(uint32_t) * SD_INODE_DATA_INDEX)
#define SD_INODE_HEADER_SIZE offsetof(struct sd_inode, data_vdi_id)
#define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
+#define SD_LEDGER_OBJ_SIZE (UINT64_C(1) << 22)
#define CURRENT_VDI_ID 0
#define STORE_LEN 16
@@ -165,6 +167,11 @@ struct sd_req {
/* others mean true */
uint8_t copy_policy;
} vdi_state;
+ struct {
+ uint64_t oid;
+ uint32_t generation;
+ uint32_t count;
+ } ref;
uint32_t __pad[8];
};
@@ -402,10 +409,16 @@ static inline bool is_vdi_btree_obj(uint64_t oid)
return !!(oid & VDI_BTREE_BIT);
}
+static inline bool is_ledger_object(uint64_t oid)
+{
+ return !!(oid & LEDGER_BIT);
+}
+
static inline bool is_data_obj(uint64_t oid)
{
return !is_vdi_obj(oid) && !is_vmstate_obj(oid) &&
- !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid);
+ !is_vdi_attr_obj(oid) && !is_vdi_btree_obj(oid) &&
+ !is_ledger_object(oid);
}
static inline size_t count_data_objs(const struct sd_inode *inode)
@@ -424,6 +437,9 @@ static inline size_t get_objsize(uint64_t oid)
if (is_vdi_btree_obj(oid))
return SD_INODE_DATA_INDEX_SIZE;
+ if (is_ledger_object(oid))
+ return SD_LEDGER_OBJ_SIZE;
+
return SD_DATA_OBJ_SIZE;
}
@@ -474,4 +490,14 @@ static inline __attribute__((used)) void __sd_proto_build_bug_ons(void)
BUILD_BUG_ON(sizeof(struct sd_rsp) != SD_RSP_SIZE);
}
+static inline uint64_t ledger_oid_to_data_oid(uint64_t oid)
+{
+ return ~LEDGER_BIT & oid;
+}
+
+static inline uint64_t data_oid_to_ledger_oid(uint64_t oid)
+{
+ return LEDGER_BIT | oid;
+}
+
#endif
diff --git a/sheep/gateway.c b/sheep/gateway.c
index aea4495..073e650 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -182,6 +182,7 @@ out:
bool is_erasure_oid(uint64_t oid)
{
return !is_vdi_obj(oid) && !is_vdi_btree_obj(oid) &&
+ !is_ledger_object(oid) &&
get_vdi_copy_policy(oid_to_vid(oid)) > 0;
}
@@ -624,3 +625,8 @@ int gateway_remove_obj(struct request *req)
{
return gateway_forward_request(req);
}
+
+int gateway_decref_object(struct request *req)
+{
+ return gateway_forward_request(req);
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index ca00a18..deb11fb 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1054,6 +1054,83 @@ static inline int local_nfs_delete(struct request *req)
#endif
+int peer_decref_object(struct request *req)
+{
+ struct sd_req *hdr = &req->rq;
+ int ret;
+ uint32_t epoch = hdr->epoch;
+ uint64_t ledger_oid = hdr->ref.oid;
+ uint64_t data_oid = ledger_oid_to_data_oid(ledger_oid);
+ uint32_t generation = hdr->ref.generation;
+ uint32_t count = hdr->ref.count;
+ uint32_t *ledger = NULL, *zero = xzalloc(SD_LEDGER_OBJ_SIZE);
+ bool exist = false, locked = false;
+ static struct sd_mutex lock = SD_MUTEX_INITIALIZER;
+
+ sd_debug("%" PRIx64 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32,
+ ledger_oid, epoch, generation, count);
+
+ ledger = xvalloc(SD_LEDGER_OBJ_SIZE);
+ memset(ledger, 0, SD_LEDGER_OBJ_SIZE);
+
+ struct siocb iocb = {
+ .epoch = epoch,
+ .buf = ledger,
+ .length = SD_LEDGER_OBJ_SIZE,
+ };
+
+ /* we don't allow concurrent updates to the ledger objects */
+ sd_mutex_lock(&lock);
+ locked = true;
+
+ ret = sd_store->read(ledger_oid, &iocb);
+ switch (ret) {
+ case SD_RES_SUCCESS:
+ exist = true;
+ break;
+ case SD_RES_NO_OBJ:
+ /* initialize ledger */
+ ledger[0] = 1;
+ break;
+ default:
+ goto out;
+ }
+
+ ledger[generation]--;
+ ledger[generation + 1] += count;
+
+ if (memcmp(ledger, zero, SD_LEDGER_OBJ_SIZE) == 0) {
+ /* reclaim object */
+ if (exist) {
+ ret = sd_store->remove_object(ledger_oid, -1);
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("error %s", sd_strerror(ret));
+ goto out;
+ }
+ }
+ sd_mutex_unlock(&lock);
+ locked = false;
+
+ ret = sd_remove_object(data_oid);
+ if (ret != SD_RES_SUCCESS) {
+ sd_err("error %s", sd_strerror(ret));
+ goto out;
+ }
+ } else {
+ /* update ledger */
+ if (exist)
+ ret = sd_store->write(ledger_oid, &iocb);
+ else
+ ret = sd_store->create_and_write(ledger_oid, &iocb);
+ }
+out:
+ if (locked)
+ sd_mutex_unlock(&lock);
+ free(ledger);
+ free(zero);
+ return ret;
+}
+
static struct sd_op_template sd_ops[] = {
/* cluster operations */
@@ -1397,6 +1474,12 @@ static struct sd_op_template sd_ops[] = {
.process_work = gateway_remove_obj,
},
+ [SD_OP_DECREF_OBJ] = {
+ .name = "DECREF_OBJ",
+ .type = SD_OP_TYPE_GATEWAY,
+ .process_work = gateway_decref_object,
+ },
+
/* peer I/O operations */
[SD_OP_CREATE_AND_WRITE_PEER] = {
.name = "CREATE_AND_WRITE_PEER",
@@ -1421,6 +1504,12 @@ static struct sd_op_template sd_ops[] = {
.type = SD_OP_TYPE_PEER,
.process_work = peer_remove_obj,
},
+
+ [SD_OP_DECREF_PEER] = {
+ .name = "DECREF_PEER",
+ .type = SD_OP_TYPE_PEER,
+ .process_work = peer_decref_object,
+ },
};
const struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1509,6 +1598,7 @@ static int map_table[] = {
[SD_OP_READ_OBJ] = SD_OP_READ_PEER,
[SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
[SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
+ [SD_OP_DECREF_OBJ] = SD_OP_DECREF_PEER,
};
int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 3737f5a..31026d2 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -226,6 +226,7 @@ struct store_driver {
/* backend store */
int peer_read_obj(struct request *req);
+int peer_decref_object(struct request *req);
int default_init(void);
bool default_exist(uint64_t oid, uint8_t ec_index);
@@ -380,6 +381,8 @@ int sd_read_object(uint64_t oid, char *data, unsigned int datalen,
uint64_t offset);
int sd_remove_object(uint64_t oid);
int sd_discard_object(uint64_t oid);
+int sd_dec_object_refcnt(uint64_t data_oid, uint32_t generation,
+ uint32_t refcnt);
struct request_iocb *local_req_init(void);
int exec_local_req(struct sd_req *rq, void *data);
@@ -435,6 +438,8 @@ int gateway_read_obj(struct request *req);
int gateway_write_obj(struct request *req);
int gateway_create_and_write_obj(struct request *req);
int gateway_remove_obj(struct request *req);
+int gateway_decref_object(struct request *req);
+
bool is_erasure_oid(uint64_t oid);
uint8_t local_ec_index(struct vnode_info *vinfo, uint64_t oid);
diff --git a/sheep/store.c b/sheep/store.c
index 2d5aa32..722b55e 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -468,3 +468,28 @@ int sd_discard_object(uint64_t oid)
return ret;
}
+
+int sd_dec_object_refcnt(uint64_t data_oid, uint32_t generation,
+ uint32_t refcnt)
+{
+ struct sd_req hdr;
+ int ret;
+ uint64_t ledger_oid = data_oid_to_ledger_oid(data_oid);
+
+ sd_debug("%" PRId32 ", %" PRId32, generation, refcnt);
+
+ if (generation == 0 && refcnt == 0)
+ return sd_remove_object(data_oid);
+
+ sd_init_req(&hdr, SD_OP_DECREF_OBJ);
+ hdr.ref.oid = ledger_oid;
+ hdr.ref.generation = generation;
+ hdr.ref.count = refcnt;
+
+ ret = exec_local_req(&hdr, NULL);
+ if (ret != SD_RES_SUCCESS)
+ sd_err("failed to decrement reference %" PRIx64 ", %s",
+ ledger_oid, sd_strerror(ret));
+
+ return ret;
+}
--
1.8.3.2
More information about the sheepdog
mailing list