[sheepdog] [PATCH v2 04/11] sheep: introduce ledger objects

MORITA Kazutaka morita.kazutaka at gmail.com
Tue Jun 18 19:14:24 CEST 2013


From: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>

This introduces ledger objects, which keeps track of the number of
outstanding references of each generation.  Sheep decrements a
generational reference count with a gateway request SD_OP_DECREF_OBJ,
and reclaims objects when there is no generational reference.

Signed-off-by: MORITA Kazutaka <morita.kazutaka at lab.ntt.co.jp>
---
 include/internal_proto.h |    2 ++
 include/sheepdog_proto.h |   27 +++++++++++++-
 sheep/gateway.c          |    5 +++
 sheep/ops.c              |   90 ++++++++++++++++++++++++++++++++++++++++++++++
 sheep/sheep_priv.h       |    3 ++
 sheep/store.c            |   24 +++++++++++++
 6 files changed, 150 insertions(+), 1 deletion(-)

diff --git a/include/internal_proto.h b/include/internal_proto.h
index 24d5519..08e0e28 100644
--- a/include/internal_proto.h
+++ b/include/internal_proto.h
@@ -75,6 +75,8 @@
 #define SD_OP_GET_HASH       0xB4
 #define SD_OP_REWEIGHT       0xB5
 #define SD_OP_UPDATE_SIZE    0xB6
+#define SD_OP_DECREF_OBJ     0xB7
+#define SD_OP_DECREF_PEER    0xB8
 
 /* internal flags for hdr.flags, must be above 0x80 */
 #define SD_FLAG_CMD_RECOVERY 0x0080
diff --git a/include/sheepdog_proto.h b/include/sheepdog_proto.h
index bb0f253..3340f2b 100644
--- a/include/sheepdog_proto.h
+++ b/include/sheepdog_proto.h
@@ -89,6 +89,7 @@
 #define VDI_BIT (UINT64_C(1) << 63)
 #define VMSTATE_BIT (UINT64_C(1) << 62)
 #define VDI_ATTR_BIT (UINT64_C(1) << 61)
+#define LEDGER_BIT (UINT64_C(1) << 60)
 #define MAX_DATA_OBJS (1ULL << 20)
 #define MAX_CHILDREN 1024U
 #define SD_MAX_VDI_LEN 256U
@@ -103,6 +104,7 @@
 #define SD_INODE_SIZE (sizeof(struct sd_inode))
 #define SD_INODE_HEADER_SIZE offsetof(struct sd_inode, data_vdi_id)
 #define SD_ATTR_OBJ_SIZE (sizeof(struct sheepdog_vdi_attr))
+#define SD_LEDGER_OBJ_SIZE (UINT64_C(1) << 22)
 #define CURRENT_VDI_ID 0
 
 #define STORE_LEN 16
@@ -146,6 +148,11 @@ struct sd_req {
 			uint8_t		set_bitmap; /* 0 means false */
 						    /* others mean true */
 		} vdi_state;
+		struct {
+			uint64_t	oid;
+			uint32_t	generation;
+			uint32_t	count;
+		} ref;
 
 		uint32_t		__pad[8];
 	};
@@ -270,10 +277,15 @@ static inline bool is_vdi_attr_obj(uint64_t oid)
 	return !!(oid & VDI_ATTR_BIT);
 }
 
+static inline bool is_ledger_object(uint64_t oid)
+{
+	return !!(oid & LEDGER_BIT);
+}
+
 static inline bool is_data_obj(uint64_t oid)
 {
 	return !is_vdi_obj(oid) && !is_vmstate_obj(oid) &&
-		!is_vdi_attr_obj(oid);
+		!is_vdi_attr_obj(oid) && !is_ledger_object(oid);
 }
 
 static inline size_t get_objsize(uint64_t oid)
@@ -284,6 +296,9 @@ static inline size_t get_objsize(uint64_t oid)
 	if (is_vdi_attr_obj(oid))
 		return SD_ATTR_OBJ_SIZE;
 
+	if (is_ledger_object(oid))
+		return SD_LEDGER_OBJ_SIZE;
+
 	return SD_DATA_OBJ_SIZE;
 }
 
@@ -324,4 +339,14 @@ static inline __attribute__((used)) void __sd_proto_build_bug_ons(void)
 	BUILD_BUG_ON(sizeof(struct sd_rsp) != SD_RSP_SIZE);
 }
 
+static inline uint64_t ledger_oid_to_data_oid(uint64_t oid)
+{
+	return ~LEDGER_BIT & oid;
+}
+
+static inline uint64_t data_oid_to_ledger_oid(uint64_t oid)
+{
+	return LEDGER_BIT | oid;
+}
+
 #endif
diff --git a/sheep/gateway.c b/sheep/gateway.c
index 2496709..08c58d2 100644
--- a/sheep/gateway.c
+++ b/sheep/gateway.c
@@ -350,3 +350,8 @@ int gateway_remove_obj(struct request *req)
 {
 	return gateway_forward_request(req);
 }
+
+int gateway_decref_object(struct request *req)
+{
+	return gateway_forward_request(req);
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index ff06d81..6a39f1e 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -1025,6 +1025,83 @@ out:
 	return ret;
 }
 
+int peer_decref_object(struct request *req)
+{
+	struct sd_req *hdr = &req->rq;
+	int ret;
+	uint32_t epoch = hdr->epoch;
+	uint64_t ledger_oid = hdr->ref.oid;
+	uint64_t data_oid = ledger_oid_to_data_oid(ledger_oid);
+	uint32_t generation = hdr->ref.generation;
+	uint32_t count = hdr->ref.count;
+	uint32_t *ledger = NULL, *zero = xzalloc(SD_LEDGER_OBJ_SIZE);
+	bool exist = false, locked = false;
+	static pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
+
+	sd_dprintf("%" PRIx64 ", %" PRIu32 ", %" PRIu32 ", %" PRIu32,
+		   ledger_oid, epoch, generation, count);
+
+	ledger = xvalloc(SD_LEDGER_OBJ_SIZE);
+	memset(ledger, 0, SD_LEDGER_OBJ_SIZE);
+
+	struct siocb iocb = {
+		.epoch = epoch,
+		.buf = ledger,
+		.length = SD_LEDGER_OBJ_SIZE,
+	};
+
+	/* we don't allow concurrent updates to the ledger objects */
+	pthread_mutex_lock(&lock);
+	locked = true;
+
+	ret = sd_store->read(ledger_oid, &iocb);
+	switch (ret) {
+	case SD_RES_SUCCESS:
+		exist = true;
+		break;
+	case SD_RES_NO_OBJ:
+		/* initialize ledger */
+		ledger[0] = 1;
+		break;
+	default:
+		goto out;
+	}
+
+	ledger[generation]--;
+	ledger[generation + 1] += count;
+
+	if (memcmp(ledger, zero, SD_LEDGER_OBJ_SIZE) == 0) {
+		/* reclaim object */
+		if (exist) {
+			ret = sd_store->remove_object(ledger_oid);
+			if (ret != SD_RES_SUCCESS) {
+				sd_eprintf("error %s", sd_strerror(ret));
+				goto out;
+			}
+		}
+		pthread_mutex_unlock(&lock);
+		locked = false;
+
+		ret = remove_object(data_oid);
+		if (ret != SD_RES_SUCCESS) {
+			sd_eprintf("error %s", sd_strerror(ret));
+			goto out;
+		}
+	} else {
+		/* update ledger */
+		if (exist)
+			ret = sd_store->write(ledger_oid, &iocb);
+		else
+			ret = sd_store->create_and_write(ledger_oid, &iocb);
+	}
+out:
+	if (locked)
+		pthread_mutex_unlock(&lock);
+	free(ledger);
+	free(zero);
+	return ret;
+}
+
 static struct sd_op_template sd_ops[] = {
 
 	/* cluster operations */
@@ -1294,6 +1371,12 @@ static struct sd_op_template sd_ops[] = {
 		.process_work = gateway_remove_obj,
 	},
 
+	[SD_OP_DECREF_OBJ] = {
+		.name = "DECREF_OBJ",
+		.type = SD_OP_TYPE_GATEWAY,
+		.process_work = gateway_decref_object,
+	},
+
 	/* peer I/O operations */
 	[SD_OP_CREATE_AND_WRITE_PEER] = {
 		.name = "CREATE_AND_WRITE_PEER",
@@ -1318,6 +1401,12 @@ static struct sd_op_template sd_ops[] = {
 		.type = SD_OP_TYPE_PEER,
 		.process_work = peer_remove_obj,
 	},
+
+	[SD_OP_DECREF_PEER] = {
+		.name = "DECREF_PEER",
+		.type = SD_OP_TYPE_PEER,
+		.process_work = peer_decref_object,
+	},
 };
 
 const struct sd_op_template *get_sd_op(uint8_t opcode)
@@ -1404,6 +1493,7 @@ static int map_table[] = {
 	[SD_OP_READ_OBJ] = SD_OP_READ_PEER,
 	[SD_OP_WRITE_OBJ] = SD_OP_WRITE_PEER,
 	[SD_OP_REMOVE_OBJ] = SD_OP_REMOVE_PEER,
+	[SD_OP_DECREF_OBJ] = SD_OP_DECREF_PEER,
 };
 
 int gateway_to_peer_opcode(int opcode)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index cdf8b7a..07d878c 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -310,6 +310,7 @@ int write_object(uint64_t oid, char *data, unsigned int datalen,
 int read_object(uint64_t oid, char *data, unsigned int datalen,
 		uint64_t offset);
 int remove_object(uint64_t oid);
+int dec_object_refcnt(uint64_t data_oid, uint32_t generation, uint32_t refcnt);
 
 int exec_local_req(struct sd_req *rq, void *data);
 void local_req_init(void);
@@ -353,12 +354,14 @@ int gateway_read_obj(struct request *req);
 int gateway_write_obj(struct request *req);
 int gateway_create_and_write_obj(struct request *req);
 int gateway_remove_obj(struct request *req);
+int gateway_decref_object(struct request *req);
 
 /* backend store */
 int peer_read_obj(struct request *req);
 int peer_write_obj(struct request *req);
 int peer_create_and_write_obj(struct request *req);
 int peer_remove_obj(struct request *req);
+int peer_decref_object(struct request *req);
 
 /* object_cache */
 
diff --git a/sheep/store.c b/sheep/store.c
index a804d0d..1a7ee1e 100644
--- a/sheep/store.c
+++ b/sheep/store.c
@@ -467,3 +467,27 @@ int remove_object(uint64_t oid)
 
 	return ret;
 }
+
+int dec_object_refcnt(uint64_t data_oid, uint32_t generation, uint32_t refcnt)
+{
+	struct sd_req hdr;
+	int ret;
+	uint64_t ledger_oid = data_oid_to_ledger_oid(data_oid);
+
+	sd_dprintf("%" PRId32 ", %" PRId32, generation, refcnt);
+
+	if (generation == 0 && refcnt == 0)
+		return remove_object(data_oid);
+
+	sd_init_req(&hdr, SD_OP_DECREF_OBJ);
+	hdr.ref.oid = ledger_oid;
+	hdr.ref.generation = generation;
+	hdr.ref.count = refcnt;
+
+	ret = exec_local_req(&hdr, NULL);
+	if (ret != SD_RES_SUCCESS)
+		sd_eprintf("failed to decrement reference %" PRIx64 ", %s",
+			   ledger_oid, sd_strerror(ret));
+
+	return ret;
+}
-- 
1.7.9.5




More information about the sheepdog mailing list