[sheepdog] [PATCH v3 3/4] object list cache: reclaim object list cache when receiving a deletion event.

Thu Jul 19 04:19:10 CEST 2012

From: levin li <xingke.lwp at taobao.com>

Before reclaiming the cache belonging to the VDI just deleted, we should test
whether the VDI is exist, because after some node delete it and before the
notification is sent to all the node, another node may issus a VDI creation
event and reused the VDI id again, in which case we should reclaim the cached
entry.

Signed-off-by: levin li <xingke.lwp at taobao.com>
---
 sheep/object_list_cache.c |   65 +++++++++++++++++++++++++++++++++++++++++++++
 sheep/ops.c               |   14 ++++++++++
 sheep/sheep_priv.h        |    2 ++
 sheep/vdi.c               |   27 +++++++++++++++++++
 4 files changed, 108 insertions(+)

diff --git a/sheep/object_list_cache.c b/sheep/object_list_cache.c
index 39e8d49..df94dce 100644
--- a/sheep/object_list_cache.c
+++ b/sheep/object_list_cache.c
@@ -37,6 +37,11 @@ struct objlist_cache {
 	pthread_rwlock_t lock;
 };
 
+struct objlist_deletion_work {
+	uint32_t vid;
+	struct work work;
+};
+
 struct objlist_cache obj_list_cache = {
 	.tree_version	= 1,
 	.root		= RB_ROOT,
@@ -167,3 +172,63 @@ out:
 	pthread_rwlock_unlock(&obj_list_cache.lock);
 	return SD_RES_SUCCESS;
 }
+
+static void objlist_deletion_work(struct work *work)
+{
+	struct objlist_deletion_work *ow =
+		container_of(work, struct objlist_deletion_work, work);
+	struct objlist_cache_entry *entry, *t;
+	uint32_t vid = ow->vid, entry_vid;
+
+	/* Before reclaiming the cache belonging to the VDI just deleted,
+	 * we should test whether the VDI is exist, because after some node
+	 * deleting it and before the notification is sent to all the node,
+	 * another node may issus a VDI creation event and reused the VDI id
+	 * again, in which case we should not reclaim the cached entry.
+	 */
+	if (vdi_exist(vid)) {
+		eprintf("VDI (%" PRIx32 ") is still in use, can not be deleted\n",
+			vid);
+		return;
+	}
+
+	pthread_rwlock_wrlock(&obj_list_cache.lock);
+	list_for_each_entry_safe(entry, t, &obj_list_cache.entry_list, list) {
+		entry_vid = oid_to_vid(entry->oid);
+		if (entry_vid != vid)
+			continue;
+		dprintf("delete object entry %" PRIx64 "\n", entry->oid);
+		list_del(&entry->list);
+		rb_erase(&entry->node, &obj_list_cache.root);
+		free(entry);
+	}
+	pthread_rwlock_unlock(&obj_list_cache.lock);
+}
+
+static void objlist_deletion_done(struct work *work)
+{
+	struct objlist_deletion_work *ow =
+		container_of(work, struct objlist_deletion_work, work);
+	free(ow);
+}
+
+/*
+ * During recovery, some objects may be migrated from one node to a
+ * new one, but we can't remove the object list cache entry in this
+ * case, it may causes recovery failure, so after recovery, we can
+ * not locate the cache entry correctly, causing objlist_cache_remove()
+ * fail to delete it, then we need this function to do the cleanup work
+ * in all nodes.
+ */
+int objlist_cache_cleanup(uint32_t vid)
+{
+	struct objlist_deletion_work *ow;
+
+	ow = xzalloc(sizeof(*ow));
+	ow->vid = vid;
+	ow->work.fn = objlist_deletion_work;
+	ow->work.done = objlist_deletion_done;
+	queue_work(sys->deletion_wqueue, &ow->work);
+
+	return SD_RES_SUCCESS;
+}
diff --git a/sheep/ops.c b/sheep/ops.c
index 27dbdfa..6802aea 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -487,6 +487,14 @@ static int cluster_cleanup(const struct sd_req *req, struct sd_rsp *rsp,
 	return ret;
 }
 
+static int cluster_notify_vdi_del(const struct sd_req *req, struct sd_rsp *rsp,
+				  void *data)
+{
+	uint32_t vid = *(uint32_t *)data;
+
+	return objlist_cache_cleanup(vid);
+}
+
 static int cluster_restore(const struct sd_req *req, struct sd_rsp *rsp,
 			   void *data)
 {
@@ -836,6 +844,12 @@ static struct sd_op_template sd_ops[] = {
 		.process_main = cluster_cleanup,
 	},
 
+	[SD_OP_NOTIFY_VDI_DEL] = {
+		.type = SD_OP_TYPE_CLUSTER,
+		.force = 1,
+		.process_main = cluster_notify_vdi_del,
+	},
+
 	/* local operations */
 	[SD_OP_GET_STORE_LIST] = {
 		.name = "GET_STORE_LIST",
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 7d5700c..11efcb7 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -195,6 +195,7 @@ int create_listen_port(int port, void *data);
 int init_store(const char *dir, int enable_write_cache);
 int init_base_path(const char *dir);
 
+int vdi_exist(uint32_t vid);
 int add_vdi(char *data, int data_len, uint64_t size, uint32_t *new_vid,
 	    uint32_t base_vid, int is_snapshot, unsigned int *nr_copies);
 
@@ -257,6 +258,7 @@ uint32_t get_latest_epoch(void);
 int set_cluster_ctime(uint64_t ctime);
 uint64_t get_cluster_ctime(void);
 int get_obj_list(const struct sd_list_req *, struct sd_list_rsp *, void *);
+int objlist_cache_cleanup(uint32_t vid);
 
 int start_recovery(struct vnode_info *cur_vnodes,
 	struct vnode_info *old_vnodes);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index bcb3df1..c9e070e 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -15,6 +15,33 @@
 #include "sheepdog_proto.h"
 #include "sheep_priv.h"
 
+int vdi_exist(uint32_t vid)
+{
+	struct sheepdog_inode *inode;
+	int ret = 1;
+
+	inode = zalloc(sizeof(*inode));
+	if (!inode) {
+		ret = 0;
+		goto out;
+	}
+
+	ret = read_object(vid_to_vdi_oid(vid), (char *)inode,
+			  sizeof(*inode), 0);
+	if (ret != SD_RES_SUCCESS) {
+		eprintf("fail to read vdi inode (%" PRIx32 ")\n", vid);
+		ret = 0;
+		goto out;
+	}
+
+	if (*inode->name == '\0')
+		ret = 0;
+	ret = 1;
+
+out:
+	free(inode);
+	return ret;
+}
 
 /* TODO: should be performed atomically */
 static int create_vdi_obj(char *name, uint32_t new_vid, uint64_t size,
-- 
1.7.10