[sheepdog] [PATCH v2 4/5] sheep: garbage collect needless VIDs

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Tue Jan 13 02:37:44 CET 2015


Current sheepdog never recycles VIDs. But it will cause problems
e.g. VID space exhaustion, too much garbage inode objects.

Keeping deleted inode objects is required because living inodes
(snapshots or clones) can point objects of the deleted inodes. So if
every member of VDI family is deleted, it is safe to remove deleted
inode objects.

Cc: Saeki Masaki <saeki.masaki at po.ntts.co.jp>
Cc: Yuka Kawasaki <kawasaki.yuka at po.ntts.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 sheep/ops.c        |   1 +
 sheep/sheep_priv.h |   1 +
 sheep/vdi.c        | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 112 insertions(+)

diff --git a/sheep/ops.c b/sheep/ops.c
index eec8ce3..bbb41ee 100644
--- a/sheep/ops.c
+++ b/sheep/ops.c
@@ -198,6 +198,7 @@ static int post_cluster_del_vdi(const struct sd_req *req, struct sd_rsp *rsp,
 	if (ret == SD_RES_SUCCESS) {
 		atomic_set_bit(vid, sys->vdi_deleted);
 		vdi_mark_deleted(vid);
+		run_vid_gc(vid);
 	}
 
 	if (!sys->enable_object_cache)
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 2b32b4e..46d0ba9 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -364,6 +364,7 @@ void invalidate_other_nodes(uint32_t vid);
 int inode_coherence_update(uint32_t vid, bool validate,
 			   const struct node_id *sender);
 void remove_node_from_participants(const struct node_id *left);
+void run_vid_gc(uint32_t vid);
 
 extern int ec_max_data_strip;
 
diff --git a/sheep/vdi.c b/sheep/vdi.c
index 9bf6b23..8114fb5 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -89,6 +89,7 @@ static void update_vdi_family(uint32_t parent_vid,
 
 		INIT_LIST_NODE(&new->roots_list);
 		INIT_LIST_HEAD(&new->child_list_head);
+		INIT_LIST_NODE(&new->child_list_node);
 
 		list_add_tail(&new->roots_list, &vdi_family_roots);
 
@@ -102,6 +103,7 @@ static void update_vdi_family(uint32_t parent_vid,
 	new->entry = entry;
 	entry->family_member = new;
 
+	INIT_LIST_NODE(&new->roots_list);
 	INIT_LIST_HEAD(&new->child_list_head);
 	INIT_LIST_NODE(&new->child_list_node);
 
@@ -149,6 +151,30 @@ ret:
 	sd_mutex_unlock(&vdi_family_mutex);
 }
 
+static main_fn struct vdi_family_member *lookup_root(struct vdi_family_member
+						     *member)
+{
+	if (!member->parent)
+		return member;
+
+	return lookup_root(member->parent);
+}
+
+static main_fn bool is_all_members_deleted(struct vdi_family_member *member)
+{
+	struct vdi_family_member *child;
+
+	if (!member->entry->deleted)
+		return false;
+
+	list_for_each_entry(child, &member->child_list_head, child_list_node) {
+		if (!is_all_members_deleted(child))
+			return false;
+	}
+
+	return true;
+}
+
 /*
  * ec_max_data_strip represent max number of data strips in the cluster. When
  * nr_zones < it, we don't purge the stale objects because for erasure coding,
@@ -1879,12 +1905,39 @@ out:
 	return ret;
 }
 
+static void clean_family(struct vdi_family_member *member)
+{
+	struct vdi_family_member *child;
+
+	list_for_each_entry(child, &member->child_list_head, child_list_node) {
+		clean_family(child);
+	}
+
+	if (list_linked(&member->child_list_node))
+		list_del(&member->child_list_node);
+
+	if (!list_linked(&member->roots_list))
+		free(member);
+}
+
 void clean_vdi_state(void)
 {
+	struct vdi_family_member *member;
+
 	sd_write_lock(&vdi_state_lock);
 	rb_destroy(&vdi_state_root, struct vdi_state_entry, node);
 	INIT_RB_ROOT(&vdi_state_root);
 	sd_rw_unlock(&vdi_state_lock);
+
+	sd_mutex_lock(&vdi_family_mutex);
+
+	list_for_each_entry(member, &vdi_family_roots, roots_list) {
+		clean_family(member);
+		list_del(&member->roots_list);
+		free(member);
+	}
+
+	sd_mutex_unlock(&vdi_family_mutex);
 }
 
 int sd_delete_vdi(const char *name)
@@ -2045,3 +2098,60 @@ main_fn void free_vdi_state_snapshot(int epoch)
 
 	panic("invalid free request for vdi state snapshot, epoch: %d", epoch);
 }
+
+static main_fn void do_vid_gc(struct vdi_family_member *member)
+{
+	struct vdi_state_entry *entry = member->entry;
+	uint32_t vid = entry->vid;
+	uint64_t oid = vid_to_vdi_oid(vid);
+	struct vdi_family_member *child;
+
+	rb_erase(&entry->node, &vdi_state_root);
+	free(entry);
+
+	list_for_each_entry(child, &member->child_list_head, child_list_node) {
+		do_vid_gc(child);
+	}
+
+	if (list_linked(&member->roots_list))
+		list_del(&member->roots_list);
+
+	free(member);
+
+	if (sd_store && sd_store->exist(oid, -1))
+		/* TODO: gc other objects */
+		sd_store->remove_object(oid, -1);
+
+	atomic_clear_bit(vid, sys->vdi_inuse);
+	atomic_clear_bit(vid, sys->vdi_deleted);
+}
+
+main_fn void run_vid_gc(uint32_t vid)
+{
+	struct vdi_state_entry *entry;
+	struct vdi_family_member *member, *root;
+
+	sd_write_lock(&vdi_state_lock);
+	sd_mutex_lock(&vdi_family_mutex);
+	entry = vdi_state_search(&vdi_state_root, vid);
+	if (!entry) {
+		sd_alert("vid %"PRIx32" doesn't have its entry", vid);
+		goto out;
+	}
+
+	member = entry->family_member;
+	root = lookup_root(member);
+
+	if (is_all_members_deleted(root)) {
+		sd_info("all members of the family (root: %"PRIx32
+			") are deleted", root->vid);
+		do_vid_gc(root);
+	} else
+		sd_info("not all members of the family (root: %"PRIx32
+			") are deleted", root->vid);
+
+out:
+	sd_mutex_unlock(&vdi_family_mutex);
+	sd_rw_unlock(&vdi_state_lock);
+
+}
-- 
1.9.1




More information about the sheepdog mailing list