[sheepdog] [PATCH RFT 2/4] sheep: construct data structures for representing family tree

Hitoshi Mitake mitake.hitoshi at lab.ntt.co.jp
Mon Dec 15 10:36:58 CET 2014


This patch adds new data structres which represent VDI family tree. It
is different from the existing vdi_state_entry tree. The purpose of
these new data structres is efficient tracking of family relation
between VDIs. It is required for garbage collecting needless VIDs.

Cc: Saeki Masaki <saeki.masaki at po.ntts.co.jp>
Cc: Yuka Kawasaki <kawasaki.yuka at po.ntts.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi at lab.ntt.co.jp>
---
 sheep/group.c       |   7 +--
 sheep/plain_store.c |   2 +-
 sheep/sheep_priv.h  |   2 +
 sheep/vdi.c         | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 148 insertions(+), 9 deletions(-)

diff --git a/sheep/group.c b/sheep/group.c
index f07ffb8..06a3bd8 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -509,9 +509,10 @@ retry:
 		atomic_set_bit(vs[i].vid, sys->vdi_inuse);
 		if (vs[i].deleted)
 			atomic_set_bit(vs[i].vid, sys->vdi_deleted);
-		add_vdi_state(vs[i].vid, vs[i].nr_copies, vs[i].snapshot,
-			      vs[i].copy_policy, vs[i].block_size_shift,
-			      vs[i].parent_vid);
+		add_vdi_state_unordered(vs[i].vid, vs[i].nr_copies,
+					vs[i].snapshot, vs[i].copy_policy,
+					vs[i].block_size_shift,
+					vs[i].parent_vid);
 	}
 out:
 	free(vs);
diff --git a/sheep/plain_store.c b/sheep/plain_store.c
index a7b0864..92f9a14 100644
--- a/sheep/plain_store.c
+++ b/sheep/plain_store.c
@@ -281,7 +281,7 @@ static int init_vdi_state(uint64_t oid, const char *wd, uint32_t epoch)
 		       "wat %s", oid, epoch, wd);
 		goto out;
 	}
-	add_vdi_state(oid_to_vid(oid), inode->nr_copies,
+	add_vdi_state_unordered(oid_to_vid(oid), inode->nr_copies,
 		      vdi_is_snapshot(inode), inode->copy_policy,
 		      inode->block_size_shift, inode->parent_vdi_id);
 
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 5f9c534..bc45f49 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -333,6 +333,8 @@ int get_obj_copy_number(uint64_t oid, int nr_zones);
 int get_req_copy_number(struct request *req);
 int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
 		  uint8_t, uint8_t block_size_shift, uint32_t parent_vid);
+int add_vdi_state_unordered(uint32_t vid, int nr_copies, bool snapshot,
+		  uint8_t, uint8_t block_size_shift, uint32_t parent_vid);
 int vdi_exist(uint32_t vid);
 int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid);
 int vdi_snapshot(const struct vdi_iocb *iocb, uint32_t *new_vid);
diff --git a/sheep/vdi.c b/sheep/vdi.c
index fea7a32..a0cc0a7 100644
--- a/sheep/vdi.c
+++ b/sheep/vdi.c
@@ -30,11 +30,125 @@ struct vdi_state_entry {
 	int nr_participants;
 	enum shared_lock_state participants_state[SD_MAX_COPIES];
 	struct node_id participants[SD_MAX_COPIES];
+
+	struct vdi_family_member *family_member;
 };
 
 static struct rb_root vdi_state_root = RB_ROOT;
 static struct sd_rw_lock vdi_state_lock = SD_RW_LOCK_INITIALIZER;
 
+struct vdi_family_member {
+	uint32_t vid, parent_vid;
+	struct vdi_family_member *parent;
+	struct vdi_state_entry *entry;
+
+	struct list_node roots_list; /* only used for root VDIs */
+
+	struct list_head child_list_head;
+	struct list_node child_list_node;
+};
+
+static LIST_HEAD(vdi_family_roots);
+static LIST_HEAD(vdi_family_temporal_orphans);
+static struct sd_mutex vdi_family_mutex = SD_MUTEX_INITIALIZER;
+
+static struct vdi_family_member *lookup_vdi_family_member(uint32_t vid,
+				  struct vdi_family_member *family_member)
+{
+	/* FIXME: more effective data structure */
+	struct vdi_family_member *vdi;
+
+	if (family_member->vid == vid)
+		return family_member;
+
+	list_for_each_entry(vdi, &family_member->child_list_head,
+			    child_list_node) {
+		struct vdi_family_member *ret;
+
+		ret = lookup_vdi_family_member(vid, vdi);
+		if (ret)
+			return ret;
+	}
+
+	return NULL;
+}
+
+static void update_vdi_family(uint32_t parent_vid,
+			      struct vdi_state_entry *entry, bool unordered)
+{
+	uint32_t vid = entry->vid;
+	struct vdi_family_member *new, *vdi, *parent;
+
+	sd_mutex_lock(&vdi_family_mutex);
+
+	if (!parent_vid) {
+		new = xzalloc(sizeof(*new));
+		new->vid = vid;
+		new->entry = entry;
+		entry->family_member = new;
+
+		INIT_LIST_NODE(&new->roots_list);
+		INIT_LIST_HEAD(&new->child_list_head);
+
+		list_add_tail(&new->roots_list, &vdi_family_roots);
+
+		sd_debug("new vid %"PRIx32 " is added as a root VDI", vid);
+		goto out;
+	}
+
+	new = xzalloc(sizeof(*new));
+	new->vid = vid;
+	new->parent_vid = parent_vid;
+	new->entry = entry;
+	entry->family_member = new;
+
+	INIT_LIST_HEAD(&new->child_list_head);
+	INIT_LIST_NODE(&new->child_list_node);
+
+	list_for_each_entry(vdi, &vdi_family_roots, roots_list) {
+		parent = lookup_vdi_family_member(parent_vid, vdi);
+		if (parent) {
+			new->parent = parent;
+			goto found;
+		}
+	}
+
+	if (unordered) {
+		sd_debug("parent of VID: %"PRIx32" (%"PRIx32") is not"
+			 " initialized yet", vid, parent_vid);
+
+		list_add_tail(&new->child_list_node,
+			      &vdi_family_temporal_orphans);
+		goto ret;
+	}
+
+	panic("parent VID: %"PRIx32" not found", parent_vid);
+
+found:
+	list_add_tail(&new->child_list_node, &parent->child_list_head);
+	sd_debug("new child vid %"PRIx32" is added to parent %"PRIx32
+		 " correctly", vid, parent_vid);
+
+out:
+	/* correct children from orphan list */
+
+	list_for_each_entry(vdi, &vdi_family_temporal_orphans,
+			    child_list_node) {
+		if (vdi->parent_vid != vid)
+			continue;
+
+		list_del(&vdi->child_list_node);
+		list_add_tail(&vdi->child_list_node, &new->child_list_head);
+		vdi->parent = new;
+
+		sd_debug("new vid %"PRIx32" rescued orphan vid %"PRIx32"",
+			 vid, vdi->vid);
+	}
+
+ret:
+	sd_mutex_unlock(&vdi_family_mutex);
+}
+
 /*
  * ec_max_data_strip represent max number of data strips in the cluster. When
  * nr_zones < it, we don't purge the stale objects because for erasure coding,
@@ -189,10 +303,12 @@ int get_req_copy_number(struct request *req)
 	return nr_copies;
 }
 
-int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
-		  uint8_t cp, uint8_t block_size_shift, uint32_t parent_vid)
+static int do_add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
+			    uint8_t cp, uint8_t block_size_shift,
+			    uint32_t parent_vid, bool unordered)
 {
 	struct vdi_state_entry *entry, *old;
+	bool already_exists = false;
 
 	entry = xzalloc(sizeof(*entry));
 	entry->vid = vid;
@@ -216,8 +332,8 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
 		sd_mutex_unlock(&m);
 	}
 
-	sd_debug("%" PRIx32 ", %d, %d, %"PRIu8,
-		 vid, nr_copies, cp, block_size_shift);
+	sd_debug("%" PRIx32 ", %d, %d, %"PRIu8", %"PRIx32,
+		 vid, nr_copies, cp, block_size_shift, parent_vid);
 
 	sd_write_lock(&vdi_state_lock);
 	old = vdi_state_insert(&vdi_state_root, entry);
@@ -238,13 +354,32 @@ int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
 
 			entry->parent_vid = parent_vid;
 		}
+
+		already_exists = true;
 	}
 
+	if (!already_exists)
+		update_vdi_family(parent_vid, entry, unordered);
+
 	sd_rw_unlock(&vdi_state_lock);
 
 	return SD_RES_SUCCESS;
 }
 
+int add_vdi_state(uint32_t vid, int nr_copies, bool snapshot,
+		  uint8_t cp, uint8_t block_size_shift, uint32_t parent_vid)
+{
+	return do_add_vdi_state(vid, nr_copies, snapshot, cp, block_size_shift,
+				parent_vid, false);
+}
+
+int add_vdi_state_unordered(uint32_t vid, int nr_copies, bool snapshot,
+		  uint8_t cp, uint8_t block_size_shift, uint32_t parent_vid)
+{
+	return do_add_vdi_state(vid, nr_copies, snapshot, cp, block_size_shift,
+				parent_vid, true);
+}
+
 int fill_vdi_state_list(const struct sd_req *hdr,
 			struct sd_rsp *rsp, void *data)
 {
@@ -1389,7 +1524,8 @@ int vdi_create(const struct vdi_iocb *iocb, uint32_t *new_vid)
 	if (info.snapid == 0)
 		info.snapid = 1;
 	*new_vid = info.free_bit;
-	ret = notify_vdi_add(*new_vid, iocb->nr_copies, info.vid,
+	ret = notify_vdi_add(*new_vid, iocb->nr_copies,
+			     iocb->base_vid == 0 ? info.vid : iocb->base_vid,
 			     iocb->copy_policy, iocb->block_size_shift);
 	if (ret != SD_RES_SUCCESS)
 		return ret;
-- 
1.8.3.2




More information about the sheepdog mailing list