[Sheepdog] [PACH, RFC] sheep: use atomic for the vnode_info refcount outside the main thread

Christoph Hellwig hch at infradead.org
Mon May 7 17:33:49 CEST 2012


This is the minimally tested patch to use atomics to refcount
the vnode_info structure.  Because the GCC built-in are too ugly to live
I added a minimal Linux Kernel style atomic.h wrapper around them.  this
was thrown together really quickly, so careful review is welcome, as
are pointers to readily available userspace atomic_t implementations.

Note that we still never grab the first reference to vnode_info
from a worker thread, and always put references from the main thread,
which simplifies the concurrency issues greatly.


---
 include/atomic.h   |   31 +++++++++++++++++++++++++++++++
 sheep/farm/trunk.c |    2 +-
 sheep/group.c      |   50 +++++++++++++++++++++++++++++++++++++++++---------
 sheep/ops.c        |   10 ++++++----
 sheep/sdnet.c      |   10 ++++++----
 sheep/sheep_priv.h |   19 ++++---------------
 sheep/vdi.c        |   31 +++++++++++++------------------
 7 files changed, 102 insertions(+), 51 deletions(-)

Index: sheepdog/include/atomic.h
===================================================================
--- /dev/null	1970-01-01 00:00:00.000000000 +0000
+++ sheepdog/include/atomic.h	2012-05-07 17:13:46.781406382 +0200
@@ -0,0 +1,31 @@
+#ifndef _SHEEPDOG_ATOMIC_H
+#define _SHEEPDOG_ATOMIC_H
+
+#include <stdbool.h>
+
+typedef int atomic_t;
+
+static inline void atomic_init(atomic_t *v, int val)
+{
+	*v = val;
+	__sync_synchronize();
+}
+
+static inline int atomic_read(atomic_t *v)
+{
+	return *v;
+}
+
+static inline void atomic_inc(atomic_t *v)
+{
+	__sync_add_and_fetch(v, 1);
+}
+
+static inline bool atomic_dec_and_test(atomic_t *v)
+{
+	if (__sync_sub_and_fetch(v, 1) == 0)
+		return true;
+	return false;
+}
+
+#endif /* _SHEEPDOG_ATOMIC_H */
Index: sheepdog/sheep/farm/trunk.c
===================================================================
--- sheepdog.orig/sheep/farm/trunk.c	2012-05-06 23:21:16.537321808 +0200
+++ sheepdog/sheep/farm/trunk.c	2012-05-07 17:10:15.005400960 +0200
@@ -240,7 +240,7 @@ static int oid_stale(uint64_t oid)
 	struct sd_vnode *v;
 	int ret = 1;
 
-	vnodes = get_vnode_info();
+	vnodes = get_current_vnode_info();
 	nr_copies = get_nr_copies(vnodes);
 	for (i = 0; i < nr_copies; i++) {
 		v = oid_to_vnode(vnodes, oid, i);
Index: sheepdog/sheep/group.c
===================================================================
--- sheepdog.orig/sheep/group.c	2012-05-07 10:59:08.769468543 +0200
+++ sheepdog/sheep/group.c	2012-05-07 17:17:23.273411925 +0200
@@ -9,6 +9,7 @@
  * along with this program. If not, see <http://www.gnu.org/licenses/>.
  */
 #include <assert.h>
+#include <atomic.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unistd.h>
@@ -134,6 +135,13 @@ int get_zones_nr_from(struct sd_node *no
 	return nr_zones;
 }
 
+struct vnode_info {
+	struct sd_vnode entries[SD_MAX_VNODES];
+	int nr_vnodes;
+	int nr_zones;
+	atomic_t refcnt;
+};
+
 /*
  * If we have less zones available than the desired redundancy we have to do
  * with nr_zones copies, sorry.
@@ -146,21 +154,44 @@ int get_max_nr_copies_from(struct sd_nod
 	return min((int)sys->nr_copies, get_zones_nr_from(nodes, nr_nodes));
 }
 
-struct vnode_info *get_vnode_info(void)
+/*
+ * Grab an additional reference to the passed in vnode info.
+ *
+ * The caller must already hold a reference to vnode_info, this function must
+ * only be used to grab an additional reference from code that wants the
+ * vnode information to outlive the request structure.
+ */
+struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info)
+{
+	assert(atomic_read(&vnode_info->refcnt) > 0);
+
+	atomic_inc(&vnode_info->refcnt);
+	return vnode_info;
+}
+
+/*
+ * Release a reference to the current vnode information.
+ *
+ * Must be called from the main thread.
+ */
+void put_vnode_info(struct vnode_info *vnode_info)
 {
 	assert(is_main_thread());
-	assert(current_vnode_info);
 
-	current_vnode_info->refcnt++;
-	return current_vnode_info;
+	if (atomic_dec_and_test(&vnode_info->refcnt))
+		free(vnode_info);
 }
 
-void put_vnode_info(struct vnode_info *vnodes)
+/*
+ * Get a reference to the currently active vnode information structure,
+ * this must only be called from the main thread.
+ */
+struct vnode_info *get_current_vnode_info(void)
 {
 	assert(is_main_thread());
+	assert(current_vnode_info);
 
-	if (vnodes && --vnodes->refcnt == 0)
-		free(vnodes);
+	return grab_vnode_info(current_vnode_info);
 }
 
 struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
@@ -187,9 +218,10 @@ static int update_vnode_info(void)
 	vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
 						vnode_info->entries);
 	vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
-	vnode_info->refcnt = 1;
+	atomic_init(&vnode_info->refcnt, 1);
 
-	put_vnode_info(current_vnode_info);
+	if (current_vnode_info)
+		put_vnode_info(current_vnode_info);
 	current_vnode_info = vnode_info;
 	return 0;
 }
Index: sheepdog/sheep/ops.c
===================================================================
--- sheepdog.orig/sheep/ops.c	2012-05-07 10:58:00.177466788 +0200
+++ sheepdog/sheep/ops.c	2012-05-07 17:17:01.573411373 +0200
@@ -63,7 +63,7 @@ struct sd_op_template {
 
 struct flush_work {
 	struct object_cache *cache;
-	struct vnode_info vnode_info;
+	struct vnode_info *vnode_info;
 	struct work work;
 };
 
@@ -582,7 +582,7 @@ static void flush_vdi_fn(struct work *wo
 	struct flush_work *fw = container_of(work, struct flush_work, work);
 
 	dprintf("flush vdi %"PRIx32"\n", fw->cache->vid);
-	if (object_cache_push(&fw->vnode_info, fw->cache) != SD_RES_SUCCESS)
+	if (object_cache_push(fw->vnode_info, fw->cache) != SD_RES_SUCCESS)
 		eprintf("failed to flush vdi %"PRIx32"\n", fw->cache->vid);
 }
 
@@ -590,6 +590,8 @@ static void flush_vdi_done(struct work *
 {
 	struct flush_work *fw = container_of(work, struct flush_work, work);
 	dprintf("flush vdi %"PRIx32" done\n", fw->cache->vid);
+
+	put_vnode_info(fw->vnode_info);
 	free(fw);
 }
 
@@ -609,8 +611,8 @@ static int local_flush_vdi(struct reques
 			fw->work.fn = flush_vdi_fn;
 			fw->work.done = flush_vdi_done;
 			fw->cache = cache;
-			memcpy(&fw->vnode_info, req->vnodes,
-				sizeof(fw->vnode_info));
+			fw->vnode_info = grab_vnode_info(req->vnodes);
+
 			queue_work(sys->flush_wqueue, &fw->work);
 		}
 	}
Index: sheepdog/sheep/sdnet.c
===================================================================
--- sheepdog.orig/sheep/sdnet.c	2012-05-07 10:54:39.669461654 +0200
+++ sheepdog/sheep/sdnet.c	2012-05-07 17:20:04.521416055 +0200
@@ -150,8 +150,9 @@ static void io_op_done(struct work *work
 retry:
 	req->rq.epoch = sys->epoch;
 
-	put_vnode_info(req->vnodes);
-	req->vnodes = get_vnode_info();
+	if (req->vnodes)
+		put_vnode_info(req->vnodes);
+	req->vnodes = get_current_vnode_info();
 	setup_access_to_local_objects(req);
 	list_add_tail(&req->cev.event_list, &sys->request_queue);
 
@@ -310,7 +311,7 @@ static void queue_request(struct request
 	 * called before we set up current_vnode_info
 	 */
 	if (!is_force_op(req->op))
-		req->vnodes = get_vnode_info();
+		req->vnodes = get_current_vnode_info();
 
 	if (is_io_op(req->op)) {
 		req->work.fn = do_io_request;
@@ -376,7 +377,8 @@ static void free_request(struct request
 	sys->outstanding_data_size -= req->data_length;
 
 	list_del(&req->r_siblings);
-	put_vnode_info(req->vnodes);
+	if (req->vnodes)
+		put_vnode_info(req->vnodes);
 	free(req->data);
 	free(req);
 }
Index: sheepdog/sheep/sheep_priv.h
===================================================================
--- sheepdog.orig/sheep/sheep_priv.h	2012-05-07 10:59:08.769468543 +0200
+++ sheepdog/sheep/sheep_priv.h	2012-05-07 17:11:01.197402145 +0200
@@ -63,19 +63,7 @@ struct client_info {
 	int refcnt;
 };
 
-/*
- * This should be private in group.c, but the asynchronous VDI deletion
- * prevents this currently.
- */
-struct vnode_info {
-	struct sd_vnode entries[SD_MAX_VNODES];
-	int nr_vnodes;
-	int nr_zones;
-	int refcnt;
-};
-
-
-struct request;
+struct vnode_info;
 
 struct request {
 	struct event_struct cev;
@@ -264,8 +252,9 @@ int get_vdi_attr(struct vnode_info *vnod
 		int excl, int delete);
 
 int get_zones_nr_from(struct sd_node *nodes, int nr_nodes);
-struct vnode_info *get_vnode_info(void);
-void put_vnode_info(struct vnode_info *vnodes);
+struct vnode_info *grab_vnode_info(struct vnode_info *vnode_info);
+void put_vnode_info(struct vnode_info *vnode_info);
+struct vnode_info *get_current_vnode_info(void);
 
 struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
 		int copy_idx);
Index: sheepdog/sheep/vdi.c
===================================================================
--- sheepdog.orig/sheep/vdi.c	2012-05-07 10:56:03.861463810 +0200
+++ sheepdog/sheep/vdi.c	2012-05-07 17:15:54.865409669 +0200
@@ -354,7 +354,7 @@ struct deletion_work {
 	int count;
 	uint32_t *buf;
 
-	struct vnode_info vnode_info;
+	struct vnode_info *vnode_info;
 	int delete_error;
 };
 
@@ -372,9 +372,9 @@ static int delete_inode(struct deletion_
 		goto out;
 	}
 
-	nr_copies = get_nr_copies(&dw->vnode_info);
+	nr_copies = get_nr_copies(dw->vnode_info);
 
-	ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
+	ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
 			  (char *)inode, SD_INODE_HEADER_SIZE, 0, nr_copies);
 	if (ret != SD_RES_SUCCESS) {
 		ret = SD_RES_EIO;
@@ -386,7 +386,7 @@ static int delete_inode(struct deletion_
 	else
 		memset(inode->name, 0, sizeof(inode->name));
 
-	ret = write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
+	ret = write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(dw->vid),
 			   (char *)inode, SD_INODE_HEADER_SIZE, 0, 0,
 			   nr_copies, 0);
 	if (ret != 0) {
@@ -416,9 +416,9 @@ static void delete_one(struct work *work
 		goto out;
 	}
 
-	nr_copies = get_nr_copies(&dw->vnode_info);
+	nr_copies = get_nr_copies(dw->vnode_info);
 
-	ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
+	ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
 			  (void *)inode, sizeof(*inode),
 			  0, nr_copies);
 
@@ -437,7 +437,7 @@ static void delete_one(struct work *work
 			continue;
 		}
 
-		ret = remove_object(&dw->vnode_info, dw->epoch,
+		ret = remove_object(dw->vnode_info, dw->epoch,
 			      vid_to_data_oid(inode->data_vdi_id[i], i),
 			      nr_copies);
 
@@ -448,7 +448,7 @@ static void delete_one(struct work *work
 	}
 
 	if (dw->delete_error) {
-		write_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
+		write_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vdi_id),
 			     (void *)inode, sizeof(*inode), 0, 0, nr_copies, 0);
 	}
 
@@ -470,6 +470,7 @@ static void delete_one_done(struct work
 
 	list_del(&dw->dw_siblings);
 
+	put_vnode_info(dw->vnode_info);
 	free(dw->buf);
 	free(dw);
 
@@ -489,7 +490,7 @@ static int fill_vdi_list(struct deletion
 	uint32_t vid;
 	int nr_copies;
 
-	nr_copies = get_nr_copies(&dw->vnode_info);
+	nr_copies = get_nr_copies(dw->vnode_info);
 
 	inode = malloc(SD_INODE_HEADER_SIZE);
 	if (!inode) {
@@ -500,7 +501,7 @@ static int fill_vdi_list(struct deletion
 	dw->buf[dw->count++] = root_vid;
 again:
 	vid = dw->buf[done++];
-	ret = read_object(&dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid),
+	ret = read_object(dw->vnode_info, dw->epoch, vid_to_vdi_oid(vid),
 			  (char *)inode, SD_INODE_HEADER_SIZE, 0,
 			  nr_copies);
 
@@ -614,18 +615,12 @@ int del_vdi(struct vnode_info *vnode_inf
 	dw->count = 0;
 	dw->vid = *vid;
 	dw->epoch = epoch;
+	dw->vnode_info = grab_vnode_info(vnode_info);
 
 	dw->work.fn = delete_one;
 	dw->work.done = delete_one_done;
 
-	/*
-	 * Unfortunately we have to copy over the vnode_info array here,
-	 * as we can't grab additional references to it outside the main
-	 * thread.
-	 */
-	memcpy(&dw->vnode_info, vnode_info, sizeof(*vnode_info));
-
-	root_vid = get_vdi_root(&dw->vnode_info, dw->epoch, dw->vid);
+	root_vid = get_vdi_root(dw->vnode_info, dw->epoch, dw->vid);
 	if (!root_vid) {
 		ret = SD_RES_EIO;
 		goto out_free_buf;



More information about the sheepdog mailing list