[Sheepdog] [PATCH v3] Use RCU to avoid race on current_vnode_info

Yunkai Zhang yunkai.me at gmail.com
Mon May 7 07:59:58 CEST 2012


From: Yunkai Zhang <qiushu.zyk at taobao.com>

We have discussed this issue in maillist:
http://lists.wpkg.org/pipermail/sheepdog/2012-May/003315.html

This patch depends on a third-party usersapce RCU library, we
can get it from here: http://lttng.org/urcu.

Signed-off-by: Yunkai Zhang <qiushu.zyk at taobao.com>
---
 configure.ac       |    3 ++-
 sheep/Makefile.am  |    7 +++++--
 sheep/group.c      |   46 +++++++++++++++++++++++++++++++++-------------
 sheep/sdnet.c      |    5 +++--
 sheep/sheep_priv.h |    4 +---
 sheep/work.c       |    3 +++
 6 files changed, 47 insertions(+), 21 deletions(-)

diff --git a/configure.ac b/configure.ac
index 7a5eedf..b7af48f 100644
--- a/configure.ac
+++ b/configure.ac
@@ -77,6 +77,7 @@ AM_MISSING_PROG(AUTOM4TE, autom4te, $missing_dir)
 
 # Checks for libraries.
 AC_CHECK_LIB([socket], [socket])
+PKG_CHECK_MODULES([liburcu],[liburcu])
 
 # Checks for header files.
 AC_FUNC_ALLOCA
@@ -86,7 +87,7 @@ AC_HEADER_SYS_WAIT
 AC_CHECK_HEADERS([arpa/inet.h fcntl.h limits.h netdb.h netinet/in.h stdint.h \
 		  stdlib.h string.h sys/ioctl.h sys/param.h sys/socket.h \
 		  sys/time.h syslog.h unistd.h sys/types.h getopt.h malloc.h \
-		  sys/sockio.h utmpx.h])
+		  sys/sockio.h utmpx.h urcu.h])
 
 # Checks for typedefs, structures, and compiler characteristics.
 AC_C_CONST
diff --git a/sheep/Makefile.am b/sheep/Makefile.am
index 7448ae1..bca365c 100644
--- a/sheep/Makefile.am
+++ b/sheep/Makefile.am
@@ -20,7 +20,8 @@ MAINTAINERCLEANFILES	= Makefile.in
 AM_CFLAGS		=
 
 INCLUDES		= -I$(top_builddir)/include -I$(top_srcdir)/include \
-			  $(libcpg_CFLAGS) $(libcfg_CFLAGS) $(libacrd_CFLAGS)
+			  $(libcpg_CFLAGS) $(libcfg_CFLAGS) $(libacrd_CFLAGS) \
+			  $(liburcu_CFLAGS)
 
 sbin_PROGRAMS		= sheep
 
@@ -47,7 +48,9 @@ sheep_SOURCES		+= trace/trace.c trace/mcount.S trace/stabs.c trace/graph.c
 endif
 
 sheep_LDADD	  	= ../lib/libsheepdog.a -lpthread \
-			  $(libcpg_LIBS) $(libcfg_LIBS) $(libacrd_LIBS) $(LIBS)
+			  $(libcpg_LIBS) $(libcfg_LIBS) $(libacrd_LIBS) $(LIBS) \
+			  $(liburcu_LIBS)
+
 sheep_DEPENDENCIES	= ../lib/libsheepdog.a
 
 
diff --git a/sheep/group.c b/sheep/group.c
index c7fd387..cd95ba7 100644
--- a/sheep/group.c
+++ b/sheep/group.c
@@ -13,6 +13,7 @@
 #include <stdlib.h>
 #include <unistd.h>
 #include <netdb.h>
+#include <urcu.h>
 #include <arpa/inet.h>
 #include <sys/time.h>
 #include <sys/epoll.h>
@@ -36,7 +37,7 @@ struct vnode_info {
 	struct sd_vnode entries[SD_MAX_VNODES];
 	int nr_vnodes;
 	int nr_zones;
-	int refcnt;
+	struct rcu_head rcu;
 };
 
 struct join_message {
@@ -104,7 +105,6 @@ struct work_leave {
 })
 
 static int event_running;
-static struct vnode_info *current_vnode_info;
 
 static size_t get_join_message_size(struct join_message *jm)
 {
@@ -155,15 +155,27 @@ int get_max_nr_copies_from(struct sd_node *nodes, int nr_nodes)
 
 struct vnode_info *get_vnode_info(void)
 {
-	assert(current_vnode_info);
-	current_vnode_info->refcnt++;
-	return current_vnode_info;
+	struct vnode_info *vnodes, *p;
+
+	assert(sys->current_vnode_info);
+
+	vnodes = zalloc(sizeof(*vnodes));
+	if (!vnodes)
+		panic("failed to allocate memory\n");
+
+	rcu_read_lock();
+
+	p = rcu_dereference(sys->current_vnode_info);
+	memcpy(vnodes, p, sizeof(*p));
+
+	rcu_read_unlock();
+
+	return vnodes;
 }
 
 void put_vnode_info(struct vnode_info *vnodes)
 {
-	if (vnodes && --vnodes->refcnt == 0)
-		free(vnodes);
+	free(vnodes);
 }
 
 struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
@@ -176,10 +188,16 @@ struct sd_vnode *oid_to_vnode(struct vnode_info *vnode_info, uint64_t oid,
 	return &vnode_info->entries[n];
 }
 
+static void vnode_info_reclaim(struct rcu_head *head)
+{
+	struct vnode_info *vnodes;
+	vnodes = container_of(head, struct vnode_info, rcu);
+	free(vnodes);
+}
 
 static int update_vnode_info(void)
 {
-	struct vnode_info *vnode_info;
+	struct vnode_info *vnode_info, *old_vnode_info;
 
 	vnode_info = zalloc(sizeof(*vnode_info));
 	if (!vnode_info) {
@@ -190,10 +208,12 @@ static int update_vnode_info(void)
 	vnode_info->nr_vnodes = nodes_to_vnodes(sys->nodes, sys->nr_nodes,
 						vnode_info->entries);
 	vnode_info->nr_zones = get_zones_nr_from(sys->nodes, sys->nr_nodes);
-	vnode_info->refcnt = 1;
 
-	put_vnode_info(current_vnode_info);
-	current_vnode_info = vnode_info;
+	old_vnode_info = sys->current_vnode_info;
+	rcu_assign_pointer(sys->current_vnode_info, vnode_info);
+	if (old_vnode_info)
+		call_rcu(&old_vnode_info->rcu, vnode_info_reclaim);
+
 	return 0;
 }
 
@@ -841,7 +861,7 @@ static void __sd_join_done(struct event_struct *cevent)
 	}
 
 	if (sys_stat_halt()) {
-		if (current_vnode_info->nr_zones >= sys->nr_copies)
+		if (sys->current_vnode_info->nr_zones >= sys->nr_copies)
 			sys_stat_set(SD_STATUS_OK);
 	}
 
@@ -871,7 +891,7 @@ static void __sd_leave_done(struct event_struct *cevent)
 		start_recovery(sys->epoch);
 
 	if (sys_can_halt()) {
-		if (current_vnode_info->nr_zones < sys->nr_copies)
+		if (sys->current_vnode_info->nr_zones < sys->nr_copies)
 			sys_stat_set(SD_STATUS_HALT);
 	}
 }
diff --git a/sheep/sdnet.c b/sheep/sdnet.c
index f59b1ff..c8c9bdb 100644
--- a/sheep/sdnet.c
+++ b/sheep/sdnet.c
@@ -303,7 +303,9 @@ static void queue_request(struct request *req)
 	 * called before we set up current_vnode_info
 	 */
 	if (!is_force_op(req->op))
-		req->vnodes = get_vnode_info();
+		/* We can reference current_vnode_info safely
+		 * because we are in main thread here */
+		req->vnodes = sys->current_vnode_info;
 
 	if (is_io_op(req->op)) {
 		req->work.fn = do_io_request;
@@ -379,7 +381,6 @@ static void free_request(struct request *req)
 	sys->outstanding_data_size -= req->data_length;
 
 	list_del(&req->r_siblings);
-	put_vnode_info(req->vnodes);
 	free(req->data);
 	free(req);
 }
diff --git a/sheep/sheep_priv.h b/sheep/sheep_priv.h
index 2275a93..c3a152b 100644
--- a/sheep/sheep_priv.h
+++ b/sheep/sheep_priv.h
@@ -123,9 +123,7 @@ struct cluster_info {
 	struct sd_node nodes[SD_MAX_NODES];
 	int nr_nodes;
 
-	/* this array contains a list of ordered virtual nodes */
-	struct sd_vnode vnodes[SD_MAX_VNODES];
-	int nr_vnodes;
+	struct vnode_info *current_vnode_info;
 
 	struct list_head pending_list;
 
diff --git a/sheep/work.c b/sheep/work.c
index 8564cb2..84ce3c2 100644
--- a/sheep/work.c
+++ b/sheep/work.c
@@ -19,6 +19,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <urcu.h>
 #include <stdlib.h>
 #include <syscall.h>
 #include <sys/types.h>
@@ -172,6 +173,7 @@ static void *worker_routine(void *arg)
 	/* started this thread */
 	pthread_mutex_unlock(&wi->startup_lock);
 
+	rcu_register_thread();
 	while (!(wi->q.wq_state & WQ_DEAD)) {
 
 		pthread_mutex_lock(&wi->pending_lock);
@@ -200,6 +202,7 @@ retest:
 		eventfd_write(efd, value);
 	}
 
+	rcu_unregister_thread();
 	pthread_exit(NULL);
 }
 
-- 
1.7.7.6




More information about the sheepdog mailing list