[Sheepdog] [PATCH v4 06/12] trace: low-level trace infrastructure proper

Liu Yuan namei.unix at gmail.com
Tue Apr 10 09:02:01 CEST 2012


From: Liu Yuan <tailai.ly at taobao.com>

We achieves this by dynamically patching the binary code at runtime,
making use of GCC -pg instrumentation features, which instrument a
5-bytes opcode in every function.

When the trace is
 turned off, the opcode is patched as NOP opcode.
 turned on, the opcode is a call opcode to our trace functions and
 we can hook callbacks both for function entry and exit.

This is how the 'graph' tracer atually calculate the time spent
by function.

Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
---
 include/list.h       |    1 +
 sheep/sheep.c        |    5 +
 sheep/trace/mcount.S |    4 +
 sheep/trace/trace.c  |  269 ++++++++++++++++++++++++++++++++++++++++++++++++++
 sheep/trace/trace.h  |   35 +++++++
 5 files changed, 314 insertions(+), 0 deletions(-)
 create mode 100644 sheep/trace/trace.c

diff --git a/include/list.h b/include/list.h
index 1c2dcac..38870e6 100644
--- a/include/list.h
+++ b/include/list.h
@@ -142,6 +142,7 @@ static inline int hlist_unhashed(const struct hlist_node *h)
 	return !h->pprev;
 }
 
+__attribute__((always_inline))
 static inline int hlist_empty(const struct hlist_head *h)
 {
 	return !h->first;
diff --git a/sheep/sheep.c b/sheep/sheep.c
index fd98044..89e2d37 100644
--- a/sheep/sheep.c
+++ b/sheep/sheep.c
@@ -21,6 +21,7 @@
 #include <sys/syslog.h>
 
 #include "sheep_priv.h"
+#include "trace/trace.h"
 
 #define EPOLL_SIZE 4096
 #define DEFAULT_OBJECT_DIR "/tmp"
@@ -229,6 +230,10 @@ int main(int argc, char **argv)
 	ret = init_signal();
 	if (ret)
 		exit(1);
+
+	ret = trace_init();
+	if (ret)
+		exit(1);
 	vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION);
 
 	while (!sys_stat_shutdown() || sys->nr_outstanding_reqs != 0)
diff --git a/sheep/trace/mcount.S b/sheep/trace/mcount.S
index c16e5ae..5f1e6b5 100644
--- a/sheep/trace/mcount.S
+++ b/sheep/trace/mcount.S
@@ -64,3 +64,7 @@ trace_call:
 .globl trace_stub
 trace_stub:
 	retq
+
+.globl NOP5
+NOP5:
+	.byte 0x0f,0x1f,0x44,0x00,0x00 # Intel recommended one for 5 bytes nops
diff --git a/sheep/trace/trace.c b/sheep/trace/trace.c
new file mode 100644
index 0000000..225b715
--- /dev/null
+++ b/sheep/trace/trace.c
@@ -0,0 +1,269 @@
+#include <string.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <signal.h>
+
+#include "trace.h"
+#include "logger.h"
+#include "list.h"
+#include "work.h"
+#include "sheepdog_proto.h"
+
+#define TRACE_HASH_BITS       7
+#define TRACE_HASH_SIZE       (1 << TRACE_HASH_BITS)
+
+static struct hlist_head trace_hashtable[TRACE_HASH_SIZE];
+static LIST_HEAD(caller_list);
+static pthread_mutex_t trace_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static trace_func_t trace_func = trace_call;
+static int trace_count;
+
+pthread_cond_t trace_cond = PTHREAD_COND_INITIALIZER;
+pthread_mutex_t trace_mux = PTHREAD_MUTEX_INITIALIZER;
+
+union instruction {
+	unsigned char start[INSN_SIZE];
+	struct {
+		char opcode;
+		int offset;
+	} __attribute__((packed));
+};
+
+static notrace void suspend(int num)
+{
+	dprintf("worker thread %u going to suspend\n", (int)pthread_self());
+
+	pthread_mutex_lock(&trace_mux);
+	trace_count--;
+	pthread_cond_wait(&trace_cond, &trace_mux);
+	pthread_mutex_unlock(&trace_mux);
+	dprintf("worker thread going to resume\n");
+}
+
+static inline int trace_hash(unsigned long ip)
+{
+	return hash_64(ip, TRACE_HASH_BITS);
+}
+
+static notrace unsigned char *get_new_call(unsigned long ip, unsigned long addr)
+{
+	static union instruction code;
+
+	code.opcode = 0xe8; /* opcode of call */
+	code.offset = (int)(addr - ip - INSN_SIZE);
+
+	return code.start;
+}
+
+static notrace void replace_call(unsigned long ip, unsigned long func)
+{
+	unsigned char *new;
+
+	new = get_new_call(ip, func);
+	memcpy((void *)ip, new, INSN_SIZE);
+}
+
+static inline void replace_mcount_call(unsigned long func)
+{
+	unsigned long ip = (unsigned long)mcount_call;
+
+	replace_call(ip, func);
+}
+
+static inline void replace_trace_call(unsigned long func)
+{
+	unsigned long ip = (unsigned long)trace_call;
+
+	replace_call(ip, func);
+}
+
+static notrace int make_text_writable(unsigned long ip)
+{
+	unsigned long start = ip & ~(getpagesize() - 1);
+
+	return mprotect((void *)start, getpagesize() + INSN_SIZE, PROT_READ | PROT_EXEC | PROT_WRITE);
+}
+
+notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
+{
+	int h = trace_hash(ip);
+	struct hlist_head *head = trace_hashtable + h;
+	struct hlist_node *node;
+	struct ipinfo info;
+	struct caller *new = NULL;
+
+	pthread_mutex_lock(&trace_lock);
+	if (hlist_empty(head))
+		goto not_found;
+
+	hlist_for_each_entry(new, node, head, hash) {
+		if (new->mcount == ip)
+			goto out;
+	}
+not_found:
+	if (get_ipinfo(ip, &info) < 0) {
+		dprintf("ip: %lx not found\n", ip);
+		new = NULL;
+		goto out;
+	}
+	if (create) {
+		new = malloc(sizeof(*new));
+		if (!new) {
+			eprintf("out of memory\n");
+			goto out;
+		}
+		new->mcount = ip;
+		new->namelen = info.fn_namelen;
+		new->name = info.fn_name;
+		hlist_add_head(&new->hash, head);
+		list_add(&new->list, &caller_list);
+		dprintf("add %.*s\n", info.fn_namelen, info.fn_name);
+	} else {
+		dprintf("%.*s\n not found", info.fn_namelen, info.fn_name);
+		new = NULL;
+	}
+out:
+	pthread_mutex_unlock(&trace_lock);
+	return new;
+}
+
+/*
+ * Try to NOP all the mcount call sites that are supposed to be traced.
+ * Later we can enable it by asking these sites to point to trace_caller,
+ * where we can override trace_call() with our own trace function. We can
+ * do this, because below function record the IP of 'call mcount' inside the
+ * callers.
+ *
+ * IP points to the return address.
+ */
+static notrace void do_trace_init(unsigned long ip)
+{
+
+	if (make_text_writable(ip) < 0)
+		return;
+
+	memcpy((void *)ip, NOP5, INSN_SIZE);
+	trace_lookup_ip(ip, 1);
+}
+
+notrace int register_trace_function(trace_func_t func)
+{
+	if (make_text_writable((unsigned long)trace_call) < 0)
+		return -1;
+
+	replace_trace_call((unsigned long)func);
+	trace_func = func;
+	return 0;
+}
+
+static notrace void suspend_worker_threads(void)
+{
+	struct worker_info *wi;
+	int i;
+	trace_count = total_nr_workers;
+	list_for_each_entry(wi, &worker_info_list, worker_info_siblings) {
+		for (i = 0; i < wi->nr_threads; i++)
+			if (pthread_kill(wi->worker_thread[i], SIGUSR2) != 0)
+				dprintf("%m\n");
+	}
+wait_for_worker_suspend:
+	pthread_mutex_lock(&trace_mux);
+	if (trace_count > 0) {
+		pthread_mutex_unlock(&trace_mux);
+		pthread_yield();
+		goto wait_for_worker_suspend;
+	}
+	pthread_mutex_unlock(&trace_mux);
+}
+
+static notrace void resume_worker_threads(void)
+{
+	pthread_mutex_lock(&trace_mux);
+	pthread_cond_broadcast(&trace_cond);
+	pthread_mutex_unlock(&trace_mux);
+}
+
+static notrace void patch_all_sites(unsigned long addr)
+{
+	struct caller *ca;
+	unsigned char *new;
+
+	pthread_mutex_lock(&trace_lock);
+	list_for_each_entry(ca, &caller_list, list) {
+		new = get_new_call(ca->mcount, addr);
+		memcpy((void *)ca->mcount, new, INSN_SIZE);
+	}
+	pthread_mutex_unlock(&trace_lock);
+}
+
+static notrace void nop_all_sites(void)
+{
+	struct caller *ca;
+
+	pthread_mutex_lock(&trace_lock);
+	list_for_each_entry(ca, &caller_list, list) {
+		memcpy((void *)ca->mcount, NOP5, INSN_SIZE);
+	}
+	pthread_mutex_unlock(&trace_lock);
+}
+
+notrace int trace_enable(void)
+{
+	if (trace_func == trace_call) {
+		dprintf("no tracer available\n");
+		return SD_RES_NO_TAG;
+	}
+
+	suspend_worker_threads();
+	patch_all_sites((unsigned long)trace_caller);
+	resume_worker_threads();
+	dprintf("patch tracer done\n");
+	return SD_RES_SUCCESS;
+}
+
+notrace int trace_disable(void)
+{
+	suspend_worker_threads();
+	nop_all_sites();
+	resume_worker_threads();
+	dprintf("patch nop done\n");
+	return SD_RES_SUCCESS;
+}
+
+int init_signal(void)
+{
+	struct sigaction act;
+
+	memset(&act, 0, sizeof(act));
+	act.sa_handler = suspend;
+	/* trace uses this signal to suspend the worker threads */
+	if (sigaction(SIGUSR2, &act, NULL) < 0) {
+		dprintf("%m\n");
+		return -1;
+	}
+	return 0;
+}
+
+notrace int trace_init()
+{
+	sigset_t block;
+
+	sigemptyset(&block);
+	sigaddset(&block, SIGUSR2);
+	if (pthread_sigmask(SIG_BLOCK, &block, NULL) != 0) {
+		dprintf("%m\n");
+		return -1;
+	}
+
+	if (make_text_writable((unsigned long)mcount_call) < 0) {
+		dprintf("%m\n");
+		return -1;
+	}
+
+	replace_mcount_call((unsigned long)do_trace_init);
+	dprintf("main thread %u\n", (int)pthread_self());
+	dprintf("trace support enabled.\n");
+	return 0;
+}
diff --git a/sheep/trace/trace.h b/sheep/trace/trace.h
index 92154c6..5dcbca5 100644
--- a/sheep/trace/trace.h
+++ b/sheep/trace/trace.h
@@ -1,6 +1,12 @@
 #ifndef TRACE_H
 #define TRACE_H
 
+#define INSN_SIZE       5       /* call(1b) + offset(4b) = 5b */
+
+#ifndef __ASSEMBLY__
+#include <stdlib.h>
+
+#include "list.h"
 #include "util.h"
 
 struct ipinfo {
@@ -12,6 +18,16 @@ struct ipinfo {
         int fn_narg;                /* Number of function arguments */
 };
 
+struct caller {
+	struct list_head list;
+	struct hlist_node hash;
+	unsigned long mcount;
+	int namelen;
+	const char *name;
+};
+
+typedef void (*trace_func_t)(unsigned long ip, unsigned long *parent_ip);
+
 /* stabs.c */
 extern int get_ipinfo(unsigned long ip, struct ipinfo *info);
 
@@ -20,5 +36,24 @@ extern void mcount(void);
 extern void mcount_call(void);
 extern void trace_caller(void);
 extern void trace_call(unsigned long, unsigned long *);
+extern const unsigned char NOP5[];
+
+/* trace.c */
+extern pthread_cond_t trace_cond;
+extern pthread_mutex_t trace_mux;
+
+extern int trace_init(void);
+extern int register_trace_function(trace_func_t func);
+extern int trace_enable(void);
+extern int trace_disable(void);
+extern struct caller *trace_lookup_ip(unsigned long ip, int create);
+
+#define register_tracer(new)			\
+static void __attribute__((constructor))	\
+register_ ## _tracer(void) 			\
+{  						\
+	register_trace_function(new);		\
+}
 
+#endif /* __ASSEMBLY__ */
 #endif
-- 
1.7.8.2




More information about the sheepdog mailing list