[Sheepdog] [PATCH v3 06/13] trace: low-level trace infrastructure proper

MORITA Kazutaka morita.kazutaka at gmail.com
Thu Apr 5 18:09:32 CEST 2012


At Thu,  1 Mar 2012 10:20:18 +0800,
Liu Yuan wrote:
> 
> From: Liu Yuan <tailai.ly at taobao.com>
> 
> 
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
>  sheep/sheep.c        |    5 +
>  sheep/trace/mcount.S |    4 +
>  sheep/trace/trace.c  |  254 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  sheep/trace/trace.h  |   35 +++++++
>  4 files changed, 298 insertions(+), 0 deletions(-)
>  create mode 100644 sheep/trace/trace.c
> 
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index f76190a..2662bd8 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -21,6 +21,7 @@
>  #include <sys/syslog.h>
>  
>  #include "sheep_priv.h"
> +#include "trace/trace.h"
>  
>  #define EPOLL_SIZE 4096
>  #define DEFAULT_OBJECT_DIR "/tmp"
> @@ -222,6 +223,10 @@ int main(int argc, char **argv)
>  	ret = init_signal();
>  	if (ret)
>  		exit(1);
> +
> +	ret = trace_init();
> +	if (ret)
> +		exit(1);
>  	vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION);
>  
>  	while (!sys_stat_shutdown() || sys->nr_outstanding_reqs != 0)
> diff --git a/sheep/trace/mcount.S b/sheep/trace/mcount.S
> index c16e5ae..5f1e6b5 100644
> --- a/sheep/trace/mcount.S
> +++ b/sheep/trace/mcount.S
> @@ -64,3 +64,7 @@ trace_call:
>  .globl trace_stub
>  trace_stub:
>  	retq
> +
> +.globl NOP5
> +NOP5:
> +	.byte 0x0f,0x1f,0x44,0x00,0x00 # Intel recommended one for 5 bytes nops
> diff --git a/sheep/trace/trace.c b/sheep/trace/trace.c
> new file mode 100644
> index 0000000..463d667
> --- /dev/null
> +++ b/sheep/trace/trace.c
> @@ -0,0 +1,254 @@
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <unistd.h>
> +#include <pthread.h>
> +#include <signal.h>
> +
> +#include "trace.h"
> +#include "logger.h"
> +#include "list.h"
> +#include "work.h"
> +#include "sheepdog_proto.h"
> +
> +#define TRACE_HASH_BITS       7
> +#define TRACE_HASH_SIZE       (1 << TRACE_HASH_BITS)
> +
> +static struct hlist_head trace_hashtable[TRACE_HASH_SIZE];
> +static LIST_HEAD(caller_list);
> +static pthread_mutex_t trace_lock = PTHREAD_MUTEX_INITIALIZER;
> +
> +static trace_func_t trace_func = trace_call;
> +static int trace_count;
> +
> +pthread_cond_t trace_cond = PTHREAD_COND_INITIALIZER;
> +pthread_mutex_t trace_mux = PTHREAD_MUTEX_INITIALIZER;
> +
> +union instruction {
> +	unsigned char start[INSN_SIZE];
> +	struct {
> +		char opcode;
> +		int offset;
> +	} __attribute__((packed));
> +};
> +
> +notrace void suspend(int num)
> +{
> +	dprintf("worker thread %u going to suspend\n", (int)pthread_self());
> +
> +	pthread_mutex_lock(&trace_mux);
> +	trace_count--;
> +	pthread_cond_wait(&trace_cond, &trace_mux);
> +	pthread_mutex_unlock(&trace_mux);
> +	dprintf("worker thread going to resume\n");
> +}
> +
> +static inline int trace_hash(unsigned long ip)
> +{
> +	return hash_64(ip, TRACE_HASH_BITS);
> +}
> +
> +static notrace unsigned char *get_new_call(unsigned long ip, unsigned long addr)
> +{
> +	static union instruction code;
> +
> +	code.opcode = 0xe8; /* opcode of call */
> +	code.offset = (int)(addr - ip - INSN_SIZE);
> +
> +	return code.start;
> +}
> +
> +static notrace void replace_call(unsigned long ip, unsigned long func)
> +{
> +	unsigned char *new;
> +
> +	new = get_new_call(ip, func);
> +	memcpy((void *)ip, new, INSN_SIZE);
> +}
> +
> +static inline void replace_mcount_call(unsigned long func)
> +{
> +	unsigned long ip = (unsigned long)mcount_call;
> +
> +	replace_call(ip, func);
> +}
> +
> +static inline void replace_trace_call(unsigned long func)
> +{
> +	unsigned long ip = (unsigned long)trace_call;
> +
> +	replace_call(ip, func);
> +}
> +
> +static notrace int make_text_writable(unsigned long ip)
> +{
> +	unsigned long start = ip & ~(getpagesize() - 1);
> +
> +	return mprotect((void *)start, INSN_SIZE, PROT_READ | PROT_EXEC | PROT_WRITE);

IIUC, the second argument should be "getpagesize() + INSN_SIZE".


> +}
> +
> +notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
> +{
> +	int h = trace_hash(ip);
> +	struct hlist_head *head = trace_hashtable + h;
> +	struct hlist_node *node;
> +	struct ipinfo info;
> +	struct caller *new = NULL;
> +
> +	pthread_mutex_lock(&trace_lock);
> +	if (hlist_empty(head))

hlist_empty() calls trace_lookup_ip() again if it is not inlined, and
it results in a deadlock.

How about preventing a recursive call of trace_lookup_ip()?

E.g.:

notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
{
        int h;
        struct hlist_head *head;
        struct hlist_node *node;
        struct ipinfo info;
        struct caller *new = NULL;
        static __thread int looking_up_ip;

        if (looking_up_ip)
                return NULL;

        looking_up_ip = 1;

        pthread_mutex_lock(&trace_lock);

        ...

        pthread_mutex_unlock(&trace_lock);

        looking_up_ip = 0;
        return new;
}


> +		goto not_found;
> +
> +	hlist_for_each_entry(new, node, head, hash) {
> +		if (new->mcount == ip)
> +			goto out;
> +	}
> +not_found:
> +	if (get_ipinfo(ip, &info) < 0) {
> +		dprintf("ip: %lx not found\n", ip);
> +		new = NULL;
> +		goto out;
> +	}
> +	if (create) {
> +		/* unlock to avoid deadlock */
> +		pthread_mutex_unlock(&trace_lock);

I think we don't need unlock here if we prevent recursion.


> +		new = xzalloc(sizeof(*new));
> +		pthread_mutex_lock(&trace_lock);
> +		new->mcount = ip;
> +		new->namelen = info.fn_namelen;
> +		new->name = info.fn_name;
> +		hlist_add_head(&new->hash, head);
> +		list_add(&new->list, &caller_list);
> +		dprintf("add %.*s\n", info.fn_namelen, info.fn_name);
> +	} else {
> +		dprintf("%.*s\n not found", info.fn_namelen, info.fn_name);
> +		new = NULL;
> +	}
> +out:
> +	pthread_mutex_unlock(&trace_lock);
> +	return new;
> +}
> +
> +/*
> + * Try to NOP all the mcount call sites that are supposed to be traced.
> + * Later we can enable it by asking these sites to point to trace_caller,
> + * where we can override trace_call() with our own trace function. We can
> + * do this, because below function record the IP of 'call mcount' inside the
> + * callers.
> + *
> + * IP points to the return address.
> + */
> +static notrace void do_trace_init(unsigned long ip)
> +{
> +
> +	if (make_text_writable(ip) < 0)
> +		return;
> +
> +	memcpy((void *)ip, NOP5, INSN_SIZE);
> +	trace_lookup_ip(ip, 1);
> +}
> +
> +notrace int register_trace_function(trace_func_t func)
> +{
> +	if (make_text_writable((unsigned long)trace_call) < 0)
> +		return -1;
> +
> +	replace_trace_call((unsigned long)func);
> +	trace_func = func;
> +	return 0;
> +}
> +
> +static notrace void suspend_worker_threads(void)
> +{
> +	struct worker_info *wi;
> +	int i;
> +	trace_count = total_nr_workers;
> +	list_for_each_entry(wi, &worker_info_list, worker_info_siblings) {
> +		for (i = 0; i < wi->nr_threads; i++)
> +			if (pthread_kill(wi->worker_thread[i], SIGUSR1) != 0)

SIGUSR1 is already used by the cluster driver 'local'.  Let's use
SIGUSR2.

> +				dprintf("%m\n");
> +	}
> +wait_for_worker_suspend:
> +	pthread_mutex_lock(&trace_mux);
> +	if (trace_count > 0) {
> +		pthread_mutex_unlock(&trace_mux);
> +		pthread_yield();
> +		goto wait_for_worker_suspend;
> +	}
> +	pthread_mutex_unlock(&trace_mux);
> +}
> +
> +static notrace void resume_worker_threads(void)
> +{
> +	pthread_mutex_lock(&trace_mux);
> +	pthread_cond_broadcast(&trace_cond);
> +	pthread_mutex_unlock(&trace_mux);
> +}
> +
> +static notrace void patch_all_sites(unsigned long addr)
> +{
> +	struct caller *ca;
> +	unsigned char *new;
> +
> +	pthread_mutex_lock(&trace_lock);
> +	list_for_each_entry(ca, &caller_list, list) {
> +		new = get_new_call(ca->mcount, addr);
> +		memcpy((void *)ca->mcount, new, INSN_SIZE);
> +	}

IIUC, this doesn't make any change to the functions which have not
ever called.  Is this behaivior what you would expect?

Thanks,

Kazutaka



More information about the sheepdog mailing list