[Sheepdog] [PATCH v3 06/13] trace: low-level trace infrastructure proper

Mon Apr 9 03:28:40 CEST 2012

On 04/06/2012 12:09 AM, MORITA Kazutaka wrote:

> At Thu,  1 Mar 2012 10:20:18 +0800,
> Liu Yuan wrote:
>>
>> From: Liu Yuan <tailai.ly at taobao.com>
>>
>>
>> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
>> ---
>>  sheep/sheep.c        |    5 +
>>  sheep/trace/mcount.S |    4 +
>>  sheep/trace/trace.c  |  254 ++++++++++++++++++++++++++++++++++++++++++++++++++
>>  sheep/trace/trace.h  |   35 +++++++
>>  4 files changed, 298 insertions(+), 0 deletions(-)
>>  create mode 100644 sheep/trace/trace.c
>>
>> diff --git a/sheep/sheep.c b/sheep/sheep.c
>> index f76190a..2662bd8 100644
>> --- a/sheep/sheep.c
>> +++ b/sheep/sheep.c
>> @@ -21,6 +21,7 @@
>>  #include <sys/syslog.h>
>>  
>>  #include "sheep_priv.h"
>> +#include "trace/trace.h"
>>  
>>  #define EPOLL_SIZE 4096
>>  #define DEFAULT_OBJECT_DIR "/tmp"
>> @@ -222,6 +223,10 @@ int main(int argc, char **argv)
>>  	ret = init_signal();
>>  	if (ret)
>>  		exit(1);
>> +
>> +	ret = trace_init();
>> +	if (ret)
>> +		exit(1);
>>  	vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION);
>>  
>>  	while (!sys_stat_shutdown() || sys->nr_outstanding_reqs != 0)
>> diff --git a/sheep/trace/mcount.S b/sheep/trace/mcount.S
>> index c16e5ae..5f1e6b5 100644
>> --- a/sheep/trace/mcount.S
>> +++ b/sheep/trace/mcount.S
>> @@ -64,3 +64,7 @@ trace_call:
>>  .globl trace_stub
>>  trace_stub:
>>  	retq
>> +
>> +.globl NOP5
>> +NOP5:
>> +	.byte 0x0f,0x1f,0x44,0x00,0x00 # Intel recommended one for 5 bytes nops
>> diff --git a/sheep/trace/trace.c b/sheep/trace/trace.c
>> new file mode 100644
>> index 0000000..463d667
>> --- /dev/null
>> +++ b/sheep/trace/trace.c
>> @@ -0,0 +1,254 @@
>> +#include <string.h>
>> +#include <sys/mman.h>
>> +#include <unistd.h>
>> +#include <pthread.h>
>> +#include <signal.h>
>> +
>> +#include "trace.h"
>> +#include "logger.h"
>> +#include "list.h"
>> +#include "work.h"
>> +#include "sheepdog_proto.h"
>> +
>> +#define TRACE_HASH_BITS       7
>> +#define TRACE_HASH_SIZE       (1 << TRACE_HASH_BITS)
>> +
>> +static struct hlist_head trace_hashtable[TRACE_HASH_SIZE];
>> +static LIST_HEAD(caller_list);
>> +static pthread_mutex_t trace_lock = PTHREAD_MUTEX_INITIALIZER;
>> +
>> +static trace_func_t trace_func = trace_call;
>> +static int trace_count;
>> +
>> +pthread_cond_t trace_cond = PTHREAD_COND_INITIALIZER;
>> +pthread_mutex_t trace_mux = PTHREAD_MUTEX_INITIALIZER;
>> +
>> +union instruction {
>> +	unsigned char start[INSN_SIZE];
>> +	struct {
>> +		char opcode;
>> +		int offset;
>> +	} __attribute__((packed));
>> +};
>> +
>> +notrace void suspend(int num)
>> +{
>> +	dprintf("worker thread %u going to suspend\n", (int)pthread_self());
>> +
>> +	pthread_mutex_lock(&trace_mux);
>> +	trace_count--;
>> +	pthread_cond_wait(&trace_cond, &trace_mux);
>> +	pthread_mutex_unlock(&trace_mux);
>> +	dprintf("worker thread going to resume\n");
>> +}
>> +
>> +static inline int trace_hash(unsigned long ip)
>> +{
>> +	return hash_64(ip, TRACE_HASH_BITS);
>> +}
>> +
>> +static notrace unsigned char *get_new_call(unsigned long ip, unsigned long addr)
>> +{
>> +	static union instruction code;
>> +
>> +	code.opcode = 0xe8; /* opcode of call */
>> +	code.offset = (int)(addr - ip - INSN_SIZE);
>> +
>> +	return code.start;
>> +}
>> +
>> +static notrace void replace_call(unsigned long ip, unsigned long func)
>> +{
>> +	unsigned char *new;
>> +
>> +	new = get_new_call(ip, func);
>> +	memcpy((void *)ip, new, INSN_SIZE);
>> +}
>> +
>> +static inline void replace_mcount_call(unsigned long func)
>> +{
>> +	unsigned long ip = (unsigned long)mcount_call;
>> +
>> +	replace_call(ip, func);
>> +}
>> +
>> +static inline void replace_trace_call(unsigned long func)
>> +{
>> +	unsigned long ip = (unsigned long)trace_call;
>> +
>> +	replace_call(ip, func);
>> +}
>> +
>> +static notrace int make_text_writable(unsigned long ip)
>> +{
>> +	unsigned long start = ip & ~(getpagesize() - 1);
>> +
>> +	return mprotect((void *)start, INSN_SIZE, PROT_READ | PROT_EXEC | PROT_WRITE);
> 
> IIUC, the second argument should be "getpagesize() + INSN_SIZE".
> 

Oh, yes, good catch.

> 
>> +}
>> +
>> +notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
>> +{
>> +	int h = trace_hash(ip);
>> +	struct hlist_head *head = trace_hashtable + h;
>> +	struct hlist_node *node;
>> +	struct ipinfo info;
>> +	struct caller *new = NULL;
>> +
>> +	pthread_mutex_lock(&trace_lock);
>> +	if (hlist_empty(head))
> 
> hlist_empty() calls trace_lookup_ip() again if it is not inlined, and
> it results in a deadlock.
> 
> How about preventing a recursive call of trace_lookup_ip()?
> 
> E.g.:
> 
> notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
> {
>         int h;
>         struct hlist_head *head;
>         struct hlist_node *node;
>         struct ipinfo info;
>         struct caller *new = NULL;
>         static __thread int looking_up_ip;
> 
>         if (looking_up_ip)
>                 return NULL;
> 
>         looking_up_ip = 1;
> 
>         pthread_mutex_lock(&trace_lock);
> 
>         ...
> 
>         pthread_mutex_unlock(&trace_lock);
> 
>         looking_up_ip = 0;
>         return new;
> }
>

Nope, this bring more problems than it resolves:

1) return NULL while there entry does exist will break current trace logic
2) __thread variable doesn't make any effect to exclude concurrent access
3) one line function of hlist_empty() will be inline for any sane
compiler if it supports it.

> 
>> +		goto not_found;
>> +
>> +	hlist_for_each_entry(new, node, head, hash) {
>> +		if (new->mcount == ip)
>> +			goto out;
>> +	}
>> +not_found:
>> +	if (get_ipinfo(ip, &info) < 0) {
>> +		dprintf("ip: %lx not found\n", ip);
>> +		new = NULL;
>> +		goto out;
>> +	}
>> +	if (create) {
>> +		/* unlock to avoid deadlock */
>> +		pthread_mutex_unlock(&trace_lock);
> 
> I think we don't need unlock here if we prevent recursion.
> 
> 
>> +		new = xzalloc(sizeof(*new));
>> +		pthread_mutex_lock(&trace_lock);
>> +		new->mcount = ip;
>> +		new->namelen = info.fn_namelen;
>> +		new->name = info.fn_name;
>> +		hlist_add_head(&new->hash, head);
>> +		list_add(&new->list, &caller_list);
>> +		dprintf("add %.*s\n", info.fn_namelen, info.fn_name);
>> +	} else {
>> +		dprintf("%.*s\n not found", info.fn_namelen, info.fn_name);
>> +		new = NULL;
>> +	}
>> +out:
>> +	pthread_mutex_unlock(&trace_lock);
>> +	return new;
>> +}
>> +
>> +/*
>> + * Try to NOP all the mcount call sites that are supposed to be traced.
>> + * Later we can enable it by asking these sites to point to trace_caller,
>> + * where we can override trace_call() with our own trace function. We can
>> + * do this, because below function record the IP of 'call mcount' inside the
>> + * callers.
>> + *
>> + * IP points to the return address.
>> + */
>> +static notrace void do_trace_init(unsigned long ip)
>> +{
>> +
>> +	if (make_text_writable(ip) < 0)
>> +		return;
>> +
>> +	memcpy((void *)ip, NOP5, INSN_SIZE);
>> +	trace_lookup_ip(ip, 1);
>> +}
>> +
>> +notrace int register_trace_function(trace_func_t func)
>> +{
>> +	if (make_text_writable((unsigned long)trace_call) < 0)
>> +		return -1;
>> +
>> +	replace_trace_call((unsigned long)func);
>> +	trace_func = func;
>> +	return 0;
>> +}
>> +
>> +static notrace void suspend_worker_threads(void)
>> +{
>> +	struct worker_info *wi;
>> +	int i;
>> +	trace_count = total_nr_workers;
>> +	list_for_each_entry(wi, &worker_info_list, worker_info_siblings) {
>> +		for (i = 0; i < wi->nr_threads; i++)
>> +			if (pthread_kill(wi->worker_thread[i], SIGUSR1) != 0)
> 
> SIGUSR1 is already used by the cluster driver 'local'.  Let's use
> SIGUSR2.
> 

Okay.

>> +				dprintf("%m\n");
>> +	}
>> +wait_for_worker_suspend:
>> +	pthread_mutex_lock(&trace_mux);
>> +	if (trace_count > 0) {
>> +		pthread_mutex_unlock(&trace_mux);
>> +		pthread_yield();
>> +		goto wait_for_worker_suspend;
>> +	}
>> +	pthread_mutex_unlock(&trace_mux);
>> +}
>> +
>> +static notrace void resume_worker_threads(void)
>> +{
>> +	pthread_mutex_lock(&trace_mux);
>> +	pthread_cond_broadcast(&trace_cond);
>> +	pthread_mutex_unlock(&trace_mux);
>> +}
>> +
>> +static notrace void patch_all_sites(unsigned long addr)
>> +{
>> +	struct caller *ca;
>> +	unsigned char *new;
>> +
>> +	pthread_mutex_lock(&trace_lock);
>> +	list_for_each_entry(ca, &caller_list, list) {
>> +		new = get_new_call(ca->mcount, addr);
>> +		memcpy((void *)ca->mcount, new, INSN_SIZE);
>> +	}
> 
> IIUC, this doesn't make any change to the functions which have not
> ever called.  Is this behaivior what you would expect?
>

Yes. We need to track down functions before it can be traced. This
tracer is targeted to live system tracing, and all the functions are
registered after the cluster starts up. So it is okay.

Thanks,
Yuan