At Thu, 1 Mar 2012 10:20:18 +0800, Liu Yuan wrote: > > From: Liu Yuan <tailai.ly at taobao.com> > > > Signed-off-by: Liu Yuan <tailai.ly at taobao.com> > --- > sheep/sheep.c | 5 + > sheep/trace/mcount.S | 4 + > sheep/trace/trace.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++ > sheep/trace/trace.h | 35 +++++++ > 4 files changed, 298 insertions(+), 0 deletions(-) > create mode 100644 sheep/trace/trace.c > > diff --git a/sheep/sheep.c b/sheep/sheep.c > index f76190a..2662bd8 100644 > --- a/sheep/sheep.c > +++ b/sheep/sheep.c > @@ -21,6 +21,7 @@ > #include <sys/syslog.h> > > #include "sheep_priv.h" > +#include "trace/trace.h" > > #define EPOLL_SIZE 4096 > #define DEFAULT_OBJECT_DIR "/tmp" > @@ -222,6 +223,10 @@ int main(int argc, char **argv) > ret = init_signal(); > if (ret) > exit(1); > + > + ret = trace_init(); > + if (ret) > + exit(1); > vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION); > > while (!sys_stat_shutdown() || sys->nr_outstanding_reqs != 0) > diff --git a/sheep/trace/mcount.S b/sheep/trace/mcount.S > index c16e5ae..5f1e6b5 100644 > --- a/sheep/trace/mcount.S > +++ b/sheep/trace/mcount.S > @@ -64,3 +64,7 @@ trace_call: > .globl trace_stub > trace_stub: > retq > + > +.globl NOP5 > +NOP5: > + .byte 0x0f,0x1f,0x44,0x00,0x00 # Intel recommended one for 5 bytes nops > diff --git a/sheep/trace/trace.c b/sheep/trace/trace.c > new file mode 100644 > index 0000000..463d667 > --- /dev/null > +++ b/sheep/trace/trace.c > @@ -0,0 +1,254 @@ > +#include <string.h> > +#include <sys/mman.h> > +#include <unistd.h> > +#include <pthread.h> > +#include <signal.h> > + > +#include "trace.h" > +#include "logger.h" > +#include "list.h" > +#include "work.h" > +#include "sheepdog_proto.h" > + > +#define TRACE_HASH_BITS 7 > +#define TRACE_HASH_SIZE (1 << TRACE_HASH_BITS) > + > +static struct hlist_head trace_hashtable[TRACE_HASH_SIZE]; > +static LIST_HEAD(caller_list); > +static pthread_mutex_t trace_lock = PTHREAD_MUTEX_INITIALIZER; > + > +static trace_func_t trace_func = trace_call; > +static int trace_count; > + > +pthread_cond_t trace_cond = PTHREAD_COND_INITIALIZER; > +pthread_mutex_t trace_mux = PTHREAD_MUTEX_INITIALIZER; > + > +union instruction { > + unsigned char start[INSN_SIZE]; > + struct { > + char opcode; > + int offset; > + } __attribute__((packed)); > +}; > + > +notrace void suspend(int num) > +{ > + dprintf("worker thread %u going to suspend\n", (int)pthread_self()); > + > + pthread_mutex_lock(&trace_mux); > + trace_count--; > + pthread_cond_wait(&trace_cond, &trace_mux); > + pthread_mutex_unlock(&trace_mux); > + dprintf("worker thread going to resume\n"); > +} > + > +static inline int trace_hash(unsigned long ip) > +{ > + return hash_64(ip, TRACE_HASH_BITS); > +} > + > +static notrace unsigned char *get_new_call(unsigned long ip, unsigned long addr) > +{ > + static union instruction code; > + > + code.opcode = 0xe8; /* opcode of call */ > + code.offset = (int)(addr - ip - INSN_SIZE); > + > + return code.start; > +} > + > +static notrace void replace_call(unsigned long ip, unsigned long func) > +{ > + unsigned char *new; > + > + new = get_new_call(ip, func); > + memcpy((void *)ip, new, INSN_SIZE); > +} > + > +static inline void replace_mcount_call(unsigned long func) > +{ > + unsigned long ip = (unsigned long)mcount_call; > + > + replace_call(ip, func); > +} > + > +static inline void replace_trace_call(unsigned long func) > +{ > + unsigned long ip = (unsigned long)trace_call; > + > + replace_call(ip, func); > +} > + > +static notrace int make_text_writable(unsigned long ip) > +{ > + unsigned long start = ip & ~(getpagesize() - 1); > + > + return mprotect((void *)start, INSN_SIZE, PROT_READ | PROT_EXEC | PROT_WRITE); IIUC, the second argument should be "getpagesize() + INSN_SIZE". > +} > + > +notrace struct caller *trace_lookup_ip(unsigned long ip, int create) > +{ > + int h = trace_hash(ip); > + struct hlist_head *head = trace_hashtable + h; > + struct hlist_node *node; > + struct ipinfo info; > + struct caller *new = NULL; > + > + pthread_mutex_lock(&trace_lock); > + if (hlist_empty(head)) hlist_empty() calls trace_lookup_ip() again if it is not inlined, and it results in a deadlock. How about preventing a recursive call of trace_lookup_ip()? E.g.: notrace struct caller *trace_lookup_ip(unsigned long ip, int create) { int h; struct hlist_head *head; struct hlist_node *node; struct ipinfo info; struct caller *new = NULL; static __thread int looking_up_ip; if (looking_up_ip) return NULL; looking_up_ip = 1; pthread_mutex_lock(&trace_lock); ... pthread_mutex_unlock(&trace_lock); looking_up_ip = 0; return new; } > + goto not_found; > + > + hlist_for_each_entry(new, node, head, hash) { > + if (new->mcount == ip) > + goto out; > + } > +not_found: > + if (get_ipinfo(ip, &info) < 0) { > + dprintf("ip: %lx not found\n", ip); > + new = NULL; > + goto out; > + } > + if (create) { > + /* unlock to avoid deadlock */ > + pthread_mutex_unlock(&trace_lock); I think we don't need unlock here if we prevent recursion. > + new = xzalloc(sizeof(*new)); > + pthread_mutex_lock(&trace_lock); > + new->mcount = ip; > + new->namelen = info.fn_namelen; > + new->name = info.fn_name; > + hlist_add_head(&new->hash, head); > + list_add(&new->list, &caller_list); > + dprintf("add %.*s\n", info.fn_namelen, info.fn_name); > + } else { > + dprintf("%.*s\n not found", info.fn_namelen, info.fn_name); > + new = NULL; > + } > +out: > + pthread_mutex_unlock(&trace_lock); > + return new; > +} > + > +/* > + * Try to NOP all the mcount call sites that are supposed to be traced. > + * Later we can enable it by asking these sites to point to trace_caller, > + * where we can override trace_call() with our own trace function. We can > + * do this, because below function record the IP of 'call mcount' inside the > + * callers. > + * > + * IP points to the return address. > + */ > +static notrace void do_trace_init(unsigned long ip) > +{ > + > + if (make_text_writable(ip) < 0) > + return; > + > + memcpy((void *)ip, NOP5, INSN_SIZE); > + trace_lookup_ip(ip, 1); > +} > + > +notrace int register_trace_function(trace_func_t func) > +{ > + if (make_text_writable((unsigned long)trace_call) < 0) > + return -1; > + > + replace_trace_call((unsigned long)func); > + trace_func = func; > + return 0; > +} > + > +static notrace void suspend_worker_threads(void) > +{ > + struct worker_info *wi; > + int i; > + trace_count = total_nr_workers; > + list_for_each_entry(wi, &worker_info_list, worker_info_siblings) { > + for (i = 0; i < wi->nr_threads; i++) > + if (pthread_kill(wi->worker_thread[i], SIGUSR1) != 0) SIGUSR1 is already used by the cluster driver 'local'. Let's use SIGUSR2. > + dprintf("%m\n"); > + } > +wait_for_worker_suspend: > + pthread_mutex_lock(&trace_mux); > + if (trace_count > 0) { > + pthread_mutex_unlock(&trace_mux); > + pthread_yield(); > + goto wait_for_worker_suspend; > + } > + pthread_mutex_unlock(&trace_mux); > +} > + > +static notrace void resume_worker_threads(void) > +{ > + pthread_mutex_lock(&trace_mux); > + pthread_cond_broadcast(&trace_cond); > + pthread_mutex_unlock(&trace_mux); > +} > + > +static notrace void patch_all_sites(unsigned long addr) > +{ > + struct caller *ca; > + unsigned char *new; > + > + pthread_mutex_lock(&trace_lock); > + list_for_each_entry(ca, &caller_list, list) { > + new = get_new_call(ca->mcount, addr); > + memcpy((void *)ca->mcount, new, INSN_SIZE); > + } IIUC, this doesn't make any change to the functions which have not ever called. Is this behaivior what you would expect? Thanks, Kazutaka |