[Sheepdog] [PATCH v3 06/13] trace: low-level trace infrastructure proper
MORITA Kazutaka
morita.kazutaka at gmail.com
Thu Apr 5 18:09:32 CEST 2012
At Thu, 1 Mar 2012 10:20:18 +0800,
Liu Yuan wrote:
>
> From: Liu Yuan <tailai.ly at taobao.com>
>
>
> Signed-off-by: Liu Yuan <tailai.ly at taobao.com>
> ---
> sheep/sheep.c | 5 +
> sheep/trace/mcount.S | 4 +
> sheep/trace/trace.c | 254 ++++++++++++++++++++++++++++++++++++++++++++++++++
> sheep/trace/trace.h | 35 +++++++
> 4 files changed, 298 insertions(+), 0 deletions(-)
> create mode 100644 sheep/trace/trace.c
>
> diff --git a/sheep/sheep.c b/sheep/sheep.c
> index f76190a..2662bd8 100644
> --- a/sheep/sheep.c
> +++ b/sheep/sheep.c
> @@ -21,6 +21,7 @@
> #include <sys/syslog.h>
>
> #include "sheep_priv.h"
> +#include "trace/trace.h"
>
> #define EPOLL_SIZE 4096
> #define DEFAULT_OBJECT_DIR "/tmp"
> @@ -222,6 +223,10 @@ int main(int argc, char **argv)
> ret = init_signal();
> if (ret)
> exit(1);
> +
> + ret = trace_init();
> + if (ret)
> + exit(1);
> vprintf(SDOG_NOTICE, "sheepdog daemon (version %s) started\n", PACKAGE_VERSION);
>
> while (!sys_stat_shutdown() || sys->nr_outstanding_reqs != 0)
> diff --git a/sheep/trace/mcount.S b/sheep/trace/mcount.S
> index c16e5ae..5f1e6b5 100644
> --- a/sheep/trace/mcount.S
> +++ b/sheep/trace/mcount.S
> @@ -64,3 +64,7 @@ trace_call:
> .globl trace_stub
> trace_stub:
> retq
> +
> +.globl NOP5
> +NOP5:
> + .byte 0x0f,0x1f,0x44,0x00,0x00 # Intel recommended one for 5 bytes nops
> diff --git a/sheep/trace/trace.c b/sheep/trace/trace.c
> new file mode 100644
> index 0000000..463d667
> --- /dev/null
> +++ b/sheep/trace/trace.c
> @@ -0,0 +1,254 @@
> +#include <string.h>
> +#include <sys/mman.h>
> +#include <unistd.h>
> +#include <pthread.h>
> +#include <signal.h>
> +
> +#include "trace.h"
> +#include "logger.h"
> +#include "list.h"
> +#include "work.h"
> +#include "sheepdog_proto.h"
> +
> +#define TRACE_HASH_BITS 7
> +#define TRACE_HASH_SIZE (1 << TRACE_HASH_BITS)
> +
> +static struct hlist_head trace_hashtable[TRACE_HASH_SIZE];
> +static LIST_HEAD(caller_list);
> +static pthread_mutex_t trace_lock = PTHREAD_MUTEX_INITIALIZER;
> +
> +static trace_func_t trace_func = trace_call;
> +static int trace_count;
> +
> +pthread_cond_t trace_cond = PTHREAD_COND_INITIALIZER;
> +pthread_mutex_t trace_mux = PTHREAD_MUTEX_INITIALIZER;
> +
> +union instruction {
> + unsigned char start[INSN_SIZE];
> + struct {
> + char opcode;
> + int offset;
> + } __attribute__((packed));
> +};
> +
> +notrace void suspend(int num)
> +{
> + dprintf("worker thread %u going to suspend\n", (int)pthread_self());
> +
> + pthread_mutex_lock(&trace_mux);
> + trace_count--;
> + pthread_cond_wait(&trace_cond, &trace_mux);
> + pthread_mutex_unlock(&trace_mux);
> + dprintf("worker thread going to resume\n");
> +}
> +
> +static inline int trace_hash(unsigned long ip)
> +{
> + return hash_64(ip, TRACE_HASH_BITS);
> +}
> +
> +static notrace unsigned char *get_new_call(unsigned long ip, unsigned long addr)
> +{
> + static union instruction code;
> +
> + code.opcode = 0xe8; /* opcode of call */
> + code.offset = (int)(addr - ip - INSN_SIZE);
> +
> + return code.start;
> +}
> +
> +static notrace void replace_call(unsigned long ip, unsigned long func)
> +{
> + unsigned char *new;
> +
> + new = get_new_call(ip, func);
> + memcpy((void *)ip, new, INSN_SIZE);
> +}
> +
> +static inline void replace_mcount_call(unsigned long func)
> +{
> + unsigned long ip = (unsigned long)mcount_call;
> +
> + replace_call(ip, func);
> +}
> +
> +static inline void replace_trace_call(unsigned long func)
> +{
> + unsigned long ip = (unsigned long)trace_call;
> +
> + replace_call(ip, func);
> +}
> +
> +static notrace int make_text_writable(unsigned long ip)
> +{
> + unsigned long start = ip & ~(getpagesize() - 1);
> +
> + return mprotect((void *)start, INSN_SIZE, PROT_READ | PROT_EXEC | PROT_WRITE);
IIUC, the second argument should be "getpagesize() + INSN_SIZE".
> +}
> +
> +notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
> +{
> + int h = trace_hash(ip);
> + struct hlist_head *head = trace_hashtable + h;
> + struct hlist_node *node;
> + struct ipinfo info;
> + struct caller *new = NULL;
> +
> + pthread_mutex_lock(&trace_lock);
> + if (hlist_empty(head))
hlist_empty() calls trace_lookup_ip() again if it is not inlined, and
it results in a deadlock.
How about preventing a recursive call of trace_lookup_ip()?
E.g.:
notrace struct caller *trace_lookup_ip(unsigned long ip, int create)
{
int h;
struct hlist_head *head;
struct hlist_node *node;
struct ipinfo info;
struct caller *new = NULL;
static __thread int looking_up_ip;
if (looking_up_ip)
return NULL;
looking_up_ip = 1;
pthread_mutex_lock(&trace_lock);
...
pthread_mutex_unlock(&trace_lock);
looking_up_ip = 0;
return new;
}
> + goto not_found;
> +
> + hlist_for_each_entry(new, node, head, hash) {
> + if (new->mcount == ip)
> + goto out;
> + }
> +not_found:
> + if (get_ipinfo(ip, &info) < 0) {
> + dprintf("ip: %lx not found\n", ip);
> + new = NULL;
> + goto out;
> + }
> + if (create) {
> + /* unlock to avoid deadlock */
> + pthread_mutex_unlock(&trace_lock);
I think we don't need unlock here if we prevent recursion.
> + new = xzalloc(sizeof(*new));
> + pthread_mutex_lock(&trace_lock);
> + new->mcount = ip;
> + new->namelen = info.fn_namelen;
> + new->name = info.fn_name;
> + hlist_add_head(&new->hash, head);
> + list_add(&new->list, &caller_list);
> + dprintf("add %.*s\n", info.fn_namelen, info.fn_name);
> + } else {
> + dprintf("%.*s\n not found", info.fn_namelen, info.fn_name);
> + new = NULL;
> + }
> +out:
> + pthread_mutex_unlock(&trace_lock);
> + return new;
> +}
> +
> +/*
> + * Try to NOP all the mcount call sites that are supposed to be traced.
> + * Later we can enable it by asking these sites to point to trace_caller,
> + * where we can override trace_call() with our own trace function. We can
> + * do this, because below function record the IP of 'call mcount' inside the
> + * callers.
> + *
> + * IP points to the return address.
> + */
> +static notrace void do_trace_init(unsigned long ip)
> +{
> +
> + if (make_text_writable(ip) < 0)
> + return;
> +
> + memcpy((void *)ip, NOP5, INSN_SIZE);
> + trace_lookup_ip(ip, 1);
> +}
> +
> +notrace int register_trace_function(trace_func_t func)
> +{
> + if (make_text_writable((unsigned long)trace_call) < 0)
> + return -1;
> +
> + replace_trace_call((unsigned long)func);
> + trace_func = func;
> + return 0;
> +}
> +
> +static notrace void suspend_worker_threads(void)
> +{
> + struct worker_info *wi;
> + int i;
> + trace_count = total_nr_workers;
> + list_for_each_entry(wi, &worker_info_list, worker_info_siblings) {
> + for (i = 0; i < wi->nr_threads; i++)
> + if (pthread_kill(wi->worker_thread[i], SIGUSR1) != 0)
SIGUSR1 is already used by the cluster driver 'local'. Let's use
SIGUSR2.
> + dprintf("%m\n");
> + }
> +wait_for_worker_suspend:
> + pthread_mutex_lock(&trace_mux);
> + if (trace_count > 0) {
> + pthread_mutex_unlock(&trace_mux);
> + pthread_yield();
> + goto wait_for_worker_suspend;
> + }
> + pthread_mutex_unlock(&trace_mux);
> +}
> +
> +static notrace void resume_worker_threads(void)
> +{
> + pthread_mutex_lock(&trace_mux);
> + pthread_cond_broadcast(&trace_cond);
> + pthread_mutex_unlock(&trace_mux);
> +}
> +
> +static notrace void patch_all_sites(unsigned long addr)
> +{
> + struct caller *ca;
> + unsigned char *new;
> +
> + pthread_mutex_lock(&trace_lock);
> + list_for_each_entry(ca, &caller_list, list) {
> + new = get_new_call(ca->mcount, addr);
> + memcpy((void *)ca->mcount, new, INSN_SIZE);
> + }
IIUC, this doesn't make any change to the functions which have not
ever called. Is this behaivior what you would expect?
Thanks,
Kazutaka
More information about the sheepdog
mailing list