2017-12-07

记录linux-tracing-workshop实验过程,第二部分共三篇。

8. Writing BPF Tools: setuidsnoop

本节试着写一个BPF来跟踪setuid系统调用。 我们可以使用trace来跟踪setuid:

            root@ubuntu1604:~/bcc/tools# ./trace.py 'sys_setuid "uid=0x%x", arg1' 'r::sys_setuid "rc=%d", retval'
            PID    TID    COMM         FUNC             -
            34913  34913  su           sys_setuid       uid=0x3e8
            34913  34913  su           sys_setuid       rc=0
            34932  34932  cron         sys_setuid       uid=0x0
            34932  34932  cron         sys_setuid       rc=0

也可以写一个独立的BPF工具,本节模仿killsnoop.py内容实现setuid的trace。

第一步,替换sys_kill为sys_setuid

            kprobe__sys_kill->kprobe__sys_setuid
            kretprobe__sys_kill->kretprobe__sys_setuid

第二步,修改函数签名

            int kprobe__sys_setuid(struct pt_regs *ctx, int tpid, int sig)-->
            int kprobe__sys_setuid(struct pt_regs *ctx, u32 uid)

第三步,修改数据结构,用setuid的参数替换kill的参数

            struct val_t {
            u64 pid;
            u32 uid;
            char comm[TASK_COMM_LEN];
            };

            struct data_t {
            u64 pid;
            u32 uid;
            int ret;
            char comm[TASK_COMM_LEN];
            };

            class Data(ct.Structure):
            _fields_ = [
                    ("pid", ct.c_ulonglong),
                    ("uid", ct.c_uint),
                    ("ret", ct.c_int),
                    ("comm", ct.c_char * TASK_COMM_LEN)
            ]

第四步,在kprobe和kretprobe修改相应的数据

            int kprobe__sys_kill(struct pt_regs *ctx, u32 uid)
            {
            u32 pid = bpf_get_current_pid_tgid();
            FILTER

            struct val_t val = {.pid = pid};
            if (bpf_get_current_comm(&val.comm, sizeof(val.comm)) == 0) {
                    val.uid = uid;
                    infotmp.update(&pid, &val);
            }

            return 0;
            };

            int kretprobe__sys_kill(struct pt_regs *ctx)
            {
                    struct data_t data = {};
                    struct val_t *valp;
                    u32 pid = bpf_get_current_pid_tgid();

                    valp = infotmp.lookup(&pid);
                    if (valp == 0) {
                            // missed entry
                            return 0;
            }

            bpf_probe_read(&data.comm, sizeof(data.comm), valp->comm);
            data.pid = pid;
            data.uid = valp->uid;
            data.ret = PT_REGS_RC(ctx);

            events.perf_submit(ctx, &data, sizeof(data));
            infotmp.delete(&pid);

            return 0;
            }

第五步,修改print的数据

            print("%-9s %-6s %-16s %-6s %s" % (
            "TIME", "PID", "COMM", "UID", "RESULT"))

            # process event
            def print_event(cpu, data, size):
            event = ct.cast(data, ct.POINTER(Data)).contents

            if (args.failed and (event.ret >= 0)):
                    return

            print("%-9s %-6d %-16s %-6d %d" % (strftime("%H:%M:%S"),
                    event.pid, event.comm.decode(), event.uid, event.ret))

效果

            root@ubuntu1604:~/bcc/tools# ./setuidsnoop.py 
            TIME      PID    COMM             UID    RESULT
            11:41:05  36919  su               1000   0
            11:45:01  36941  cron             0      0

原实验完整版

9. Writing BPF Tools: dbslower

该实验开发一个机遇USDT probe的BCC工具,用来监控数据库的query延迟和执行。

首先下载postgresql,使用–enable-dtraceb编译,使其支持USDT,运行:

            $ cd /usr/local/pgsql/bin
            $ ./initdb -D /tmp/pgdata
            $ ./pg_ctl -D /tmp/pgdata start

查看USDT probe点:

            test@ubuntu1604:/usr/local/pgsql/bin$ /usr/share/bcc/tools/tplist  -p $(pgrep -n postgres) | grep query
            /usr/local/pgsql/bin/postgres postgresql:query__parse__start
            /usr/local/pgsql/bin/postgres postgresql:query__parse__done
            /usr/local/pgsql/bin/postgres postgresql:query__rewrite__start
            /usr/local/pgsql/bin/postgres postgresql:query__rewrite__done
            /usr/local/pgsql/bin/postgres postgresql:query__plan__start
            /usr/local/pgsql/bin/postgres postgresql:query__plan__done
            /usr/local/pgsql/bin/postgres postgresql:query__start
            /usr/local/pgsql/bin/postgres postgresql:query__done
            /usr/local/pgsql/bin/postgres postgresql:query__execute__start
            /usr/local/pgsql/bin/postgres postgresql:query__execute__done

本实验关注query__start 和 query__done,query__start第一个参数就是query参数。

下面根据实验给的整体框架完成工具编写。

第一步:找到PostgreSQL的进程ID

            dbpid = int(subprocess.check_output("pgrep -n postgres".split()))

第二步:定义数据结构,包含PID, timestamp, duration, 以及 query文本

            struct temp_t {
                    u64 timestamp;
                    char *query;
            };

            struct data_t {
                    u64 pid;
                    u64 timestamp;
                    u64 duration;
                    char query[256];
            };

            BPF_HASH(temp, u64, struct temp_t);
            BPF_PERF_OUTPUT(events);

第三步:第一两个函数处理query__start 和 query__end

            int probe_query_start(struct pt_regs *ctx) {
                    struct temp_t tmp = {};
                    tmp.timestamp = bpf_ktime_get_ns();
                    bpf_usdt_readarg(1, ctx, &tmp.query);
                    u64 pid = bpf_get_current_pid_tgid();
                    temp.update(&pid, &tmp);
                    return 0;
            }

            int probe_query_end(struct pt_regs *ctx) {
                    struct temp_t *tempp;
                    u64 pid = bpf_get_current_pid_tgid();
                    tempp = temp.lookup(&pid);
                    if (!tempp) 
                            return 0;
                    u64 delta = bpf_ktime_get_ns() - tempp->timestamp;
                    if (delta >=""" + str(threshold_ns) + """) {
                            struct data_t data = {};
                            data.pid = pid >> 32;
                            data.timestamp = tempp->timestamp;
                            data.duration = delta;
                            bpf_probe_read(&data.query, sizeof(data.query), tempp->query);
                            events.perf_submit(ctx, &data, sizeof(data));
                    }
                    temp.delete(&pid);
                    return 0;
            };

第四步:使用enable_probe enable query__start和query__end

            usdt = USDT(pid=int(dbpid))
            usdt.enable_probe("query__start", "probe_query_start")
            usdt.enable_probe("query__done", "probe_query_end")

第五步:定义Python数据结构b表示输出

            class Data(ct.Structure):
            _fields_ = [
                    ("pid", ct.c_ulonglong),
                    ("timestamp", ct.c_ulonglong),
                    ("delta", ct.c_ulonglong),
                    ("query", ct.c_char * 256)
            ]

第六步:输出

            start = 0

            def print_event(cpu, data, size):
                    global start
                    event = ct.cast(data, ct.POINTER(Data)).contents
                    if start == 0:
                            start = event.timestamp
                    print("%-14.6f %-6d %8.3f %s" % (float(event.timestamp - start) / 1000000000,
                            event.pid, float(event.delta) / 1000000, event.query))

            print("Tracing database queries for PID %d slower than %d ms..." %
                    (dbpid, args.threshold))
            print("%-14s %-6s %8s %s" % ("TIME(s)", "PID", "MS", "QUERY"))

            bpf["events"].open_perf_buffer(print_event)

效果:

            root@ubuntu1604:~/bcc/tools# ./lqdbslower.py  postgres 0
            /virtual/main.c:45:15: warning: comparison of unsigned expression >= 0 is always true [-Wtautological-compare]
            if (delta >=0) {
                    ~~~~~ ^ ~
            1 warning generated.
            Tracing database queries for PID 50216 slower than 0 ms...
            TIME(s)        PID          MS QUERY
            0.000000       50216     1.806 INSERT INTO tbl(name, date) VALUES('aaa', '2013-12-22');
            7.150496       50216     0.227 select * from tbl

原实验的dbslower.py

10. Writing BPF Tools: Contention Stats and Stacks

该实验编写一个基于BCC的观察Linux锁的竞争状态的工具。

首先编译,运行程序:

            root@ubuntu1604:~/linux-tracing-workshop# gcc -g -fno-omit-frame-pointer -pthread  parprimes.c -o parprimes
            root@ubuntu1604:~/linux-tracing-workshop# ./parprimes 4 10000

lockstat.py查找TODO完成该工具。

// TODO Update tm_key fields with the mutex, tid, and stack id

    tm_key.tid = pid;
    tm_key.mtx = entry->mtx;
    tm_key.lock_stack_id = stack_id;

// TODO Call locks.lookup_or_init(…) and update the wait time and the enter count // of the entry in the locks data structure

    struct thread_mutex_val_t *existing_tm_val, new_tm_val = {};
    existing_tm_val = locks.lookup_or_init(&tm_key, &new_tm_val);
    existing_tm_val->wait_time_ns += wait_time;
    if (PT_REGS_RC(ctx) == 0) {
            existing_tm_val->enter_count += 1;
    }

// TODO Update the mutex_lock_hist histogram with the time we held the lock

    u64 slot = bpf_log2l(hold_time / 1000);
    mutex_lock_hist.increment(slot);

// TODO Similarly to the previous probe, attach the following probes: // uprobe in pthread_mutex_lock handled by probe_mutex_lock // uretprobe in pthread_mutex_lock handled by probe_mutex_lock_return // uprobe in pthread_mutex_unlock handled by probe_mutex_unlock

    bpf.attach_uprobe(name="pthread", sym="pthread_mutex_lock", fn_name="probe_mutex_lock", pid=pid)
    bpf.attach_uretprobe(name="pthread", sym="pthread_mutex_lock", fn_name="probe_mutex_lock_return", pid=pid)
    bpf.attach_uprobe(name="pthread", sym="pthread_mutex_unlock", fn_name="probe_mutex_unlock", pid=pid)

// TODO Print a nicely formatted line with the mutex description, wait time, // hold time, enter count, and stack (use print_stack)

    print("\tmutex %s ::: wait time %.2fus ::: hold time %.2fus ::: enter count %d" %
            (mutex_descr, v.wait_time_ns/1000.0, v.lock_time_ns/1000.0, v.enter_count))
    print_stack(bpf, pid, stacks, k.lock_stack_id)

效果:

            root@ubuntu1604:~/linux-tracing-workshop# python lockstat.py  $(pidof parprimes)
            init stack for mutex 7fff3dfa1fa0 (#1)
                            [unknown] (7f2eebaa85a0)
                            [unknown] (7f2eeb6f5830)
                            [unknown] (113e258d4c544155)

            thread 53243
                    mutex [unknown] ::: wait time 7.01us ::: hold time 5.56us ::: enter count 1
                            [unknown] (7f2eebcccb34)
                            [unknown] (7f2eeb70eff8)
                            [unknown] (7f2eeba9b060)

            thread 53246
                    mutex #1 ::: wait time 1655.31us ::: hold time 809.63us ::: enter count 369
                            [unknown] (4009f0)
                            [unknown] (400a44)
                            [unknown] (7f2eebaa66ba)

            thread 53247
                    mutex #1 ::: wait time 12850.63us ::: hold time 660.04us ::: enter count 302
                            [unknown] (4009f0)
                            [unknown] (400a44)
                            [unknown] (7f2eebaa66ba)

            thread 53248
                    mutex #1 ::: wait time 13290.15us ::: hold time 610.43us ::: enter count 281
                            [unknown] (4009f0)
                            [unknown] (400a44)
                            [unknown] (7f2eebaa66ba)

            thread 53249
                    mutex #1 ::: wait time 1282.58us ::: hold time 621.87us ::: enter count 279
                            [unknown] (4009f0)
                            [unknown] (400a44)
                            [unknown] (7f2eebaa66ba)

            wait time (us)      : count     distribution
            0 -> 1          : 0        |                                        |
            2 -> 3          : 0        |                                        |
            4 -> 7          : 1229     |****************************************|
            8 -> 15         : 1        |                                        |
            16 -> 31         : 0        |                                        |
            32 -> 63         : 0        |                                        |
            64 -> 127        : 0        |                                        |
            128 -> 255        : 0        |                                        |
            256 -> 511        : 0        |                                        |
            512 -> 1023       : 0        |                                        |
            1024 -> 2047       : 0        |                                        |
            2048 -> 4095       : 0        |                                        |
            4096 -> 8191       : 0        |                                        |
            8192 -> 16383      : 2        |                                        |
            hold time (us)      : count     distribution
            0 -> 1          : 0        |                                        |
            2 -> 3          : 1227     |****************************************|
            4 -> 7          : 4        |                                        |
            8 -> 15         : 1        |                                        |

原实验解答lockstat-solution



blog comments powered by Disqus