#!/usr/bin/python
# @lint-avoid-python-3-compatibility-imports
#
# aliworkslower    Trace high work sched latency.
#                  For Linux, uses BCC, eBPF.
#
# USAGE: aliworkslower [-h] [threshold]
#
# Copyright (c) 2019 Jeffle Xu, Alibaba, Inc.
# Licensed under the Apache License, Version 2.0 (the "License")

from __future__ import print_function
from bcc import BPF
import argparse
import ctypes as ct

# arguments
examples = """examples:
    ./aliworkslower            # trace work sched latency higher than 10000 us (default)
    ./aliworkslower 1000       # trace work sched latency higher than  1000 us
"""
parser = argparse.ArgumentParser(
    description="Trace high work sched latency",
    formatter_class=argparse.RawDescriptionHelpFormatter,
    epilog=examples)
parser.add_argument("threshold", nargs="?", type=int, default=10000,
    help="threshold of Q2S latency, in microsecond")
parser.add_argument("--ebpf", action="store_true",
    help=argparse.SUPPRESS)

args = parser.parse_args()
debug = 0

# define BPF program
bpf_text = """
#include <uapi/linux/ptrace.h>

typedef struct work_timestamp {
    u64 time_queue_work;
    u64 time_activate_work;
} work_timestamp_t;

typedef struct work_latency {
    u64 pid;    // pid of worker thread
    u64 func;   // function pointer of the work callback
    u64 q2a;    // queue_work to activate_work
    u64 a2s;    // activate_work to execute_start
} work_latency_t;

BPF_HASH(res, u64, work_timestamp_t);
BPF_PERF_OUTPUT(events);

TRACEPOINT_PROBE(workqueue, workqueue_queue_work)
{
    u64 work = (u64)args->work;
    work_timestamp_t stamp = {
        .time_queue_work =  bpf_ktime_get_ns(),
    };
    res.update(&work, &stamp);

    return 0;
}

TRACEPOINT_PROBE(workqueue, workqueue_activate_work)
{
    u64 work = (u64)args->work;
    work_timestamp_t *stampp = res.lookup(&work);

    if (stampp) {
        stampp->time_activate_work = bpf_ktime_get_ns();
    }

    return 0;
}

TRACEPOINT_PROBE(workqueue, workqueue_execute_start)
{
    u64 work = (u64)args->work;
    u64 now = bpf_ktime_get_ns();
    work_timestamp_t *stampp = res.lookup(&work);

    if (stampp) {
        u64 delta = now - stampp->time_queue_work;
        if (delta > THRESHOLD) {
            work_latency_t latency = {
                .pid = (u32)bpf_get_current_pid_tgid(),
                .func = (u64)args->function,
                .q2a = stampp->time_activate_work - stampp->time_queue_work,
                .a2s = now - stampp->time_activate_work,
            };
            events.perf_submit(args, &latency, sizeof(work_latency_t));
        }
    }

    return 0;
}
"""

bpf_text = bpf_text.replace('THRESHOLD', "%d" % (args.threshold * 1000))

# output eBPF program C code after it is replaced, used by debugging
if debug or args.ebpf:
    print(bpf_text)
    if args.ebpf:
        exit()


class Record(ct.Structure):
    _fields_ = [("pid", ct.c_ulong),
                ("func", ct.c_ulonglong),
                ("q2a", ct.c_ulonglong),
                ("a2s", ct.c_ulonglong)]


def print_record(cpu, data, size):
    event = ct.cast(data, ct.POINTER(Record)).contents
    print("%10s %20s %25d %25d %25d" % (event.pid,
                                        b.ksym(event.func),
                                        (event.q2a + event.a2s) / 1000,
                                        event.q2a / 1000,
                                        event.a2s / 1000))

# load BPF program
b = BPF(text=bpf_text)
print("Tracing high work sched latency... Hit Ctrl-C to end.")
print("%10s %20s %25s %25s %25s" %
    ("Worker Thread", "Work", "Total Q2S latency (us)", "Q2A latency (us)", "A2S latency (us)"))

b["events"].open_perf_buffer(print_record)


# output
while (1):
    try:
        b.perf_buffer_poll()
    except KeyboardInterrupt:
        exit()
