Gecko [ theprintave.ca ]

Name	Size	Permission	Date
bridge_loop.bt	4.91 KB	-rwxr-xr-x	2024-11-20 14:02
dpif_nl_exec_monitor.py	21.67 KB	-rwxr-xr-x	2024-11-20 14:02
kernel_delay.py	43 KB	-rwxr-xr-x	2024-11-20 14:02
kernel_delay.rst	29.64 KB	-rwxr-xr-x	2024-11-20 14:02
reval_monitor.py	29.42 KB	-rwxr-xr-x	2024-11-20 14:02
upcall_cost.py	64.38 KB	-rwxr-xr-x	2024-11-20 14:02
upcall_monitor.py	17.19 KB	-rwxr-xr-x	2024-11-20 14:02

Rename

#!/usr/bin/env python3
#
# Copyright (c) 2022,2023 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at:
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
#
# Script information:
# -------------------
# This script allows a developer to quickly identify if the issue at hand
# might be related to the kernel running out of resources or if it really is
# an Open vSwitch issue.
#
# For documentation see the kernel_delay.rst file.
#
#
# Dependencies:
# -------------
#  You need to install the BCC package for your specific platform or build it
#  yourself using the following instructions:
#    https://raw.githubusercontent.com/iovisor/bcc/master/INSTALL.md
#
#  Python needs the following additional packages installed:
#    - pytz
#    - psutil
#
#  You can either install your distribution specific package or use pip:
#    pip install pytz psutil
#
import argparse
import datetime
import os
import pytz
import psutil
import re
import sys
import time

import ctypes as ct

try:
    from bcc import BPF, USDT, USDTException
    from bcc.syscall import syscalls, syscall_name
except ModuleNotFoundError:
    print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!")
    sys.exit(os.EX_OSFILE)

from enum import IntEnum

#
# Actual eBPF source code
#
EBPF_SOURCE = """
#include <linux/irq.h>
#include <linux/sched.h>

#define MONITOR_PID <MONITOR_PID>

enum {
<EVENT_ENUM>
};

struct event_t {
    u64 ts;
    u32 tid;
    u32 id;

int user_stack_id;
    int kernel_stack_id;

u32 syscall;
    u64 entry_ts;

};

BPF_RINGBUF_OUTPUT(events, <BUFFER_PAGE_CNT>);
BPF_STACK_TRACE(stack_traces, <STACK_TRACE_SIZE>);
BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1);
BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1);

BPF_ARRAY(capture_on, u64, 1);
static inline bool capture_enabled(u64 pid_tgid) {
    int key = 0;
    u64 *ret;

if ((pid_tgid >> 32) != MONITOR_PID)
        return false;

ret = capture_on.lookup(&key);
    return ret && *ret == 1;
}

static inline bool capture_enabled__() {
    int key = 0;
    u64 *ret;

ret = capture_on.lookup(&key);
    return ret && *ret == 1;
}

static struct event_t *get_event(uint32_t id) {
    struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t));

if (!event) {
        dropcnt.increment(0);
        return NULL;
    }

event->id = id;
    event->ts = bpf_ktime_get_ns();
    event->tid = bpf_get_current_pid_tgid();

return event;
}

static int start_trigger() {
    int key = 0;
    u64 *val = capture_on.lookup(&key);

/* If the value is -1 we can't start as we are still processing the
     * results in userspace. */
    if (!val || *val != 0) {
        trigger_miss.increment(0);
        return 0;
    }

struct event_t *event = get_event(EVENT_START_TRIGGER);
    if (event) {
       events.ringbuf_submit(event, 0);
       *val = 1;
    } else {
        trigger_miss.increment(0);
    }
    return 0;
}

static int stop_trigger() {
    int key = 0;
    u64 *val = capture_on.lookup(&key);

if (!val || *val != 1)
        return 0;

struct event_t *event = get_event(EVENT_STOP_TRIGGER);

if (event)
       events.ringbuf_submit(event, 0);

if (val)
        *val = -1;

return 0;
}

<START_TRIGGER>
<STOP_TRIGGER>

/*
 * For the syscall monitor the following probes get installed.
 */
struct syscall_data_t {
    u64 count;
    u64 total_ns;
    u64 worst_ns;
};

struct syscall_data_key_t {
    u32 pid;
    u32 tid;
    u32 syscall;
};

BPF_HASH(syscall_start, u64, u64);
BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t);

TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
    u64 pid_tgid = bpf_get_current_pid_tgid();

if (!capture_enabled(pid_tgid))
       return 0;

u64 t = bpf_ktime_get_ns();
    syscall_start.update(&pid_tgid, &t);

return 0;
}

TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
    struct syscall_data_t *val, zero = {};
    struct syscall_data_key_t key;

u64 pid_tgid = bpf_get_current_pid_tgid();

if (!capture_enabled(pid_tgid))
       return 0;

key.pid = pid_tgid >> 32;
    key.tid = (u32)pid_tgid;
    key.syscall = args->id;

u64 *start_ns = syscall_start.lookup(&pid_tgid);

if (!start_ns)
        return 0;

val = syscall_data.lookup_or_try_init(&key, &zero);
    if (val) {
        u64 delta = bpf_ktime_get_ns() - *start_ns;
        val->count++;
        val->total_ns += delta;
        if (val->worst_ns == 0 || delta > val->worst_ns)
            val->worst_ns = delta;

if (<SYSCALL_TRACE_EVENTS>) {
            struct event_t *event = get_event(EVENT_SYSCALL);
            if (event) {
                event->syscall = args->id;
                event->entry_ts = *start_ns;
                if (<STACK_TRACE_ENABLED>) {
                    event->user_stack_id = stack_traces.get_stackid(
                        args, BPF_F_USER_STACK);
                    event->kernel_stack_id = stack_traces.get_stackid(
                        args, 0);
                }
                events.ringbuf_submit(event, 0);
            }
        }
    }
    return 0;
}

/*
 * For measuring the thread run time, we need the following.
 */
struct run_time_data_t {
    u64 count;
    u64 total_ns;
    u64 max_ns;
    u64 min_ns;
};

struct pid_tid_key_t {
    u32  pid;
    u32  tid;
};

BPF_HASH(run_start, u64, u64);
BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t);

static inline void thread_start_run(u64 pid_tgid, u64 ktime)
{
    run_start.update(&pid_tgid, &ktime);
}

static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime)
{
    u64 pid_tgid = (u64) tgid << 32 | pid;
    u64 *start_ns = run_start.lookup(&pid_tgid);

if (!start_ns || *start_ns == 0)
        return;

struct run_time_data_t *val, zero = {};
    struct pid_tid_key_t key = { .pid = tgid,
                                 .tid = pid };

val = run_data.lookup_or_try_init(&key, &zero);
    if (val) {
        u64 delta = ktime - *start_ns;
        val->count++;
        val->total_ns += delta;
        if (val->max_ns == 0 || delta > val->max_ns)
            val->max_ns = delta;
        if (val->min_ns == 0 || delta < val->min_ns)
            val->min_ns = delta;
    }
    *start_ns = 0;
}

/*
 * For measuring the thread-ready delay, we need the following.
 */
struct ready_data_t {
    u64 count;
    u64 total_ns;
    u64 worst_ns;
};

BPF_HASH(ready_start, u64, u64);
BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t);

static inline int sched_wakeup__(u32 pid, u32 tgid)
{
    u64 pid_tgid = (u64) tgid << 32 | pid;

if (!capture_enabled(pid_tgid))
        return 0;

u64 t = bpf_ktime_get_ns();
    ready_start.update(&pid_tgid, &t);
    return 0;
}

RAW_TRACEPOINT_PROBE(sched_wakeup)
{
    struct task_struct *t = (struct task_struct *)ctx->args[0];
    return sched_wakeup__(t->pid, t->tgid);
}

RAW_TRACEPOINT_PROBE(sched_wakeup_new)
{
    struct task_struct *t = (struct task_struct *)ctx->args[0];
    return sched_wakeup__(t->pid, t->tgid);
}

RAW_TRACEPOINT_PROBE(sched_switch)
{
    struct task_struct *prev = (struct task_struct *)ctx->args[1];
    struct task_struct *next= (struct task_struct *)ctx->args[2];
    u64 ktime = 0;

if (!capture_enabled__())
        return 0;

if (prev-><STATE_FIELD> == TASK_RUNNING && prev->tgid == MONITOR_PID)
        sched_wakeup__(prev->pid, prev->tgid);

if (prev->tgid == MONITOR_PID) {
        ktime = bpf_ktime_get_ns();
        thread_stop_run(prev->pid, prev->tgid, ktime);
    }

u64 pid_tgid = (u64)next->tgid << 32 | next->pid;

if (next->tgid != MONITOR_PID)
        return 0;

if (ktime == 0)
        ktime = bpf_ktime_get_ns();

u64 *start_ns = ready_start.lookup(&pid_tgid);

if (start_ns && *start_ns != 0) {

struct ready_data_t *val, zero = {};
        struct pid_tid_key_t key = { .pid = next->tgid,
                                     .tid = next->pid };

val = ready_data.lookup_or_try_init(&key, &zero);
        if (val) {
            u64 delta = ktime - *start_ns;
            val->count++;
            val->total_ns += delta;
            if (val->worst_ns == 0 || delta > val->worst_ns)
                val->worst_ns = delta;
        }
        *start_ns = 0;
    }

thread_start_run(pid_tgid, ktime);
    return 0;
}

/*
 * For measuring the hard irq time, we need the following.
 */
struct hardirq_start_data_t {
    u64  start_ns;
    char irq_name[32];
};

struct hardirq_data_t {
    u64 count;
    u64 total_ns;
    u64 worst_ns;
};

struct hardirq_data_key_t {
    u32 pid;
    u32 tid;
    char irq_name[32];
};

BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t);
BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t);

TRACEPOINT_PROBE(irq, irq_handler_entry)
{
    u64 pid_tgid = bpf_get_current_pid_tgid();

if (!capture_enabled(pid_tgid))
        return 0;

struct hardirq_start_data_t data = {};

data.start_ns = bpf_ktime_get_ns();
    TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name));
    hardirq_start.update(&pid_tgid, &data);
    return 0;
}

TRACEPOINT_PROBE(irq, irq_handler_exit)
{
    u64 pid_tgid = bpf_get_current_pid_tgid();

if (!capture_enabled(pid_tgid))
        return 0;

struct hardirq_start_data_t *data;
    data = hardirq_start.lookup(&pid_tgid);
    if (!data || data->start_ns == 0)
        return 0;

if (args->ret != IRQ_NONE) {
        struct hardirq_data_t *val, zero = {};
        struct hardirq_data_key_t key = { .pid = pid_tgid >> 32,
                                          .tid = (u32)pid_tgid };

bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name),
                              data->irq_name);
        val = hardirq_data.lookup_or_try_init(&key, &zero);
        if (val) {
            u64 delta = bpf_ktime_get_ns() - data->start_ns;
            val->count++;
            val->total_ns += delta;
            if (val->worst_ns == 0 || delta > val->worst_ns)
                val->worst_ns = delta;
        }
    }

data->start_ns = 0;
    return 0;
}

/*
 * For measuring the soft irq time, we need the following.
 */
struct softirq_start_data_t {
    u64 start_ns;
    u32 vec_nr;
};

struct softirq_data_t {
    u64 count;
    u64 total_ns;
    u64 worst_ns;
};

struct softirq_data_key_t {
    u32 pid;
    u32 tid;
    u32 vec_nr;
};

BPF_HASH(softirq_start, u64, struct softirq_start_data_t);
BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t);

TRACEPOINT_PROBE(irq, softirq_entry)
{
    u64 pid_tgid = bpf_get_current_pid_tgid();

if (!capture_enabled(pid_tgid))
        return 0;

struct softirq_start_data_t data = {};

data.start_ns = bpf_ktime_get_ns();
    data.vec_nr = args->vec;
    softirq_start.update(&pid_tgid, &data);
    return 0;
}

TRACEPOINT_PROBE(irq, softirq_exit)
{
    u64 pid_tgid = bpf_get_current_pid_tgid();

if (!capture_enabled(pid_tgid))
        return 0;

struct softirq_start_data_t *data;
    data = softirq_start.lookup(&pid_tgid);
    if (!data || data->start_ns == 0)
        return 0;

struct softirq_data_t *val, zero = {};
    struct softirq_data_key_t key = { .pid = pid_tgid >> 32,
                                      .tid = (u32)pid_tgid,
                                      .vec_nr = data->vec_nr};

val = softirq_data.lookup_or_try_init(&key, &zero);
    if (val) {
        u64 delta = bpf_ktime_get_ns() - data->start_ns;
        val->count++;
        val->total_ns += delta;
        if (val->worst_ns == 0 || delta > val->worst_ns)
            val->worst_ns = delta;
    }

data->start_ns = 0;
    return 0;
}
"""

#
# time_ns()
#
try:
    from time import time_ns
except ImportError:
    # For compatibility with Python <= v3.6.
    def time_ns():
        now = datetime.datetime.now()
        return int(now.timestamp() * 1e9)

#
# Probe class to use for the start/stop triggers
#
class Probe(object):
    '''
    The goal for this object is to support as many as possible
    probe/events as supported by BCC. See
       https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
    '''
    def __init__(self, probe, pid=None):
        self.pid = pid
        self.text_probe = probe
        self._parse_text_probe()

def __str__(self):
        if self.probe_type == "usdt":
            return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
                                           self.usdt_provider, self.usdt_probe)
        elif self.probe_type == "trace":
            return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
                                           self.trace_system, self.trace_event)
        elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
            return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
                                        self.kprobe_function)
        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
            return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
                                        self.uprobe_function)
        else:
            return "[{}] <{}:unknown probe>".format(self.text_probe,
                                                    self.probe_type)

def _raise(self, error):
        raise ValueError("[{}]; {}".format(self.text_probe, error))

def _verify_kprobe_probe(self):
        # Nothing to verify for now, just return.
        return

def _verify_trace_probe(self):
        # Nothing to verify for now, just return.
        return

def _verify_uprobe_probe(self):
        # Nothing to verify for now, just return.
        return

def _verify_usdt_probe(self):
        if not self.pid:
            self._raise("USDT probes need a valid PID.")

usdt = USDT(pid=self.pid)

for probe in usdt.enumerate_probes():
            if probe.provider.decode("utf-8") == self.usdt_provider and \
               probe.name.decode("utf-8") == self.usdt_probe:
                return

self._raise("Can't find UDST probe '{}:{}'".format(self.usdt_provider,
                                                           self.usdt_probe))

def _parse_text_probe(self):
        '''
        The text probe format is defined as follows:
          <probe_type>:<probe_specific>

Types:
          USDT:      u|usdt:<provider>:<probe>
          TRACE:     t|trace:<system>:<event>
          KPROBE:    k|kprobe:<kernel_function>
          KRETPROBE: kr|kretprobe:<kernel_function>
          UPROBE:    up|uprobe:<function>
          URETPROBE: ur|uretprobe:<function>
        '''
        args = self.text_probe.split(":")
        if len(args) <= 1:
            self._raise("Can't extract probe type.")

if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace",
                           "u", "usdt", "up", "uprobe", "ur", "uretprobe"]:
            self._raise("Invalid probe type '{}'".format(args[0]))

self.probe_type = "kprobe" if args[0] == "k" else args[0]
        self.probe_type = "kretprobe" if args[0] == "kr" else self.probe_type
        self.probe_type = "trace" if args[0] == "t" else self.probe_type
        self.probe_type = "usdt" if args[0] == "u" else self.probe_type
        self.probe_type = "uprobe" if args[0] == "up" else self.probe_type
        self.probe_type = "uretprobe" if args[0] == "ur" else self.probe_type

if self.probe_type == "usdt":
            if len(args) != 3:
                self._raise("Invalid number of arguments for USDT")

self.usdt_provider = args[1]
            self.usdt_probe = args[2]
            self._verify_usdt_probe()

elif self.probe_type == "trace":
            if len(args) != 3:
                self._raise("Invalid number of arguments for TRACE")

self.trace_system = args[1]
            self.trace_event = args[2]
            self._verify_trace_probe()

elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
            if len(args) != 2:
                self._raise("Invalid number of arguments for K(RET)PROBE")
            self.kprobe_function = args[1]
            self._verify_kprobe_probe()

elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
            if len(args) != 2:
                self._raise("Invalid number of arguments for U(RET)PROBE")
            self.uprobe_function = args[1]
            self._verify_uprobe_probe()

def _get_kprobe_c_code(self, function_name, function_content):
        #
        # The kprobe__* do not require a function name, so it's
        # ignored in the code generation.
        #
        return """
int {}__{}(struct pt_regs *ctx) {{
    {}
}}
""".format(self.probe_type, self.kprobe_function, function_content)

def _get_trace_c_code(self, function_name, function_content):
        #
        # The TRACEPOINT_PROBE() do not require a function name, so it's
        # ignored in the code generation.
        #
        return """
TRACEPOINT_PROBE({},{}) {{
    {}
}}
""".format(self.trace_system, self.trace_event, function_content)

def _get_uprobe_c_code(self, function_name, function_content):
        return """
int {}(struct pt_regs *ctx) {{
    {}
}}
""".format(function_name, function_content)

def _get_usdt_c_code(self, function_name, function_content):
        return """
int {}(struct pt_regs *ctx) {{
    {}
}}
""".format(function_name, function_content)

def get_c_code(self, function_name, function_content):
        if self.probe_type == "kprobe" or self.probe_type == "kretprobe":
            return self._get_kprobe_c_code(function_name, function_content)
        elif self.probe_type == "trace":
            return self._get_trace_c_code(function_name, function_content)
        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
            return self._get_uprobe_c_code(function_name, function_content)
        elif self.probe_type == "usdt":
            return self._get_usdt_c_code(function_name, function_content)

return ""

def probe_name(self):
        if self.probe_type == "kprobe" or self.probe_type == "kretprobe":
            return "{}".format(self.kprobe_function)
        elif self.probe_type == "trace":
            return "{}:{}".format(self.trace_system,
                                  self.trace_event)
        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
            return "{}".format(self.uprobe_function)
        elif self.probe_type == "usdt":
            return "{}:{}".format(self.usdt_provider,
                                  self.usdt_probe)

return ""

#
# event_to_dict()
#
def event_to_dict(event):
    return dict([(field, getattr(event, field))
                 for (field, _) in event._fields_
                 if isinstance(getattr(event, field), (int, bytes))])

#
# Event enum
#
Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER"],
                start=0)

#
# process_event()
#
def process_event(ctx, data, size):
    global start_trigger_ts
    global stop_trigger_ts

event = bpf["events"].event(data)
    if event.id == Event.SYSCALL:
        syscall_events.append({"tid": event.tid,
                               "ts_entry": event.entry_ts,
                               "ts_exit": event.ts,
                               "syscall": event.syscall,
                               "user_stack_id": event.user_stack_id,
                               "kernel_stack_id": event.kernel_stack_id})
    elif event.id == Event.START_TRIGGER:
        #
        # This event would have started the trigger already, so all we need to
        # do is record the start timestamp.
        #
        start_trigger_ts = event.ts

elif event.id == Event.STOP_TRIGGER:
        #
        # This event would have stopped the trigger already, so all we need to
        # do is record the start timestamp.
        stop_trigger_ts = event.ts

#
# next_power_of_two()
#
def next_power_of_two(val):
    np = 1
    while np < val:
        np *= 2
    return np

#
# unsigned_int()
#
def unsigned_int(value):
    try:
        value = int(value)
    except ValueError:
        raise argparse.ArgumentTypeError("must be an integer")

if value < 0:
        raise argparse.ArgumentTypeError("must be positive")
    return value

#
# unsigned_nonzero_int()
#
def unsigned_nonzero_int(value):
    value = unsigned_int(value)
    if value == 0:
        raise argparse.ArgumentTypeError("must be nonzero")
    return value

#
# get_thread_name()
#
def get_thread_name(pid, tid):
    try:
        with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f:
            return f.readline().strip("\n")
    except FileNotFoundError:
        pass

return f"<unknown:{pid}/{tid}>"

#
# get_vec_nr_name()
#
def get_vec_nr_name(vec_nr):
    known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
                    "tasklet", "sched", "hrtimer", "rcu"]

if vec_nr < 0 or vec_nr > len(known_vec_nr):
        return f"<unknown:{vec_nr}>"

return known_vec_nr[vec_nr]

#
# start/stop/reset capture
#
def start_capture():
    bpf["capture_on"][ct.c_int(0)] = ct.c_int(1)

def stop_capture(force=False):
    if force:
        bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff)
    else:
        bpf["capture_on"][ct.c_int(0)] = ct.c_int(0)

def capture_running():
    return bpf["capture_on"][ct.c_int(0)].value == 1

def reset_capture():
    bpf["syscall_start"].clear()
    bpf["syscall_data"].clear()
    bpf["run_start"].clear()
    bpf["run_data"].clear()
    bpf["ready_start"].clear()
    bpf["ready_data"].clear()
    bpf["hardirq_start"].clear()
    bpf["hardirq_data"].clear()
    bpf["softirq_start"].clear()
    bpf["softirq_data"].clear()
    bpf["stack_traces"].clear()

#
# Display timestamp
#
def print_timestamp(msg):
    ltz = datetime.datetime.now()
    utc = ltz.astimezone(pytz.utc)
    time_string = "{} @{} ({} UTC)".format(
        msg, ltz.isoformat(), utc.strftime("%H:%M:%S"))
    print(time_string)

#
# process_results()
#
def process_results(syscall_events=None, trigger_delta=None):
    if trigger_delta:
        print_timestamp("# Triggered sample dump, stop-start delta {:,} ns".
                        format(trigger_delta))
    else:
        print_timestamp("# Sample dump")

#
    # First get a list of all threads we need to report on.
    #
    threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items()
                       if k.syscall != 0xffffffff}

threads_run = {k.tid for k, _ in bpf["run_data"].items()
                   if k.pid != 0xffffffff}

threads_ready = {k.tid for k, _ in bpf["ready_data"].items()
                     if k.pid != 0xffffffff}

threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items()
                       if k.pid != 0xffffffff}

threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items()
                       if k.pid != 0xffffffff}

threads = sorted(threads_syscall | threads_run | threads_ready |
                     threads_hardirq | threads_softirq,
                     key=lambda x: get_thread_name(options.pid, x))

#
    # Print header...
    #
    print("{:10} {:16} {}".format("TID", "THREAD", "<RESOURCE SPECIFIC>"))
    print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76))
    indent = 28 * " "

#
    # Print all events/statistics per threads.
    #
    poll_id = [k for k, v in syscalls.items() if v == b"poll"][0]
    for thread in threads:

if thread != threads[0]:
            print("")

#
        # SYSCALL_STATISTICS
        #
        print("{:10} {:16} {}\n{}{:20} {:>6}  {:>10}  {:>16}  {:>16}".format(
            thread, get_thread_name(options.pid, thread),
            "[SYSCALL STATISTICS]", indent,
            "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns"))

total_count = 0
        total_ns = 0
        for k, v in sorted(filter(lambda t: t[0].tid == thread,
                                  bpf["syscall_data"].items()),
                           key=lambda kv: -kv[1].total_ns):

print("{}{:20.20} {:6}  {:10}  {:16,}  {:16,}".format(
                indent, syscall_name(k.syscall).decode("utf-8"), k.syscall,
                v.count, v.total_ns, v.worst_ns))
            if k.syscall != poll_id:
                total_count += v.count
                total_ns += v.total_ns

if total_count > 0:
            print("{}{:20.20} {:6}  {:10}  {:16,}".format(
                indent, "TOTAL( - poll):", "", total_count, total_ns))

#
        # THREAD RUN STATISTICS
        #
        print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}  {:>16}".format(
            "", "", "[THREAD RUN STATISTICS]", indent,
            "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns"))

for k, v in filter(lambda t: t[0].tid == thread,
                           bpf["run_data"].items()):

print("{}{:10}  {:16,}  {:16,}  {:16,}".format(
                indent, v.count, v.total_ns, v.min_ns, v.max_ns))

#
        # THREAD READY STATISTICS
        #
        print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}".format(
            "", "", "[THREAD READY STATISTICS]", indent,
            "SCHED_CNT", "TOTAL ns", "MAX ns"))

for k, v in filter(lambda t: t[0].tid == thread,
                           bpf["ready_data"].items()):

print("{}{:10}  {:16,}  {:16,}".format(
                indent, v.count, v.total_ns, v.worst_ns))

#
        # HARD IRQ STATISTICS
        #
        total_ns = 0
        total_count = 0
        header_printed = False
        for k, v in sorted(filter(lambda t: t[0].tid == thread,
                                  bpf["hardirq_data"].items()),
                           key=lambda kv: -kv[1].total_ns):

if not header_printed:
                print("\n{:10} {:16} {}\n{}{:20}  {:>10}  {:>16}  {:>16}".
                      format("", "", "[HARD IRQ STATISTICS]", indent,
                             "NAME", "COUNT", "TOTAL ns", "MAX ns"))
                header_printed = True

print("{}{:20.20}  {:10}  {:16,}  {:16,}".format(
                indent, k.irq_name.decode("utf-8"),
                v.count, v.total_ns, v.worst_ns))

total_count += v.count
            total_ns += v.total_ns

if total_count > 0:
            print("{}{:20.20}  {:10}  {:16,}".format(
                indent, "TOTAL:", total_count, total_ns))

#
        # SOFT IRQ STATISTICS
        #
        total_ns = 0
        total_count = 0
        header_printed = False
        for k, v in sorted(filter(lambda t: t[0].tid == thread,
                                  bpf["softirq_data"].items()),
                           key=lambda kv: -kv[1].total_ns):

if not header_printed:
                print("\n{:10} {:16} {}\n"
                      "{}{:20} {:>7}  {:>10}  {:>16}  {:>16}".
                      format("", "", "[SOFT IRQ STATISTICS]", indent,
                             "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX ns"))
                header_printed = True

print("{}{:20.20} {:>7}  {:10}  {:16,}  {:16,}".format(
                indent, get_vec_nr_name(k.vec_nr), k.vec_nr,
                v.count, v.total_ns, v.worst_ns))

total_count += v.count
            total_ns += v.total_ns

if total_count > 0:
            print("{}{:20.20} {:7}  {:10}  {:16,}".format(
                indent, "TOTAL:", "", total_count, total_ns))

#
    # Print events
    #
    lost_stack_traces = 0
    if syscall_events:
        stack_traces = bpf.get_table("stack_traces")

print("\n\n# SYSCALL EVENTS:"
              "\n{}{:>19} {:>19} {:>10} {:16} {:>10}  {}".format(
                  2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM",
                  "DELTA (us)", "SYSCALL"))
        print("{}{:19} {:19} {:10} {:16} {:10}  {}".format(
            2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16,
            "-" * 10, "-" * 16))
        for event in syscall_events:
            print("{}{:19} {:19} {:10} {:16} {:10,}  {}".format(
                " " * 2,
                event["ts_entry"], event["ts_exit"], event["tid"],
                get_thread_name(options.pid, event["tid"]),
                int((event["ts_exit"] - event["ts_entry"]) / 1000),
                syscall_name(event["syscall"]).decode("utf-8")))
            #
            # Not sure where to put this, but I'll add some info on stack
            # traces here... Userspace stack traces are very limited due to
            # the fact that bcc does not support dwarf backtraces. As OVS
            # gets compiled without frame pointers we will not see much.
            # If however, OVS does get built with frame pointers, we should not
            # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle
            # the debug symbols correctly. Also, note that for kernel
            # traces you should not use BPF_STACK_TRACE_BUILDID, so two
            # buffers are needed.
            #
            # Some info on manual dwarf walk support:
            #   https://github.com/iovisor/bcc/issues/3515
            #   https://github.com/iovisor/bcc/pull/4463
            #
            if options.stack_trace_size == 0:
                continue

if event["kernel_stack_id"] < 0 or event["user_stack_id"] < 0:
                lost_stack_traces += 1

kernel_stack = stack_traces.walk(event["kernel_stack_id"]) \
                if event["kernel_stack_id"] >= 0 else []
            user_stack = stack_traces.walk(event["user_stack_id"]) \
                if event["user_stack_id"] >= 0 else []

for addr in kernel_stack:
                print("{}{}".format(
                    " " * 10,
                    bpf.ksym(addr, show_module=True,
                             show_offset=True).decode("utf-8", "replace")))

for addr in user_stack:
                addr_str = bpf.sym(addr, options.pid, show_module=True,
                                   show_offset=True).decode("utf-8", "replace")

if addr_str == "[unknown]":
                    addr_str += " 0x{:x}".format(addr)

print("{}{}".format(" " * 10, addr_str))

#
    # Print any footer messages.
    #
    if lost_stack_traces > 0:
        print("\n#WARNING: We where not able to display {} stack traces!\n"
              "#         Consider increasing the stack trace size using\n"
              "#         the '--stack-trace-size' option.\n"
              "#         Note that this can also happen due to a stack id\n"
              "#         collision.".format(lost_stack_traces))

#
# main()
#
def main():
    #
    # Don't like these globals, but ctx passing does not seem to work with the
    # existing open_ring_buffer() API :(
    #
    global bpf
    global options
    global syscall_events
    global start_trigger_ts
    global stop_trigger_ts

start_trigger_ts = 0
    stop_trigger_ts = 0

#
    # Argument parsing
    #
    parser = argparse.ArgumentParser()

parser.add_argument("-D", "--debug",
                        help="Enable eBPF debugging",
                        type=int, const=0x3f, default=0, nargs="?")
    parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID",
                        help="ovs-vswitch's PID",
                        type=unsigned_int, default=None)
    parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS",
                        help="Record syscall events that take longer than "
                        "DURATION_NS. Omit the duration value to record all "
                        "syscall events",
                        type=unsigned_int, const=0, default=None, nargs="?")
    parser.add_argument("--buffer-page-count",
                        help="Number of BPF ring buffer pages, default 1024",
                        type=unsigned_int, default=1024, metavar="NUMBER")
    parser.add_argument("--sample-count",
                        help="Number of sample runs, default 1",
                        type=unsigned_nonzero_int, default=1, metavar="RUNS")
    parser.add_argument("--sample-interval",
                        help="Delay between sample runs, default 0",
                        type=float, default=0, metavar="SECONDS")
    parser.add_argument("--sample-time",
                        help="Sample time, default 0.5 seconds",
                        type=float, default=0.5, metavar="SECONDS")
    parser.add_argument("--skip-syscall-poll-events",
                        help="Skip poll() syscalls with --syscall-events",
                        action="store_true")
    parser.add_argument("--stack-trace-size",
                        help="Number of unique stack traces that can be "
                        "recorded, default 4096. 0 to disable",
                        type=unsigned_int, default=4096)
    parser.add_argument("--start-trigger", metavar="TRIGGER",
                        help="Start trigger, see documentation for details",
                        type=str, default=None)
    parser.add_argument("--stop-trigger", metavar="TRIGGER",
                        help="Stop trigger, see documentation for details",
                        type=str, default=None)
    parser.add_argument("--trigger-delta", metavar="DURATION_NS",
                        help="Only report event when the trigger duration > "
                             "DURATION_NS, default 0 (all events)",
                        type=unsigned_int, const=0, default=0, nargs="?")

options = parser.parse_args()

#
    # Find the PID of the ovs-vswitchd daemon if not specified.
    #
    if not options.pid:
        for proc in psutil.process_iter():
            if "ovs-vswitchd" in proc.name():
                if options.pid:
                    print("ERROR: Multiple ovs-vswitchd daemons running, "
                          "use the -p option!")
                    sys.exit(os.EX_NOINPUT)

options.pid = proc.pid

#
    # Error checking on input parameters.
    #
    if not options.pid:
        print("ERROR: Failed to find ovs-vswitchd's PID!")
        sys.exit(os.EX_UNAVAILABLE)

options.buffer_page_count = next_power_of_two(options.buffer_page_count)

#
    # Make sure we are running as root, or else we can not attach the probes.
    #
    if os.geteuid() != 0:
        print("ERROR: We need to run as root to attached probes!")
        sys.exit(os.EX_NOPERM)

#
    # Setup any of the start stop triggers
    #
    if options.start_trigger is not None:
        try:
            start_trigger = Probe(options.start_trigger, pid=options.pid)
        except ValueError as e:
            print(f"ERROR: Invalid start trigger {str(e)}")
            sys.exit(os.EX_CONFIG)
    else:
        start_trigger = None

if options.stop_trigger is not None:
        try:
            stop_trigger = Probe(options.stop_trigger, pid=options.pid)
        except ValueError as e:
            print(f"ERROR: Invalid stop trigger {str(e)}")
            sys.exit(os.EX_CONFIG)
    else:
        stop_trigger = None

#
    # Attach probe to running process.
    #
    source = EBPF_SOURCE.replace("<EVENT_ENUM>", "\n".join(
        ["    EVENT_{} = {},".format(
            event.name, event.value) for event in Event]))
    source = source.replace("<BUFFER_PAGE_CNT>",
                            str(options.buffer_page_count))
    source = source.replace("<MONITOR_PID>", str(options.pid))

if BPF.kernel_struct_has_field(b"task_struct", b"state") == 1:
        source = source.replace("<STATE_FIELD>", "state")
    else:
        source = source.replace("<STATE_FIELD>", "__state")

poll_id = [k for k, v in syscalls.items() if v == b"poll"][0]
    if options.syscall_events is None:
        syscall_trace_events = "false"
    elif options.syscall_events == 0:
        if not options.skip_syscall_poll_events:
            syscall_trace_events = "true"
        else:
            syscall_trace_events = f"args->id != {poll_id}"
    else:
        syscall_trace_events = "delta > {}".format(options.syscall_events)
        if options.skip_syscall_poll_events:
            syscall_trace_events += f" && args->id != {poll_id}"

source = source.replace("<SYSCALL_TRACE_EVENTS>",
                            syscall_trace_events)

source = source.replace("<STACK_TRACE_SIZE>",
                            str(options.stack_trace_size))

source = source.replace("<STACK_TRACE_ENABLED>", "true"
                            if options.stack_trace_size > 0 else "false")

#
    # Handle start/stop probes
    #
    if start_trigger:
        source = source.replace("<START_TRIGGER>",
                                start_trigger.get_c_code(
                                    "start_trigger_probe",
                                    "return start_trigger();"))
    else:
        source = source.replace("<START_TRIGGER>", "")

if stop_trigger:
        source = source.replace("<STOP_TRIGGER>",
                                stop_trigger.get_c_code(
                                    "stop_trigger_probe",
                                    "return stop_trigger();"))
    else:
        source = source.replace("<STOP_TRIGGER>", "")

#
    # Setup usdt or other probes that need handling trough the BFP class.
    #
    usdt = USDT(pid=int(options.pid))
    try:
        if start_trigger and start_trigger.probe_type == "usdt":
            usdt.enable_probe(probe=start_trigger.probe_name(),
                              fn_name="start_trigger_probe")
        if stop_trigger and stop_trigger.probe_type == "usdt":
            usdt.enable_probe(probe=stop_trigger.probe_name(),
                              fn_name="stop_trigger_probe")

except USDTException as e:
        print("ERROR: {}".format(
            (re.sub("^", " " * 7, str(e), flags=re.MULTILINE)).strip().
            replace("--with-dtrace or --enable-dtrace",
                    "--enable-usdt-probes")))
        sys.exit(os.EX_OSERR)

bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug)

if start_trigger:
        try:
            if start_trigger.probe_type == "uprobe":
                bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
                                  sym=start_trigger.probe_name(),
                                  fn_name="start_trigger_probe",
                                  pid=options.pid)

if start_trigger.probe_type == "uretprobe":
                bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
                                     sym=start_trigger.probe_name(),
                                     fn_name="start_trigger_probe",
                                     pid=options.pid)
        except Exception as e:
            print("ERROR: Failed attaching uprobe start trigger "
                  f"'{start_trigger.probe_name()}';\n         {str(e)}")
            sys.exit(os.EX_OSERR)

if stop_trigger:
        try:
            if stop_trigger.probe_type == "uprobe":
                bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
                                  sym=stop_trigger.probe_name(),
                                  fn_name="stop_trigger_probe",
                                  pid=options.pid)

if stop_trigger.probe_type == "uretprobe":
                bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
                                     sym=stop_trigger.probe_name(),
                                     fn_name="stop_trigger_probe",
                                     pid=options.pid)
        except Exception as e:
            print("ERROR: Failed attaching uprobe stop trigger"
                  f"'{stop_trigger.probe_name()}';\n         {str(e)}")
            sys.exit(os.EX_OSERR)

#
    # If no triggers are configured use the delay configuration
    #
    bpf["events"].open_ring_buffer(process_event)

sample_count = 0
    while sample_count < options.sample_count:
        sample_count += 1
        syscall_events = []

if not options.start_trigger:
            print_timestamp("# Start sampling")
            start_capture()
            stop_time = -1 if options.stop_trigger else \
                time_ns() + options.sample_time * 1000000000
        else:
            # For start triggers the stop time depends on the start trigger
            # time, or depends on the stop trigger if configured.
            stop_time = -1 if options.stop_trigger else 0

while True:
            keyboard_interrupt = False
            try:
                last_start_ts = start_trigger_ts
                last_stop_ts = stop_trigger_ts

if stop_time > 0:
                    delay = int((stop_time - time_ns()) / 1000000)
                    if delay <= 0:
                        break
                else:
                    delay = -1

bpf.ring_buffer_poll(timeout=delay)

if stop_time <= 0 and last_start_ts != start_trigger_ts:
                    print_timestamp(
                        "# Start sampling (trigger@{})".format(
                            start_trigger_ts))

if not options.stop_trigger:
                        stop_time = time_ns() + \
                            options.sample_time * 1000000000

if last_stop_ts != stop_trigger_ts:
                    break

except KeyboardInterrupt:
                keyboard_interrupt = True
                break

if options.stop_trigger and not capture_running():
            print_timestamp("# Stop sampling (trigger@{})".format(
                stop_trigger_ts))
        else:
            print_timestamp("# Stop sampling")

if stop_trigger_ts != 0 and start_trigger_ts != 0:
            trigger_delta = stop_trigger_ts - start_trigger_ts
        else:
            trigger_delta = None

if not trigger_delta or trigger_delta >= options.trigger_delta:
            stop_capture(force=True)  # Prevent a new trigger to start.
            process_results(syscall_events=syscall_events,
                            trigger_delta=trigger_delta)
        elif trigger_delta:
            sample_count -= 1
            print_timestamp("# Sample dump skipped, delta {:,} ns".format(
                trigger_delta))

reset_capture()
        stop_capture()

if keyboard_interrupt:
            break

if options.sample_interval > 0:
            time.sleep(options.sample_interval)

#
    # Report lost events.
    #
    dropcnt = bpf.get_table("dropcnt")
    for k in dropcnt.keys():
        count = dropcnt.sum(k).value
        if k.value == 0 and count > 0:
            print("\n# WARNING: Not all events were captured, {} were "
                  "dropped!\n#          Increase the BPF ring buffer size "
                  "with the --buffer-page-count option.".format(count))

if options.sample_count > 1:
        trigger_miss = bpf.get_table("trigger_miss")
        for k in trigger_miss.keys():
            count = trigger_miss.sum(k).value
            if k.value == 0 and count > 0:
                print("\n# WARNING: Not all start triggers were successful. "
                      "{} were missed due to\n#          slow userspace "
                      "processing!".format(count))

#
# Start main() as the default entry point...
#
if __name__ == "__main__":
    main()