#+TITLE: Tracing On-CPU time for threads

As part of my Python tracer, I'd like to explore on-cpu time for
threads and visualize it more explicitly: using sys.settrace allows
me to see the actual functions being "executed" by the interpreter,
but not when they're actually running on it.

ftrace is a tracer built into the kernel that can allow me to capture
this information: as long as I use the right commands to enable it. 

Quick steps to set it up for function tracing
- Navigate to /sys/kernel/debug/tracing (might need to use root).
- Choose the function tracer with echo function > current_tracer.
- Make sure sched events are enabled with echo 1 >
  /sys/kernel/debug/tracing/events/sched/enable.
- Only intercept scheduling events by running  echo :mod:sched >
  set_ftrace_filter.
- Enable tracing  with echo 1 > tracing_on.
  
There are several events in the sched module: the ones I'm most
interested in are
- sched_switch Switching tasks on the active cpu
- sched_migrate_task Moving a task to another cpu
- sched_process_exec Self explanatory
- sched_process_fork Self explanatory
  
  To make it easier to do this, I modified a script from elinux to
  automate this process. sched events don't respect the pid filter, so
  I'm not bothering with exec to maintain a specific pid.
  
#+begin_src sh
#!/bin/sh

set -e -x

echo $$
echo $$ > /sys/kernel/tracing/set_ftrace_pid
echo function > /sys/kernel/tracing/current_tracer
echo :mod:sched > /sys/kernel/tracing/set_ftrace_filter
echo 1 > /sys/kernel/tracing/events/sched/enable
echo > /sys/kernel/tracing/trace
echo 1 > /sys/kernel/tracing/tracing_on
$*
cat /sys/kernel/tracing/trace > ftrace.txt
echo 0 > /sys/kernel/tracing/tracing_on
#+end_src

A sample trace over a few seconds as I wrote this post:
#+begin_example

# tracer: function
#
# entries-in-buffer/entries-written: 70888/70888   #P:4
#
#                              _-----=> irqs-off
#                             / _----=> need-resched
#                            | / _---=> hardirq/softirq
#                            || / _--=> preempt-depth
#                            ||| /     delay
#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
#              | |       |   ||||       |         |
    tmux: server-1613  [003] d..5 14306.111708: sched_wake_idle_without_ipi: cpu=1
    kworker/u8:3-25363 [001] d..4 14306.111729: sched_stat_runtime: comm=kworker/u8:3 pid=25363 runtime=14716 [ns] vruntime=3124141097686 [ns]
    tmux: server-1613  [003] d..2 14306.111733: sched_stat_runtime: comm=tmux: server pid=1613 runtime=351974 [ns] vruntime=3051734649 [ns]
    tmux: server-1613  [003] d..2 14306.111744: sched_stat_runtime: comm=kworker/u8:3 pid=25363 runtime=22808 [ns] vruntime=3124141120494 [ns]
          chrome-2223  [002] dN.2 14306.111751: sched_stat_runtime: comm=chrome pid=2223 runtime=374887 [ns] vruntime=736273717308 [ns]
              st-9107  [003] d..2 14306.111825: sched_stat_runtime: comm=st pid=9107 runtime=71610 [ns] vruntime=3063535983 [ns]
           <...>-239   [002] d.h2 14306.111912: sched_stat_runtime: comm=systemd-journal pid=239 runtime=163616 [ns] vruntime=146844212 [ns]
           <...>-22949 [000] d.h3 14306.111912: sched_stat_runtime: comm=bash pid=22949 runtime=618332 [ns] vruntime=1583492855 [ns]
           <...>-22949 [000] dN.2 14306.111937: sched_stat_runtime: comm=bash pid=22949 runtime=1426 [ns] vruntime=1583494281 [ns]
     kworker/1:1-27835 [001] d..2 14306.111951: sched_stat_runtime: comm=kworker/1:1 pid=27835 runtime=19081 [ns] vruntime=3124140995943 [ns]
     kworker/0:0-23401 [000] d..2 14306.111956: sched_stat_runtime: comm=kworker/0:0 pid=23401 runtime=9197 [ns] vruntime=3069426866395 [ns]
           <...>-239   [002] d..2 14306.112015: sched_stat_runtime: comm=systemd-journal pid=239 runtime=71494 [ns] vruntime=146915706 [ns]
     kworker/2:0-29251 [002] d..2 14306.112028: sched_stat_runtime: comm=kworker/2:0 pid=29251 runtime=13207 [ns] vruntime=3167685111600 [ns]
           <...>-22949 [000] d..4 14306.112359: sched_waking: comm=kworker/u8:3 pid=25363 prio=120 target_cpu=001
           <...>-22949 [000] d..5 14306.112364: sched_wake_idle_without_ipi: cpu=1
           <...>-22949 [000] d..5 14306.112366: sched_wakeup: comm=kworker/u8:3 pid=25363 prio=120 target_cpu=001
          <idle>-0     [001] d..2 14306.112367: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=kworker/u8:3 next_pid=25363 next_prio=120
    kworker/u8:3-25363 [001] d..3 14306.112370: sched_waking: comm=tmux: server pid=1613 prio=120 target_cpu=003
    kworker/u8:3-25363 [001] d..4 14306.112373: sched_wake_idle_without_ipi: cpu=3
    kworker/u8:3-25363 [001] d..4 14306.112373: sched_wakeup: comm=tmux: server pid=1613 prio=120 target_cpu=003
    kworker/u8:3-25363 [001] d..2 14306.112375: sched_stat_runtime: comm=kworker/u8:3 pid=25363 runtime=12983 [ns] vruntime=3124141133477 [ns]
          <idle>-0     [003] d..2 14306.112378: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=tmux: server next_pid=1613 next_prio=120
    kworker/u8:3-25363 [001] d..2 14306.112379: sched_switch: prev_comm=kworker/u8:3 prev_pid=25363 prev_prio=120 prev_state=I ==> next_comm=swapper/1 next_pid=0 next_prio=120
           <...>-22949 [000] d..4 14306.112460: sched_waking: comm=kworker/u8:3 pid=25363 prio=120 target_cpu=001
           <...>-22949 [000] d..5 14306.112464: sched_wake_idle_without_ipi: cpu=1
           <...>-22949 [000] d..5 14306.112465: sched_wakeup: comm=kworker/u8:3 pid=25363 prio=120 target_cpu=001
          <idle>-0     [001] d..2 14306.112468: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=kworker/u8:3 next_pid=25363 next_prio=120
    kworker/u8:3-25363 [001] d..2 14306.112472: sched_stat_runtime: comm=kworker/u8:3 pid=25363 runtime=9494 [ns] vruntime=3124141142971 [ns]
    kworker/u8:3-25363 [001] d..2 14306.112474: sched_switch: prev_comm=kworker/u8:3 prev_pid=25363 prev_prio=120 prev_state=I ==> next_comm=swapper/1 next_pid=0 next_prio=120
           <...>-22949 [000] d..2 14306.112476: sched_stat_runtime: comm=bash pid=22949 runtime=518620 [ns] vruntime=1584012901 [ns]
           <...>-22949 [000] d..2 14306.112480: sched_switch: prev_comm=bash prev_pid=22949 prev_prio=120 prev_state=S ==> next_comm=swapper/0 next_pid=0 next_prio=120
          chrome-2223  [002] d..2 14306.112705: sched_stat_runtime: comm=chrome pid=2223 runtime=668879 [ns] vruntime=736274386187 [ns]
          chrome-2223  [002] d..2 14306.112711: sched_switch: prev_comm=chrome prev_pid=2223 prev_prio=120 prev_state=S ==> next_comm=swapper/2 next_pid=0 next_prio=120
    tmux: server-1613  [003] d..4 14306.112941: sched_waking: comm=kworker/u8:3 pid=25363 prio=120 target_cpu=001
    tmux: server-1613  [003] d..5 14306.112949: sched_wake_idle_without_ipi: cpu=1
    tmux: server-1613  [003] d..5 14306.112950: sched_wakeup: comm=kworker/u8:3 pid=25363 prio=120 target_cpu=001
          <idle>-0     [001] d..2 14306.112956: sched_switch: prev_comm=swapper/1 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=kworker/u8:3 next_pid=25363 next_prio=120
    kworker/u8:3-25363 [001] d..3 14306.112962: sched_waking: comm=st pid=9107 prio=120 target_cpu=003
    kworker/u8:3-25363 [001] d..3 14306.112968: sched_migrate_task: comm=st pid=9107 prio=120 orig_cpu=3 dest_cpu=2
    kworker/u8:3-25363 [001] d..4 14306.112978: sched_wake_idle_without_ipi: cpu=2
    kworker/u8:3-25363 [001] d..4 14306.112979: sched_wakeup: comm=st pid=9107 prio=120 target_cpu=002
    tmux: server-1613  [003] d..2 14306.112980: sched_stat_runtime: comm=tmux: server pid=1613 runtime=605903 [ns] vruntime=3052340552 [ns]
    kworker/u8:3-25363 [001] d..2 14306.112983: sched_stat_runtime: comm=kworker/u8:3 pid=25363 runtime=37454 [ns] vruntime=3124141180425 [ns]
    kworker/u8:3-25363 [001] d..2 14306.112986: sched_switch: prev_comm=kworker/u8:3 prev_pid=25363 prev_prio=120 prev_state=I ==> next_comm=swapper/1 next_pid=0 next_prio=120
    tmux: server-1613  [003] d..2 14306.112986: sched_switch: prev_comm=tmux: server prev_pid=1613 prev_prio=120 prev_state=S ==> next_comm=swapper/3 next_pid=0 next_prio=120
          <idle>-0     [002] d..2 14306.113020: sched_switch: prev_comm=swapper/2 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=st next_pid=9107 next_prio=120
          <idle>-0     [000] d.s4 14306.113053: sched_waking: comm=systemd-journal pid=239 prio=120 target_cpu=002
          <idle>-0     [000] d.s4 14306.113059: sched_migrate_task: comm=systemd-journal pid=239 prio=120 orig_cpu=2 dest_cpu=0
          <idle>-0     [000] dNs5 14306.113070: sched_wakeup: comm=systemd-journal pid=239 prio=120 target_cpu=000
          <idle>-0     [000] d..2 14306.113093: sched_switch: prev_comm=swapper/0 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=systemd-journal next_pid=239 next_prio=120
              st-9107  [002] d..2 14306.113100: sched_stat_runtime: comm=st pid=9107 runtime=122133 [ns] vruntime=2793204668 [ns]
              st-9107  [002] d..2 14306.113107: sched_switch: prev_comm=st prev_pid=9107 prev_prio=120 prev_state=S ==> next_comm=swapper/2 next_pid=0 next_prio=120
           <...>-239   [000] d..4 14306.113114: sched_waking: comm=jbd2/sda3-8 pid=211 prio=120 target_cpu=003
           <...>-239   [000] d..5 14306.113124: sched_wake_idle_without_ipi: cpu=3
           <...>-239   [000] d..5 14306.113125: sched_wakeup: comm=jbd2/sda3-8 pid=211 prio=120 target_cpu=003
          <idle>-0     [003] d..2 14306.113127: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=jbd2/sda3-8 next_pid=211 next_prio=120
           <...>-239   [000] d..2 14306.113129: sched_stat_runtime: comm=systemd-journal pid=239 runtime=19441 [ns] vruntime=153979359 [ns]
           <...>-239   [000] d..2 14306.113131: sched_switch: prev_comm=systemd-journal prev_pid=239 prev_prio=120 prev_state=D ==> next_comm=swapper/0 next_pid=0 next_prio=120
           <...>-211   [003] d..2 14306.113244: sched_stat_runtime: comm=jbd2/sda3-8 pid=211 runtime=125557 [ns] vruntime=3129122506486 [ns]
           <...>-211   [003] d..2 14306.113248: sched_switch: prev_comm=jbd2/sda3-8 prev_pid=211 prev_prio=120 prev_state=D ==> next_comm=swapper/3 next_pid=0 next_prio=120
          <idle>-0     [000] d.s4 14306.113375: sched_waking: comm=jbd2/sda3-8 pid=211 prio=120 target_cpu=003
          <idle>-0     [000] d.s5 14306.113388: sched_wake_idle_without_ipi: cpu=3
          <idle>-0     [000] d.s5 14306.113389: sched_wakeup: comm=jbd2/sda3-8 pid=211 prio=120 target_cpu=003
          <idle>-0     [003] d..2 14306.113395: sched_switch: prev_comm=swapper/3 prev_pid=0 prev_prio=120 prev_state=R ==> next_comm=jbd2/sda3-8 next_pid=211 next_prio=120
           <...>-211   [003] d..4 14306.113432: sched_waking: comm=kworker/3:1H pid=206 prio=100 target_cpu=003
#+end_example

* References
- https://elinux.org/Ftrace
- https://github.com/torvalds/linux/blob/v5.4/include/trace/events/sched.h
- https://events.static.linuxfound.org/sites/events/files/slides/linuxconjapan-ftrace-2014.pdf