diff --git a/README.md b/README.md index 6cbdd59b5f60fbd2e5e68d12be61c9fa5ba7a70c..c8199572b4e8b56b9c0aad71e324fefca5db8e13 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ Examples: - tools/[btrfsdist](tools/btrfsdist.py): Summarize btrfs operation latency distribution as a histogram. [Examples](tools/btrfsdist_example.txt). - tools/[btrfsslower](tools/btrfsslower.py): Trace slow btrfs operations. [Examples](tools/btrfsslower_example.txt). - tools/[cachestat](tools/cachestat.py): Trace page cache hit/miss ratio. [Examples](tools/cachestat_example.txt). -- tools/[cpudist](tools/cpudist.py): Summarize on-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt) +- tools/[cpudist](tools/cpudist.py): Summarize on- and off-CPU time per task as a histogram. [Examples](tools/cpudist_example.txt) - tools/[dcsnoop](tools/dcsnoop.py): Trace directory entry cache (dcache) lookups. [Examples](tools/dcsnoop_example.txt). - tools/[dcstat](tools/dcstat.py): Directory entry cache (dcache) stats. [Examples](tools/dcstat_example.txt). - tools/[execsnoop](tools/execsnoop.py): Trace new processes via exec() syscalls. [Examples](tools/execsnoop_example.txt). diff --git a/man/man8/cpudist.8 b/man/man8/cpudist.8 index cee5165a237cf65e1af1cceb857b75f28537872e..8d507aa79118390ee3a4585f4a1dbbb6e5952715 100644 --- a/man/man8/cpudist.8 +++ b/man/man8/cpudist.8 @@ -1,8 +1,8 @@ .TH cpudist 8 "2016-06-28" "USER COMMANDS" .SH NAME -cpudist \- On-CPU task time as a histogram. +cpudist \- On- and off-CPU task time as a histogram. .SH SYNOPSIS -.B cpudist [\-h] [\-T] [\-m] [\-P] [\-L] [\-p PID] [interval] [count] +.B cpudist [\-h] [-O] [\-T] [\-m] [\-P] [\-L] [\-p PID] [interval] [count] .SH DESCRIPTION This measures the time a task spends on the CPU before being descheduled, and shows the times as a histogram. Tasks that spend a very short time on the CPU @@ -10,6 +10,11 @@ can be indicative of excessive context-switches and poor workload distribution, and possibly point to a shared source of contention that keeps tasks switching in and out as it becomes available (such as a mutex). +Similarly, the tool can also measure the time a task spends off-CPU before it +is scheduled again. This can be helpful in identifying long blocking and I/O +operations, or alternatively very short descheduling times due to short-lived +locks or timers. + This tool uses in-kernel eBPF maps for storing timestamps and the histogram, for efficiency. Despite this, the overhead of this tool may become significant for some workloads: see the OVERHEAD section. @@ -27,6 +32,9 @@ CONFIG_BPF and bcc. \-h Print usage message. .TP +\-O +Measure off-CPU time instead of on-CPU time. +.TP \-T Include timestamps on output. .TP @@ -53,6 +61,10 @@ Summarize task on-CPU time as a histogram: # .B cpudist .TP +Summarize task off-CPU time as a histogram: +# +.B cpudist -O +.TP Print 1 second summaries, 10 times: # .B cpudist 1 10 diff --git a/tools/cpudist.py b/tools/cpudist.py index 560ad92f10c891ecae33b6e036f36db7a77d75bc..2df8172b7ad9e7cb28e939676c35c2891e2cd245 100755 --- a/tools/cpudist.py +++ b/tools/cpudist.py @@ -1,12 +1,12 @@ #!/usr/bin/python # @lint-avoid-python-3-compatibility-imports # -# cpudist Summarize on-CPU time per task as a histogram. +# cpudist Summarize on- and off-CPU time per task as a histogram. # -# USAGE: cpudist [-h] [-T] [-m] [-P] [-L] [-p PID] [interval] [count] +# USAGE: cpudist [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [interval] [count] # -# This measures the time a task spends on the CPU, and shows this time as a -# histogram, optionally per-process. +# This measures the time a task spends on or off the CPU, and shows this time +# as a histogram, optionally per-process. # # Copyright 2016 Sasha Goldshtein # Licensed under the Apache License, Version 2.0 (the "License") @@ -18,6 +18,7 @@ import argparse examples = """examples: cpudist # summarize on-CPU time as a histogram + cpudist -O # summarize off-CPU time as a histogram cpudist 1 10 # print 1 second summaries, 10 times cpudist -mT 1 # 1s summaries, milliseconds, and timestamps cpudist -P # show each PID separately @@ -27,6 +28,8 @@ parser = argparse.ArgumentParser( description="Summarize on-CPU time per task as a histogram.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=examples) +parser.add_argument("-O", "--offcpu", action="store_true", + help="measure off-CPU time") parser.add_argument("-T", "--timestamp", action="store_true", help="include timestamp on output") parser.add_argument("-m", "--milliseconds", action="store_true", @@ -52,60 +55,90 @@ bpf_text += tp.generate_decl() bpf_text += tp.generate_entry_probe() bpf_text += tp.generate_struct() +if not args.offcpu: + bpf_text += "#define ONCPU\n" + bpf_text += """ typedef struct pid_key { u64 id; u64 slot; } pid_key_t; -// We need to store the start time, which is when the thread got switched in, -// and the tgid for the pid because the sched_switch tracepoint doesn't provide -// that information. + BPF_HASH(start, u32, u64); BPF_HASH(tgid_for_pid, u32, u32); STORAGE +#define INVALID_TGID 0xffffffff + +static inline u32 get_tgid_if_missing(u32 tgid, u32 pid) +{ + if (tgid == INVALID_TGID) { + u32 *stored_tgid = tgid_for_pid.lookup(&pid); + if (stored_tgid != 0) + return *stored_tgid; + } + return tgid; +} + +static inline void store_start(u32 tgid, u32 pid, u64 ts) +{ + tgid = get_tgid_if_missing(tgid, pid); + + if (FILTER) + return; + + start.update(&pid, &ts); +} + +static inline void update_hist(u32 tgid, u32 pid, u64 ts) +{ + tgid = get_tgid_if_missing(tgid, pid); + + if (FILTER) + return; + + u64 *tsp = start.lookup(&pid); + if (tsp == 0) + return; + + u64 delta = ts - *tsp; + FACTOR + STORE +} + int sched_switch(struct pt_regs *ctx) { + u64 ts = bpf_ktime_get_ns(); u64 pid_tgid = bpf_get_current_pid_tgid(); + u32 tgid = pid_tgid >> 32, pid = pid_tgid; + // Keep a mapping of tgid for pid because when sched_switch hits, + // we only have the tgid information for the *current* pid, but not + // for the previous one. + tgid_for_pid.update(&pid, &tgid); + u64 *di = __trace_di.lookup(&pid_tgid); if (di == 0) return 0; struct sched_switch_trace_entry args = {}; bpf_probe_read(&args, sizeof(args), (void *)*di); - - u32 tgid, pid; - u64 ts = bpf_ktime_get_ns(); + // TODO: Store the comm as well if (args.prev_state == TASK_RUNNING) { - pid = args.prev_pid; - - u32 *stored_tgid = tgid_for_pid.lookup(&pid); - if (stored_tgid == 0) - goto BAIL; - tgid = *stored_tgid; - - if (FILTER) - goto BAIL; - - u64 *tsp = start.lookup(&pid); - if (tsp == 0) - goto BAIL; - - u64 delta = ts - *tsp; - FACTOR - STORE + u32 prev_pid = args.prev_pid; +#ifdef ONCPU + update_hist(INVALID_TGID, prev_pid, ts); +#else + store_start(INVALID_TGID, prev_pid, ts); +#endif } -BAIL: - tgid = pid_tgid >> 32; - pid = pid_tgid; - if (FILTER) - return 0; - - start.update(&pid, &ts); - tgid_for_pid.update(&pid, &tgid); +#ifdef ONCPU + store_start(tgid, pid, ts); +#else + update_hist(tgid, pid, ts); +#endif return 0; } @@ -144,7 +177,8 @@ b = BPF(text=bpf_text) Tracepoint.attach(b) b.attach_kprobe(event="perf_trace_sched_switch", fn_name="sched_switch") -print("Tracing on-CPU time... Hit Ctrl-C to end.") +print("Tracing %s-CPU time... Hit Ctrl-C to end." % + ("off" if args.offcpu else "on")) exiting = 0 if args.interval else 1 dist = b.get_table("dist") diff --git a/tools/cpudist_example.txt b/tools/cpudist_example.txt index 596c89b7a2348b9ac357dee761886942cd0177c4..7da435401974c84356628abe774b3df2602ca04b 100644 --- a/tools/cpudist_example.txt +++ b/tools/cpudist_example.txt @@ -6,6 +6,10 @@ that can indicate oversubscription (too many tasks for too few processors), overhead due to excessive context switching (e.g. a common shared lock for multiple threads), uneven workload distribution, too-granular tasks, and more. +Alternatively, the same options are available for summarizing task off-CPU +time, which helps understand how often threads are being descheduled and how +long they spend waiting for I/O, locks, timers, and other causes of suspension. + # ./cpudist.py Tracing on-CPU time... Hit Ctrl-C to end. ^C @@ -155,6 +159,47 @@ pid = 5068 This histogram was obtained while executing `dd if=/dev/zero of=/dev/null` with fairly large block sizes. +You could also ask for an off-CPU report using the -O switch. Here's a +histogram of task block times while the system is heavily loaded: + +# ./cpudist -O -p $(parprimes) +Tracing off-CPU time... Hit Ctrl-C to end. +^C + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 1 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 3 | | + 64 -> 127 : 1 | | + 128 -> 255 : 1 | | + 256 -> 511 : 0 | | + 512 -> 1023 : 2 | | + 1024 -> 2047 : 4 | | + 2048 -> 4095 : 3 | | + 4096 -> 8191 : 70 |*** | + 8192 -> 16383 : 867 |****************************************| + 16384 -> 32767 : 141 |****** | + 32768 -> 65535 : 8 | | + 65536 -> 131071 : 0 | | + 131072 -> 262143 : 1 | | + 262144 -> 524287 : 2 | | + 524288 -> 1048575 : 3 | | + +As you can see, threads are switching out for relatively long intervals, even +though we know the workload doesn't have any significant blocking. This can be +a result of over-subscription -- too many threads contending over too few CPUs. +Indeed, there are four available CPUs and more than four runnable threads: + +# nproc +4 +# cat /proc/loadavg +0.04 0.11 0.06 9/147 7494 + +(This shows we have 9 threads runnable out of 147 total. This is more than 4, +the number of available CPUs.) + Finally, let's ask for a per-thread report and values in milliseconds instead of microseconds: @@ -235,7 +280,7 @@ USAGE message: # ./cpudist.py -h -usage: cpudist.py [-h] [-T] [-m] [-P] [-L] [-p PID] [interval] [count] +usage: cpudist.py [-h] [-O] [-T] [-m] [-P] [-L] [-p PID] [interval] [count] Summarize on-CPU time per task as a histogram. @@ -245,6 +290,7 @@ positional arguments: optional arguments: -h, --help show this help message and exit + -O, --offcpu measure off-CPU time -T, --timestamp include timestamp on output -m, --milliseconds millisecond histogram -P, --pids print a histogram per process ID @@ -253,6 +299,7 @@ optional arguments: examples: cpudist # summarize on-CPU time as a histogram + cpudist -O # summarize off-CPU time as a histogram cpudist 1 10 # print 1 second summaries, 10 times cpudist -mT 1 # 1s summaries, milliseconds, and timestamps cpudist -P # show each PID separately