diff --git a/README.md b/README.md index 3f14366a7a681f01a8c79ff204a3e794da06ce75..e491de6c0b1e6d55604008a4649e613ee11c0021 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,7 @@ pair of .c and .py files, and some are directories of files. - tools/[mysqld_qslower](tools/mysqld_qslower.py): Trace MySQL server queries slower than a threshold. [Examples](tools/mysqld_qslower_example.txt). - tools/[memleak](tools/memleak.py): Display outstanding memory allocations to find memory leaks. [Examples](tools/memleak_example.txt). - tools/[nfsslower](tools/nfsslower.py): Trace slow NFS operations. [Examples](tools/nfsslower_example.txt). +- tools/[nfsdist](tools/nfsdist.py): Summarize NFS operation latency distribution as a histogram. [Examples](tools/nfsdist_example.txt). - tools/[offcputime](tools/offcputime.py): Summarize off-CPU time by kernel stack trace. [Examples](tools/offcputime_example.txt). - tools/[offwaketime](tools/offwaketime.py): Summarize blocked time by kernel off-CPU stack and waker stack. [Examples](tools/offwaketime_example.txt). - tools/[oomkill](tools/oomkill.py): Trace the out-of-memory (OOM) killer. [Examples](tools/oomkill_example.txt). diff --git a/man/man8/nfsdist.8 b/man/man8/nfsdist.8 new file mode 100644 index 0000000000000000000000000000000000000000..c72cfaa6021495779da3c8dc6f783b2b45a81ed9 --- /dev/null +++ b/man/man8/nfsdist.8 @@ -0,0 +1,80 @@ +.TH nfsdist 8 "2017-09-08" "USER COMMANDS" +.SH NAME +nfsdist \- Summarize NFS operation latency. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B nfsdist [\-h] [\-T] [\-m] [\-p PID] [interval] [count] +.SH DESCRIPTION +This tool summarizes time (latency) spent in common NFS file operations: reads, +writes, opens, and getattrs, and presents it as a power-of-2 histogram. It uses an +in-kernel eBPF map to store the histogram for efficiency. + +Since this works by tracing the nfs_file_operations interface functions, it +will need updating to match any changes to these functions. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.TP +\-T +Don't include timestamps on interval output. +.TP +\-m +Output in milliseconds. +.TP +\-p PID +Trace this PID only. +.SH EXAMPLES +.TP +Trace NFS operation time, and print a summary on Ctrl-C: +# +.B nfsdist +.TP +Trace PID 181 only: +# +.B nfsdist -p 181 +.TP +Print 1 second summaries, 10 times: +# +.B nfsdist 1 10 +.TP +1 second summaries, printed in milliseconds +# +.B nfsdist \-m 1 +.SH FIELDS +.TP +msecs +Range of milliseconds for this bucket. +.TP +usecs +Range of microseconds for this bucket. +.TP +count +Number of operations in this time range. +.TP +distribution +ASCII representation of the distribution (the count column). +.SH OVERHEAD +This adds low-overhead instrumentation to these NFS operations, +including reads and writes from the file system cache. Such reads and writes +can be very frequent (depending on the workload; eg, 1M/sec), at which +point the overhead of this tool may become noticeable. +Measure and quantify before use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Samuel Nair +.SH SEE ALSO +nfsslower(8) diff --git a/tests/python/test_tools_smoke.py b/tests/python/test_tools_smoke.py index 2d9519818a84037c017d7ad6f5e0c511068d4c8a..2fa7c7c37219006896bd126633e7e8032754e5ff 100755 --- a/tests/python/test_tools_smoke.py +++ b/tests/python/test_tools_smoke.py @@ -225,6 +225,13 @@ class SmokeTests(TestCase): else: pass + @skipUnless(kernel_version_ge(4,4), "requires kernel >= 4.4") + def test_nfsdist(self): + if(self.kmod_loaded("nfs")): + self.run_with_duration("nfsdist.py 1 1") + else: + pass + @skipUnless(kernel_version_ge(4,6), "requires kernel >= 4.6") def test_offcputime(self): self.run_with_duration("offcputime.py 1") diff --git a/tools/nfsdist.py b/tools/nfsdist.py new file mode 100755 index 0000000000000000000000000000000000000000..c38c05312df9b96f8c04f331425f2ebc2437968e --- /dev/null +++ b/tools/nfsdist.py @@ -0,0 +1,169 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# nfsdist Summarize NFS operation latency +# for Linux, uses BCC and eBPF +# +# USAGE: nfsdist [-h] [-T] [-m] [-p PID] [interval] [count] +# +# 4-Sep-2017 Samuel Nair created this + +from __future__ import print_function +from bcc import BPF +from time import sleep, strftime +import argparse + +# arguments +examples = """examples: + ./nfsdist # show operation latency as a histogram + ./nfsdist -p 181 # trace PID 181 only + ./nfsdist 1 10 # print 1 second summaries, 10 times + ./nfsdist -m 5 # 5s summaries, milliseconds +""" +parser = argparse.ArgumentParser( + description="Summarize NFS operation latency", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("-T", "--notimestamp", action="store_true", + help="don't include timestamp on interval output") +parser.add_argument("-m", "--milliseconds", action="store_true", + help="output in milliseconds") +parser.add_argument("-p", "--pid", + help="trace this PID only") +parser.add_argument("interval", nargs="?", + help="output interval, in seconds") +parser.add_argument("count", nargs="?", default=99999999, + help="number of outputs") +args = parser.parse_args() +pid = args.pid +countdown = int(args.count) +if args.milliseconds: + factor = 1000000 + label = "msecs" +else: + factor = 1000 + label = "usecs" + if args.interval and int(args.interval) == 0: + print("ERROR: interval 0. Exiting.") + exit() +debug = 0 + +# define BPF program +bpf_text = """ +#include <uapi/linux/ptrace.h> +#include <linux/fs.h> +#include <linux/sched.h> + +#define OP_NAME_LEN 8 +typedef struct dist_key { + char op[OP_NAME_LEN]; + u64 slot; +} dist_key_t; + +BPF_HASH(start, u32); +BPF_HISTOGRAM(dist, dist_key_t); + +// time operation +int trace_entry(struct pt_regs *ctx) +{ + u32 pid = bpf_get_current_pid_tgid(); + if (FILTER_PID) + return 0; + u64 ts = bpf_ktime_get_ns(); + start.update(&pid, &ts); + return 0; +} + +static int trace_return(struct pt_regs *ctx, const char *op) +{ + u64 *tsp; + u32 pid = bpf_get_current_pid_tgid(); + + // fetch timestamp and calculate delta + tsp = start.lookup(&pid); + if (tsp == 0) { + return 0; // missed start or filtered + } + u64 delta = (bpf_ktime_get_ns() - *tsp) / FACTOR; + + // store as histogram + dist_key_t key = {.slot = bpf_log2l(delta)}; + __builtin_memcpy(&key.op, op, sizeof(key.op)); + dist.increment(key); + + start.delete(&pid); + return 0; +} + +int trace_read_return(struct pt_regs *ctx) +{ + char *op = "read"; + return trace_return(ctx, op); +} + +int trace_write_return(struct pt_regs *ctx) +{ + char *op = "write"; + return trace_return(ctx, op); +} + +int trace_open_return(struct pt_regs *ctx) +{ + char *op = "open"; + return trace_return(ctx, op); +} + +int trace_getattr_return(struct pt_regs *ctx) +{ + char *op = "getattr"; + return trace_return(ctx, op); +} +""" +bpf_text = bpf_text.replace('FACTOR', str(factor)) +if args.pid: + bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) +else: + bpf_text = bpf_text.replace('FILTER_PID', '0') +if debug: + print(bpf_text) + +# load BPF program +b = BPF(text=bpf_text) + +# common file functions +b.attach_kprobe(event="nfs_file_read", fn_name="trace_entry") +b.attach_kprobe(event="nfs_file_write", fn_name="trace_entry") +b.attach_kprobe(event="nfs4_file_open", fn_name="trace_entry") +b.attach_kprobe(event="nfs_file_open", fn_name="trace_entry") +b.attach_kprobe(event="nfs_getattr", fn_name="trace_entry") + +b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return") +b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return") +b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_open_return") +b.attach_kretprobe(event="nfs_file_open", fn_name="trace_open_return") +b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return") + +print("Tracing NFS operation latency... Hit Ctrl-C to end.") + +# output +exiting = 0 +dist = b.get_table("dist") +while (1): + try: + if args.interval: + sleep(int(args.interval)) + else: + sleep(99999999) + except KeyboardInterrupt: + exiting = 1 + + print() + if args.interval and (not args.notimestamp): + print(strftime("%H:%M:%S:")) + + dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode) + dist.clear() + + countdown -= 1 + if exiting or countdown == 0: + exit() diff --git a/tools/nfsdist_example.txt b/tools/nfsdist_example.txt new file mode 100644 index 0000000000000000000000000000000000000000..b057569bd8b4baa241a7fea085bfc3c007758736 --- /dev/null +++ b/tools/nfsdist_example.txt @@ -0,0 +1,160 @@ +Demonstrations of nfsdist, the Linux eBPF/bcc version. + +nfsdist traces NFS reads, writes, opens, and getattr, and summarizes their +latency as a power-of-2 histogram. For example: + + +./nfsdist.py + +Tracing NFS operation latency... Hit Ctrl-C to end. + +operation = read + usecs : count distribution + 0 -> 1 : 4 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 7107 |************** | + 16 -> 31 : 19864 |****************************************| + 32 -> 63 : 1494 |*** | + 64 -> 127 : 491 | | + 128 -> 255 : 1810 |*** | + 256 -> 511 : 6356 |************ | + 512 -> 1023 : 4860 |********* | + 1024 -> 2047 : 3070 |****** | + 2048 -> 4095 : 1853 |*** | + 4096 -> 8191 : 921 |* | + 8192 -> 16383 : 122 | | + 16384 -> 32767 : 15 | | + 32768 -> 65535 : 5 | | + 65536 -> 131071 : 2 | | + 131072 -> 262143 : 1 | | + +operation = write + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 1 | | + 16 -> 31 : 0 | | + 32 -> 63 : 9 | | + 64 -> 127 : 19491 |****************************************| + 128 -> 255 : 3064 |****** | + 256 -> 511 : 940 |* | + 512 -> 1023 : 365 | | + 1024 -> 2047 : 312 | | + 2048 -> 4095 : 119 | | + 4096 -> 8191 : 31 | | + 8192 -> 16383 : 84 | | + 16384 -> 32767 : 31 | | + 32768 -> 65535 : 5 | | + 65536 -> 131071 : 3 | | + 131072 -> 262143 : 0 | | + 262144 -> 524287 : 1 | | + +operation = getattr + usecs : count distribution + 0 -> 1 : 27 |****************************************| + 2 -> 3 : 2 |** | + 4 -> 7 : 3 |**** | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 2 |** | + 512 -> 1023 : 2 |** | + +operation = open + usecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 0 | | + 4 -> 7 : 0 | | + 8 -> 15 : 0 | | + 16 -> 31 : 0 | | + 32 -> 63 : 0 | | + 64 -> 127 : 0 | | + 128 -> 255 : 0 | | + 256 -> 511 : 2 |****************************************| + + +In this example you can see that the read traffic is rather bi-modal, with about +26K reads falling within 8 - 30 usecs and about 18K reads spread between 128 - +8191 usecs. Write traffic is largely clustered in the 64 - 127 usecs bracket. +The faster read traffic is probably coming from a filesystem cache and the slower +traffic from disk. The reason why the writes are so consistently fast is because +this example test was run on a couple of VM's and I believe the hypervisor was +caching all the write traffic to memory. + +This "latency" is measured from when the operation was issued from the VFS +interface to the file system, to when it completed. This spans everything: +RPC latency, network latency, file system CPU cycles, file system locks, run +queue latency, etc. This is a better measure of the latency suffered by +applications reading from a NFS share and can better expose problems +experienced by NFS clients. + +Note that this only traces the common NFS operations (read, write, open and +getattr). I chose to include getattr as a significant percentage of NFS +traffic end up being getattr calls and are a good indicator of problems +with an NFS server. + +An optional interval and a count can be provided, as well as -m to show the +distributions in milliseconds. For example: + +./nfsdist -m 1 5 +Tracing NFS operation latency... Hit Ctrl-C to end. + +11:02:39: + +operation = write + msecs : count distribution + 0 -> 1 : 1 | | + 2 -> 3 : 24 |******** | + 4 -> 7 : 114 |****************************************| + 8 -> 15 : 9 |*** | + 16 -> 31 : 1 | | + 32 -> 63 : 1 | | + +11:02:40: + +operation = write + msecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 11 |*** | + 4 -> 7 : 111 |****************************************| + 8 -> 15 : 13 |**** | + 16 -> 31 : 1 | | + +11:02:41: + +operation = write + msecs : count distribution + 0 -> 1 : 0 | | + 2 -> 3 : 21 |****** | + 4 -> 7 : 137 |****************************************| + 8 -> 15 : 3 | | + +This shows a write workload, with writes hovering primarily in the 4-7ms range. + +USAGE message: + + +./nfsdist -h +usage: nfsdist.py [-h] [-T] [-m] [-p PID] [interval] [count] + +Summarize NFS operation latency + +positional arguments: + interval output interval, in seconds + count number of outputs + +optional arguments: + -h, --help show this help message and exit + -T, --notimestamp don't include timestamp on interval output + -m, --milliseconds output in milliseconds + -p PID, --pid PID trace this PID only + +examples: + ./nfsdist # show operation latency as a histogram + ./nfsdist -p 181 # trace PID 181 only + ./nfsdist 1 10 # print 1 second summaries, 10 times + ./nfsdist -m 5 # 5s summaries, milliseconds