diff --git a/README.md b/README.md index 68c426a9310ef10bff5bc6c839a4a0a3638a47d3..c2b3375b06673dca3bb19e6dfd869160c7f9d65b 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,7 @@ pair of .c and .py files, and some are directories of files. - tools/[tcpaccept](tools/tcpaccept.py): Trace TCP passive connections (accept()). [Examples](tools/tcpaccept_example.txt). - tools/[tcpconnect](tools/tcpconnect.py): Trace TCP active connections (connect()). [Examples](tools/tcpconnect_example.txt). - tools/[tcpconnlat](tools/tcpconnlat.py): Trace TCP active connection latency (connect()). [Examples](tools/tcpconnlat_example.txt). +- tools/[tcpdrop](tools/tcpdrop.py): Trace kernel-based TCP packet drops with details. [Examples](tools/tcpdrop_example.txt). - tools/[tcplife](tools/tcplife.py): Trace TCP sessions and summarize lifespan. [Examples](tools/tcplife_example.txt). - tools/[tcpretrans](tools/tcpretrans.py): Trace TCP retransmits and TLPs. [Examples](tools/tcpretrans_example.txt). - tools/[tcpstates](tools/tcpstates.py): Trace TCP session state changes with durations. [Examples](tools/tcpstates_example.txt). diff --git a/man/man8/tcpdrop.8 b/man/man8/tcpdrop.8 new file mode 100644 index 0000000000000000000000000000000000000000..a21e885bb0f8ad6786d03c61a4ea1a048be941a0 --- /dev/null +++ b/man/man8/tcpdrop.8 @@ -0,0 +1,73 @@ +.TH tcpdrop 8 "2018-05-30" "USER COMMANDS" +.SH NAME +tcpdrop \- Trace kernel-based TCP packet drops with details. Uses Linux eBPF/bcc. +.SH SYNOPSIS +.B tcpdrop [\-h] [\-T] [\-t] [\-w] [\-s] [\-p PID] [\-D PORTS] [\-L PORTS] +.SH DESCRIPTION +This tool traces TCP packets or segments that were dropped by the kernel, and +shows details from the IP and TCP headers, the socket state, and the +kernel stack trace. This is useful for debugging cases of high kernel drops, +which can cause timer-based retransmits and performance issues. + +This tool works using dynamic tracing of the tcp_drop() kernel function, +which requires a recent kernel version. + +Since this uses BPF, only the root user can use this tool. +.SH REQUIREMENTS +CONFIG_BPF and bcc. +.SH OPTIONS +.TP +\-h +Print usage message. +.B tcpdrop +.SH FIELDS +.TP +TIME +Time of the drop, in HH:MM:SS format. +.TP +PID +Process ID that was on-CPU during the drop. This may be unrelated, as drops +can occur on the receive interrupt and be unrelated to the PID that was +interrupted. +.TP +IP +IP address family (4 or 6) +.TP +SADDR +Source IP address. +.TP +SPORT +Source TCP port. +.TP +DADDR +Destination IP address. +.TP +DPORT +Destionation TCP port. +.TP +STATE +TCP session state ("ESTABLISHED", etc). +.TP +FLAGS +TCP flags ("SYN", etc). +.SH OVERHEAD +This traces the kernel tcp_drop() function, which should be low frequency, +and therefore the overhead of this tool should be negligible. + +As always, test and understand this tools overhead for your types of +workloads before production use. +.SH SOURCE +This is from bcc. +.IP +https://github.com/iovisor/bcc +.PP +Also look in the bcc distribution for a companion _examples.txt file containing +example usage, output, and commentary for this tool. +.SH OS +Linux +.SH STABILITY +Unstable - in development. +.SH AUTHOR +Brendan Gregg +.SH SEE ALSO +tcplife(8), tcpaccept(8), tcpconnect(8), tcptop(8) diff --git a/src/python/bcc/tcp.py b/src/python/bcc/tcp.py new file mode 100644 index 0000000000000000000000000000000000000000..0d25348d8507ca2f892d12f67c20622254666d06 --- /dev/null +++ b/src/python/bcc/tcp.py @@ -0,0 +1,58 @@ +# Copyright 2018 Netflix, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# from include/net/tcp_states.h: +tcpstate = {} +tcpstate[1] = 'ESTABLISHED' +tcpstate[2] = 'SYN_SENT' +tcpstate[3] = 'SYN_RECV' +tcpstate[4] = 'FIN_WAIT1' +tcpstate[5] = 'FIN_WAIT2' +tcpstate[6] = 'TIME_WAIT' +tcpstate[7] = 'CLOSE' +tcpstate[8] = 'CLOSE_WAIT' +tcpstate[9] = 'LAST_ACK' +tcpstate[10] = 'LISTEN' +tcpstate[11] = 'CLOSING' +tcpstate[12] = 'NEW_SYN_RECV' + +# from include/net/tcp.h: +TCPHDR_FIN = 0x01; +TCPHDR_SYN = 0x02; +TCPHDR_RST = 0x04; +TCPHDR_PSH = 0x08; +TCPHDR_ACK = 0x10; +TCPHDR_URG = 0x20; +TCPHDR_ECE = 0x40; +TCPHDR_CWR = 0x80; + +def flags2str(flags): + arr = []; + if flags & TCPHDR_FIN: + arr.append("FIN"); + if flags & TCPHDR_SYN: + arr.append("SYN"); + if flags & TCPHDR_RST: + arr.append("RST"); + if flags & TCPHDR_PSH: + arr.append("PSH"); + if flags & TCPHDR_ACK: + arr.append("ACK"); + if flags & TCPHDR_URG: + arr.append("URG"); + if flags & TCPHDR_ECE: + arr.append("ECE"); + if flags & TCPHDR_CWR: + arr.append("CWR"); + return "|".join(arr); diff --git a/tools/tcpdrop.py b/tools/tcpdrop.py new file mode 100755 index 0000000000000000000000000000000000000000..3f3387f2248a7db10105c2649f6e81adfbe275b9 --- /dev/null +++ b/tools/tcpdrop.py @@ -0,0 +1,222 @@ +#!/usr/bin/python +# @lint-avoid-python-3-compatibility-imports +# +# tcpdrop Trace TCP kernel-dropped packets/segments. +# For Linux, uses BCC, eBPF. Embedded C. +# +# This provides information such as packet details, socket state, and kernel +# stack trace for packets/segments that were dropped via tcp_drop(). +# +# USAGE: tcpdrop [-c] [-h] [-l] +# +# This uses dynamic tracing of kernel functions, and will need to be updated +# to match kernel changes. +# +# Copyright 2018 Netflix, Inc. +# Licensed under the Apache License, Version 2.0 (the "License") +# +# 30-May-2018 Brendan Gregg Created this. + +from __future__ import print_function +from bcc import BPF +import argparse +from time import strftime +from socket import inet_ntop, AF_INET, AF_INET6 +from struct import pack +import ctypes as ct +from time import sleep +from bcc import tcp + +# arguments +examples = """examples: + ./tcpdrop # trace kernel TCP drops +""" +parser = argparse.ArgumentParser( + description="Trace TCP drops by the kernel", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=examples) +parser.add_argument("--ebpf", action="store_true", + help=argparse.SUPPRESS) +args = parser.parse_args() +debug = 0 + +# define BPF program +bpf_text = """ +#include <uapi/linux/ptrace.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/ip.h> +#include <net/sock.h> +#include <bcc/proto.h> + +BPF_STACK_TRACE(stack_traces, 1024); + +// separate data structs for ipv4 and ipv6 +struct ipv4_data_t { + // XXX: switch some to u32's when supported + u64 stack_id; + u64 pid; + u64 ip; + u64 saddr; + u64 daddr; + u64 sport; + u64 dport; + u64 state; + u64 tcpflags; +}; +BPF_PERF_OUTPUT(ipv4_events); + +struct ipv6_data_t { + u64 stack_id; + u64 pid; + u64 ip; + unsigned __int128 saddr; + unsigned __int128 daddr; + u64 sport; + u64 dport; + u64 state; + u64 tcpflags; +}; +BPF_PERF_OUTPUT(ipv6_events); + +static struct tcphdr *skb_to_tcphdr(const struct sk_buff *skb) +{ + // unstable API. verify logic in tcp_hdr() -> skb_transport_header(). + return (struct tcphdr *)(skb->head + skb->transport_header); +} + +static inline struct iphdr *skb_to_iphdr(const struct sk_buff *skb) +{ + // unstable API. verify logic in ip_hdr() -> skb_network_header(). + return (struct iphdr *)(skb->head + skb->network_header); +} + +// from include/net/tcp.h: +#ifndef tcp_flag_byte +#define tcp_flag_byte(th) (((u_int8_t *)th)[13]) +#endif + +int trace_tcp_drop(struct pt_regs *ctx, struct sock *sk, struct sk_buff *skb) +{ + if (sk == NULL) + return 0; + u32 pid = bpf_get_current_pid_tgid(); + + // pull in details from the packet headers and the sock struct + u16 family = sk->__sk_common.skc_family; + char state = sk->__sk_common.skc_state; + u16 sport = 0, dport = 0; + u8 tcpflags = 0; + struct tcphdr *tcp = skb_to_tcphdr(skb); + struct iphdr *ip = skb_to_iphdr(skb); + bpf_probe_read(&sport, sizeof(sport), &tcp->source); + bpf_probe_read(&dport, sizeof(dport), &tcp->dest); + bpf_probe_read(&tcpflags, sizeof(tcpflags), &tcp_flag_byte(tcp)); + sport = ntohs(sport); + dport = ntohs(dport); + + if (family == AF_INET) { + struct ipv4_data_t data4 = {.pid = pid, .ip = 4}; + bpf_probe_read(&data4.saddr, sizeof(u32), &ip->saddr); + bpf_probe_read(&data4.daddr, sizeof(u32), &ip->daddr); + data4.dport = dport; + data4.sport = sport; + data4.tcpflags = tcpflags; + data4.state = state; + data4.stack_id = stack_traces.get_stackid(ctx, 0); + ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); + + } else if (family == AF_INET6) { + struct ipv6_data_t data6 = {.pid = pid, .ip = 6}; + bpf_probe_read(&data6.saddr, sizeof(data6.saddr), + sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + bpf_probe_read(&data6.daddr, sizeof(data6.daddr), + sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); + data6.dport = dport; + data6.sport = sport; + data6.tcpflags = tcpflags; + data6.state = state; + data6.stack_id = stack_traces.get_stackid(ctx, 0); + ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); + } + // else drop + + return 0; +} +""" + +if debug or args.ebpf: + print(bpf_text) + if args.ebpf: + exit() + +# event data +class Data_ipv4(ct.Structure): + _fields_ = [ + ("stack_id", ct.c_ulonglong), + ("pid", ct.c_ulonglong), + ("ip", ct.c_ulonglong), + ("saddr", ct.c_ulonglong), + ("daddr", ct.c_ulonglong), + ("sport", ct.c_ulonglong), + ("dport", ct.c_ulonglong), + ("state", ct.c_ulonglong), + ("tcpflags", ct.c_ulonglong) + ] + +class Data_ipv6(ct.Structure): + _fields_ = [ + ("stack_id", ct.c_ulonglong), + ("pid", ct.c_ulonglong), + ("ip", ct.c_ulonglong), + ("saddr", (ct.c_ulonglong * 2)), + ("daddr", (ct.c_ulonglong * 2)), + ("sport", ct.c_ulonglong), + ("dport", ct.c_ulonglong), + ("state", ct.c_ulonglong), + ("tcpflags", ct.c_ulonglong) + ] + +# process event +def print_ipv4_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data_ipv4)).contents + print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % ( + strftime("%H:%M:%S"), event.pid, event.ip, + "%s:%d" % (inet_ntop(AF_INET, pack('I', event.saddr)), event.sport), + "%s:%s" % (inet_ntop(AF_INET, pack('I', event.daddr)), event.dport), + tcp.tcpstate[event.state], tcp.flags2str(event.tcpflags))) + for addr in stack_traces.walk(event.stack_id): + sym = b.ksym(addr, show_offset=True) + print("\t%s" % sym) + print("") + +def print_ipv6_event(cpu, data, size): + event = ct.cast(data, ct.POINTER(Data_ipv6)).contents + print("%-8s %-6d %-2d %-20s > %-20s %s (%s)" % ( + strftime("%H:%M:%S"), event.pid, event.ip, + "%s:%d" % (inet_ntop(AF_INET6, event.saddr), event.sport), + "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport), + tcp.tcpstate[event.state], tcp.flags2str(event.tcpflags))) + for addr in stack_traces.walk(event.stack_id): + sym = b.ksym(addr, show_offset=True) + print("\t%s" % sym) + print("") + +# initialize BPF +b = BPF(text=bpf_text) +if b.get_kprobe_functions("tcp_drop"): + b.attach_kprobe(event="tcp_drop", fn_name="trace_tcp_drop") +else: + print("ERROR: tcp_drop() kernel function not found or traceable. " + "Older kernel versions not supported.") + exit() +stack_traces = b.get_table("stack_traces") + +# header +print("%-8s %-6s %-2s %-20s > %-20s %s (%s)" % ("TIME", "PID", "IP", + "SADDR:SPORT", "DADDR:DPORT", "STATE", "FLAGS")) + +# read events +b["ipv4_events"].open_perf_buffer(print_ipv4_event) +b["ipv6_events"].open_perf_buffer(print_ipv6_event) +while 1: + b.perf_buffer_poll() diff --git a/tools/tcpdrop_example.txt b/tools/tcpdrop_example.txt new file mode 100644 index 0000000000000000000000000000000000000000..752ec4bf6a9a9eb0de443163a0eeb6cf2211d1f1 --- /dev/null +++ b/tools/tcpdrop_example.txt @@ -0,0 +1,72 @@ +Demonstrations of tcpdrop, the Linux BPF/bcc version. + + +tcpdrop prints details of TCP packets or segments that were dropped by the +kernel, including the kernel stack trace that led to the drop: + +# ./tcpdrop.py +TIME PID IP SADDR:SPORT > DADDR:DPORT STATE (FLAGS) +20:49:06 0 4 10.32.119.56:443 > 10.66.65.252:22912 CLOSE (ACK) + tcp_drop+0x1 + tcp_v4_do_rcv+0x135 + tcp_v4_rcv+0x9c7 + ip_local_deliver_finish+0x62 + ip_local_deliver+0x6f + ip_rcv_finish+0x129 + ip_rcv+0x28f + __netif_receive_skb_core+0x432 + __netif_receive_skb+0x18 + netif_receive_skb_internal+0x37 + napi_gro_receive+0xc5 + ena_clean_rx_irq+0x3c3 + ena_io_poll+0x33f + net_rx_action+0x140 + __softirqentry_text_start+0xdf + irq_exit+0xb6 + do_IRQ+0x82 + ret_from_intr+0x0 + native_safe_halt+0x6 + default_idle+0x20 + arch_cpu_idle+0x15 + default_idle_call+0x23 + do_idle+0x17f + cpu_startup_entry+0x73 + rest_init+0xae + start_kernel+0x4dc + x86_64_start_reservations+0x24 + x86_64_start_kernel+0x74 + secondary_startup_64+0xa5 + +20:49:50 12431 4 127.0.0.1:8198 > 127.0.0.1:48280 CLOSE (RST|ACK) + tcp_drop+0x1 + tcp_v4_do_rcv+0x135 + __release_sock+0x88 + release_sock+0x30 + inet_stream_connect+0x47 + SYSC_connect+0x9e + sys_connect+0xe + do_syscall_64+0x73 + entry_SYSCALL_64_after_hwframe+0x3d + +[...] + +The last two columns show the state of the TCP session, and the TCP flags. +These two examples show packets arriving for a session in the closed state, +that were dropped by the kernel. + +This tool is useful for debugging high rates of drops, which can cause the +remote end to do timer-based retransmits, hurting performance. + + +USAGE: + +# ./tcpdrop.py -h +usage: tcpdrop.py [-h] + +Trace TCP drops by the kernel + +optional arguments: + -h, --help show this help message and exit + +examples: + ./tcpdrop # trace kernel TCP drops