diff --git a/man/man8/tcpretrans.8 b/man/man8/tcpretrans.8 index 8f04a159281442d6fe6a05755b00e87fb429c41d..abdb4bc58771235c1ac1b2d1f7cf3a09be822782 100644 --- a/man/man8/tcpretrans.8 +++ b/man/man8/tcpretrans.8 @@ -1,6 +1,6 @@ .TH tcpaccept 8 "2016-02-14" "USER COMMANDS" .SH NAME -tcpaccept \- Trace TCP retransmits and TLPs. Uses Linux eBPF/bcc. +tcpaccept \- Trace or count TCP retransmits and TLPs. Uses Linux eBPF/bcc. .SH SYNOPSIS .B tcpaccept [\-h] [\-l] .SH DESCRIPTION @@ -8,7 +8,9 @@ This traces TCP retransmits, showing address, port, and TCP state information, and sometimes the PID (although usually not, since retransmits are usually sent by the kernel on timeouts). To keep overhead very low, only the TCP retransmit functions are traced. This does not trace every packet -(like tcpdump(8) or a packet sniffer). +(like tcpdump(8) or a packet sniffer). Optionally, it can count retransmits +over a user signalled interval to spot potentially dropping network paths the +flows are traversing. This uses dynamic tracing of the kernel tcp_retransmit_skb() and tcp_send_loss_probe() functions, and will need to be updated to @@ -25,6 +27,9 @@ Print usage message. \-l Include tail loss probe attempts (in some cases the kernel may not complete the TLP send). +.TP +\-c +Count occurring retransmits per flow. .SH EXAMPLES .TP Trace TCP retransmits: @@ -63,6 +68,9 @@ Remote port. .TP STATE TCP session state. +.TP +RETRANSMITS +Accumulated occurred retransmits since start. .SH OVERHEAD Should be negligible: TCP retransmit events should be low (<1000/s), and the low overhead this tool adds to each event should make the cost negligible. diff --git a/tools/tcpretrans.py b/tools/tcpretrans.py index 21fcbc7a6b3f5155fdd1e79ecb78ac2a80d11f4f..e768260d8e2919db03164e6ca086962c3cd0a9ab 100755 --- a/tools/tcpretrans.py +++ b/tools/tcpretrans.py @@ -13,6 +13,7 @@ # Licensed under the Apache License, Version 2.0 (the "License") # # 14-Feb-2016 Brendan Gregg Created this. +# 03-Nov-2017 Matthias Tafelmeier Extended this. from __future__ import print_function from bcc import BPF @@ -21,6 +22,7 @@ from time import strftime from socket import inet_ntop, AF_INET, AF_INET6 from struct import pack import ctypes as ct +from time import sleep # arguments examples = """examples: @@ -33,6 +35,8 @@ parser = argparse.ArgumentParser( epilog=examples) parser.add_argument("-l", "--lossprobe", action="store_true", help="include tail loss probe attempts") +parser.add_argument("-c", "--count", action="store_true", + help="count occurred retransmits per flow") args = parser.parse_args() # define BPF program @@ -70,6 +74,23 @@ struct ipv6_data_t { }; BPF_PERF_OUTPUT(ipv6_events); +// separate flow keys per address family +struct ipv4_flow_key_t { + u32 saddr; + u32 daddr; + u16 lport; + u16 dport; +}; +BPF_HASH(ipv4_count, struct ipv4_flow_key_t); + +struct ipv6_flow_key_t { + unsigned __int128 saddr; + unsigned __int128 daddr; + u16 lport; + u16 dport; +}; +BPF_HASH(ipv6_count, struct ipv6_flow_key_t); + static int trace_event(struct pt_regs *ctx, struct sock *skp, int type) { if (skp == NULL) @@ -83,26 +104,11 @@ static int trace_event(struct pt_regs *ctx, struct sock *skp, int type) char state = skp->__sk_common.skc_state; if (family == AF_INET) { - struct ipv4_data_t data4 = {.pid = pid, .ip = 4, .type = type}; - data4.saddr = skp->__sk_common.skc_rcv_saddr; - data4.daddr = skp->__sk_common.skc_daddr; - // lport is host order - data4.lport = lport; - data4.dport = ntohs(dport); - data4.state = state; - ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); - + IPV4_INIT + IPV4_CORE } else if (family == AF_INET6) { - struct ipv6_data_t data6 = {.pid = pid, .ip = 6, .type = type}; - bpf_probe_read(&data6.saddr, sizeof(data6.saddr), - skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); - bpf_probe_read(&data6.daddr, sizeof(data6.daddr), - skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); - // lport is host order - data6.lport = lport; - data6.dport = ntohs(dport); - data6.state = state; - ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); + IPV6_INIT + IPV6_CORE } // else drop @@ -122,6 +128,66 @@ int trace_tlp(struct pt_regs *ctx, struct sock *sk) } """ +struct_init = { 'ipv4': + { 'count' : + """ + struct ipv4_flow_key_t flow_key = {}; + flow_key.saddr = skp->__sk_common.skc_rcv_saddr; + flow_key.daddr = skp->__sk_common.skc_daddr; + // lport is host order + flow_key.lport = lport; + flow_key.dport = ntohs(dport);""", + 'trace' : + """ + struct ipv4_data_t data4 = {.pid = pid, .ip = 4, .type = type}; + data4.saddr = skp->__sk_common.skc_rcv_saddr; + data4.daddr = skp->__sk_common.skc_daddr; + // lport is host order + data4.lport = lport; + data4.dport = ntohs(dport); + data4.state = state; """ + }, + 'ipv6': + { 'count' : + """ + struct ipv6_flow_key_t flow_key = {}; + bpf_probe_read(&flow_key.saddr, sizeof(flow_key.saddr), + skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + bpf_probe_read(&flow_key.daddr, sizeof(flow_key.daddr), + skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); + // lport is host order + flow_key.lport = lport; + flow_key.dport = ntohs(dport);""", + 'trace' : """ + struct ipv6_data_t data6 = {.pid = pid, .ip = 6, .type = type}; + bpf_probe_read(&data6.saddr, sizeof(data6.saddr), + skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); + bpf_probe_read(&data6.daddr, sizeof(data6.daddr), + skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); + // lport is host order + data6.lport = lport; + data6.dport = ntohs(dport); + data6.state = state;""" + } + } + +count_core_base = """ + u64 zero = 0, *val; + val = COUNT_STRUCT.lookup_or_init(&flow_key, &zero); + (*val)++; +""" + +if args.count: + bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['count']) + bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['count']) + bpf_text = bpf_text.replace("IPV4_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv4_count')) + bpf_text = bpf_text.replace("IPV6_CORE", count_core_base.replace("COUNT_STRUCT", 'ipv6_count')) +else: + bpf_text = bpf_text.replace("IPV4_INIT", struct_init['ipv4']['trace']) + bpf_text = bpf_text.replace("IPV6_INIT", struct_init['ipv6']['trace']) + bpf_text = bpf_text.replace("IPV4_CORE", "ipv4_events.perf_submit(ctx, &data4, sizeof(data4));") + bpf_text = bpf_text.replace("IPV6_CORE", "ipv6_events.perf_submit(ctx, &data6, sizeof(data6));") + # event data class Data_ipv4(ct.Structure): _fields_ = [ @@ -186,18 +252,44 @@ def print_ipv6_event(cpu, data, size): "%s:%d" % (inet_ntop(AF_INET6, event.daddr), event.dport), tcpstate[event.state])) +def depict_cnt(counts_tab, l3prot='ipv4'): + for k, v in sorted(counts_tab.items(), key=lambda counts: counts[1].value): + depict_key = "" + ep_fmt = "[%s]#%d" + if l3prot == 'ipv4': + depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET, pack('I', k.saddr)), k.lport), + ep_fmt % (inet_ntop(AF_INET, pack('I', k.daddr)), k.dport)) + else: + depict_key = "%-20s <-> %-20s" % (ep_fmt % (inet_ntop(AF_INET6, k.saddr), k.lport), + ep_fmt % (inet_ntop(AF_INET6, k.daddr), k.dport)) + + print ("%s %10d" % (depict_key, v.value)) + # initialize BPF b = BPF(text=bpf_text) b.attach_kprobe(event="tcp_retransmit_skb", fn_name="trace_retransmit") if args.lossprobe: b.attach_kprobe(event="tcp_send_loss_probe", fn_name="trace_tlp") -# header -print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP", - "LADDR:LPORT", "T", "RADDR:RPORT", "STATE")) +print("Tracing retransmits ... Hit Ctrl-C to end") +if args.count: + try: + while 1: + sleep(99999999) + except BaseException: + pass + # header + print("\n%-25s %-25s %-10s" % ( + "LADDR:LPORT", "RADDR:RPORT", "RETRANSMITS")) + depict_cnt(b.get_table("ipv4_count")) + depict_cnt(b.get_table("ipv6_count"), l3prot='ipv6') # read events -b["ipv4_events"].open_perf_buffer(print_ipv4_event) -b["ipv6_events"].open_perf_buffer(print_ipv6_event) -while 1: - b.kprobe_poll() +else: + # header + print("%-8s %-6s %-2s %-20s %1s> %-20s %-4s" % ("TIME", "PID", "IP", + "LADDR:LPORT", "T", "RADDR:RPORT", "STATE")) + b["ipv4_events"].open_perf_buffer(print_ipv4_event) + b["ipv6_events"].open_perf_buffer(print_ipv6_event) + while 1: + b.kprobe_poll() diff --git a/tools/tcpretrans_example.txt b/tools/tcpretrans_example.txt index 970756e10750b84da5435026dde4e62afd2743f7..db63477589c8d69aab9a8d6c1fca603df488ef9e 100644 --- a/tools/tcpretrans_example.txt +++ b/tools/tcpretrans_example.txt @@ -44,6 +44,20 @@ TIME PID IP LADDR:LPORT T> RADDR:RPORT STATE See the "L>" in the "T>" column. These are attempts: the kernel probably sent a TLP, but in some cases it might not have been ultimately sent. +To spot heavily retransmitting flows quickly one can use the -c flag. It will +count occurring retransmits per flow. + +# ./tcpretrans.py -c +Tracing retransmits ... Hit Ctrl-C to end +^C +LADDR:LPORT RADDR:RPORT RETRANSMITS +192.168.10.50:60366 <-> 172.217.21.194:443 700 +192.168.10.50:666 <-> 172.213.11.195:443 345 +192.168.10.50:366 <-> 172.212.22.194:443 211 +[...] + +This can ease to quickly isolate congested or otherwise awry network paths +responsible for clamping tcp performance. USAGE message: @@ -55,6 +69,7 @@ Trace TCP retransmits optional arguments: -h, --help show this help message and exit -l, --lossprobe include tail loss probe attempts + -c, --count count occurred retransmits per flow examples: ./tcpretrans # trace TCP retransmits