diff --git a/man/man8/runqslower.8 b/man/man8/runqslower.8
index 2feef13fb1eead01adc05100eddaad5736ed572a..17052a1414220e6d8b4cff8c3517b247d6ee8de9 100644
--- a/man/man8/runqslower.8
+++ b/man/man8/runqslower.8
@@ -2,7 +2,7 @@
 .SH NAME
 runqslower \- Trace long process scheduling delays.
 .SH SYNOPSIS
-.B runqslower [\-p PID] [min_us]
+.B runqslower [\-p PID] [\-t TID] [min_us]
 .SH DESCRIPTION
 This measures the time a task spends waiting on a run queue (or equivalent
 scheduler data structure) for a turn on-CPU, and shows occurrences of time
@@ -38,6 +38,9 @@ Print usage message.
 \-p PID
 Only show this PID (filtered in kernel for efficiency).
 .TP
+\-t TID
+Only show this TID (filtered in kernel for efficiency).
+.TP
 min_us
 Minimum scheduling delay in microseconds to output.
 .SH EXAMPLES
diff --git a/tools/runqslower.py b/tools/runqslower.py
index b67853301699804ea3977b1f2d2046276e5f3276..8f790602a227a24f73cc9b030a6d858edebd543f 100755
--- a/tools/runqslower.py
+++ b/tools/runqslower.py
@@ -7,7 +7,7 @@
 # This script traces high scheduling delays between tasks being
 # ready to run and them running on CPU after that.
 #
-# USAGE: runqslower [-p PID] [min_us]
+# USAGE: runqslower [-p PID] [-t TID] [min_us]
 #
 # REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
 #
@@ -42,19 +42,25 @@ from time import strftime
 examples = """examples:
     ./runqslower         # trace run queue latency higher than 10000 us (default)
     ./runqslower 1000    # trace run queue latency higher than 1000 us
-    ./runqslower -p 123  # trace pid 123 only
+    ./runqslower -p 123  # trace pid 123
+    ./runqslower -t 123  # trace tid 123 (use for threads only)
 """
 parser = argparse.ArgumentParser(
     description="Trace high run queue latency",
     formatter_class=argparse.RawDescriptionHelpFormatter,
     epilog=examples)
-parser.add_argument("-p", "--pid", type=int, metavar="PID", dest="pid",
-    help="trace this PID only")
 parser.add_argument("min_us", nargs="?", default='10000',
     help="minimum run queue latecy to trace, in ms (default 10000)")
 parser.add_argument("--ebpf", action="store_true",
     help=argparse.SUPPRESS)
+
+thread_group = parser.add_mutually_exclusive_group()
+thread_group.add_argument("-p", "--pid", metavar="PID", dest="pid",
+    help="trace this PID only", type=int)
+thread_group.add_argument("-t", "--tid", metavar="TID", dest="tid",
+    help="trace this TID only", type=int)
 args = parser.parse_args()
+
 min_us = int(args.min_us)
 debug = 0
 
@@ -80,7 +86,7 @@ BPF_PERF_OUTPUT(events);
 // record enqueue timestamp
 static int trace_enqueue(u32 tgid, u32 pid)
 {
-    if (FILTER_PID || pid == 0)
+    if (FILTER_PID || FILTER_TGID || pid == 0)
         return 0;
     u64 ts = bpf_ktime_get_ns();
     start.update(&pid, &ts);
@@ -109,13 +115,14 @@ int trace_run(struct pt_regs *ctx, struct task_struct *prev)
     if (prev->state == TASK_RUNNING) {
         tgid = prev->tgid;
         pid = prev->pid;
-        if (!(FILTER_PID || pid == 0)) {
-            u64 ts = bpf_ktime_get_ns();
-            start.update(&pid, &ts);
+        u64 ts = bpf_ktime_get_ns();
+        if (pid != 0) {
+            if (!(FILTER_PID) && !(FILTER_TGID)) {
+                start.update(&pid, &ts);
+            }
         }
     }
 
-    tgid = bpf_get_current_pid_tgid() >> 32;
     pid = bpf_get_current_pid_tgid();
 
     u64 *tsp, delta_us;
@@ -167,17 +174,21 @@ RAW_TRACEPOINT_PROBE(sched_switch)
     // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
     struct task_struct *prev = (struct task_struct *)ctx->args[1];
     struct task_struct *next= (struct task_struct *)ctx->args[2];
-    u32 pid;
+    u32 tgid, pid;
     long state;
 
     // ivcsw: treat like an enqueue event and store timestamp
     bpf_probe_read(&state, sizeof(long), (const void *)&prev->state);
     if (state == TASK_RUNNING) {
+        bpf_probe_read(&tgid, sizeof(prev->tgid), &prev->tgid);
         bpf_probe_read(&pid, sizeof(prev->pid), &prev->pid);
-        if (!(FILTER_PID || pid == 0)) {
-            u64 ts = bpf_ktime_get_ns();
-            start.update(&pid, &ts);
+        u64 ts = bpf_ktime_get_ns();
+        if (pid != 0) {
+            if (!(FILTER_PID) && !(FILTER_TGID)) {
+                start.update(&pid, &ts);
+            }
         }
+
     }
 
     bpf_probe_read(&pid, sizeof(next->pid), &next->pid);
@@ -218,10 +229,17 @@ if min_us == 0:
     bpf_text = bpf_text.replace('FILTER_US', '0')
 else:
     bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
-if args.pid:
-    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.pid)
+
+if args.tid:
+    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.tid)
 else:
     bpf_text = bpf_text.replace('FILTER_PID', '0')
+
+if args.pid:
+    bpf_text = bpf_text.replace('FILTER_TGID', 'tgid != %s' % args.pid)
+else:
+    bpf_text = bpf_text.replace('FILTER_TGID', '0')
+
 if debug or args.ebpf:
     print(bpf_text)
     if args.ebpf:
@@ -240,7 +258,7 @@ if not is_support_raw_tp:
     b.attach_kprobe(event="finish_task_switch", fn_name="trace_run")
 
 print("Tracing run queue latency higher than %d us" % min_us)
-print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "PID", "LAT(us)"))
+print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "TID", "LAT(us)"))
 
 # read events
 b["events"].open_perf_buffer(print_event, page_cnt=64)
diff --git a/tools/runqslower_example.txt b/tools/runqslower_example.txt
index 61825e5201c270ebbd09ba17d5da13186ccfcd79..e64f9451af2e931af81593898b1604e81373079f 100644
--- a/tools/runqslower_example.txt
+++ b/tools/runqslower_example.txt
@@ -5,8 +5,15 @@ runqslower shows high latency scheduling times between tasks being
 ready to run and them running on CPU after that. For example:
 
 # runqslower
-Tracing run queue latency higher than 10000 us
-TIME     COMM             PID           LAT(us)
+
+Note: Showing TID (thread id) in the report column. The smallest
+execution unit becomes a TID when using the --pid flag as
+in that case the tool reports not only the parent pid but
+its children threads as well.
+
+Tracing run queue latency higher than 10000 us.
+
+TIME     COMM             TID           LAT(us)
 04:16:32 cc1              12924           12739
 04:16:32 sh               13640           12118
 04:16:32 make             13639           12730
@@ -32,7 +39,7 @@ These delays can be analyzed in depth with "perf sched" tool, see:
 USAGE message:
 
 # ./runqslower -h
-usage: runqslower.py [-h] [-p PID] [min_us]
+usage: runqslower.py [-h] [-p PID | -t TID] [min_us]
 
 Trace high run queue latency
 
@@ -41,9 +48,11 @@ positional arguments:
 
 optional arguments:
   -h, --help         show this help message and exit
-  -p PID, --pid PID  trace this PID only
+  -p PID, --pid PID  trace this PID
+  -t TID, --tid TID  trace this TID only
 
 examples:
     ./runqslower         # trace run queue latency higher than 10000 us (default)
     ./runqslower 1000    # trace run queue latency higher than 1000 us
-    ./runqslower -p 123  # trace pid 123 only
+    ./runqslower -p 123  # trace pid 123
+    ./runqslower -t 123  # trace tid (thread) 123