diff --git a/man/man8/runqlat.8 b/man/man8/runqlat.8
index 9cc6176c0f7faaee2e551725d995f8fffac73b44..2986ff5bbb69080f6b7e423fb33f40e052cd11fa 100644
--- a/man/man8/runqlat.8
+++ b/man/man8/runqlat.8
@@ -2,7 +2,7 @@
 .SH NAME
 runqlat \- Run queue (scheduler) latency as a histogram.
 .SH SYNOPSIS
-.B runqlat [\-h] [\-T] [\-m] [\-P] [\-p PID] [interval] [count]
+.B runqlat [\-h] [\-T] [\-m] [\-P] [\-\-pidnss] [\-L] [\-p PID] [interval] [count]
 .SH DESCRIPTION
 This measures the time a task spends waiting on a run queue (or equivalent
 scheduler data structure) for a turn on-CPU, and shows this time as a
@@ -44,6 +44,13 @@ Output histogram in milliseconds.
 \-P
 Print a histogram for each PID.
 .TP
+\-\-pidnss
+Print a histogram for each PID namespace (short for PID namespaces). For
+container analysis.
+.TP
+\-L
+Print a histogram for each thread ID.
+.TP
 \-p PID
 Only show this PID (filtered in kernel for efficiency).
 .TP
diff --git a/tools/runqlat.py b/tools/runqlat.py
index bae0138d0fa47362d4fdd1a692788ab3761ea04d..1eb5d9ac9b44aadd1c2d58c0b4669c2228496ca9 100755
--- a/tools/runqlat.py
+++ b/tools/runqlat.py
@@ -37,7 +37,7 @@ examples = """examples:
     ./runqlat -p 185     # trace PID 185 only
 """
 parser = argparse.ArgumentParser(
-    description="Summarize run queue (schedular) latency as a histogram",
+    description="Summarize run queue (scheduler) latency as a histogram",
     formatter_class=argparse.RawDescriptionHelpFormatter,
     epilog=examples)
 parser.add_argument("-T", "--timestamp", action="store_true",
@@ -46,6 +46,10 @@ parser.add_argument("-m", "--milliseconds", action="store_true",
     help="millisecond histogram")
 parser.add_argument("-P", "--pids", action="store_true",
     help="print a histogram per process ID")
+# PID options are --pid and --pids, so namespaces should be --pidns (not done
+# yet) and --pidnss:
+parser.add_argument("--pidnss", action="store_true",
+    help="print a histogram per PID namespace")
 parser.add_argument("-L", "--tids", action="store_true",
     help="print a histogram per thread ID")
 parser.add_argument("-p", "--pid",
@@ -62,11 +66,19 @@ debug = 0
 bpf_text = """
 #include <uapi/linux/ptrace.h>
 #include <linux/sched.h>
+#include <linux/nsproxy.h>
+#include <linux/pid_namespace.h>
 
 typedef struct pid_key {
     u64 id;    // work around
     u64 slot;
 } pid_key_t;
+
+typedef struct pidns_key {
+    u64 id;    // work around
+    u64 slot;
+} pidns_key_t;
+
 BPF_HASH(start, u32);
 STORAGE
 
@@ -145,6 +157,13 @@ if args.pids or args.tids:
     bpf_text = bpf_text.replace('STORE',
         'pid_key_t key = {.id = ' + pid + ', .slot = bpf_log2l(delta)}; ' +
         'dist.increment(key);')
+elif args.pidnss:
+    section = "pidns"
+    bpf_text = bpf_text.replace('STORAGE',
+        'BPF_HISTOGRAM(dist, pidns_key_t);')
+    bpf_text = bpf_text.replace('STORE', 'pidns_key_t key = ' +
+        '{.id = prev->nsproxy->pid_ns_for_children->ns.inum, ' +
+        '.slot = bpf_log2l(delta)}; dist.increment(key);')
 else:
     section = ""
     bpf_text = bpf_text.replace('STORAGE', 'BPF_HISTOGRAM(dist);')
diff --git a/tools/runqlat_example.txt b/tools/runqlat_example.txt
index ee6335609be7f99c7c234e4d2520a0cc1aa3c1c2..857e5165df37dcf334714a482a2b06217f88ce3b 100644
--- a/tools/runqlat_example.txt
+++ b/tools/runqlat_example.txt
@@ -486,12 +486,75 @@ tid = 7
 [...]
 
 
+And a --pidnss option (short for PID namespaces)  will print for each PID
+namespace, for analyzing container performance:
+
+# ./runqlat --pidnss -m
+Tracing run queue latency... Hit Ctrl-C to end.
+^C
+
+pidns = 4026532870
+     msecs               : count     distribution
+         0 -> 1          : 40       |****************************************|
+         2 -> 3          : 1        |*                                       |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 2        |**                                      |
+        64 -> 127        : 5        |*****                                   |
+
+pidns = 4026532809
+     msecs               : count     distribution
+         0 -> 1          : 67       |****************************************|
+
+pidns = 4026532748
+     msecs               : count     distribution
+         0 -> 1          : 63       |****************************************|
+
+pidns = 4026532687
+     msecs               : count     distribution
+         0 -> 1          : 7        |****************************************|
+
+pidns = 4026532626
+     msecs               : count     distribution
+         0 -> 1          : 45       |****************************************|
+         2 -> 3          : 0        |                                        |
+         4 -> 7          : 0        |                                        |
+         8 -> 15         : 0        |                                        |
+        16 -> 31         : 0        |                                        |
+        32 -> 63         : 0        |                                        |
+        64 -> 127        : 3        |**                                      |
+
+pidns = 4026531836
+     msecs               : count     distribution
+         0 -> 1          : 314      |****************************************|
+         2 -> 3          : 1        |                                        |
+         4 -> 7          : 11       |*                                       |
+         8 -> 15         : 28       |***                                     |
+        16 -> 31         : 137      |*****************                       |
+        32 -> 63         : 86       |**********                              |
+        64 -> 127        : 1        |                                        |
+
+pidns = 4026532382
+     msecs               : count     distribution
+         0 -> 1          : 285      |****************************************|
+         2 -> 3          : 5        |                                        |
+         4 -> 7          : 16       |**                                      |
+         8 -> 15         : 9        |*                                       |
+        16 -> 31         : 69       |*********                               |
+        32 -> 63         : 25       |***                                     |
+
+Many of these distributions have two modes: the second, in this case, is
+caused by capping CPU usage via CPU shares.
+
+
 USAGE message:
 
 # ./runqlat -h
-usage: runqlat [-h] [-T] [-m] [-P] [-L] [-p PID] [interval] [count]
+usage: runqlat.py [-h] [-T] [-m] [-P] [--pidnss] [-L] [-p PID]
+                  [interval] [count]
 
-Summarize run queue (schedular) latency as a histogram
+Summarize run queue (scheduler) latency as a histogram
 
 positional arguments:
   interval            output interval, in seconds
@@ -502,6 +565,7 @@ optional arguments:
   -T, --timestamp     include timestamp on output
   -m, --milliseconds  millisecond histogram
   -P, --pids          print a histogram per process ID
+  --pidnss            print a histogram per PID namespace
   -L, --tids          print a histogram per thread ID
   -p PID, --pid PID   trace this PID only