Search code examples
clinux-kernelbpfebpfbcc-bpf

How can I determine which namespaces a PID is in from kernel space?


I am trying to write an eBPF program to log every call of a particular syscall from containers running on the system. I am using bcc and can retrieve the PID using bpf_get_current_pid_tgid().

From userspace I could examine the proc filesystem to determine if the process's namespaces differ from the root namespaces to guess whether it's a container process or not, but I do not know how do you do this from kernel space?


Solution

  • You can use the (Linux 4.8+ only) bpf_get_current_task helper to retrieve the struct task_struct of the current process. Then the PID as seen by processes inside the container is in t->nsproxy->pid_ns_for_children->last_pid.

    The following shows this in action when tracing execve syscalls (you can use top inside the container to check that the upid is correct):

    from bcc import BPF
    BPF(text="""
    #include <linux/pid_namespace.h>
    int kprobe__sys_execve(void *ctx) {
        u32 pid = bpf_get_current_pid_tgid();
        struct task_struct *t = (struct task_struct *)bpf_get_current_task();
        u32 upid = t->nsproxy->pid_ns_for_children->last_pid;
        bpf_trace_printk("pid=%d; upid=%d!\\n", pid, upid);
        return 0;
    }
    """).trace_print()
    

    The following diff (based of a44d26ed3) extends bcc's execsnoop.py to retrieve the upid:

    diff --git a/tools/execsnoop.py b/tools/execsnoop.py
    index 5711fd1..2134f69 100755
    --- a/tools/execsnoop.py
    +++ b/tools/execsnoop.py
    @@ -53,6 +53,7 @@ bpf_text = """
     #include <uapi/linux/ptrace.h>
     #include <linux/sched.h>
     #include <linux/fs.h>
    +#include <linux/pid_namespace.h>
    
     #define ARGSIZE  128
    
    @@ -63,6 +64,7 @@ enum event_type {
    
     struct data_t {
         u32 pid;  // PID as in the userspace term (i.e. task->tgid in kernel)
    +    u32 upid;
         char comm[TASK_COMM_LEN];
         enum event_type type;
         char argv[ARGSIZE];
    @@ -119,6 +121,8 @@ int kretprobe__sys_execve(struct pt_regs *ctx)
     {
         struct data_t data = {};
         data.pid = bpf_get_current_pid_tgid() >> 32;
    +    struct task_struct *t = (struct task_struct *)bpf_get_current_task();
    +    data.upid = t->nsproxy->pid_ns_for_children->last_pid;
         bpf_get_current_comm(&data.comm, sizeof(data.comm));
         data.type = EVENT_RET;
         data.retval = PT_REGS_RC(ctx);
    @@ -134,7 +138,7 @@ b = BPF(text=bpf_text.replace("MAXARG", args.max_args))
     # header
     if args.timestamp:
         print("%-8s" % ("TIME(s)"), end="")
    -print("%-16s %-6s %-6s %3s %s" % ("PCOMM", "PID", "PPID", "RET", "ARGS"))
    +print("%-16s %-6s %-6s %-6s %3s %s" % ("PCOMM", "PID", "UPID", "PPID", "RET", "ARGS"))
    
     TASK_COMM_LEN = 16      # linux/sched.h
     ARGSIZE = 128           # should match #define in C above
    @@ -142,6 +146,7 @@ ARGSIZE = 128           # should match #define in C above
     class Data(ct.Structure):
         _fields_ = [
             ("pid", ct.c_uint),
    +        ("upid", ct.c_uint),
             ("comm", ct.c_char * TASK_COMM_LEN),
             ("type", ct.c_int),
             ("argv", ct.c_char * ARGSIZE),
    @@ -189,8 +194,8 @@ def print_event(cpu, data, size):
                 if args.timestamp:
                     print("%-8.3f" % (time.time() - start_ts), end="")
                 ppid = get_ppid(event.pid)
    -            print("%-16s %-6s %-6s %3s %s" % (event.comm.decode(), event.pid,
    -                    ppid if ppid > 0 else "?", event.retval,
    +            print("%-16s %-6s %-6s %-6s %3s %s" % (event.comm.decode(), event.pid,
    +                    event.upid, ppid if ppid > 0 else "?", event.retval,
                         b' '.join(argv[event.pid]).decode()))
             try:
                 del(argv[event.pid])