Containerd shim 进程 PPID 之谜
这件事困扰了我很久,现在终于有时间来一探究竟了。
Kubernetes 自从 1.20 版废除对 dockershim 的支持,改用 Containerd[1] 作为默认的容器运行时。
我们使用 ps
命令来观察一下 Containerd 相关进程:
$ ps -ef | grep containerd
root 1002 1 3 02:29 ? 00:00:19 /usr/bin/kubelet --bootstrap-kubeconfig=/etc/kubernetes/bootstrap-kubelet.conf --kubeconfig=/etc/kubernetes/kubelet.conf --config=/var/lib/kubelet/config.yaml --container-runtime=remote --container-runtime-endpoint=/run/containerd/containerd.sock
root 1011 1 1 02:29 ? 00:00:07 /usr/bin/containerd
root 1622 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 5ca114f2233d4638fae47b86ed058c0774a248168b3bb66d41f94bdcd1e56626 -address /run/containerd/containerd.sock
root 1624 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 4727e762c3fa1a7f2d4beebfeb79a4ee22298e48018beee5204cc8fd98e7bd41 -address /run/containerd/containerd.sock
root 1660 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 35d2d1cafe57afde4a1e3041a75d216e48f75312760b1c77ffaa7acc0ee8802f -address /run/containerd/containerd.sock
root 1661 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id c7769877c77465c86e803b1522ad44ec5ee62b4ff90d1e7f9afd13680215f048 -address /run/containerd/containerd.sock
root 2003 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id f4165704fb540c52586e2edcff1c420fe4177d0494205a019201689c7d65d5d4 -address /run/containerd/containerd.sock
root 2090 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id ebe6394198639bfe1e9a09e5e72bf4fc6f55fb1c1e617cdda5409a7d35941010 -address /run/containerd/containerd.sock
root 2637 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 68611b98a5fa4e19a18494898d084b1c025ff94bf840ffc035dd00694bb3fd17 -address /run/containerd/containerd.sock
root 2792 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 5aba84afafa0e41a93034903d079aaa5ba730b7b134fff3a1fd533e4db85f28b -address /run/containerd/containerd.sock
root 2957 1 0 02:29 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id ced3e51f8774077aaaf52bf6e381f0e1d21be585cc1f2f1abd285a1588da638b -address /run/containerd/containerd.sock
我们会发现了一个奇怪的现象,containerd 进程是由 PID 1 号进程 systemd 托管,所以 containerd 进程的父进程 ID(PPID)毫无疑问就是 1;而由 containerd 拉起的 containerd-shim 进程的 PPID 也是 1,但实际上 containerd-shim 并非由 systemd 托管。
这一定是有意为之,containerd-shim 进程与 containerd 进程彻底脱离关系,containerd 进程即使崩溃重启就不会对 containerd-shim 进程造成任何影响,而 Kubernetes 集群中的各个容器进程正是由 containerd-shim 拉起的。
我们使用 pstree
观察系统的进程树:
$ pstree
systemd─┬─NetworkManager─┬─dhclient
│ └─2*[{NetworkManager}]
├─agetty
├─auditd───{auditd}
├─chronyd
├─containerd───24*[{containerd}]
├─containerd-shim─┬─etcd───14*[{etcd}]
│ ├─pause
│ └─13*[{containerd-shim}]
├─containerd-shim─┬─kube-apiserver───9*[{kube-apiserver}]
│ ├─pause
│ └─12*[{containerd-shim}]
├─containerd-shim─┬─kube-controller───7*[{kube-controller}]
│ ├─pause
│ └─12*[{containerd-shim}]
├─containerd-shim─┬─kube-scheduler───8*[{kube-scheduler}]
│ ├─pause
│ └─13*[{containerd-shim}]
├─containerd-shim─┬─kube-proxy───7*[{kube-proxy}]
│ ├─pause
│ └─13*[{containerd-shim}]
├─containerd-shim─┬─flanneld───9*[{flanneld}]
│ ├─pause
│ └─14*[{containerd-shim}]
├─containerd-shim─┬─coredns───9*[{coredns}]
│ ├─pause
│ └─13*[{containerd-shim}]
├─containerd-shim─┬─coredns───9*[{coredns}]
│ ├─pause
│ └─14*[{containerd-shim}]
├─containerd-shim─┬─pause
│ └─14*[{containerd-shim}]
├─crond
我们尝试干掉 etcd 容器的父进程 containerd-shim:
$ pstree -p 1890 -s
systemd(1)───containerd-shim(1622)───etcd(1890)
$ kill -9 1622
$ ps -ef | grep etcd
root 10286 10221 17 03:03 ? 00:00:00 etcd --advertise-client-urls=https://10.211.55.13:2379 --cert-file=/etc/kubernetes/pki/etcd/server.crt --client-cert-auth=true --data-dir=/var/lib/etcd --initial-advertise-peer-urls=https://10.211.55.13:2380 --initial-cluster=k8s-test20=https://10.211.55.13:2380 --key-file=/etc/kubernetes/pki/etcd/server.key --listen-client-urls=https://127.0.0.1:2379,https://10.211.55.13:2379 --listen-metrics-urls=http://127.0.0.1:2381 --listen-peer-urls=https://10.211.55.13:2380 --name=k8s-test20 --peer-cert-file=/etc/kubernetes/pki/etcd/peer.crt --peer-client-cert-auth=true --peer-key-file=/etc/kubernetes/pki/etcd/peer.key --peer-trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt --snapshot-count=10000 --trusted-ca-file=/etc/kubernetes/pki/etcd/ca.crt
etcd 进程的 PID 变为 10221,很明显重启过了。
$ pstree -p 10221 -s
systemd(1)───containerd-shim(10221)─┬─etcd(10286)
那么 Containerd 是如何做到 fork 出 containerd-shim 进程后保留其 PPID 的呢?
fork
在 Linux/Unix 中创建一个新的进程通过父进程利用 fork 系统调用来实现。
这里教大家一个小技巧,在 github[2] 上浏览代码时,只要在域名中补上 1s,即 github1s.com,就可以打开一个 web 版的 VS Code 编辑器,非常好用。
最终落实下来是一个名为 do_fork
的函数 https://github.com/torvalds/linux/blob/v3.10/kernel/fork.c#L1557-L1636:
long do_fork(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *parent_tidptr,
int __user *child_tidptr)
{
struct task_struct *p;
int trace = 0;
long nr;
/*
* Do some preliminary argument and permissions checking before we
* actually start allocating stuff
*/
if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) {
if (clone_flags & (CLONE_THREAD|CLONE_PARENT))
return -EINVAL;
}
// a lot of code here
p = copy_process(clone_flags, stack_start, stack_size,
child_tidptr, NULL, trace);
}
fork 系统调用会通过 copy_process
函数复制进程结构,第一个参数 clone_flags
标记子进程从父进程中需要继承的资源清单。
再找同一文件下 copy_process
函数的定义 https://github.com/torvalds/linux/blob/v3.10/kernel/fork.c#L1124-L1533:
static struct task_struct *copy_process(unsigned long clone_flags,
unsigned long stack_start,
unsigned long stack_size,
int __user *child_tidptr,
struct pid *pid,
int trace)
{
int retval;
struct task_struct *p;
// a lot of code here
/* CLONE_PARENT re-uses the old parent */
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
}
}
clone_flags
参数只要传入 CLONE_PARENT
即可在复制进程结构时保留原先的父进程信息。
Containerd 是 Golang 编写的程序,又是如何实现的呢?
https://github.com/containerd/containerd/blob/v1.4.3/runtime/v2/runc/v2/service.go#L134-L159
func newCommand(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (*exec.Cmd, error) {
ns, err := namespaces.NamespaceRequired(ctx)
if err != nil {
return nil, err
}
self, err := os.Executable()
if err != nil {
return nil, err
}
cwd, err := os.Getwd()
if err != nil {
return nil, err
}
args := []string{
"-namespace", ns,
"-id", id,
"-address", containerdAddress,
}
cmd := exec.Command(self, args...)
cmd.Dir = cwd
cmd.Env = append(os.Environ(), "GOMAXPROCS=4")
cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true,
}
return cmd, nil
}
结合上面 ps
命令的输出,containerd-shim 进程启动时确实带上了 namespace
、id
、address
这几个参数,但可执行二进制文件却是通过 os.Executable()
得到的,并不是通过变量传递来的,我们已经知道了这个执行文件就是 /usr/bin/containerd-shim-runc-v2。那就说明 containerd-shim 进程也是由一个 containerd-shim 父进程拉起来的。
为了证实这个猜想,我利用 execsnoop[3] 这个脚本来监控一把进程的 exec() 行为。
首先我们要关掉 Kubelet 的开机自启 systemctl disable kubelet
来手动控制容器们的启动时机,重启。
开启两个终端,一个运行 execsnoop 实时监控,在另一个终端中将 kubelet 启动:
$ ./execsnoop -a 11 -r -t
62.027700 1687 1139 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd -id f3ee5e89639f483ac05ecbf556800999de36d2c25fed9fe217f06e30c45f1513 start
62.028335 1688 1118 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 start
62.061869 1698 1693 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 -address /run/containerd/containerd.sock
62.066636 1706 1700 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id f3ee5e89639f483ac05ecbf556800999de36d2c25fed9fe217f06e30c45f1513 -address /run/containerd/containerd.sock
62.066859 1708 1140 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd -id dc847ec919c40b53e8c64d03646fcc1a31dd6ef58ea9b50364f6852c01eb1b42 start
62.072287 1727 1128 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd -id b00ccf06ecec5b31a1b1d238cb9fd70b8da9d7fc174a3c55e20f8766b08c4b9d start
62.072586 1730 1723 runc --root /run/containerd/runc/k8s.io --log /run/containerd/io.containerd.runtime.v2.task/k8s.io/f3ee5e89639f483ac05ecbf556800999de36d2c25fed9fe217f06e30c45f1513/log.json --log-format json create --bundle /run/containerd/io.containerd.runtime.v2.task/k8s.io/f3ee5e89639f483ac05ecbf556800999de36d2c25fed9fe217f06e30c45f1513 --pid-file /run/containerd/io.containerd.runtime.v2.task/k8s.io/f3ee5e89639f483ac05ecbf556800999de36d2c25fed9fe217f06e30c45f1513/init.pid [...]
$ ps -ef | grep containerd
root 1698 1 0 06:59 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 -address /run/containerd/containerd.sock
root 1706 1 0 06:59 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id f3ee5e89639f483ac05ecbf556800999de36d2c25fed9fe217f06e30c45f1513 -address /run/containerd/containerd.sock
root 1737 1 0 06:59 ? 00:00:00 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id
我们看到
62.028335 1688 1118 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 start
62.061869 1698 1693 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 -address /run/containerd/containerd.sock
结合源码 https://github.com/containerd/containerd/blob/v1.4.3/runtime/v2/shim/shim.go#L221-L229
func run(id string, initFunc Init, config Config) error {
// a lot of code here
switch action {
case "start":
address, err := service.StartShim(ctx, idFlag, containerdBinaryFlag, addressFlag, ttrpcAddress)
if err != nil {
return err
}
if _, err := os.Stdout.WriteString(address); err != nil {
return err
}
return nil
}
}
还有 StartShim
方法 https://github.com/containerd/containerd/blob/v1.4.3/runtime/v2/runc/v2/service.go#L174-L286
func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress, containerdTTRPCAddress string) (_ string, retErr error) {
cmd, err := newCommand(ctx, id, containerdBinary, containerdAddress, containerdTTRPCAddress)
if err != nil {
return "", err
}
// a lot of code here
}
证实了我的猜想,containerd-shim 进程都是由一个 containerd-shim 父进程通过 start
子命令启动的。那就回到原来的问题了,containerd-shim 是如何将 PPID 设置为为 1 的,毕竟 execsnoop 显示该进程实际的 PPID 是 1693。
我们来看一下源码 https://github.com/containerd/containerd/blob/v1.4.3/runtime/v2/runc/v2/service.go#L230-L233:
if err := cmd.Start(); err != nil {
f.Close()
return "", err
}
containerd-shim 进程是通过 os/exec 包中的 Start
方法启动的 https://github.com/golang/go/blob/master/src/os/exec/exec.go#L370-L458:
// Start starts the specified command but does not wait for it to complete.
//
// If Start returns successfully, the c.Process field will be set.
//
// The Wait method will return the exit code and release associated resources
// once the command exits.
func (c *Cmd) Start() error {
if c.lookPathErr != nil {
c.closeDescriptors(c.closeAfterStart)
c.closeDescriptors(c.closeAfterWait)
return c.lookPathErr
}
// a lot of code here
c.Process, err = os.StartProcess(c.Path, c.argv(), &os.ProcAttr{
Dir: c.Dir,
Files: c.childFiles,
Env: addCriticalEnv(dedupEnv(envv)),
Sys: c.SysProcAttr,
})
if err != nil {
c.closeDescriptors(c.closeAfterStart)
c.closeDescriptors(c.closeAfterWait)
return err
}
}
再跳到 os 包的 StartProcess
函数 https://github.com/golang/go/blob/master/src/os/exec_posix.go:
// StartProcess starts a new process with the program, arguments and attributes
// specified by name, argv and attr. The argv slice will become os.Args in the
// new process, so it normally starts with the program name.
//
// If the calling goroutine has locked the operating system thread
// with runtime.LockOSThread and modified any inheritable OS-level
// thread state (for example, Linux or Plan 9 name spaces), the new
// process will inherit the caller's thread state.
//
// StartProcess is a low-level interface. The os/exec package provides
// higher-level interfaces.
//
// If there is an error, it will be of type *PathError.
func StartProcess(name string, argv []string, attr *ProcAttr) (*Process, error) {
return startProcess(name, argv, attr)
}
func startProcess(name string, argv []string, attr *ProcAttr) (p *Process, err error) {
// a lot of code here
pid, h, e := syscall.StartProcess(name, argv, sysattr)
// Make sure we don't run the finalizers of attr.Files.
runtime.KeepAlive(attr)
if e != nil {
return nil, &PathError{"fork/exec", name, e}
}
return newProcess(pid, h), nil
}
再跳到 syscall 包的 StartProcess
函数 https://github.com/golang/go/blob/master/src/syscall/exec_unix.go
// StartProcess wraps ForkExec for package os.
func StartProcess(argv0 string, argv []string, attr *ProcAttr) (pid int, handle uintptr, err error) {
pid, err = forkExec(argv0, argv, attr)
}
func forkExec(argv0 string, argv []string, attr *ProcAttr) (pid int, err error) {
// a lot of code here
// Convert args to C form.
argv0p, err := BytePtrFromString(argv0)
if err != nil {
return 0, err
}
argvp, err := SlicePtrFromStrings(argv)
if err != nil {
return 0, err
}
envvp, err := SlicePtrFromStrings(attr.Env)
if err != nil {
return 0, err
}
if (runtime.GOOS == "freebsd" || runtime.GOOS == "dragonfly") && len(argv[0]) > len(argv0) {
argvp[0] = argv0p
}
// a lot of code here
// Acquire the fork lock so that no other threads
// create new fds that are not yet close-on-exec
// before we fork.
ForkLock.Lock()
// Allocate child status pipe close on exec.
if err = forkExecPipe(p[:]); err != nil {
goto error
}
// Kick off child.
pid, err1 = forkAndExecInChild(argv0p, argvp, envvp, chroot, dir, attr, sys, p[1])
if err1 != 0 {
err = Errno(err1)
goto error
}
ForkLock.Unlock()
// a lot of code here
}
因为 Containerd 运行在 Linux 系统,所以 forkAndExecInChild
函数要看 Linux 的那份 https://github.com/golang/go/blob/master/src/syscall/exec_linux.go
// Fork, dup fd onto 0..len(fd), and exec(argv0, argvv, envv) in child.
// If a dup or exec fails, write the errno error to pipe.
// (Pipe is close-on-exec so if exec succeeds, it will be closed.)
// In the child, this function must not acquire any locks, because
// they might have been locked at the time of the fork. This means
// no rescheduling, no malloc calls, and no new stack segments.
// For the same reason compiler does not race instrument it.
// The calls to RawSyscall are okay because they are assembly
// functions that do not grow the stack.
//go:norace
func forkAndExecInChild(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (pid int, err Errno) {
// Set up and fork. This returns immediately in the parent or
// if there's an error.
r1, err1, p, locked := forkAndExecInChild1(argv0, argv, envv, chroot, dir, attr, sys, pipe)
if locked {
runtime_AfterFork()
}
if err1 != 0 {
return 0, err1
}
// parent; return PID
pid = int(r1)
if sys.UidMappings != nil || sys.GidMappings != nil {
Close(p[0])
var err2 Errno
// uid/gid mappings will be written after fork and unshare(2) for user
// namespaces.
if sys.Unshareflags&CLONE_NEWUSER == 0 {
if err := writeUidGidMappings(pid, sys); err != nil {
err2 = err.(Errno)
}
}
RawSyscall(SYS_WRITE, uintptr(p[1]), uintptr(unsafe.Pointer(&err2)), unsafe.Sizeof(err2))
Close(p[1])
}
return pid, 0
}
// forkAndExecInChild1 implements the body of forkAndExecInChild up to
// the parent's post-fork path. This is a separate function so we can
// separate the child's and parent's stack frames if we're using
// vfork.
//
// This is go:noinline because the point is to keep the stack frames
// of this and forkAndExecInChild separate.
//
//go:noinline
//go:norace
func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, attr *ProcAttr, sys *SysProcAttr, pipe int) (r1 uintptr, err1 Errno, p [2]int, locked bool) {
// a lot of code here
// Time to exec.
_, _, err1 = RawSyscall(SYS_EXECVE,
uintptr(unsafe.Pointer(argv0)),
uintptr(unsafe.Pointer(&argv[0])),
uintptr(unsafe.Pointer(&envv[0])))
}
根据 https://github.com/golang/go/blob/master/src/syscall/zsysnum_linux_amd64.go#L68 在 Linux amd64 架构中 SYS_EXECVE
为 59,这与 Linux 系统调用表 sys_call_table[4] 是完全相同的。
再追下去就是汇编了。。。
// func RawSyscall(trap, a1, a2, a3 uintptr) (r1, r2, err uintptr)
TEXT ·RawSyscall(SB),NOSPLIT,$0-56
MOVQ a1+8(FP), DI
MOVQ a2+16(FP), SI
MOVQ a3+24(FP), DX
MOVQ $0, R10
MOVQ $0, R8
MOVQ $0, R9
MOVQ trap+0(FP), AX // syscall entry
SYSCALL
CMPQ AX, $0xfffffffffffff001
JLS ok1
MOVQ $-1, r1+32(FP)
MOVQ $0, r2+40(FP)
NEGQ AX
MOVQ AX, err+48(FP)
RET
ok1:
MOVQ AX, r1+32(FP)
MOVQ DX, r2+40(FP)
MOVQ $0, err+48(FP)
RET
所以搞了半天最终的系统调用还不是 fork。。。execve 落实下来是一个名为 do_execve
的函数 https://github.com/torvalds/linux/blob/v3.10/fs/exec.c
使用 execve 系统调用创建出来的进程是全新的,不会从原进程复制进程结构。
62.028335 1688 1118 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -address /run/containerd/containerd.sock -publish-binary /usr/bin/containerd -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 start
62.061869 1698 1693 /usr/bin/containerd-shim-runc-v2 -namespace k8s.io -id 66cbc3f2e8a67d59177959801cd6b9b3c76cb27833068c426e14dee5667b20d3 -address /run/containerd/containerd.sock
根据源码当 1693 进程也就是 1688 进程很快结束后,1698 也就成为了孤儿进程,孤儿进程会被 init 进程也即是 1 号进程(systemd)收养,这就是 containerd-shim 进程的 PPID 全都是 1 的原因。
引用链接
Containerd: https://containerd.io/
[2]github: https://github.com
[3]execsnoop: https://github.com/brendangregg/perf-tools/blob/master/execsnoop
[4]sys_call_table: https://filippo.io/linux-syscall-table/
原文链接:https://blog.crazytaxii.com/posts/containerd_shim_ppid_confusion/
你可能还喜欢
点击下方图片即可阅读
云原生是一种信仰 🤘
关注公众号
后台回复◉k8s◉获取史上最方便快捷的 Kubernetes 高可用部署工具,只需一条命令,连 ssh 都不需要!
点击 "阅读原文" 获取更好的阅读体验!
发现朋友圈变“安静”了吗?