From 12df56b9f13fe64ff1e9272dada4398a1e3d7146 Mon Sep 17 00:00:00 2001 From: Andrew Haberlandt Date: Wed, 9 Nov 2022 23:43:44 -0500 Subject: [PATCH] Setup cgroup.subtree_control controllers when necessary in cgroupsv2 This commit adds extra setup when cgroupsv2 is enabled. In particular, we make sure that the root namespace has setup cgroup.subtree_control with the controllers we need. If the necessary controller are not listed, we have to move all processes out of the root namespace before we can change this (the 'no internal processes' rule: https://unix.stackexchange.com/a/713343). Currently we only handle the case where the nsjail process is the only process in the cgroup. It seems like this would be relatively rare, but since nsjail is frequently the root process in a Docker container (e.g. for hosting CTF challenges), I think this case is common enough to make it worth implementing. This also adds `--detect_cgroupv2`, which will attempt to detect whether `--cgroupv2_mount` is a valid cgroupv2 mount, and if so it will set `use_cgroupv2`. This is useful in containerized environments where you may not know the kernel version ahead of time. References: https://github.com/redpwn/jail/blob/master/internal/cgroup/cgroup2.go --- cgroup2.cc | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++ cgroup2.h | 2 + cmdline.cc | 5 +++ config.cc | 1 + config.proto | 3 ++ nsjail.cc | 14 +++++++ nsjail.h | 1 + util.cc | 10 +++-- util.h | 2 +- 9 files changed, 144 insertions(+), 4 deletions(-) diff --git a/cgroup2.cc b/cgroup2.cc index 1902c5e..b013fa6 100644 --- a/cgroup2.cc +++ b/cgroup2.cc @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -39,9 +41,14 @@ namespace cgroup2 { +static bool addPidToProcList(const std::string &cgroup_path, pid_t pid); + static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) { return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid); } +static std::string getJailCgroupPath(nsjconf_t *nsjconf) { + return nsjconf->cgroupv2_mount + "/NSJAIL_SELF." + std::to_string(getpid()); +} static bool createCgroup(const std::string &cgroup_path, pid_t pid) { LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid); @@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) { return true; } +static bool moveSelfIntoChildCgroup(nsjconf_t *nsjconf) { + // Move ourselves into another group to avoid the 'No internal processes' rule + // https://unix.stackexchange.com/a/713343 + std::string jail_cgroup_path = getJailCgroupPath(nsjconf); + LOG_I("nsjail is moving itself to a new child cgroup: %s\n", jail_cgroup_path.c_str()); + RETURN_ON_FAILURE(createCgroup(jail_cgroup_path, getpid())); + RETURN_ON_FAILURE(addPidToProcList(jail_cgroup_path, 0)); + return true; +} + + +static bool enableCgroupSubtree(nsjconf_t *nsjconf, const std::string &controller, pid_t pid) { + std::string cgroup_path = nsjconf->cgroupv2_mount; + LOG_D("Enable cgroup.subtree_control +'%s' to '%s' for pid=%d", controller.c_str(), cgroup_path.c_str(), pid); + std::string val = "+" + controller; + + // Try once without moving the nsjail process and if that fails then try moving the nsjail process + // into a child cgroup before trying a second time. + if (util::writeBufToFile( + (cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY, false)) { + return true; + } + if (errno == EBUSY) { + RETURN_ON_FAILURE(moveSelfIntoChildCgroup(nsjconf)); + if (util::writeBufToFile( + (cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY)) { + return true; + } + } + LOG_E("Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups.", val.c_str(), cgroup_path.c_str()); + return false; +} + static bool writeToCgroup( const std::string &cgroup_path, const std::string &resource, const std::string &value) { LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str()); @@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) { } } +static bool needMemoryController(nsjconf_t *nsjconf) { + // Check if we need 'memory' + // This matches the check in initNsFromParentMem + ssize_t swap_max = nsjconf->cgroup_mem_swap_max; + if (nsjconf->cgroup_mem_memsw_max > (size_t)0) { + swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max; + } + if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) { + return false; + } + return true; +} + +static bool needPidsController(nsjconf_t *nsjconf) { + return nsjconf->cgroup_pids_max != 0; +} + +static bool needCpuController(nsjconf_t *nsjconf) { + return nsjconf->cgroup_cpu_ms_per_sec != 0U; +} + +// We will use this buf to read from cgroup.subtree_control to see if +// the root cgroup has the necessary controllers listed +#define SUBTREE_CONTROL_BUF_LEN 0x40 + +bool setup(nsjconf_t *nsjconf) { + // Read from cgroup.subtree_control in the root to see if + // the controllers we need are there. + auto p = nsjconf->cgroupv2_mount + "/cgroup.subtree_control"; + char buf[SUBTREE_CONTROL_BUF_LEN]; + int read = util::readFromFile(p.c_str(), buf, SUBTREE_CONTROL_BUF_LEN-1); + if (read < 0) { + LOG_W("cgroupv2 setup: Could not read root subtree_control"); + return false; + } + buf[read] = 0; + + // Are the controllers we need there? + bool subtree_ok = (!needMemoryController(nsjconf) || strstr(buf, "memory")) && + (!needPidsController(nsjconf) || strstr(buf, "pids")) && + (!needCpuController(nsjconf) || strstr(buf, "cpu")); + if (!subtree_ok) { + // Now we can write to the root cgroup.subtree_control + if (needMemoryController(nsjconf)) { + RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "memory", getpid())); + } + + if (needPidsController(nsjconf)) { + RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "pids", getpid())); + } + + if (needCpuController(nsjconf)) { + RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "cpu", getpid())); + } + } + return true; +} + +bool detectCgroupv2(nsjconf_t *nsjconf) { + // Check cgroupv2_mount, if it is a cgroup2 mount, use it. + struct statfs buf; + if (statfs(nsjconf->cgroupv2_mount.c_str(), &buf)) { + LOG_D("statfs %s failed with %d", nsjconf->cgroupv2_mount.c_str(), errno); + nsjconf->use_cgroupv2 = false; + return false; + } + nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC); + return true; +} + static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) { ssize_t swap_max = nsjconf->cgroup_mem_swap_max; if (nsjconf->cgroup_mem_memsw_max > (size_t)0) { diff --git a/cgroup2.h b/cgroup2.h index 3e0cc71..bd86aae 100644 --- a/cgroup2.h +++ b/cgroup2.h @@ -32,6 +32,8 @@ namespace cgroup2 { bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid); bool initNs(void); void finishFromParent(nsjconf_t* nsjconf, pid_t pid); +bool setup(nsjconf_t *nsjconf); +bool detectCgroupv2(nsjconf_t *nsjconf); } // namespace cgroup2 diff --git a/cmdline.cc b/cmdline.cc index 5194055..acfc5d8 100644 --- a/cmdline.cc +++ b/cmdline.cc @@ -158,6 +158,7 @@ struct custom_option custom_opts[] = { { { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" }, { { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"}, { { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"}, + { { "detect_cgroupv2", no_argument, NULL, 0x0836}, "Use cgroupv2, if it is available. (Specify instead of use_cgroupv2)"}, { { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" }, { { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" }, { { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" }, @@ -473,6 +474,7 @@ std::unique_ptr parseArgs(int argc, char* argv[]) { nsjconf->cgroup_cpu_ms_per_sec = 0U; nsjconf->cgroupv2_mount = "/sys/fs/cgroup"; nsjconf->use_cgroupv2 = false; + nsjconf->detect_cgroupv2 = false; nsjconf->iface_lo = true; nsjconf->iface_vs_ip = "0.0.0.0"; nsjconf->iface_vs_nm = "255.255.255.0"; @@ -912,6 +914,9 @@ std::unique_ptr parseArgs(int argc, char* argv[]) { case 0x835: nsjconf->use_cgroupv2 = true; break; + case 0x836: + nsjconf->detect_cgroupv2 = true; + break; case 'P': nsjconf->kafel_file_path = optarg; break; diff --git a/config.cc b/config.cc index 195b21c..5242ba4 100644 --- a/config.cc +++ b/config.cc @@ -266,6 +266,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent(); nsjconf->cgroupv2_mount = njc.cgroupv2_mount(); nsjconf->use_cgroupv2 = njc.use_cgroupv2(); + nsjconf->detect_cgroupv2 = njc.detect_cgroupv2(); nsjconf->iface_lo = !(njc.iface_no_lo()); for (ssize_t i = 0; i < njc.iface_own().size(); i++) { diff --git a/config.proto b/config.proto index ab2f839..bdc688d 100644 --- a/config.proto +++ b/config.proto @@ -272,4 +272,7 @@ message NsJailConfig { /* Set this to true to forward fatal signals to the child process instead * of always using SIGKILL. */ optional bool forward_signals = 94 [default = false]; + + /* Check whether cgroupv2 is available, and use it if available. */ + optional bool detect_cgroupv2 = 95 [default = false]; } diff --git a/nsjail.cc b/nsjail.cc index fdb0d30..66f8897 100644 --- a/nsjail.cc +++ b/nsjail.cc @@ -46,6 +46,7 @@ #include "sandbox.h" #include "subproc.h" #include "util.h" +#include "cgroup2.h" namespace nsjail { @@ -342,6 +343,19 @@ int main(int argc, char* argv[]) { if (!nsjail::setTimer(nsjconf.get())) { LOG_F("nsjail::setTimer() failed"); } + + if (nsjconf->detect_cgroupv2) { + cgroup2::detectCgroupv2(nsjconf.get()); + LOG_I("Detected cgroups version: %d", nsjconf->use_cgroupv2 ? 2 : 1); + } + + if (nsjconf->use_cgroupv2) { + if (!cgroup2::setup(nsjconf.get())) { + LOG_E("Couldn't setup parent cgroup (cgroupv2)"); + return -1; + } + } + if (!sandbox::preparePolicy(nsjconf.get())) { LOG_F("Couldn't prepare sandboxing policy"); } diff --git a/nsjail.h b/nsjail.h index 346dc0b..38c9851 100644 --- a/nsjail.h +++ b/nsjail.h @@ -163,6 +163,7 @@ struct nsjconf_t { unsigned int cgroup_cpu_ms_per_sec; std::string cgroupv2_mount; bool use_cgroupv2; + bool detect_cgroupv2; std::string kafel_file_path; std::string kafel_string; struct sock_fprog seccomp_fprog; diff --git a/util.cc b/util.cc index ba167fa..8e1ef8b 100644 --- a/util.cc +++ b/util.cc @@ -89,16 +89,20 @@ bool writeToFd(int fd, const void* buf, size_t len) { return true; } -bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags) { +bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors) { int fd; TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644)); if (fd == -1) { - PLOG_E("Couldn't open '%s' for writing", filename); + if (log_errors) { + PLOG_E("Couldn't open '%s' for writing", filename); + } return false; } if (!writeToFd(fd, buf, len)) { - PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd); + if (log_errors) { + PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd); + } close(fd); if (open_flags & O_CREAT) { unlink(filename); diff --git a/util.h b/util.h index d3be2e9..7aca782 100644 --- a/util.h +++ b/util.h @@ -46,7 +46,7 @@ namespace util { ssize_t readFromFd(int fd, void* buf, size_t len); ssize_t readFromFile(const char* fname, void* buf, size_t len); bool writeToFd(int fd, const void* buf, size_t len); -bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags); +bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors = true); bool createDirRecursively(const char* dir); std::string* StrAppend(std::string* str, const char* format, ...) __attribute__((format(printf, 2, 3)));