diff --git a/cgroup2.cc b/cgroup2.cc index 1902c5e..b013fa6 100644 --- a/cgroup2.cc +++ b/cgroup2.cc @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -39,9 +41,14 @@ namespace cgroup2 { +static bool addPidToProcList(const std::string &cgroup_path, pid_t pid); + static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) { return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid); } +static std::string getJailCgroupPath(nsjconf_t *nsjconf) { + return nsjconf->cgroupv2_mount + "/NSJAIL_SELF." + std::to_string(getpid()); +} static bool createCgroup(const std::string &cgroup_path, pid_t pid) { LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid); @@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) { return true; } +static bool moveSelfIntoChildCgroup(nsjconf_t *nsjconf) { + // Move ourselves into another group to avoid the 'No internal processes' rule + // https://unix.stackexchange.com/a/713343 + std::string jail_cgroup_path = getJailCgroupPath(nsjconf); + LOG_I("nsjail is moving itself to a new child cgroup: %s\n", jail_cgroup_path.c_str()); + RETURN_ON_FAILURE(createCgroup(jail_cgroup_path, getpid())); + RETURN_ON_FAILURE(addPidToProcList(jail_cgroup_path, 0)); + return true; +} + + +static bool enableCgroupSubtree(nsjconf_t *nsjconf, const std::string &controller, pid_t pid) { + std::string cgroup_path = nsjconf->cgroupv2_mount; + LOG_D("Enable cgroup.subtree_control +'%s' to '%s' for pid=%d", controller.c_str(), cgroup_path.c_str(), pid); + std::string val = "+" + controller; + + // Try once without moving the nsjail process and if that fails then try moving the nsjail process + // into a child cgroup before trying a second time. + if (util::writeBufToFile( + (cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY, false)) { + return true; + } + if (errno == EBUSY) { + RETURN_ON_FAILURE(moveSelfIntoChildCgroup(nsjconf)); + if (util::writeBufToFile( + (cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY)) { + return true; + } + } + LOG_E("Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups.", val.c_str(), cgroup_path.c_str()); + return false; +} + static bool writeToCgroup( const std::string &cgroup_path, const std::string &resource, const std::string &value) { LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str()); @@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) { } } +static bool needMemoryController(nsjconf_t *nsjconf) { + // Check if we need 'memory' + // This matches the check in initNsFromParentMem + ssize_t swap_max = nsjconf->cgroup_mem_swap_max; + if (nsjconf->cgroup_mem_memsw_max > (size_t)0) { + swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max; + } + if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) { + return false; + } + return true; +} + +static bool needPidsController(nsjconf_t *nsjconf) { + return nsjconf->cgroup_pids_max != 0; +} + +static bool needCpuController(nsjconf_t *nsjconf) { + return nsjconf->cgroup_cpu_ms_per_sec != 0U; +} + +// We will use this buf to read from cgroup.subtree_control to see if +// the root cgroup has the necessary controllers listed +#define SUBTREE_CONTROL_BUF_LEN 0x40 + +bool setup(nsjconf_t *nsjconf) { + // Read from cgroup.subtree_control in the root to see if + // the controllers we need are there. + auto p = nsjconf->cgroupv2_mount + "/cgroup.subtree_control"; + char buf[SUBTREE_CONTROL_BUF_LEN]; + int read = util::readFromFile(p.c_str(), buf, SUBTREE_CONTROL_BUF_LEN-1); + if (read < 0) { + LOG_W("cgroupv2 setup: Could not read root subtree_control"); + return false; + } + buf[read] = 0; + + // Are the controllers we need there? + bool subtree_ok = (!needMemoryController(nsjconf) || strstr(buf, "memory")) && + (!needPidsController(nsjconf) || strstr(buf, "pids")) && + (!needCpuController(nsjconf) || strstr(buf, "cpu")); + if (!subtree_ok) { + // Now we can write to the root cgroup.subtree_control + if (needMemoryController(nsjconf)) { + RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "memory", getpid())); + } + + if (needPidsController(nsjconf)) { + RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "pids", getpid())); + } + + if (needCpuController(nsjconf)) { + RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "cpu", getpid())); + } + } + return true; +} + +bool detectCgroupv2(nsjconf_t *nsjconf) { + // Check cgroupv2_mount, if it is a cgroup2 mount, use it. + struct statfs buf; + if (statfs(nsjconf->cgroupv2_mount.c_str(), &buf)) { + LOG_D("statfs %s failed with %d", nsjconf->cgroupv2_mount.c_str(), errno); + nsjconf->use_cgroupv2 = false; + return false; + } + nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC); + return true; +} + static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) { ssize_t swap_max = nsjconf->cgroup_mem_swap_max; if (nsjconf->cgroup_mem_memsw_max > (size_t)0) { diff --git a/cgroup2.h b/cgroup2.h index 3e0cc71..bd86aae 100644 --- a/cgroup2.h +++ b/cgroup2.h @@ -32,6 +32,8 @@ namespace cgroup2 { bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid); bool initNs(void); void finishFromParent(nsjconf_t* nsjconf, pid_t pid); +bool setup(nsjconf_t *nsjconf); +bool detectCgroupv2(nsjconf_t *nsjconf); } // namespace cgroup2 diff --git a/cmdline.cc b/cmdline.cc index 5194055..acfc5d8 100644 --- a/cmdline.cc +++ b/cmdline.cc @@ -158,6 +158,7 @@ struct custom_option custom_opts[] = { { { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" }, { { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"}, { { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"}, + { { "detect_cgroupv2", no_argument, NULL, 0x0836}, "Use cgroupv2, if it is available. (Specify instead of use_cgroupv2)"}, { { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" }, { { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" }, { { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" }, @@ -473,6 +474,7 @@ std::unique_ptr parseArgs(int argc, char* argv[]) { nsjconf->cgroup_cpu_ms_per_sec = 0U; nsjconf->cgroupv2_mount = "/sys/fs/cgroup"; nsjconf->use_cgroupv2 = false; + nsjconf->detect_cgroupv2 = false; nsjconf->iface_lo = true; nsjconf->iface_vs_ip = "0.0.0.0"; nsjconf->iface_vs_nm = "255.255.255.0"; @@ -912,6 +914,9 @@ std::unique_ptr parseArgs(int argc, char* argv[]) { case 0x835: nsjconf->use_cgroupv2 = true; break; + case 0x836: + nsjconf->detect_cgroupv2 = true; + break; case 'P': nsjconf->kafel_file_path = optarg; break; diff --git a/config.cc b/config.cc index 195b21c..5242ba4 100644 --- a/config.cc +++ b/config.cc @@ -266,6 +266,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig& nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent(); nsjconf->cgroupv2_mount = njc.cgroupv2_mount(); nsjconf->use_cgroupv2 = njc.use_cgroupv2(); + nsjconf->detect_cgroupv2 = njc.detect_cgroupv2(); nsjconf->iface_lo = !(njc.iface_no_lo()); for (ssize_t i = 0; i < njc.iface_own().size(); i++) { diff --git a/config.proto b/config.proto index ab2f839..bdc688d 100644 --- a/config.proto +++ b/config.proto @@ -272,4 +272,7 @@ message NsJailConfig { /* Set this to true to forward fatal signals to the child process instead * of always using SIGKILL. */ optional bool forward_signals = 94 [default = false]; + + /* Check whether cgroupv2 is available, and use it if available. */ + optional bool detect_cgroupv2 = 95 [default = false]; } diff --git a/nsjail.cc b/nsjail.cc index fdb0d30..66f8897 100644 --- a/nsjail.cc +++ b/nsjail.cc @@ -46,6 +46,7 @@ #include "sandbox.h" #include "subproc.h" #include "util.h" +#include "cgroup2.h" namespace nsjail { @@ -342,6 +343,19 @@ int main(int argc, char* argv[]) { if (!nsjail::setTimer(nsjconf.get())) { LOG_F("nsjail::setTimer() failed"); } + + if (nsjconf->detect_cgroupv2) { + cgroup2::detectCgroupv2(nsjconf.get()); + LOG_I("Detected cgroups version: %d", nsjconf->use_cgroupv2 ? 2 : 1); + } + + if (nsjconf->use_cgroupv2) { + if (!cgroup2::setup(nsjconf.get())) { + LOG_E("Couldn't setup parent cgroup (cgroupv2)"); + return -1; + } + } + if (!sandbox::preparePolicy(nsjconf.get())) { LOG_F("Couldn't prepare sandboxing policy"); } diff --git a/nsjail.h b/nsjail.h index 346dc0b..38c9851 100644 --- a/nsjail.h +++ b/nsjail.h @@ -163,6 +163,7 @@ struct nsjconf_t { unsigned int cgroup_cpu_ms_per_sec; std::string cgroupv2_mount; bool use_cgroupv2; + bool detect_cgroupv2; std::string kafel_file_path; std::string kafel_string; struct sock_fprog seccomp_fprog; diff --git a/util.cc b/util.cc index ba167fa..8e1ef8b 100644 --- a/util.cc +++ b/util.cc @@ -89,16 +89,20 @@ bool writeToFd(int fd, const void* buf, size_t len) { return true; } -bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags) { +bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors) { int fd; TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644)); if (fd == -1) { - PLOG_E("Couldn't open '%s' for writing", filename); + if (log_errors) { + PLOG_E("Couldn't open '%s' for writing", filename); + } return false; } if (!writeToFd(fd, buf, len)) { - PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd); + if (log_errors) { + PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd); + } close(fd); if (open_flags & O_CREAT) { unlink(filename); diff --git a/util.h b/util.h index d3be2e9..7aca782 100644 --- a/util.h +++ b/util.h @@ -46,7 +46,7 @@ namespace util { ssize_t readFromFd(int fd, void* buf, size_t len); ssize_t readFromFile(const char* fname, void* buf, size_t len); bool writeToFd(int fd, const void* buf, size_t len); -bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags); +bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors = true); bool createDirRecursively(const char* dir); std::string* StrAppend(std::string* str, const char* format, ...) __attribute__((format(printf, 2, 3)));