Setup cgroup.subtree_control controllers when necessary in cgroupsv2

This commit adds extra setup when cgroupsv2 is enabled. In particular,
we make sure that the root namespace has setup cgroup.subtree_control
with the controllers we need.

If the necessary controller are not listed, we have to move all
processes out of the root namespace before we can change this
(the 'no internal processes' rule:
https://unix.stackexchange.com/a/713343). Currently we only
handle the case where the nsjail process is the only process in
the cgroup. It seems like this would be relatively rare, but since
nsjail is frequently the root process in a Docker container (e.g.
for hosting CTF challenges), I think this case is common enough to
make it worth implementing.

This also adds `--detect_cgroupv2`, which will attempt to detect
whether `--cgroupv2_mount` is a valid cgroupv2 mount, and if so
it will set `use_cgroupv2`. This is useful in containerized
environments where you may not know the kernel version ahead of time.

References:
https://github.com/redpwn/jail/blob/master/internal/cgroup/cgroup2.go
This commit is contained in:
Andrew Haberlandt 2022-11-09 23:43:44 -05:00
parent 4567c78a27
commit 12df56b9f1
9 changed files with 144 additions and 4 deletions

View File

@ -28,6 +28,8 @@
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/vfs.h>
#include <linux/magic.h>
#include <unistd.h>
#include <fstream>
@ -39,9 +41,14 @@
namespace cgroup2 {
static bool addPidToProcList(const std::string &cgroup_path, pid_t pid);
static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
}
static std::string getJailCgroupPath(nsjconf_t *nsjconf) {
return nsjconf->cgroupv2_mount + "/NSJAIL_SELF." + std::to_string(getpid());
}
static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
return true;
}
static bool moveSelfIntoChildCgroup(nsjconf_t *nsjconf) {
// Move ourselves into another group to avoid the 'No internal processes' rule
// https://unix.stackexchange.com/a/713343
std::string jail_cgroup_path = getJailCgroupPath(nsjconf);
LOG_I("nsjail is moving itself to a new child cgroup: %s\n", jail_cgroup_path.c_str());
RETURN_ON_FAILURE(createCgroup(jail_cgroup_path, getpid()));
RETURN_ON_FAILURE(addPidToProcList(jail_cgroup_path, 0));
return true;
}
static bool enableCgroupSubtree(nsjconf_t *nsjconf, const std::string &controller, pid_t pid) {
std::string cgroup_path = nsjconf->cgroupv2_mount;
LOG_D("Enable cgroup.subtree_control +'%s' to '%s' for pid=%d", controller.c_str(), cgroup_path.c_str(), pid);
std::string val = "+" + controller;
// Try once without moving the nsjail process and if that fails then try moving the nsjail process
// into a child cgroup before trying a second time.
if (util::writeBufToFile(
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY, false)) {
return true;
}
if (errno == EBUSY) {
RETURN_ON_FAILURE(moveSelfIntoChildCgroup(nsjconf));
if (util::writeBufToFile(
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY)) {
return true;
}
}
LOG_E("Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups.", val.c_str(), cgroup_path.c_str());
return false;
}
static bool writeToCgroup(
const std::string &cgroup_path, const std::string &resource, const std::string &value) {
LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) {
}
}
static bool needMemoryController(nsjconf_t *nsjconf) {
// Check if we need 'memory'
// This matches the check in initNsFromParentMem
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max;
}
if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) {
return false;
}
return true;
}
static bool needPidsController(nsjconf_t *nsjconf) {
return nsjconf->cgroup_pids_max != 0;
}
static bool needCpuController(nsjconf_t *nsjconf) {
return nsjconf->cgroup_cpu_ms_per_sec != 0U;
}
// We will use this buf to read from cgroup.subtree_control to see if
// the root cgroup has the necessary controllers listed
#define SUBTREE_CONTROL_BUF_LEN 0x40
bool setup(nsjconf_t *nsjconf) {
// Read from cgroup.subtree_control in the root to see if
// the controllers we need are there.
auto p = nsjconf->cgroupv2_mount + "/cgroup.subtree_control";
char buf[SUBTREE_CONTROL_BUF_LEN];
int read = util::readFromFile(p.c_str(), buf, SUBTREE_CONTROL_BUF_LEN-1);
if (read < 0) {
LOG_W("cgroupv2 setup: Could not read root subtree_control");
return false;
}
buf[read] = 0;
// Are the controllers we need there?
bool subtree_ok = (!needMemoryController(nsjconf) || strstr(buf, "memory")) &&
(!needPidsController(nsjconf) || strstr(buf, "pids")) &&
(!needCpuController(nsjconf) || strstr(buf, "cpu"));
if (!subtree_ok) {
// Now we can write to the root cgroup.subtree_control
if (needMemoryController(nsjconf)) {
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "memory", getpid()));
}
if (needPidsController(nsjconf)) {
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "pids", getpid()));
}
if (needCpuController(nsjconf)) {
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "cpu", getpid()));
}
}
return true;
}
bool detectCgroupv2(nsjconf_t *nsjconf) {
// Check cgroupv2_mount, if it is a cgroup2 mount, use it.
struct statfs buf;
if (statfs(nsjconf->cgroupv2_mount.c_str(), &buf)) {
LOG_D("statfs %s failed with %d", nsjconf->cgroupv2_mount.c_str(), errno);
nsjconf->use_cgroupv2 = false;
return false;
}
nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC);
return true;
}
static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {

View File

@ -32,6 +32,8 @@ namespace cgroup2 {
bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
bool initNs(void);
void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
bool setup(nsjconf_t *nsjconf);
bool detectCgroupv2(nsjconf_t *nsjconf);
} // namespace cgroup2

View File

@ -158,6 +158,7 @@ struct custom_option custom_opts[] = {
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
{ { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
{ { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
{ { "detect_cgroupv2", no_argument, NULL, 0x0836}, "Use cgroupv2, if it is available. (Specify instead of use_cgroupv2)"},
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
@ -473,6 +474,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->cgroup_cpu_ms_per_sec = 0U;
nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
nsjconf->use_cgroupv2 = false;
nsjconf->detect_cgroupv2 = false;
nsjconf->iface_lo = true;
nsjconf->iface_vs_ip = "0.0.0.0";
nsjconf->iface_vs_nm = "255.255.255.0";
@ -912,6 +914,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x835:
nsjconf->use_cgroupv2 = true;
break;
case 0x836:
nsjconf->detect_cgroupv2 = true;
break;
case 'P':
nsjconf->kafel_file_path = optarg;
break;

View File

@ -266,6 +266,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
nsjconf->use_cgroupv2 = njc.use_cgroupv2();
nsjconf->detect_cgroupv2 = njc.detect_cgroupv2();
nsjconf->iface_lo = !(njc.iface_no_lo());
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {

View File

@ -272,4 +272,7 @@ message NsJailConfig {
/* Set this to true to forward fatal signals to the child process instead
* of always using SIGKILL. */
optional bool forward_signals = 94 [default = false];
/* Check whether cgroupv2 is available, and use it if available. */
optional bool detect_cgroupv2 = 95 [default = false];
}

View File

@ -46,6 +46,7 @@
#include "sandbox.h"
#include "subproc.h"
#include "util.h"
#include "cgroup2.h"
namespace nsjail {
@ -342,6 +343,19 @@ int main(int argc, char* argv[]) {
if (!nsjail::setTimer(nsjconf.get())) {
LOG_F("nsjail::setTimer() failed");
}
if (nsjconf->detect_cgroupv2) {
cgroup2::detectCgroupv2(nsjconf.get());
LOG_I("Detected cgroups version: %d", nsjconf->use_cgroupv2 ? 2 : 1);
}
if (nsjconf->use_cgroupv2) {
if (!cgroup2::setup(nsjconf.get())) {
LOG_E("Couldn't setup parent cgroup (cgroupv2)");
return -1;
}
}
if (!sandbox::preparePolicy(nsjconf.get())) {
LOG_F("Couldn't prepare sandboxing policy");
}

View File

@ -163,6 +163,7 @@ struct nsjconf_t {
unsigned int cgroup_cpu_ms_per_sec;
std::string cgroupv2_mount;
bool use_cgroupv2;
bool detect_cgroupv2;
std::string kafel_file_path;
std::string kafel_string;
struct sock_fprog seccomp_fprog;

10
util.cc
View File

@ -89,16 +89,20 @@ bool writeToFd(int fd, const void* buf, size_t len) {
return true;
}
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags) {
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors) {
int fd;
TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644));
if (fd == -1) {
PLOG_E("Couldn't open '%s' for writing", filename);
if (log_errors) {
PLOG_E("Couldn't open '%s' for writing", filename);
}
return false;
}
if (!writeToFd(fd, buf, len)) {
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
if (log_errors) {
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
}
close(fd);
if (open_flags & O_CREAT) {
unlink(filename);

2
util.h
View File

@ -46,7 +46,7 @@ namespace util {
ssize_t readFromFd(int fd, void* buf, size_t len);
ssize_t readFromFile(const char* fname, void* buf, size_t len);
bool writeToFd(int fd, const void* buf, size_t len);
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags);
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors = true);
bool createDirRecursively(const char* dir);
std::string* StrAppend(std::string* str, const char* format, ...)
__attribute__((format(printf, 2, 3)));