Merge pull request #208 from ndrewh/cgroupsv2-fix

Setup cgroup.subtree_control controllers when necessary in cgroupsv2
This commit is contained in:
robertswiecki 2022-11-22 22:12:12 +01:00 committed by GitHub
commit 4437810830
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 144 additions and 4 deletions

View File

@ -28,6 +28,8 @@
#include <stdio.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/vfs.h>
#include <linux/magic.h>
#include <unistd.h>
#include <fstream>
@ -39,9 +41,14 @@
namespace cgroup2 {
static bool addPidToProcList(const std::string &cgroup_path, pid_t pid);
static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
}
static std::string getJailCgroupPath(nsjconf_t *nsjconf) {
return nsjconf->cgroupv2_mount + "/NSJAIL_SELF." + std::to_string(getpid());
}
static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
return true;
}
static bool moveSelfIntoChildCgroup(nsjconf_t *nsjconf) {
// Move ourselves into another group to avoid the 'No internal processes' rule
// https://unix.stackexchange.com/a/713343
std::string jail_cgroup_path = getJailCgroupPath(nsjconf);
LOG_I("nsjail is moving itself to a new child cgroup: %s\n", jail_cgroup_path.c_str());
RETURN_ON_FAILURE(createCgroup(jail_cgroup_path, getpid()));
RETURN_ON_FAILURE(addPidToProcList(jail_cgroup_path, 0));
return true;
}
static bool enableCgroupSubtree(nsjconf_t *nsjconf, const std::string &controller, pid_t pid) {
std::string cgroup_path = nsjconf->cgroupv2_mount;
LOG_D("Enable cgroup.subtree_control +'%s' to '%s' for pid=%d", controller.c_str(), cgroup_path.c_str(), pid);
std::string val = "+" + controller;
// Try once without moving the nsjail process and if that fails then try moving the nsjail process
// into a child cgroup before trying a second time.
if (util::writeBufToFile(
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY, false)) {
return true;
}
if (errno == EBUSY) {
RETURN_ON_FAILURE(moveSelfIntoChildCgroup(nsjconf));
if (util::writeBufToFile(
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY)) {
return true;
}
}
LOG_E("Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups.", val.c_str(), cgroup_path.c_str());
return false;
}
static bool writeToCgroup(
const std::string &cgroup_path, const std::string &resource, const std::string &value) {
LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) {
}
}
static bool needMemoryController(nsjconf_t *nsjconf) {
// Check if we need 'memory'
// This matches the check in initNsFromParentMem
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max;
}
if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) {
return false;
}
return true;
}
static bool needPidsController(nsjconf_t *nsjconf) {
return nsjconf->cgroup_pids_max != 0;
}
static bool needCpuController(nsjconf_t *nsjconf) {
return nsjconf->cgroup_cpu_ms_per_sec != 0U;
}
// We will use this buf to read from cgroup.subtree_control to see if
// the root cgroup has the necessary controllers listed
#define SUBTREE_CONTROL_BUF_LEN 0x40
bool setup(nsjconf_t *nsjconf) {
// Read from cgroup.subtree_control in the root to see if
// the controllers we need are there.
auto p = nsjconf->cgroupv2_mount + "/cgroup.subtree_control";
char buf[SUBTREE_CONTROL_BUF_LEN];
int read = util::readFromFile(p.c_str(), buf, SUBTREE_CONTROL_BUF_LEN-1);
if (read < 0) {
LOG_W("cgroupv2 setup: Could not read root subtree_control");
return false;
}
buf[read] = 0;
// Are the controllers we need there?
bool subtree_ok = (!needMemoryController(nsjconf) || strstr(buf, "memory")) &&
(!needPidsController(nsjconf) || strstr(buf, "pids")) &&
(!needCpuController(nsjconf) || strstr(buf, "cpu"));
if (!subtree_ok) {
// Now we can write to the root cgroup.subtree_control
if (needMemoryController(nsjconf)) {
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "memory", getpid()));
}
if (needPidsController(nsjconf)) {
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "pids", getpid()));
}
if (needCpuController(nsjconf)) {
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "cpu", getpid()));
}
}
return true;
}
bool detectCgroupv2(nsjconf_t *nsjconf) {
// Check cgroupv2_mount, if it is a cgroup2 mount, use it.
struct statfs buf;
if (statfs(nsjconf->cgroupv2_mount.c_str(), &buf)) {
LOG_D("statfs %s failed with %d", nsjconf->cgroupv2_mount.c_str(), errno);
nsjconf->use_cgroupv2 = false;
return false;
}
nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC);
return true;
}
static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {

View File

@ -32,6 +32,8 @@ namespace cgroup2 {
bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
bool initNs(void);
void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
bool setup(nsjconf_t *nsjconf);
bool detectCgroupv2(nsjconf_t *nsjconf);
} // namespace cgroup2

View File

@ -158,6 +158,7 @@ struct custom_option custom_opts[] = {
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
{ { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
{ { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
{ { "detect_cgroupv2", no_argument, NULL, 0x0836}, "Use cgroupv2, if it is available. (Specify instead of use_cgroupv2)"},
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
@ -473,6 +474,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
nsjconf->cgroup_cpu_ms_per_sec = 0U;
nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
nsjconf->use_cgroupv2 = false;
nsjconf->detect_cgroupv2 = false;
nsjconf->iface_lo = true;
nsjconf->iface_vs_ip = "0.0.0.0";
nsjconf->iface_vs_nm = "255.255.255.0";
@ -912,6 +914,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
case 0x835:
nsjconf->use_cgroupv2 = true;
break;
case 0x836:
nsjconf->detect_cgroupv2 = true;
break;
case 'P':
nsjconf->kafel_file_path = optarg;
break;

View File

@ -266,6 +266,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
nsjconf->use_cgroupv2 = njc.use_cgroupv2();
nsjconf->detect_cgroupv2 = njc.detect_cgroupv2();
nsjconf->iface_lo = !(njc.iface_no_lo());
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {

View File

@ -272,4 +272,7 @@ message NsJailConfig {
/* Set this to true to forward fatal signals to the child process instead
* of always using SIGKILL. */
optional bool forward_signals = 94 [default = false];
/* Check whether cgroupv2 is available, and use it if available. */
optional bool detect_cgroupv2 = 95 [default = false];
}

View File

@ -46,6 +46,7 @@
#include "sandbox.h"
#include "subproc.h"
#include "util.h"
#include "cgroup2.h"
namespace nsjail {
@ -342,6 +343,19 @@ int main(int argc, char* argv[]) {
if (!nsjail::setTimer(nsjconf.get())) {
LOG_F("nsjail::setTimer() failed");
}
if (nsjconf->detect_cgroupv2) {
cgroup2::detectCgroupv2(nsjconf.get());
LOG_I("Detected cgroups version: %d", nsjconf->use_cgroupv2 ? 2 : 1);
}
if (nsjconf->use_cgroupv2) {
if (!cgroup2::setup(nsjconf.get())) {
LOG_E("Couldn't setup parent cgroup (cgroupv2)");
return -1;
}
}
if (!sandbox::preparePolicy(nsjconf.get())) {
LOG_F("Couldn't prepare sandboxing policy");
}

View File

@ -163,6 +163,7 @@ struct nsjconf_t {
unsigned int cgroup_cpu_ms_per_sec;
std::string cgroupv2_mount;
bool use_cgroupv2;
bool detect_cgroupv2;
std::string kafel_file_path;
std::string kafel_string;
struct sock_fprog seccomp_fprog;

10
util.cc
View File

@ -89,16 +89,20 @@ bool writeToFd(int fd, const void* buf, size_t len) {
return true;
}
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags) {
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors) {
int fd;
TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644));
if (fd == -1) {
PLOG_E("Couldn't open '%s' for writing", filename);
if (log_errors) {
PLOG_E("Couldn't open '%s' for writing", filename);
}
return false;
}
if (!writeToFd(fd, buf, len)) {
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
if (log_errors) {
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
}
close(fd);
if (open_flags & O_CREAT) {
unlink(filename);

2
util.h
View File

@ -46,7 +46,7 @@ namespace util {
ssize_t readFromFd(int fd, void* buf, size_t len);
ssize_t readFromFile(const char* fname, void* buf, size_t len);
bool writeToFd(int fd, const void* buf, size_t len);
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags);
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors = true);
bool createDirRecursively(const char* dir);
std::string* StrAppend(std::string* str, const char* format, ...)
__attribute__((format(printf, 2, 3)));