Merge pull request #208 from ndrewh/cgroupsv2-fix
Setup cgroup.subtree_control controllers when necessary in cgroupsv2
This commit is contained in:
commit
4437810830
110
cgroup2.cc
110
cgroup2.cc
@ -28,6 +28,8 @@
|
|||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
|
#include <sys/vfs.h>
|
||||||
|
#include <linux/magic.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -39,9 +41,14 @@
|
|||||||
|
|
||||||
namespace cgroup2 {
|
namespace cgroup2 {
|
||||||
|
|
||||||
|
static bool addPidToProcList(const std::string &cgroup_path, pid_t pid);
|
||||||
|
|
||||||
static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
|
static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
|
||||||
return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
|
return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
|
||||||
}
|
}
|
||||||
|
static std::string getJailCgroupPath(nsjconf_t *nsjconf) {
|
||||||
|
return nsjconf->cgroupv2_mount + "/NSJAIL_SELF." + std::to_string(getpid());
|
||||||
|
}
|
||||||
|
|
||||||
static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
|
static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
|
||||||
LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
|
LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
|
||||||
@ -52,6 +59,39 @@ static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool moveSelfIntoChildCgroup(nsjconf_t *nsjconf) {
|
||||||
|
// Move ourselves into another group to avoid the 'No internal processes' rule
|
||||||
|
// https://unix.stackexchange.com/a/713343
|
||||||
|
std::string jail_cgroup_path = getJailCgroupPath(nsjconf);
|
||||||
|
LOG_I("nsjail is moving itself to a new child cgroup: %s\n", jail_cgroup_path.c_str());
|
||||||
|
RETURN_ON_FAILURE(createCgroup(jail_cgroup_path, getpid()));
|
||||||
|
RETURN_ON_FAILURE(addPidToProcList(jail_cgroup_path, 0));
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static bool enableCgroupSubtree(nsjconf_t *nsjconf, const std::string &controller, pid_t pid) {
|
||||||
|
std::string cgroup_path = nsjconf->cgroupv2_mount;
|
||||||
|
LOG_D("Enable cgroup.subtree_control +'%s' to '%s' for pid=%d", controller.c_str(), cgroup_path.c_str(), pid);
|
||||||
|
std::string val = "+" + controller;
|
||||||
|
|
||||||
|
// Try once without moving the nsjail process and if that fails then try moving the nsjail process
|
||||||
|
// into a child cgroup before trying a second time.
|
||||||
|
if (util::writeBufToFile(
|
||||||
|
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY, false)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (errno == EBUSY) {
|
||||||
|
RETURN_ON_FAILURE(moveSelfIntoChildCgroup(nsjconf));
|
||||||
|
if (util::writeBufToFile(
|
||||||
|
(cgroup_path + "/cgroup.subtree_control").c_str(), val.c_str(), val.length(), O_WRONLY)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
LOG_E("Could not apply '%s' to cgroup.subtree_control in '%s'. If you are running in Docker, nsjail MUST be the root process to use cgroups.", val.c_str(), cgroup_path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
static bool writeToCgroup(
|
static bool writeToCgroup(
|
||||||
const std::string &cgroup_path, const std::string &resource, const std::string &value) {
|
const std::string &cgroup_path, const std::string &resource, const std::string &value) {
|
||||||
LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
|
LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
|
||||||
@ -83,6 +123,76 @@ static void removeCgroup(const std::string &cgroup_path) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool needMemoryController(nsjconf_t *nsjconf) {
|
||||||
|
// Check if we need 'memory'
|
||||||
|
// This matches the check in initNsFromParentMem
|
||||||
|
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
|
||||||
|
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
|
||||||
|
swap_max = nsjconf->cgroup_mem_memsw_max - nsjconf->cgroup_mem_max;
|
||||||
|
}
|
||||||
|
if (nsjconf->cgroup_mem_max == (size_t)0 && swap_max < (ssize_t)0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool needPidsController(nsjconf_t *nsjconf) {
|
||||||
|
return nsjconf->cgroup_pids_max != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool needCpuController(nsjconf_t *nsjconf) {
|
||||||
|
return nsjconf->cgroup_cpu_ms_per_sec != 0U;
|
||||||
|
}
|
||||||
|
|
||||||
|
// We will use this buf to read from cgroup.subtree_control to see if
|
||||||
|
// the root cgroup has the necessary controllers listed
|
||||||
|
#define SUBTREE_CONTROL_BUF_LEN 0x40
|
||||||
|
|
||||||
|
bool setup(nsjconf_t *nsjconf) {
|
||||||
|
// Read from cgroup.subtree_control in the root to see if
|
||||||
|
// the controllers we need are there.
|
||||||
|
auto p = nsjconf->cgroupv2_mount + "/cgroup.subtree_control";
|
||||||
|
char buf[SUBTREE_CONTROL_BUF_LEN];
|
||||||
|
int read = util::readFromFile(p.c_str(), buf, SUBTREE_CONTROL_BUF_LEN-1);
|
||||||
|
if (read < 0) {
|
||||||
|
LOG_W("cgroupv2 setup: Could not read root subtree_control");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
buf[read] = 0;
|
||||||
|
|
||||||
|
// Are the controllers we need there?
|
||||||
|
bool subtree_ok = (!needMemoryController(nsjconf) || strstr(buf, "memory")) &&
|
||||||
|
(!needPidsController(nsjconf) || strstr(buf, "pids")) &&
|
||||||
|
(!needCpuController(nsjconf) || strstr(buf, "cpu"));
|
||||||
|
if (!subtree_ok) {
|
||||||
|
// Now we can write to the root cgroup.subtree_control
|
||||||
|
if (needMemoryController(nsjconf)) {
|
||||||
|
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "memory", getpid()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (needPidsController(nsjconf)) {
|
||||||
|
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "pids", getpid()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (needCpuController(nsjconf)) {
|
||||||
|
RETURN_ON_FAILURE(enableCgroupSubtree(nsjconf, "cpu", getpid()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool detectCgroupv2(nsjconf_t *nsjconf) {
|
||||||
|
// Check cgroupv2_mount, if it is a cgroup2 mount, use it.
|
||||||
|
struct statfs buf;
|
||||||
|
if (statfs(nsjconf->cgroupv2_mount.c_str(), &buf)) {
|
||||||
|
LOG_D("statfs %s failed with %d", nsjconf->cgroupv2_mount.c_str(), errno);
|
||||||
|
nsjconf->use_cgroupv2 = false;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
nsjconf->use_cgroupv2 = (buf.f_type == CGROUP2_SUPER_MAGIC);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
|
static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
|
||||||
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
|
ssize_t swap_max = nsjconf->cgroup_mem_swap_max;
|
||||||
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
|
if (nsjconf->cgroup_mem_memsw_max > (size_t)0) {
|
||||||
|
@ -32,6 +32,8 @@ namespace cgroup2 {
|
|||||||
bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
|
bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
|
||||||
bool initNs(void);
|
bool initNs(void);
|
||||||
void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
|
void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
|
||||||
|
bool setup(nsjconf_t *nsjconf);
|
||||||
|
bool detectCgroupv2(nsjconf_t *nsjconf);
|
||||||
|
|
||||||
} // namespace cgroup2
|
} // namespace cgroup2
|
||||||
|
|
||||||
|
@ -158,6 +158,7 @@ struct custom_option custom_opts[] = {
|
|||||||
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
|
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
|
||||||
{ { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
|
{ { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
|
||||||
{ { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
|
{ { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
|
||||||
|
{ { "detect_cgroupv2", no_argument, NULL, 0x0836}, "Use cgroupv2, if it is available. (Specify instead of use_cgroupv2)"},
|
||||||
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
|
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
|
||||||
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
|
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
|
||||||
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
|
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
|
||||||
@ -473,6 +474,7 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
|
|||||||
nsjconf->cgroup_cpu_ms_per_sec = 0U;
|
nsjconf->cgroup_cpu_ms_per_sec = 0U;
|
||||||
nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
|
nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
|
||||||
nsjconf->use_cgroupv2 = false;
|
nsjconf->use_cgroupv2 = false;
|
||||||
|
nsjconf->detect_cgroupv2 = false;
|
||||||
nsjconf->iface_lo = true;
|
nsjconf->iface_lo = true;
|
||||||
nsjconf->iface_vs_ip = "0.0.0.0";
|
nsjconf->iface_vs_ip = "0.0.0.0";
|
||||||
nsjconf->iface_vs_nm = "255.255.255.0";
|
nsjconf->iface_vs_nm = "255.255.255.0";
|
||||||
@ -912,6 +914,9 @@ std::unique_ptr<nsjconf_t> parseArgs(int argc, char* argv[]) {
|
|||||||
case 0x835:
|
case 0x835:
|
||||||
nsjconf->use_cgroupv2 = true;
|
nsjconf->use_cgroupv2 = true;
|
||||||
break;
|
break;
|
||||||
|
case 0x836:
|
||||||
|
nsjconf->detect_cgroupv2 = true;
|
||||||
|
break;
|
||||||
case 'P':
|
case 'P':
|
||||||
nsjconf->kafel_file_path = optarg;
|
nsjconf->kafel_file_path = optarg;
|
||||||
break;
|
break;
|
||||||
|
@ -266,6 +266,7 @@ static bool configParseInternal(nsjconf_t* nsjconf, const nsjail::NsJailConfig&
|
|||||||
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
|
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
|
||||||
nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
|
nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
|
||||||
nsjconf->use_cgroupv2 = njc.use_cgroupv2();
|
nsjconf->use_cgroupv2 = njc.use_cgroupv2();
|
||||||
|
nsjconf->detect_cgroupv2 = njc.detect_cgroupv2();
|
||||||
|
|
||||||
nsjconf->iface_lo = !(njc.iface_no_lo());
|
nsjconf->iface_lo = !(njc.iface_no_lo());
|
||||||
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {
|
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {
|
||||||
|
@ -272,4 +272,7 @@ message NsJailConfig {
|
|||||||
/* Set this to true to forward fatal signals to the child process instead
|
/* Set this to true to forward fatal signals to the child process instead
|
||||||
* of always using SIGKILL. */
|
* of always using SIGKILL. */
|
||||||
optional bool forward_signals = 94 [default = false];
|
optional bool forward_signals = 94 [default = false];
|
||||||
|
|
||||||
|
/* Check whether cgroupv2 is available, and use it if available. */
|
||||||
|
optional bool detect_cgroupv2 = 95 [default = false];
|
||||||
}
|
}
|
||||||
|
14
nsjail.cc
14
nsjail.cc
@ -46,6 +46,7 @@
|
|||||||
#include "sandbox.h"
|
#include "sandbox.h"
|
||||||
#include "subproc.h"
|
#include "subproc.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
#include "cgroup2.h"
|
||||||
|
|
||||||
namespace nsjail {
|
namespace nsjail {
|
||||||
|
|
||||||
@ -342,6 +343,19 @@ int main(int argc, char* argv[]) {
|
|||||||
if (!nsjail::setTimer(nsjconf.get())) {
|
if (!nsjail::setTimer(nsjconf.get())) {
|
||||||
LOG_F("nsjail::setTimer() failed");
|
LOG_F("nsjail::setTimer() failed");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (nsjconf->detect_cgroupv2) {
|
||||||
|
cgroup2::detectCgroupv2(nsjconf.get());
|
||||||
|
LOG_I("Detected cgroups version: %d", nsjconf->use_cgroupv2 ? 2 : 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nsjconf->use_cgroupv2) {
|
||||||
|
if (!cgroup2::setup(nsjconf.get())) {
|
||||||
|
LOG_E("Couldn't setup parent cgroup (cgroupv2)");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!sandbox::preparePolicy(nsjconf.get())) {
|
if (!sandbox::preparePolicy(nsjconf.get())) {
|
||||||
LOG_F("Couldn't prepare sandboxing policy");
|
LOG_F("Couldn't prepare sandboxing policy");
|
||||||
}
|
}
|
||||||
|
1
nsjail.h
1
nsjail.h
@ -163,6 +163,7 @@ struct nsjconf_t {
|
|||||||
unsigned int cgroup_cpu_ms_per_sec;
|
unsigned int cgroup_cpu_ms_per_sec;
|
||||||
std::string cgroupv2_mount;
|
std::string cgroupv2_mount;
|
||||||
bool use_cgroupv2;
|
bool use_cgroupv2;
|
||||||
|
bool detect_cgroupv2;
|
||||||
std::string kafel_file_path;
|
std::string kafel_file_path;
|
||||||
std::string kafel_string;
|
std::string kafel_string;
|
||||||
struct sock_fprog seccomp_fprog;
|
struct sock_fprog seccomp_fprog;
|
||||||
|
10
util.cc
10
util.cc
@ -89,16 +89,20 @@ bool writeToFd(int fd, const void* buf, size_t len) {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags) {
|
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors) {
|
||||||
int fd;
|
int fd;
|
||||||
TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644));
|
TEMP_FAILURE_RETRY(fd = open(filename, open_flags, 0644));
|
||||||
if (fd == -1) {
|
if (fd == -1) {
|
||||||
PLOG_E("Couldn't open '%s' for writing", filename);
|
if (log_errors) {
|
||||||
|
PLOG_E("Couldn't open '%s' for writing", filename);
|
||||||
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!writeToFd(fd, buf, len)) {
|
if (!writeToFd(fd, buf, len)) {
|
||||||
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
|
if (log_errors) {
|
||||||
|
PLOG_E("Couldn't write '%zu' bytes to file '%s' (fd='%d')", len, filename, fd);
|
||||||
|
}
|
||||||
close(fd);
|
close(fd);
|
||||||
if (open_flags & O_CREAT) {
|
if (open_flags & O_CREAT) {
|
||||||
unlink(filename);
|
unlink(filename);
|
||||||
|
2
util.h
2
util.h
@ -46,7 +46,7 @@ namespace util {
|
|||||||
ssize_t readFromFd(int fd, void* buf, size_t len);
|
ssize_t readFromFd(int fd, void* buf, size_t len);
|
||||||
ssize_t readFromFile(const char* fname, void* buf, size_t len);
|
ssize_t readFromFile(const char* fname, void* buf, size_t len);
|
||||||
bool writeToFd(int fd, const void* buf, size_t len);
|
bool writeToFd(int fd, const void* buf, size_t len);
|
||||||
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags);
|
bool writeBufToFile(const char* filename, const void* buf, size_t len, int open_flags, bool log_errors = true);
|
||||||
bool createDirRecursively(const char* dir);
|
bool createDirRecursively(const char* dir);
|
||||||
std::string* StrAppend(std::string* str, const char* format, ...)
|
std::string* StrAppend(std::string* str, const char* format, ...)
|
||||||
__attribute__((format(printf, 2, 3)));
|
__attribute__((format(printf, 2, 3)));
|
||||||
|
Loading…
Reference in New Issue
Block a user