nsjail/contain.c

557 lines
14 KiB
C
Raw Normal View History

2015-05-15 05:44:48 +08:00
/*
nsjail - isolating the binary
-----------------------------------------
Copyright 2014 Google Inc. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
#include "contain.h"
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <linux/capability.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/personality.h>
#include <sys/prctl.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>
#include "log.h"
static bool containSetGroups(void)
{
int fd = open("/proc/self/setgroups", O_WRONLY | O_CLOEXEC);
if (fd == -1) {
/* Not present with all kernels */
PLOG_D("'/proc/self/setgroups' not present in this kernel?");
return true;
}
const char *denystr = "deny";
if (write(fd, denystr, strlen(denystr)) == -1) {
PLOG_E("write('/proc/self/setgroups', '%s') failed", denystr);
close(fd);
return false;
}
close(fd);
return true;
}
static bool containUidGidMap(struct nsjconf_t *nsjconf, uid_t uid, gid_t gid)
{
if (nsjconf->clone_newuser == false) {
return true;
}
int fd;
char map[64];
if ((fd = open("/proc/self/uid_map", O_WRONLY | O_CLOEXEC)) == -1) {
PLOG_E("open('/proc/self/uid_map', O_WRONLY | O_CLOEXEC)");
return false;
}
2015-08-16 02:10:07 +08:00
snprintf(map, sizeof(map), "%lu %lu 1", (unsigned long)uid,
(unsigned long)nsjconf->initial_uid);
2015-05-15 05:44:48 +08:00
LOG_D("Writing '%s' to /proc/self/uid_map", map);
if (write(fd, map, strlen(map)) == -1) {
PLOG_E("write('/proc/self/uid_map', %d, '%s')", fd, map);
close(fd);
return false;
}
close(fd);
if ((fd = open("/proc/self/gid_map", O_WRONLY | O_CLOEXEC)) == -1) {
PLOG_E("open('/proc/self/gid_map', O_WRONLY | O_CLOEXEC)");
return false;
}
2015-08-16 02:10:07 +08:00
snprintf(map, sizeof(map), "%lu %lu 1", (unsigned long)gid,
(unsigned long)nsjconf->initial_gid);
2015-05-15 05:44:48 +08:00
LOG_D("Writing '%s' to /proc/self/gid_map", map);
if (write(fd, map, strlen(map)) == -1) {
PLOG_E("write('/proc/self/gid_map', %d, '%s')", fd, map);
close(fd);
return false;
}
close(fd);
return true;
}
2015-08-16 02:48:48 +08:00
bool containInitUserNs(struct nsjconf_t * nsjconf)
2015-05-15 05:44:48 +08:00
{
if (containSetGroups() == false) {
return false;
}
if (containUidGidMap(nsjconf, nsjconf->uid, nsjconf->gid) == false) {
return false;
}
2015-08-16 02:48:48 +08:00
return true;
}
bool containDropPrivs(struct nsjconf_t * nsjconf)
{
2015-05-15 05:44:48 +08:00
/*
* Best effort because of /proc/self/setgroups
*/
gid_t *group_list = NULL;
if (setgroups(0, group_list) == -1) {
PLOG_D("setgroups(NULL) failed");
}
if (setresgid(nsjconf->gid, nsjconf->gid, nsjconf->gid) == -1) {
PLOG_E("setresgid(%u)", nsjconf->gid);
return false;
}
if (setresuid(nsjconf->uid, nsjconf->uid, nsjconf->uid) == -1) {
PLOG_E("setresuid(%u)", nsjconf->uid);
return false;
}
#ifndef PR_SET_NO_NEW_PRIVS
#define PR_SET_NO_NEW_PRIVS 38
#endif
if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
/* Only new kernels support it */
PLOG_W("prctl(PR_SET_NO_NEW_PRIVS, 1)");
}
if (nsjconf->keep_caps == false) {
if (prctl(PR_SET_KEEPCAPS, 0, 0, 0, 0) == -1) {
PLOG_E("prctl(PR_SET_KEEPCAPS, 0)");
return false;
}
struct __user_cap_header_struct cap_hdr = {
.version = _LINUX_CAPABILITY_VERSION_3,
.pid = 0,
};
struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3] = {
[0 ... (_LINUX_CAPABILITY_U32S_3 - 1)].inheritable = 0U,
[0 ... (_LINUX_CAPABILITY_U32S_3 - 1)].effective = 0U,
[0 ... (_LINUX_CAPABILITY_U32S_3 - 1)].permitted = 0U,
};
if (syscall(__NR_capset, &cap_hdr, &cap_data) == -1) {
PLOG_E("capset()");
return false;
}
}
return true;
}
bool containPrepareEnv(struct nsjconf_t * nsjconf)
{
LOG_D("Setting hostname to '%s'", nsjconf->hostname);
if (nsjconf->clone_newuts) {
if (sethostname(nsjconf->hostname, strlen(nsjconf->hostname)) == -1) {
PLOG_E("sethostname('%s')", nsjconf->hostname);
return false;
}
}
if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) == -1) {
PLOG_E("prctl(PR_SET_PDEATHSIG, SIGKILL)");
return false;
}
if (nsjconf->personality && personality(nsjconf->personality) == -1) {
PLOG_E("personality(%lx)", nsjconf->personality);
return false;
}
errno = 0;
if (setpriority(PRIO_PROCESS, 0, 19) == -1 && errno != 0) {
PLOG_W("setpriority(19)");
}
return true;
}
/* findSpecDestination mutates spec (source:dest) to have a null byte instead
* of ':' in between source and dest, then returns a pointer to the dest
* string. */
2015-07-08 06:54:59 +08:00
static char *findSpecDestination(char *spec)
{
char *dest = spec;
while (*dest != ':' && *dest != '\0') {
dest++;
}
switch (*dest) {
case ':':
*dest = '\0';
return dest + 1;
case '\0':
return spec;
default:
// not reached
return spec;
}
}
static bool bindMountRW(struct nsjconf_t *nsjconf, const char *newrootdir, const char *spec)
2015-07-08 06:54:59 +08:00
{
char mount_pt[PATH_MAX];
bool success = false;
char *source = strdup(spec);
if (source == NULL) {
PLOG_E("strdup('%s')", spec);
return false;
}
char *dest = findSpecDestination(source);
snprintf(mount_pt, sizeof(mount_pt), "%s/%s", newrootdir, dest);
struct stat st;
if (stat(source, &st) == -1) {
PLOG_W("stat('%s')", source);
goto cleanup;
}
if (S_ISDIR(st.st_mode)) {
// Create mount_pt dir, only if the source bind mount point is also a directory
if (mkdir(mount_pt, 0700) == -1 && errno != EEXIST) {
2015-08-16 02:10:07 +08:00
PLOG_E("mkdir('%s') failed. Try creating the '%s/%s' directory manually",
mount_pt, nsjconf->chroot, dest);
goto cleanup;
}
} else {
// For everything else (files, sockets, pipes, devices), create a regular file
int fd = open(mount_pt, O_CREAT | O_RDONLY, 0700);
if (fd == -1) {
2015-08-16 02:10:07 +08:00
PLOG_E("creat('%s') failed. Try creating the '%s/%s' file manually",
mount_pt, nsjconf->chroot, dest);
goto cleanup;
}
close(fd);
}
LOG_D("Mounting (bind) '%s' on '%s'", source, mount_pt);
if (mount(source, mount_pt, NULL, MS_BIND | MS_REC, NULL) == -1) {
PLOG_E("mount('%s', '%s', MS_BIND|MS_REC)", source, mount_pt);
goto cleanup;
}
success = true;
2015-07-08 06:54:59 +08:00
cleanup:
free(source);
return success;
}
2015-07-08 06:54:59 +08:00
static bool remountBindMount(const char *spec, unsigned long flags)
{
if (flags == 0ULL) {
return true;
}
bool success = false;
char *source = strdup(spec);
if (source == NULL) {
PLOG_E("strdup('%s')", spec);
return false;
}
char *dest = findSpecDestination(source);
LOG_D("Remounting (bind(0x%lx)) '%s' on '%s'", flags, dest, dest);
2015-08-16 02:10:07 +08:00
if (mount(dest, dest, NULL, MS_BIND | MS_NOSUID | MS_REMOUNT | MS_PRIVATE | flags, NULL) ==
-1) {
PLOG_E("mount('%s', '%s', MS_BIND|MS_NOSUID|MS_REMOUNT|MS_PRIVATE|%lu)", dest, dest,
flags);
goto cleanup;
}
success = true;
2015-07-08 06:54:59 +08:00
cleanup:
free(source);
return success;
}
static bool containMountProc(struct nsjconf_t *nsjconf, const char *newrootdir)
{
char procrootdir[PATH_MAX];
snprintf(procrootdir, sizeof(procrootdir), "%s/proc", newrootdir);
if (nsjconf->mount_proc == false) {
return true;
}
if (nsjconf->mode == MODE_STANDALONE_EXECVE) {
if (mount("/proc", procrootdir, NULL, MS_REC | MS_BIND, NULL) == -1) {
PLOG_E("mount('/proc', '%s', MS_REC|MS_BIND)", procrootdir);
return false;
}
return true;
}
if (mount(NULL, procrootdir, "proc", MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL) == -1) {
PLOG_E("mount('%s', 'proc')", procrootdir);
return false;
}
return true;
}
2015-05-15 05:44:48 +08:00
bool containMountFS(struct nsjconf_t * nsjconf)
{
if (nsjconf->clone_newns == false) {
if (chroot(nsjconf->chroot) == -1) {
PLOG_E("chroot('%s')", nsjconf->chroot) {
return false;
}
}
if (chdir("/") == -1) {
PLOG_E("chdir('/')");
return false;
}
return true;
}
2015-05-15 05:44:48 +08:00
const char *destdir = "/tmp";
if (mount("none", destdir, "tmpfs", 0, NULL) == -1) {
PLOG_E("mount('%s', 'tmpfs'", destdir);
return false;
}
char newrootdir[PATH_MAX];
snprintf(newrootdir, sizeof(newrootdir), "%s/%s", destdir, "new_root");
if (mkdir(newrootdir, 0755) == -1) {
PLOG_E("mkdir(/tmp/new_root)");
2015-05-15 05:44:48 +08:00
return false;
}
if (mount(nsjconf->chroot, newrootdir, NULL, MS_BIND | MS_REC, NULL) == -1) {
PLOG_E("mount('%s', '%s', MS_BIND | MS_REC)", nsjconf->chroot, newrootdir);
return false;
}
if (containMountProc(nsjconf, newrootdir) == false) {
return false;
}
struct constchar_t *p;
char tmpfs_size[128];
snprintf(tmpfs_size, sizeof(tmpfs_size), "size=%zu", nsjconf->tmpfs_size);
LIST_FOREACH(p, &nsjconf->tmpfsmountpts, pointers) {
if (strchr(p->value, ':') != NULL) {
PLOG_E("invalid tmpfs mount spec. source:dest format unsupported.");
return false;
}
char tmpfsdir[PATH_MAX];
snprintf(tmpfsdir, sizeof(tmpfsdir), "%s/%s", newrootdir, p->value);
if (mkdir(tmpfsdir, 0700) == -1 && errno != EEXIST) {
2015-08-16 02:10:07 +08:00
PLOG_E
("mkdir('%s') (for tmpfs:'%s'); You probably need to create it inside your "
"--chroot ('%s') directory", tmpfsdir, p->value, nsjconf->chroot);
return false;
}
LOG_D("Mounting (tmpfs) '%s' at '%s'", p->value, tmpfsdir);
if (mount(NULL, tmpfsdir, "tmpfs", 0, tmpfs_size) == -1) {
PLOG_E("mount('%s', 'tmpfs') for '%s'", tmpfsdir, p->value);
return false;
}
}
LIST_FOREACH(p, &nsjconf->robindmountpts, pointers) {
if (!bindMountRW(nsjconf, newrootdir, p->value)) {
2015-05-15 05:44:48 +08:00
return false;
}
}
LIST_FOREACH(p, &nsjconf->rwbindmountpts, pointers) {
if (!bindMountRW(nsjconf, newrootdir, p->value)) {
2015-05-15 05:44:48 +08:00
return false;
}
}
char pivotrootdir[PATH_MAX];
snprintf(pivotrootdir, sizeof(pivotrootdir), "%s/%s", destdir, "pivot_root");
if (mkdir(pivotrootdir, 0755) == -1) {
PLOG_E("mkdir('%s')", pivotrootdir);
return false;
}
if (syscall(__NR_pivot_root, destdir, pivotrootdir) == -1) {
PLOG_E("pivot_root('%s', '%s')", destdir, pivotrootdir);
return false;
}
if (umount2("/pivot_root", MNT_DETACH) == -1) {
PLOG_E("umount2('/pivot_root', MNT_DETACH)");
return false;
}
if (chroot("/new_root") == -1) {
PLOG_E("CHROOT('/new_root')");
return false;
}
if (chdir("/") == -1) {
PLOG_E("chdir('/')");
return false;
}
if (nsjconf->is_root_rw == false) {
if (!remountBindMount("/", MS_RDONLY)) {
2015-05-15 05:44:48 +08:00
return false;
}
}
LIST_FOREACH(p, &nsjconf->robindmountpts, pointers) {
if (!remountBindMount(p->value, MS_RDONLY)) {
return false;
}
}
2015-05-15 05:44:48 +08:00
return true;
}
bool containSetLimits(struct nsjconf_t * nsjconf)
{
struct rlimit rl;
rl.rlim_cur = rl.rlim_max = nsjconf->rl_as;
if (setrlimit(RLIMIT_AS, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_AS, %lu)", nsjconf->rl_as);
return false;
}
rl.rlim_cur = rl.rlim_max = nsjconf->rl_core;
if (setrlimit(RLIMIT_CORE, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_CORE, %lu)", nsjconf->rl_core);
return false;
}
rl.rlim_cur = rl.rlim_max = nsjconf->rl_cpu;
if (setrlimit(RLIMIT_CPU, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_CPU), %lu", nsjconf->rl_cpu);
return false;
}
rl.rlim_cur = rl.rlim_max = nsjconf->rl_fsize;
if (setrlimit(RLIMIT_FSIZE, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_FSIZE), %lu", nsjconf->rl_fsize);
return false;
}
rl.rlim_cur = rl.rlim_max = nsjconf->rl_nofile;
if (setrlimit(RLIMIT_NOFILE, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_NOFILE), %lu", nsjconf->rl_nofile);
return false;
}
rl.rlim_cur = rl.rlim_max = nsjconf->rl_nproc;
if (setrlimit(RLIMIT_NPROC, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_NPROC), %lu", nsjconf->rl_nproc);
return false;
}
rl.rlim_cur = rl.rlim_max = nsjconf->rl_stack;
if (setrlimit(RLIMIT_STACK, &rl) == -1) {
PLOG_E("setrlimit(RLIMIT_STACK), %lu", nsjconf->rl_stack);
return false;
}
return true;
}
2015-08-16 02:48:48 +08:00
static bool containMakeFdsCOENaive(void)
{
// Don't use getrlimit(RLIMIT_NOFILE) here, as it can return an artifically small value
// (e.g. 32), which could be smaller than a maximum assigned number to file-descriptors
// in this process. Just use some reasonably sane value (e.g. 1024)
for (unsigned fd = (STDERR_FILENO + 1); fd < 1024; fd++) {
int flags = fcntl(fd, F_GETFD, 0);
if (flags == -1) {
continue;
}
fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
LOG_D("Set fd '%d' flag to FD_CLOEXEC", fd);
}
return true;
}
static bool containMakeFdsCOEProc(void)
2015-05-15 05:44:48 +08:00
{
/* Make all fds above stderr close-on-exec */
DIR *dir = opendir("/proc/self/fd");
if (dir == NULL) {
2015-08-16 02:48:48 +08:00
PLOG_D("opendir('/proc/self/fd')");
2015-05-15 05:44:48 +08:00
return false;
}
for (;;) {
errno = 0;
struct dirent *entry = readdir(dir);
if (entry == NULL && errno != 0) {
2015-08-16 02:48:48 +08:00
PLOG_D("readdir('/proc/self/fd')");
2015-05-15 05:44:48 +08:00
closedir(dir);
return false;
}
if (entry == NULL) {
break;
}
if (strcmp(".", entry->d_name) == 0) {
continue;
}
if (strcmp("..", entry->d_name) == 0) {
continue;
}
int fd = strtoul(entry->d_name, NULL, 10);
if (errno == EINVAL) {
LOG_W("Cannot convert /proc/self/fd/%s to a number", entry->d_name);
continue;
}
if (fd > STDERR_FILENO) {
int flags = fcntl(fd, F_GETFD, 0);
if (flags == -1) {
2015-08-16 02:48:48 +08:00
PLOG_D("fcntl(fd, F_GETFD, 0)");
2015-05-15 05:44:48 +08:00
return false;
}
fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
LOG_D("Set fd '%d' flag to FD_CLOEXEC", fd);
}
}
closedir(dir);
return true;
}
2015-08-16 02:48:48 +08:00
bool containMakeFdsCOE(void)
{
if (containMakeFdsCOEProc() == true) {
return true;
}
if (containMakeFdsCOENaive() == true) {
return true;
}
LOG_E("Couldn't mark relevant file-descriptors as close-on-exec with any known method");
return false;
}
bool containSetupFD(struct nsjconf_t * nsjconf, int fd_in, int fd_out, int fd_err, int fd_log)
2015-05-15 05:44:48 +08:00
{
/* Make sure all logs go to the parent process from now on */
if (fd_log != -1) {
logRedirectLogFD(fd_log);
}
2015-05-15 05:44:48 +08:00
if (nsjconf->mode != MODE_LISTEN_TCP) {
if (nsjconf->is_silent == false) {
return true;
}
if ((fd_in = fd_out = fd_err = open("/dev/null", O_RDWR)) == -1) {
PLOG_E("open('/dev/null', O_RDWR)");
return false;
}
}
/* Set stdin/stdout/stderr to the net */
if (dup2(fd_in, STDIN_FILENO) == -1) {
PLOG_E("dup2(%d, STDIN_FILENO)", fd_in);
return false;
}
if (dup2(fd_out, STDOUT_FILENO) == -1) {
PLOG_E("dup2(%d, STDOUT_FILENO)", fd_out);
return false;
}
if (dup2(fd_err, STDERR_FILENO) == -1) {
PLOG_E("dup2(%d, STDERR_FILENO)", fd_err);
return false;
}
return true;
}