Bitcoin Core 22.99.0
P2P Digital Currency
syscall_sandbox.cpp
Go to the documentation of this file.
1// Copyright (c) 2020 The Bitcoin Core developers
2// Distributed under the MIT software license, see the accompanying
3// file COPYING or http://www.opensource.org/licenses/mit-license.php.
4
5#if defined(HAVE_CONFIG_H)
7#endif // defined(HAVE_CONFIG_H)
8
10
11#if defined(USE_SYSCALL_SANDBOX)
12#include <array>
13#include <cassert>
14#include <cstdint>
15#include <exception>
16#include <map>
17#include <new>
18#include <set>
19#include <string>
20#include <vector>
21
22#include <logging.h>
23#include <tinyformat.h>
24#include <util/threadnames.h>
25
26#include <linux/audit.h>
27#include <linux/filter.h>
28#include <linux/seccomp.h>
29#include <linux/unistd.h>
30#include <signal.h>
31#include <sys/prctl.h>
32#include <sys/types.h>
33#include <unistd.h>
34
35namespace {
36bool g_syscall_sandbox_enabled{false};
37bool g_syscall_sandbox_log_violation_before_terminating{false};
38
39#if !defined(__x86_64__)
40#error Syscall sandbox is an experimental feature currently available only under Linux x86-64.
41#endif // defined(__x86_64__)
42
43#ifndef SECCOMP_RET_KILL_PROCESS
44#define SECCOMP_RET_KILL_PROCESS 0x80000000U
45#endif
46
47// Define system call numbers for x86_64 that are referenced in the system call profile
48// but not provided by the kernel headers used in the GUIX build.
49// Usually, they can be found via "grep name /usr/include/x86_64-linux-gnu/asm/unistd_64.h"
50
51#ifndef __NR_clone3
52#define __NR_clone3 435
53#endif
54
55#ifndef __NR_statx
56#define __NR_statx 332
57#endif
58
59#ifndef __NR_getrandom
60#define __NR_getrandom 318
61#endif
62
63#ifndef __NR_membarrier
64#define __NR_membarrier 324
65#endif
66
67#ifndef __NR_copy_file_range
68#define __NR_copy_file_range 326
69#endif
70
71// This list of syscalls in LINUX_SYSCALLS is only used to map syscall numbers to syscall names in
72// order to be able to print user friendly error messages which include the syscall name in addition
73// to the syscall number.
74//
75// Example output in case of a syscall violation where the syscall is present in LINUX_SYSCALLS:
76//
77// ```
78// 2021-06-09T12:34:56Z ERROR: The syscall "execve" (syscall number 59) is not allowed by the syscall sandbox in thread "msghand". Please report.
79// ```
80//
81// Example output in case of a syscall violation where the syscall is not present in LINUX_SYSCALLS:
82//
83// ```
84// 2021-06-09T12:34:56Z ERROR: The syscall "*unknown*" (syscall number 314) is not allowed by the syscall sandbox in thread "msghand". Please report.
85// ``
86//
87// LINUX_SYSCALLS contains two types of syscalls:
88// 1.) Syscalls that are present under all architectures or relevant Linux kernel versions for which
89// we support the syscall sandbox feature (currently only Linux x86-64). Examples include read,
90// write, open, close, etc.
91// 2.) Syscalls that are present under a subset of architectures or relevant Linux kernel versions
92// for which we support the syscall sandbox feature. This type of syscalls should be added to
93// LINUX_SYSCALLS conditional on availability like in the following example:
94// ...
95// #if defined(__NR_arch_dependent_syscall)
96// {__NR_arch_dependent_syscall, "arch_dependent_syscall"},
97// #endif // defined(__NR_arch_dependent_syscall)
98// ...
99const std::map<uint32_t, std::string> LINUX_SYSCALLS{
100 {__NR_accept, "accept"},
101 {__NR_accept4, "accept4"},
102 {__NR_access, "access"},
103 {__NR_acct, "acct"},
104 {__NR_add_key, "add_key"},
105 {__NR_adjtimex, "adjtimex"},
106 {__NR_afs_syscall, "afs_syscall"},
107 {__NR_alarm, "alarm"},
108 {__NR_arch_prctl, "arch_prctl"},
109 {__NR_bind, "bind"},
110 {__NR_bpf, "bpf"},
111 {__NR_brk, "brk"},
112 {__NR_capget, "capget"},
113 {__NR_capset, "capset"},
114 {__NR_chdir, "chdir"},
115 {__NR_chmod, "chmod"},
116 {__NR_chown, "chown"},
117 {__NR_chroot, "chroot"},
118 {__NR_clock_adjtime, "clock_adjtime"},
119 {__NR_clock_getres, "clock_getres"},
120 {__NR_clock_gettime, "clock_gettime"},
121 {__NR_clock_nanosleep, "clock_nanosleep"},
122 {__NR_clock_settime, "clock_settime"},
123 {__NR_clone, "clone"},
124 {__NR_clone3, "clone3"},
125 {__NR_close, "close"},
126 {__NR_connect, "connect"},
127 {__NR_copy_file_range, "copy_file_range"},
128 {__NR_creat, "creat"},
129 {__NR_create_module, "create_module"},
130 {__NR_delete_module, "delete_module"},
131 {__NR_dup, "dup"},
132 {__NR_dup2, "dup2"},
133 {__NR_dup3, "dup3"},
134 {__NR_epoll_create, "epoll_create"},
135 {__NR_epoll_create1, "epoll_create1"},
136 {__NR_epoll_ctl, "epoll_ctl"},
137 {__NR_epoll_ctl_old, "epoll_ctl_old"},
138 {__NR_epoll_pwait, "epoll_pwait"},
139 {__NR_epoll_wait, "epoll_wait"},
140 {__NR_epoll_wait_old, "epoll_wait_old"},
141 {__NR_eventfd, "eventfd"},
142 {__NR_eventfd2, "eventfd2"},
143 {__NR_execve, "execve"},
144 {__NR_execveat, "execveat"},
145 {__NR_exit, "exit"},
146 {__NR_exit_group, "exit_group"},
147 {__NR_faccessat, "faccessat"},
148 {__NR_fadvise64, "fadvise64"},
149 {__NR_fallocate, "fallocate"},
150 {__NR_fanotify_init, "fanotify_init"},
151 {__NR_fanotify_mark, "fanotify_mark"},
152 {__NR_fchdir, "fchdir"},
153 {__NR_fchmod, "fchmod"},
154 {__NR_fchmodat, "fchmodat"},
155 {__NR_fchown, "fchown"},
156 {__NR_fchownat, "fchownat"},
157 {__NR_fcntl, "fcntl"},
158 {__NR_fdatasync, "fdatasync"},
159 {__NR_fgetxattr, "fgetxattr"},
160 {__NR_finit_module, "finit_module"},
161 {__NR_flistxattr, "flistxattr"},
162 {__NR_flock, "flock"},
163 {__NR_fork, "fork"},
164 {__NR_fremovexattr, "fremovexattr"},
165 {__NR_fsetxattr, "fsetxattr"},
166 {__NR_fstat, "fstat"},
167 {__NR_fstatfs, "fstatfs"},
168 {__NR_fsync, "fsync"},
169 {__NR_ftruncate, "ftruncate"},
170 {__NR_futex, "futex"},
171 {__NR_futimesat, "futimesat"},
172 {__NR_get_kernel_syms, "get_kernel_syms"},
173 {__NR_get_mempolicy, "get_mempolicy"},
174 {__NR_get_robust_list, "get_robust_list"},
175 {__NR_get_thread_area, "get_thread_area"},
176 {__NR_getcpu, "getcpu"},
177 {__NR_getcwd, "getcwd"},
178 {__NR_getdents, "getdents"},
179 {__NR_getdents64, "getdents64"},
180 {__NR_getegid, "getegid"},
181 {__NR_geteuid, "geteuid"},
182 {__NR_getgid, "getgid"},
183 {__NR_getgroups, "getgroups"},
184 {__NR_getitimer, "getitimer"},
185 {__NR_getpeername, "getpeername"},
186 {__NR_getpgid, "getpgid"},
187 {__NR_getpgrp, "getpgrp"},
188 {__NR_getpid, "getpid"},
189 {__NR_getpmsg, "getpmsg"},
190 {__NR_getppid, "getppid"},
191 {__NR_getpriority, "getpriority"},
192 {__NR_getrandom, "getrandom"},
193 {__NR_getresgid, "getresgid"},
194 {__NR_getresuid, "getresuid"},
195 {__NR_getrlimit, "getrlimit"},
196 {__NR_getrusage, "getrusage"},
197 {__NR_getsid, "getsid"},
198 {__NR_getsockname, "getsockname"},
199 {__NR_getsockopt, "getsockopt"},
200 {__NR_gettid, "gettid"},
201 {__NR_gettimeofday, "gettimeofday"},
202 {__NR_getuid, "getuid"},
203 {__NR_getxattr, "getxattr"},
204 {__NR_init_module, "init_module"},
205 {__NR_inotify_add_watch, "inotify_add_watch"},
206 {__NR_inotify_init, "inotify_init"},
207 {__NR_inotify_init1, "inotify_init1"},
208 {__NR_inotify_rm_watch, "inotify_rm_watch"},
209 {__NR_io_cancel, "io_cancel"},
210 {__NR_io_destroy, "io_destroy"},
211 {__NR_io_getevents, "io_getevents"},
212 {__NR_io_setup, "io_setup"},
213 {__NR_io_submit, "io_submit"},
214 {__NR_ioctl, "ioctl"},
215 {__NR_ioperm, "ioperm"},
216 {__NR_iopl, "iopl"},
217 {__NR_ioprio_get, "ioprio_get"},
218 {__NR_ioprio_set, "ioprio_set"},
219 {__NR_kcmp, "kcmp"},
220 {__NR_kexec_file_load, "kexec_file_load"},
221 {__NR_kexec_load, "kexec_load"},
222 {__NR_keyctl, "keyctl"},
223 {__NR_kill, "kill"},
224 {__NR_lchown, "lchown"},
225 {__NR_lgetxattr, "lgetxattr"},
226 {__NR_link, "link"},
227 {__NR_linkat, "linkat"},
228 {__NR_listen, "listen"},
229 {__NR_listxattr, "listxattr"},
230 {__NR_llistxattr, "llistxattr"},
231 {__NR_lookup_dcookie, "lookup_dcookie"},
232 {__NR_lremovexattr, "lremovexattr"},
233 {__NR_lseek, "lseek"},
234 {__NR_lsetxattr, "lsetxattr"},
235 {__NR_lstat, "lstat"},
236 {__NR_madvise, "madvise"},
237 {__NR_mbind, "mbind"},
238 {__NR_membarrier, "membarrier"},
239 {__NR_memfd_create, "memfd_create"},
240 {__NR_migrate_pages, "migrate_pages"},
241 {__NR_mincore, "mincore"},
242 {__NR_mkdir, "mkdir"},
243 {__NR_mkdirat, "mkdirat"},
244 {__NR_mknod, "mknod"},
245 {__NR_mknodat, "mknodat"},
246 {__NR_mlock, "mlock"},
247 {__NR_mlock2, "mlock2"},
248 {__NR_mlockall, "mlockall"},
249 {__NR_mmap, "mmap"},
250 {__NR_modify_ldt, "modify_ldt"},
251 {__NR_mount, "mount"},
252 {__NR_move_pages, "move_pages"},
253 {__NR_mprotect, "mprotect"},
254 {__NR_mq_getsetattr, "mq_getsetattr"},
255 {__NR_mq_notify, "mq_notify"},
256 {__NR_mq_open, "mq_open"},
257 {__NR_mq_timedreceive, "mq_timedreceive"},
258 {__NR_mq_timedsend, "mq_timedsend"},
259 {__NR_mq_unlink, "mq_unlink"},
260 {__NR_mremap, "mremap"},
261 {__NR_msgctl, "msgctl"},
262 {__NR_msgget, "msgget"},
263 {__NR_msgrcv, "msgrcv"},
264 {__NR_msgsnd, "msgsnd"},
265 {__NR_msync, "msync"},
266 {__NR_munlock, "munlock"},
267 {__NR_munlockall, "munlockall"},
268 {__NR_munmap, "munmap"},
269 {__NR_name_to_handle_at, "name_to_handle_at"},
270 {__NR_nanosleep, "nanosleep"},
271 {__NR_newfstatat, "newfstatat"},
272 {__NR_nfsservctl, "nfsservctl"},
273 {__NR_open, "open"},
274 {__NR_open_by_handle_at, "open_by_handle_at"},
275 {__NR_openat, "openat"},
276 {__NR_pause, "pause"},
277 {__NR_perf_event_open, "perf_event_open"},
278 {__NR_personality, "personality"},
279 {__NR_pipe, "pipe"},
280 {__NR_pipe2, "pipe2"},
281 {__NR_pivot_root, "pivot_root"},
282#ifdef __NR_pkey_alloc
283 {__NR_pkey_alloc, "pkey_alloc"},
284#endif
285#ifdef __NR_pkey_free
286 {__NR_pkey_free, "pkey_free"},
287#endif
288#ifdef __NR_pkey_mprotect
289 {__NR_pkey_mprotect, "pkey_mprotect"},
290#endif
291 {__NR_poll, "poll"},
292 {__NR_ppoll, "ppoll"},
293 {__NR_prctl, "prctl"},
294 {__NR_pread64, "pread64"},
295 {__NR_preadv, "preadv"},
296#ifdef __NR_preadv2
297 {__NR_preadv2, "preadv2"},
298#endif
299 {__NR_prlimit64, "prlimit64"},
300 {__NR_process_vm_readv, "process_vm_readv"},
301 {__NR_process_vm_writev, "process_vm_writev"},
302 {__NR_pselect6, "pselect6"},
303 {__NR_ptrace, "ptrace"},
304 {__NR_putpmsg, "putpmsg"},
305 {__NR_pwrite64, "pwrite64"},
306 {__NR_pwritev, "pwritev"},
307#ifdef __NR_pwritev2
308 {__NR_pwritev2, "pwritev2"},
309#endif
310 {__NR__sysctl, "_sysctl"},
311 {__NR_query_module, "query_module"},
312 {__NR_quotactl, "quotactl"},
313 {__NR_read, "read"},
314 {__NR_readahead, "readahead"},
315 {__NR_readlink, "readlink"},
316 {__NR_readlinkat, "readlinkat"},
317 {__NR_readv, "readv"},
318 {__NR_reboot, "reboot"},
319 {__NR_recvfrom, "recvfrom"},
320 {__NR_recvmmsg, "recvmmsg"},
321 {__NR_recvmsg, "recvmsg"},
322 {__NR_remap_file_pages, "remap_file_pages"},
323 {__NR_removexattr, "removexattr"},
324 {__NR_rename, "rename"},
325 {__NR_renameat, "renameat"},
326 {__NR_renameat2, "renameat2"},
327 {__NR_request_key, "request_key"},
328 {__NR_restart_syscall, "restart_syscall"},
329 {__NR_rmdir, "rmdir"},
330 {__NR_rt_sigaction, "rt_sigaction"},
331 {__NR_rt_sigpending, "rt_sigpending"},
332 {__NR_rt_sigprocmask, "rt_sigprocmask"},
333 {__NR_rt_sigqueueinfo, "rt_sigqueueinfo"},
334 {__NR_rt_sigreturn, "rt_sigreturn"},
335 {__NR_rt_sigsuspend, "rt_sigsuspend"},
336 {__NR_rt_sigtimedwait, "rt_sigtimedwait"},
337 {__NR_rt_tgsigqueueinfo, "rt_tgsigqueueinfo"},
338 {__NR_sched_get_priority_max, "sched_get_priority_max"},
339 {__NR_sched_get_priority_min, "sched_get_priority_min"},
340 {__NR_sched_getaffinity, "sched_getaffinity"},
341 {__NR_sched_getattr, "sched_getattr"},
342 {__NR_sched_getparam, "sched_getparam"},
343 {__NR_sched_getscheduler, "sched_getscheduler"},
344 {__NR_sched_rr_get_interval, "sched_rr_get_interval"},
345 {__NR_sched_setaffinity, "sched_setaffinity"},
346 {__NR_sched_setattr, "sched_setattr"},
347 {__NR_sched_setparam, "sched_setparam"},
348 {__NR_sched_setscheduler, "sched_setscheduler"},
349 {__NR_sched_yield, "sched_yield"},
350 {__NR_seccomp, "seccomp"},
351 {__NR_security, "security"},
352 {__NR_select, "select"},
353 {__NR_semctl, "semctl"},
354 {__NR_semget, "semget"},
355 {__NR_semop, "semop"},
356 {__NR_semtimedop, "semtimedop"},
357 {__NR_sendfile, "sendfile"},
358 {__NR_sendmmsg, "sendmmsg"},
359 {__NR_sendmsg, "sendmsg"},
360 {__NR_sendto, "sendto"},
361 {__NR_set_mempolicy, "set_mempolicy"},
362 {__NR_set_robust_list, "set_robust_list"},
363 {__NR_set_thread_area, "set_thread_area"},
364 {__NR_set_tid_address, "set_tid_address"},
365 {__NR_setdomainname, "setdomainname"},
366 {__NR_setfsgid, "setfsgid"},
367 {__NR_setfsuid, "setfsuid"},
368 {__NR_setgid, "setgid"},
369 {__NR_setgroups, "setgroups"},
370 {__NR_sethostname, "sethostname"},
371 {__NR_setitimer, "setitimer"},
372 {__NR_setns, "setns"},
373 {__NR_setpgid, "setpgid"},
374 {__NR_setpriority, "setpriority"},
375 {__NR_setregid, "setregid"},
376 {__NR_setresgid, "setresgid"},
377 {__NR_setresuid, "setresuid"},
378 {__NR_setreuid, "setreuid"},
379 {__NR_setrlimit, "setrlimit"},
380 {__NR_setsid, "setsid"},
381 {__NR_setsockopt, "setsockopt"},
382 {__NR_settimeofday, "settimeofday"},
383 {__NR_setuid, "setuid"},
384 {__NR_setxattr, "setxattr"},
385 {__NR_shmat, "shmat"},
386 {__NR_shmctl, "shmctl"},
387 {__NR_shmdt, "shmdt"},
388 {__NR_shmget, "shmget"},
389 {__NR_shutdown, "shutdown"},
390 {__NR_sigaltstack, "sigaltstack"},
391 {__NR_signalfd, "signalfd"},
392 {__NR_signalfd4, "signalfd4"},
393 {__NR_socket, "socket"},
394 {__NR_socketpair, "socketpair"},
395 {__NR_splice, "splice"},
396 {__NR_stat, "stat"},
397 {__NR_statfs, "statfs"},
398 {__NR_statx, "statx"},
399 {__NR_swapoff, "swapoff"},
400 {__NR_swapon, "swapon"},
401 {__NR_symlink, "symlink"},
402 {__NR_symlinkat, "symlinkat"},
403 {__NR_sync, "sync"},
404 {__NR_sync_file_range, "sync_file_range"},
405 {__NR_syncfs, "syncfs"},
406 {__NR_sysfs, "sysfs"},
407 {__NR_sysinfo, "sysinfo"},
408 {__NR_syslog, "syslog"},
409 {__NR_tee, "tee"},
410 {__NR_tgkill, "tgkill"},
411 {__NR_time, "time"},
412 {__NR_timer_create, "timer_create"},
413 {__NR_timer_delete, "timer_delete"},
414 {__NR_timer_getoverrun, "timer_getoverrun"},
415 {__NR_timer_gettime, "timer_gettime"},
416 {__NR_timer_settime, "timer_settime"},
417 {__NR_timerfd_create, "timerfd_create"},
418 {__NR_timerfd_gettime, "timerfd_gettime"},
419 {__NR_timerfd_settime, "timerfd_settime"},
420 {__NR_times, "times"},
421 {__NR_tkill, "tkill"},
422 {__NR_truncate, "truncate"},
423 {__NR_tuxcall, "tuxcall"},
424 {__NR_umask, "umask"},
425 {__NR_umount2, "umount2"},
426 {__NR_uname, "uname"},
427 {__NR_unlink, "unlink"},
428 {__NR_unlinkat, "unlinkat"},
429 {__NR_unshare, "unshare"},
430 {__NR_uselib, "uselib"},
431 {__NR_userfaultfd, "userfaultfd"},
432 {__NR_ustat, "ustat"},
433 {__NR_utime, "utime"},
434 {__NR_utimensat, "utimensat"},
435 {__NR_utimes, "utimes"},
436 {__NR_vfork, "vfork"},
437 {__NR_vhangup, "vhangup"},
438 {__NR_vmsplice, "vmsplice"},
439 {__NR_vserver, "vserver"},
440 {__NR_wait4, "wait4"},
441 {__NR_waitid, "waitid"},
442 {__NR_write, "write"},
443 {__NR_writev, "writev"},
444};
445
446std::string GetLinuxSyscallName(uint32_t syscall_number)
447{
448 const auto element = LINUX_SYSCALLS.find(syscall_number);
449 if (element != LINUX_SYSCALLS.end()) {
450 return element->second;
451 }
452 return "*unknown*";
453}
454
455// See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/> for
456// an accessible introduction to using seccomp.
457//
458// This function largely follows <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c> and
459// <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>.
460//
461// Seccomp BPF resources:
462// * Seccomp BPF documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
463// * seccomp(2) manual page: <https://www.kernel.org/doc/man-pages/online/pages/man2/seccomp.2.html>
464// * Seccomp BPF demo code samples: <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/samples/seccomp>
465void SyscallSandboxDebugSignalHandler(int, siginfo_t* signal_info, void* void_signal_context)
466{
467 // The si_code field inside the siginfo_t argument that is passed to a SA_SIGINFO signal handler
468 // is a value indicating why the signal was sent.
469 //
470 // The following value can be placed in si_code for a SIGSYS signal:
471 // * SYS_SECCOMP (since Linux 3.5): Triggered by a seccomp(2) filter rule.
472 constexpr int32_t SYS_SECCOMP_SI_CODE{1};
473 assert(signal_info->si_code == SYS_SECCOMP_SI_CODE);
474
475 // The ucontext_t structure contains signal context information that was saved on the user-space
476 // stack by the kernel.
477 const ucontext_t* signal_context = static_cast<ucontext_t*>(void_signal_context);
478 assert(signal_context != nullptr);
479
480 std::set_new_handler(std::terminate);
481 // Portability note: REG_RAX is Linux x86_64 specific.
482 const uint32_t syscall_number = static_cast<uint32_t>(signal_context->uc_mcontext.gregs[REG_RAX]);
483 const std::string syscall_name = GetLinuxSyscallName(syscall_number);
484 const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*";
485 const std::string error_message = strprintf("ERROR: The syscall \"%s\" (syscall number %d) is not allowed by the syscall sandbox in thread \"%s\". Please report.", syscall_name, syscall_number, thread_name);
486 tfm::format(std::cerr, "%s\n", error_message);
487 LogPrintf("%s\n", error_message);
488 std::terminate();
489}
490
491// This function largely follows install_syscall_reporter from Kees Cook's seccomp guide:
492// <https://outflux.net/teach-seccomp/step-3/syscall-reporter.c>
493bool SetupSyscallSandboxDebugHandler()
494{
495 struct sigaction action = {};
496 sigset_t mask;
497 sigemptyset(&mask);
498 sigaddset(&mask, SIGSYS);
499 action.sa_sigaction = &SyscallSandboxDebugSignalHandler;
500 action.sa_flags = SA_SIGINFO;
501 if (sigaction(SIGSYS, &action, nullptr) < 0) {
502 return false;
503 }
504 if (sigprocmask(SIG_UNBLOCK, &mask, nullptr)) {
505 return false;
506 }
507 return true;
508}
509
510enum class SyscallSandboxAction {
511 KILL_PROCESS,
512 INVOKE_SIGNAL_HANDLER,
513};
514
515class SeccompPolicyBuilder
516{
517 std::set<uint32_t> allowed_syscalls;
518
519public:
520 SeccompPolicyBuilder()
521 {
522 // Allowed by default.
523 AllowAddressSpaceAccess();
524 AllowEpoll();
525 AllowEventFd();
526 AllowFutex();
527 AllowGeneralIo();
528 AllowGetRandom();
529 AllowGetSimpleId();
530 AllowGetTime();
531 AllowGlobalProcessEnvironment();
532 AllowGlobalSystemStatus();
533 AllowKernelInternalApi();
534 AllowNetworkSocketInformation();
535 AllowOperationOnExistingFileDescriptor();
536 AllowPipe();
537 AllowPrctl();
538 AllowProcessStartOrDeath();
539 AllowScheduling();
540 AllowSignalHandling();
541 AllowSleep();
542 AllowUmask();
543 }
544
545 void AllowAddressSpaceAccess()
546 {
547 allowed_syscalls.insert(__NR_brk); // change data segment size
548 allowed_syscalls.insert(__NR_madvise); // give advice about use of memory
549 allowed_syscalls.insert(__NR_membarrier); // issue memory barriers on a set of threads
550 allowed_syscalls.insert(__NR_mincore); // check if virtual memory is in RAM
551 allowed_syscalls.insert(__NR_mlock); // lock memory
552 allowed_syscalls.insert(__NR_mmap); // map files or devices into memory
553 allowed_syscalls.insert(__NR_mprotect); // set protection on a region of memory
554 allowed_syscalls.insert(__NR_mremap); // remap a file in memory
555 allowed_syscalls.insert(__NR_munlock); // unlock memory
556 allowed_syscalls.insert(__NR_munmap); // unmap files or devices into memory
557 }
558
559 void AllowEpoll()
560 {
561 allowed_syscalls.insert(__NR_epoll_create1); // open an epoll file descriptor
562 allowed_syscalls.insert(__NR_epoll_ctl); // control interface for an epoll file descriptor
563 allowed_syscalls.insert(__NR_epoll_pwait); // wait for an I/O event on an epoll file descriptor
564 allowed_syscalls.insert(__NR_epoll_wait); // wait for an I/O event on an epoll file descriptor
565 }
566
567 void AllowEventFd()
568 {
569 allowed_syscalls.insert(__NR_eventfd2); // create a file descriptor for event notification
570 }
571
572 void AllowFileSystem()
573 {
574 allowed_syscalls.insert(__NR_access); // check user's permissions for a file
575 allowed_syscalls.insert(__NR_chdir); // change working directory
576 allowed_syscalls.insert(__NR_chmod); // change permissions of a file
577 allowed_syscalls.insert(__NR_copy_file_range); // copy a range of data from one file to another
578 allowed_syscalls.insert(__NR_fallocate); // manipulate file space
579 allowed_syscalls.insert(__NR_fchmod); // change permissions of a file
580 allowed_syscalls.insert(__NR_fchown); // change ownership of a file
581 allowed_syscalls.insert(__NR_fdatasync); // synchronize a file's in-core state with storage device
582 allowed_syscalls.insert(__NR_flock); // apply or remove an advisory lock on an open file
583 allowed_syscalls.insert(__NR_fstat); // get file status
584 allowed_syscalls.insert(__NR_newfstatat); // get file status
585 allowed_syscalls.insert(__NR_fsync); // synchronize a file's in-core state with storage device
586 allowed_syscalls.insert(__NR_ftruncate); // truncate a file to a specified length
587 allowed_syscalls.insert(__NR_getcwd); // get current working directory
588 allowed_syscalls.insert(__NR_getdents); // get directory entries
589 allowed_syscalls.insert(__NR_getdents64); // get directory entries
590 allowed_syscalls.insert(__NR_lstat); // get file status
591 allowed_syscalls.insert(__NR_mkdir); // create a directory
592 allowed_syscalls.insert(__NR_open); // open and possibly create a file
593 allowed_syscalls.insert(__NR_openat); // open and possibly create a file
594 allowed_syscalls.insert(__NR_readlink); // read value of a symbolic link
595 allowed_syscalls.insert(__NR_rename); // change the name or location of a file
596 allowed_syscalls.insert(__NR_rmdir); // delete a directory
597 allowed_syscalls.insert(__NR_stat); // get file status
598 allowed_syscalls.insert(__NR_statfs); // get filesystem statistics
599 allowed_syscalls.insert(__NR_statx); // get file status (extended)
600 allowed_syscalls.insert(__NR_unlink); // delete a name and possibly the file it refers to
601 }
602
603 void AllowFutex()
604 {
605 allowed_syscalls.insert(__NR_futex); // fast user-space locking
606 allowed_syscalls.insert(__NR_set_robust_list); // set list of robust futexes
607 }
608
609 void AllowGeneralIo()
610 {
611 allowed_syscalls.insert(__NR_ioctl); // control device
612 allowed_syscalls.insert(__NR_lseek); // reposition read/write file offset
613 allowed_syscalls.insert(__NR_poll); // wait for some event on a file descriptor
614 allowed_syscalls.insert(__NR_ppoll); // wait for some event on a file descriptor
615 allowed_syscalls.insert(__NR_pread64); // read from a file descriptor at a given offset
616 allowed_syscalls.insert(__NR_pwrite64); // write to a file descriptor at a given offset
617 allowed_syscalls.insert(__NR_read); // read from a file descriptor
618 allowed_syscalls.insert(__NR_readv); // read data into multiple buffers
619 allowed_syscalls.insert(__NR_recvfrom); // receive a message from a socket
620 allowed_syscalls.insert(__NR_recvmsg); // receive a message from a socket
621 allowed_syscalls.insert(__NR_select); // synchronous I/O multiplexing
622 allowed_syscalls.insert(__NR_sendmmsg); // send multiple messages on a socket
623 allowed_syscalls.insert(__NR_sendmsg); // send a message on a socket
624 allowed_syscalls.insert(__NR_sendto); // send a message on a socket
625 allowed_syscalls.insert(__NR_write); // write to a file descriptor
626 allowed_syscalls.insert(__NR_writev); // write data into multiple buffers
627 }
628
629 void AllowGetRandom()
630 {
631 allowed_syscalls.insert(__NR_getrandom); // obtain a series of random bytes
632 }
633
634 void AllowGetSimpleId()
635 {
636 allowed_syscalls.insert(__NR_getegid); // get group identity
637 allowed_syscalls.insert(__NR_geteuid); // get user identity
638 allowed_syscalls.insert(__NR_getgid); // get group identity
639 allowed_syscalls.insert(__NR_getpgid); // get process group
640 allowed_syscalls.insert(__NR_getpid); // get process identification
641 allowed_syscalls.insert(__NR_getppid); // get process identification
642 allowed_syscalls.insert(__NR_getresgid); // get real, effective and saved group IDs
643 allowed_syscalls.insert(__NR_getresuid); // get real, effective and saved user IDs
644 allowed_syscalls.insert(__NR_getsid); // get session ID
645 allowed_syscalls.insert(__NR_gettid); // get thread identification
646 allowed_syscalls.insert(__NR_getuid); // get user identity
647 }
648
649 void AllowGetTime()
650 {
651 allowed_syscalls.insert(__NR_clock_getres); // find the resolution (precision) of the specified clock
652 allowed_syscalls.insert(__NR_clock_gettime); // retrieve the time of the specified clock
653 allowed_syscalls.insert(__NR_gettimeofday); // get timeval
654 }
655
656 void AllowGlobalProcessEnvironment()
657 {
658 allowed_syscalls.insert(__NR_getrlimit); // get resource limits
659 allowed_syscalls.insert(__NR_getrusage); // get resource usage
660 allowed_syscalls.insert(__NR_prlimit64); // get/set resource limits
661 }
662
663 void AllowGlobalSystemStatus()
664 {
665 allowed_syscalls.insert(__NR_sysinfo); // return system information
666 allowed_syscalls.insert(__NR_uname); // get name and information about current kernel
667 }
668
669 void AllowKernelInternalApi()
670 {
671 allowed_syscalls.insert(__NR_restart_syscall); // restart a system call after interruption by a stop signal
672 }
673
674 void AllowNetwork()
675 {
676 allowed_syscalls.insert(__NR_accept); // accept a connection on a socket
677 allowed_syscalls.insert(__NR_accept4); // accept a connection on a socket
678 allowed_syscalls.insert(__NR_bind); // bind a name to a socket
679 allowed_syscalls.insert(__NR_connect); // initiate a connection on a socket
680 allowed_syscalls.insert(__NR_listen); // listen for connections on a socket
681 allowed_syscalls.insert(__NR_setsockopt); // set options on sockets
682 allowed_syscalls.insert(__NR_socket); // create an endpoint for communication
683 allowed_syscalls.insert(__NR_socketpair); // create a pair of connected sockets
684 }
685
686 void AllowNetworkSocketInformation()
687 {
688 allowed_syscalls.insert(__NR_getpeername); // get name of connected peer socket
689 allowed_syscalls.insert(__NR_getsockname); // get socket name
690 allowed_syscalls.insert(__NR_getsockopt); // get options on sockets
691 }
692
693 void AllowOperationOnExistingFileDescriptor()
694 {
695 allowed_syscalls.insert(__NR_close); // close a file descriptor
696 allowed_syscalls.insert(__NR_dup); // duplicate a file descriptor
697 allowed_syscalls.insert(__NR_dup2); // duplicate a file descriptor
698 allowed_syscalls.insert(__NR_fcntl); // manipulate file descriptor
699 allowed_syscalls.insert(__NR_shutdown); // shut down part of a full-duplex connection
700 }
701
702 void AllowPipe()
703 {
704 allowed_syscalls.insert(__NR_pipe); // create pipe
705 allowed_syscalls.insert(__NR_pipe2); // create pipe
706 }
707
708 void AllowPrctl()
709 {
710 allowed_syscalls.insert(__NR_arch_prctl); // set architecture-specific thread state
711 allowed_syscalls.insert(__NR_prctl); // operations on a process
712 }
713
714 void AllowProcessStartOrDeath()
715 {
716 allowed_syscalls.insert(__NR_clone); // create a child process
717 allowed_syscalls.insert(__NR_clone3); // create a child process
718 allowed_syscalls.insert(__NR_exit); // terminate the calling process
719 allowed_syscalls.insert(__NR_exit_group); // exit all threads in a process
720 allowed_syscalls.insert(__NR_fork); // create a child process
721 allowed_syscalls.insert(__NR_tgkill); // send a signal to a thread
722 allowed_syscalls.insert(__NR_wait4); // wait for process to change state, BSD style
723 }
724
725 void AllowScheduling()
726 {
727 allowed_syscalls.insert(__NR_sched_getaffinity); // set a thread's CPU affinity mask
728 allowed_syscalls.insert(__NR_sched_getparam); // get scheduling parameters
729 allowed_syscalls.insert(__NR_sched_getscheduler); // get scheduling policy/parameters
730 allowed_syscalls.insert(__NR_sched_setscheduler); // set scheduling policy/parameters
731 allowed_syscalls.insert(__NR_sched_yield); // yield the processor
732 }
733
734 void AllowSignalHandling()
735 {
736 allowed_syscalls.insert(__NR_rt_sigaction); // examine and change a signal action
737 allowed_syscalls.insert(__NR_rt_sigprocmask); // examine and change blocked signals
738 allowed_syscalls.insert(__NR_rt_sigreturn); // return from signal handler and cleanup stack frame
739 allowed_syscalls.insert(__NR_sigaltstack); // set and/or get signal stack context
740 }
741
742 void AllowSleep()
743 {
744 allowed_syscalls.insert(__NR_clock_nanosleep); // high-resolution sleep with specifiable clock
745 allowed_syscalls.insert(__NR_nanosleep); // high-resolution sleep
746 }
747
748 void AllowUmask()
749 {
750 allowed_syscalls.insert(__NR_umask); // set file mode creation mask
751 }
752
753 // See Linux kernel developer Kees Cook's seccomp guide at <https://outflux.net/teach-seccomp/>
754 // for an accessible introduction to using seccomp.
755 //
756 // This function largely follows <https://outflux.net/teach-seccomp/step-3/seccomp-bpf.h>.
757 std::vector<sock_filter> BuildFilter(SyscallSandboxAction default_action)
758 {
759 std::vector<sock_filter> bpf_policy;
760 // See VALIDATE_ARCHITECTURE in seccomp-bpf.h referenced above.
761 bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, arch)));
762 // Portability note: AUDIT_ARCH_X86_64 is Linux x86_64 specific.
763 bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, AUDIT_ARCH_X86_64, 1, 0));
764 bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS));
765 // See EXAMINE_SYSCALL in seccomp-bpf.h referenced above.
766 bpf_policy.push_back(BPF_STMT(BPF_LD + BPF_W + BPF_ABS, offsetof(struct seccomp_data, nr)));
767 for (const uint32_t allowed_syscall : allowed_syscalls) {
768 // See ALLOW_SYSCALL in seccomp-bpf.h referenced above.
769 bpf_policy.push_back(BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, allowed_syscall, 0, 1));
770 bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_ALLOW));
771 }
772 switch (default_action) {
773 case SyscallSandboxAction::KILL_PROCESS:
774 // Disallow syscall and kill the process.
775 //
776 // See KILL_PROCESS in seccomp-bpf.h referenced above.
777 //
778 // Note that we're using SECCOMP_RET_KILL_PROCESS (kill the process) instead
779 // of SECCOMP_RET_KILL_THREAD (kill the thread). The SECCOMP_RET_KILL_PROCESS
780 // action was introduced in Linux 4.14.
781 //
782 // SECCOMP_RET_KILL_PROCESS: Results in the entire process exiting immediately without
783 // executing the system call.
784 //
785 // SECCOMP_RET_KILL_PROCESS documentation:
786 // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
787 bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_KILL_PROCESS));
788 break;
789 case SyscallSandboxAction::INVOKE_SIGNAL_HANDLER:
790 // Disallow syscall and force a SIGSYS to trigger syscall debug reporter.
791 //
792 // SECCOMP_RET_TRAP: Results in the kernel sending a SIGSYS signal to the triggering
793 // task without executing the system call.
794 //
795 // SECCOMP_RET_TRAP documentation:
796 // <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
797 bpf_policy.push_back(BPF_STMT(BPF_RET + BPF_K, SECCOMP_RET_TRAP));
798 break;
799 }
800 return bpf_policy;
801 }
802};
803} // namespace
804
805bool SetupSyscallSandbox(bool log_syscall_violation_before_terminating)
806{
807 assert(!g_syscall_sandbox_enabled && "SetupSyscallSandbox(...) should only be called once.");
808 g_syscall_sandbox_enabled = true;
809 g_syscall_sandbox_log_violation_before_terminating = log_syscall_violation_before_terminating;
810 if (log_syscall_violation_before_terminating) {
811 if (!SetupSyscallSandboxDebugHandler()) {
812 return false;
813 }
814 }
816 return true;
817}
818
819void TestDisallowedSandboxCall()
820{
821 // The getgroups syscall is assumed NOT to be allowed by the syscall sandbox policy.
822 std::array<gid_t, 1> groups;
823 [[maybe_unused]] int32_t ignored = getgroups(groups.size(), groups.data());
824}
825#endif // defined(USE_SYSCALL_SANDBOX)
826
828{
829#if defined(USE_SYSCALL_SANDBOX)
830 if (!g_syscall_sandbox_enabled) {
831 return;
832 }
833 SeccompPolicyBuilder seccomp_policy_builder;
834 switch (syscall_policy) {
835 case SyscallSandboxPolicy::INITIALIZATION: // Thread: main thread (state: init)
836 // SyscallSandboxPolicy::INITIALIZATION is the first policy loaded.
837 //
838 // Subsequently loaded policies can reduce the abilities further, but
839 // abilities can never be regained.
840 //
841 // SyscallSandboxPolicy::INITIALIZATION must thus be a superset of all
842 // other policies.
843 seccomp_policy_builder.AllowFileSystem();
844 seccomp_policy_builder.AllowNetwork();
845 break;
846 case SyscallSandboxPolicy::INITIALIZATION_DNS_SEED: // Thread: dnsseed
847 seccomp_policy_builder.AllowFileSystem();
848 seccomp_policy_builder.AllowNetwork();
849 break;
851 seccomp_policy_builder.AllowFileSystem();
852 break;
853 case SyscallSandboxPolicy::INITIALIZATION_MAP_PORT: // Thread: mapport
854 seccomp_policy_builder.AllowFileSystem();
855 seccomp_policy_builder.AllowNetwork();
856 break;
857 case SyscallSandboxPolicy::MESSAGE_HANDLER: // Thread: msghand
858 seccomp_policy_builder.AllowFileSystem();
859 break;
860 case SyscallSandboxPolicy::NET: // Thread: net
861 seccomp_policy_builder.AllowFileSystem();
862 seccomp_policy_builder.AllowNetwork();
863 break;
864 case SyscallSandboxPolicy::NET_ADD_CONNECTION: // Thread: addcon
865 seccomp_policy_builder.AllowFileSystem();
866 seccomp_policy_builder.AllowNetwork();
867 break;
868 case SyscallSandboxPolicy::NET_HTTP_SERVER: // Thread: http
869 seccomp_policy_builder.AllowFileSystem();
870 seccomp_policy_builder.AllowNetwork();
871 break;
872 case SyscallSandboxPolicy::NET_HTTP_SERVER_WORKER: // Thread: httpworker.<N>
873 seccomp_policy_builder.AllowFileSystem();
874 seccomp_policy_builder.AllowNetwork();
875 break;
876 case SyscallSandboxPolicy::NET_OPEN_CONNECTION: // Thread: opencon
877 seccomp_policy_builder.AllowFileSystem();
878 seccomp_policy_builder.AllowNetwork();
879 break;
880 case SyscallSandboxPolicy::SCHEDULER: // Thread: scheduler
881 seccomp_policy_builder.AllowFileSystem();
882 break;
883 case SyscallSandboxPolicy::TOR_CONTROL: // Thread: torcontrol
884 seccomp_policy_builder.AllowFileSystem();
885 seccomp_policy_builder.AllowNetwork();
886 break;
887 case SyscallSandboxPolicy::TX_INDEX: // Thread: txindex
888 seccomp_policy_builder.AllowFileSystem();
889 break;
890 case SyscallSandboxPolicy::VALIDATION_SCRIPT_CHECK: // Thread: scriptch.<N>
891 break;
892 case SyscallSandboxPolicy::SHUTOFF: // Thread: main thread (state: shutoff)
893 seccomp_policy_builder.AllowFileSystem();
894 break;
895 }
896
897 const SyscallSandboxAction default_action = g_syscall_sandbox_log_violation_before_terminating ? SyscallSandboxAction::INVOKE_SIGNAL_HANDLER : SyscallSandboxAction::KILL_PROCESS;
898 std::vector<sock_filter> filter = seccomp_policy_builder.BuildFilter(default_action);
899 const sock_fprog prog = {
900 .len = static_cast<uint16_t>(filter.size()),
901 .filter = filter.data(),
902 };
903 // Do not allow abilities to be regained after being dropped.
904 //
905 // PR_SET_NO_NEW_PRIVS documentation: <https://www.kernel.org/doc/html/latest/userspace-api/no_new_privs.html>
906 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) != 0) {
907 throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_NO_NEW_PRIVS)");
908 }
909 // Install seccomp-bpf syscall filter.
910 //
911 // PR_SET_SECCOMP documentation: <https://www.kernel.org/doc/html/latest/userspace-api/seccomp_filter.html>
912 if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog) != 0) {
913 throw std::runtime_error("Syscall sandbox enforcement failed: prctl(PR_SET_SECCOMP)");
914 }
915
916 const std::string thread_name = !util::ThreadGetInternalName().empty() ? util::ThreadGetInternalName() : "*unnamed*";
917 LogPrint(BCLog::UTIL, "Syscall filter installed for thread \"%s\"\n", thread_name);
918#endif // defined(USE_SYSCALL_SANDBOX)
919}
#define LogPrint(category,...)
Definition: logging.h:191
#define LogPrintf(...)
Definition: logging.h:187
@ UTIL
Definition: logging.h:63
void format(std::ostream &out, const char *fmt, const Args &... args)
Format list of arguments to the stream according to given format string.
Definition: tinyformat.h:1062
const std::string & ThreadGetInternalName()
Get the thread's internal (in-memory) name; used e.g.
Definition: threadnames.cpp:53
void SetSyscallSandboxPolicy(SyscallSandboxPolicy syscall_policy)
Force the current thread (and threads created from the current thread) into a restricted-service oper...
SyscallSandboxPolicy
#define strprintf
Format arguments and return the string or write to given std::ostream (see tinyformat::format doc for...
Definition: tinyformat.h:1164
assert(!tx.IsCoinBase())