//
// Syd: rock-solid application kernel
// src/config.rs: Static configuration, edit & recompile!
//
// Copyright (c) 2023, 2024, 2025, 2026 Ali Polatel <alip@chesswob.org>
// Based in part upon HardenedBSD's sys/hardenedbsd/hbsd_pax_segvguard.c which is:
//   Copyright (c) 2006 Elad Efrat <elad@NetBSD.org>
//   Copyright (c) 2013-2017, by Oliver Pinter <oliver.pinter@hardenedbsd.org>
//   Copyright (c) 2014, by Shawn Webb <shawn.webb@hardenedbsd.org>
//   Copyright (c) 2014, by Danilo Egea Gondolfo <danilo at FreeBSD.org>
//   All rights reserved.
//   SPDX-License-Identifier: BSD-3-Clause
// Based in part upon gVisor's kvm_const*.go which is:
//   Copyright 2018 The gVisor Authors.
//   SPDX-License-Identifier: Apache-2.0
// Based in part upon kvm-ioctls' kvm-ioctls.rs which is:
//   Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
//   SPDX-License-Identifier: Apache-2.0 OR MIT
//
// SPDX-License-Identifier: GPL-3.0

use std::{
    os::{
        fd::{AsRawFd, BorrowedFd, IntoRawFd, RawFd},
        unix::ffi::OsStrExt,
    },
    sync::{LazyLock, OnceLock},
};

use btoi::btoi;
use dur::Duration;
use memchr::memchr;
use nix::{
    errno::Errno,
    fcntl::OFlag,
    sys::{resource::rlim_t, stat::Mode, utsname::uname},
    unistd::{sysconf, Gid, SysconfVar, Uid},
};

use crate::{
    compat::{
        fs_type_t, openat2, MFdFlags, OpenHow, Persona, ResolveFlag, STATX_MNT_ID,
        STATX_MNT_ID_UNIQUE, STATX_MODE,
    },
    confine::check_cross_memory_attach,
    fd::{close, is_dev_null, is_proc, AT_BADFD},
    fstatx, info,
    proc::proc_mmap_min_addr,
    rng::{duprand, randint},
    sandbox::Version,
    sysinfo::RandTimer,
};

// Convenience type to list {io,pr}ctls with their names.
type KeyValue<'a> = (&'a str, u64);

/// Path to /etc.
pub const PATH_ETC: &[u8] = b"/etc";

/// The default shell to execute.
/// Change this if your system doesn't have /bin/sh,
/// or set the environment variable SYD_SHELL.
pub const SYD_SH: &str = "/bin/sh";

/// The contents of the file `esyd.sh`.
pub const ESYD_SH: &str = include_str!("esyd.sh");

/// The contents of the file `syd.el`.
pub const SYD_EL: &str = include_str!("syd.el");

/// The environment variable to override the host Linux kernel version.
pub const ENV_ASSUME_KERNEL: &str = "SYD_ASSUME_KERNEL";
/// The environment variable to read the default shell from.
pub const ENV_SH: &str = "SYD_SHELL";
/// The environment variable to read the log level from.
pub const ENV_LOG: &str = "SYD_LOG";
/// The environment variable to set to log to a different fd than standard error.
pub const ENV_LOG_FD: &str = "SYD_LOG_FD";
/// The environment variable to read the syslog(2) capacity from.
pub const ENV_LOG_BUF_LEN: &str = "SYD_LOG_BUF_LEN";
/// The environment variable to set the number of core syscall handler threads.
pub const ENV_NPROC: &str = "SYD_NPROC";
/// The environment variable to set the number of maximum syscall handler threads.
pub const ENV_NPROC_MAX: &str = "SYD_NPROC_MAX";
/// The environment variable to read the pid filename from.
pub const ENV_PID_FN: &str = "SYD_PID_FN";
/// The environment variable to force RESOLVE_NO_SYMLINKS at open(2) boundary.
pub const ENV_FORCE_NO_SYMLINKS: &str = "SYD_FORCE_NO_SYMLINKS";
/// The environment variable to force RESOLVE_NO_MAGICLINKS at open(2) boundary.
pub const ENV_FORCE_NO_MAGICLINKS: &str = "SYD_FORCE_NO_MAGICLINKS";
/// The environment variable to force RESOLVE_NO_XDEV at open(2) boundary.
pub const ENV_FORCE_NO_XDEV: &str = "SYD_FORCE_NO_XDEV";
/// The environment variable to force O_CLOEXEC at open(2) boundary.
pub const ENV_FORCE_CLOEXEC: &str = "SYD_FORCE_CLOEXEC";
/// The environment variable to force randomized fds.
///
/// Setting this environment variable is equivalent to setting trace/force_rand_fd:1.
pub const ENV_FORCE_RAND_FD: &str = "SYD_FORCE_RAND_FD";
/// The environment variable to reject creating and writing open(2) calls.
pub const ENV_FORCE_RO_OPEN: &str = "SYD_FORCE_RO_OPEN";
/// The environment variable to force local networking.
pub const ENV_FORCE_LOCAL_NET: &str = "SYD_FORCE_LOCAL_NET";
/// The environment variable to force TTY output.
pub const ENV_FORCE_TTY: &str = "SYD_FORCE_TTY";
/// The environment variable to quiet TTY output.
pub const ENV_QUIET_TTY: &str = "SYD_QUIET_TTY";

/// The environment variable to set the default value for rlimit/as.
pub const ENV_RLIMIT_AS: &str = "SYD_RLIMIT_AS";
/// The environment variable to set the default value for rlimit/core.
pub const ENV_RLIMIT_CORE: &str = "SYD_RLIMIT_CORE";
/// The environment variable to set the default value for rlimit/data.
pub const ENV_RLIMIT_DATA: &str = "SYD_RLIMIT_DATA";
/// The environment variable to set the default value for rlimit/fsize.
pub const ENV_RLIMIT_FSIZE: &str = "SYD_RLIMIT_FSIZE";
/// The environment variable to set the default value for rlimit/memlock.
pub const ENV_RLIMIT_MEMLOCK: &str = "SYD_RLIMIT_MEMLOCK";
/// The environment variable to set the default value for rlimit/nice.
pub const ENV_RLIMIT_NICE: &str = "SYD_RLIMIT_NICE";
/// The environment variable to set the default value for rlimit/nofile.
pub const ENV_RLIMIT_NOFILE: &str = "SYD_RLIMIT_NOFILE";
/// The environment variable to set the default value for rlimit/nproc.
pub const ENV_RLIMIT_NPROC: &str = "SYD_RLIMIT_NPROC";
/// The environment variable to set the default value for rlimit/rtprio.
pub const ENV_RLIMIT_RTPRIO: &str = "SYD_RLIMIT_RTPRIO";
/// The environment variable to set the default value for rlimit/rttime.
pub const ENV_RLIMIT_RTTIME: &str = "SYD_RLIMIT_RTTIME";
/// The environment variable to set the default value for rlimit/sigpending.
pub const ENV_RLIMIT_SIGPENDING: &str = "SYD_RLIMIT_SIGPENDING";
/// The environment variable to set the default value for rlimit/stack.
pub const ENV_RLIMIT_STACK: &str = "SYD_RLIMIT_STACK";
/// The environment variable to set the default value for timeout.
pub const ENV_TMOUT: &str = "SYD_TMOUT";

/// The environment variable to hold the PTY child fd (used internally).
pub const ENV_PTY_FD: &str = "SYD_PTY_FD";

/// The environment variable to set ipc socket.
pub const ENV_IPC: &str = "SYD_IPC";

/// The environment variable to hold the IPC epoll fd (used internally).
pub const ENV_IPC_POLL_FD: &str = "SYD_IPC_POLL_FD";
/// The environment variable to hold the IPC UNIX socket (used internally).
pub const ENV_IPC_UNIX_FD: &str = "SYD_IPC_UNIX_FD";

/// The environment variable to read the default external address from.
pub const ENV_PROXY_HOST: &str = "SYD_PROXY_HOST";
/// The environment variable to read the default external port from.
pub const ENV_PROXY_PORT: &str = "SYD_PROXY_PORT";
/// The environment variable to read the default external unix socket from.
pub const ENV_PROXY_UNIX: &str = "SYD_PROXY_UNIX";

/// The environment variable to set to disable cross memory attach, and fallback to /proc/pid/mem.
pub const ENV_NO_CROSS_MEMORY_ATTACH: &str = "SYD_NO_CROSS_MEMORY_ATTACH";

/// The environment variable to set to fallback to /proc/pid/mem if
/// process_vm_{read,write}v(2) is not available (ie returns ENOSYS).
/// The environment variable `ENV_NO_CROSS_MEMORY_ATTACH` has precedence
/// over this variable.
pub const ENV_PROC_PID_MEM_FALLBACK: &str = "SYD_PROC_PID_MEM_FALLBACK";

/// The environment variable to dump seccomp filters.
pub const ENV_DUMP_SCMP: &str = "SYD_DUMP_SCMP";

/// The environment variable to skip seccomp confinement of Syd threads.
pub const ENV_SKIP_SCMP: &str = "SYD_SKIP_SCMP";

/// The environment variable to set for quick boot.
///
/// This makes Syd startup noticeably faster,
/// however it removes a layer of defense against some container breaks.
/// Use this if you frequently re-execute syd{,-oci} such as we do on
/// Exherbo during Paludis' generate metadata phase.
pub const ENV_QUICK_BOOT: &str = "SYD_QUICK_BOOT";

/// The environment variable to save AT_RANDOM bytes at boot.
///
/// This is only intended for internal use and the user MUST not set it.
pub const ENV_RAND: &str = "SYD_RAND";

/// The environment variable to save Syd sandbox random ID.
///
/// If this variable is not set by the user,
/// Syd generates it by hashing AT_RANDOM bytes with SHA3-512.
pub const ENV_ID: &str = "SYD_ID";

/// The environment variable to make syd-oci skip configuration parsing.
///
/// This is primarily intended for testing.
pub const ENV_OCI_NO_CONFIG: &str = "SYD_OCI_NO_CONFIG";

/// The environment variable to read the parent-death signal from.
pub const ENV_PDS: &str = "SYD_PDS";

/// SafeSetID does not allow UID transitions for UIDs less than or equal to UID_MIN.
/// 11 is typically the `operator` user.
pub const UID_MIN: Uid = Uid::from_raw(11);

/// SafeSetID does not allow GID transitions for GIDs less than or equal to GID_MIN.
/// 14 is typically the `uucp` group.
pub const GID_MIN: Gid = Gid::from_raw(14);

/// Path prefix for magic stat commands.
pub const MAGIC_PREFIX: &[u8] = b"/dev/syd";

/// syd version
pub static VERSION: LazyLock<&'static str> = LazyLock::new(|| {
    if env!("SYD_GIT_HEAD").is_empty() {
        env!("CARGO_PKG_VERSION")
    } else {
        env!("SYD_GIT_HEAD")
    }
});

/// Api version of the syd(2) configuration.
pub const API_VERSION: Version = Version::new(3, 1);

/// Api major version of the syd(2) configuration.
pub const API_MAJOR_VERSION: &str = env!("CARGO_PKG_VERSION_MAJOR");

/// Api minor version of the syd(2) configuration.
pub const API_MINOR_VERSION: &str = "1";

/// Api version for the IPC service.
pub const IPC_MINOR_VERSION: &str = "1";

/// File format marker for Crypt sandboxing.
pub const CRYPT_MAGIC: &[u8] = &[0x7F, b'S', b'Y', b'D', 3];

/// Path to the file which will be used as file status for magic stat commands.
pub const MAGIC_FILE: &[u8] = b"/dev/null";

/// Limit on the maximum number of path components for path canonicalization.
pub const PATH_MAX_COMP: usize = 1024;

/// Default buffer size for directory entries.
///
/// We use the same default as musl, see:
/// https://git.musl-libc.org/cgit/musl/tree/src/dirent/__dirent.h#n10
/// GNU libc uses 32k:
/// https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/unix/sysv/linux/opendir.c;h=48e3dc4fdbdf219c9f354f9c8bc30108ec7046c4;hb=84977600dace5a7cfcb0918e6757939fd4969839#l106
pub const DIRENT_BUF_SIZE: usize = 2048;

/// System page size
#[expect(clippy::cast_sign_loss)]
pub static PAGE_SIZE: LazyLock<u64> = LazyLock::new(|| {
    sysconf(SysconfVar::PAGE_SIZE)
        .unwrap_or(Some(4096))
        .unwrap_or(4096) as u64
});

/// System MAX_RW_COUNT calculated based on PAGE_SIZE.
#[expect(clippy::cast_possible_truncation)]
pub static MAX_RW_COUNT: LazyLock<usize> = LazyLock::new(|| {
    let page_size = *PAGE_SIZE as usize;
    let page_mask = !(page_size - 1);
    (i32::MAX as usize) & page_mask
});

/// The system-inherent limit for the size of one zero-copy operation
/// is 16 pages. If more data is to be sent to AF_ALG, user space
/// must slice the input into segments with a maximum size of 16
/// pages.
pub const KCAPI_SIZE: usize = 16 * libc::PIPE_BUF;

/// Pipe buffer size.
///
/// Before Linux 2.6.11, the capacity of a pipe was the same as the
/// system page size (e.g., 4096 bytes on i386). Since Linux 2.6.11,
/// the pipe capacity is 16 pages (i.e., 65,536 bytes in a system
/// with a page size of 4096 bytes). Since Linux 2.6.35, the default
/// pipe capacity is 16 pages, but the capacity can be queried and set
/// using the fcntl(2) F_GETPIPE_SZ and F_SETPIPE_SZ operations.
pub const PIPE_BUF: usize = 16 * libc::PIPE_BUF;

/// A version of `PIPE_BUF` that is safe to use with `AF_ALG` sockets.
///
/// This is slightly smaller than `PIPE_BUF` to let kernel some space.
pub const PIPE_BUF_ALG: usize = 15 * libc::PIPE_BUF;

/// Number of CPUs on the system.
pub static NPROC: LazyLock<usize> = LazyLock::new(num_cpus::get);

/// Maximum number of symlinks in any one path lookup.
///
/// The default is the same as the Linux kernel.
/// See: https://docs.kernel.org/filesystems/path-lookup.html
pub const MAXSYMLINKS: u8 = 40;

/// The size of the syscall handler thread pool.
///
/// Note, the pool is self-growing/shrinking, and this constant defines
/// the number of core threads that are always alive and do not get
/// reaped-off even if they're idle.
#[expect(clippy::disallowed_methods)]
pub static EMU_POOL_SIZE: LazyLock<usize> = LazyLock::new(|| {
    let n = std::env::var_os(ENV_NPROC)
        .map(|val| btoi::<usize>(val.as_bytes()).expect("SYD_NPROC"))
        .unwrap_or_else(|| *NPROC);
    assert!(n >= 1, "SYD_NPROC must be at least 1!");
    n
});

/// The absolute maximum number of workers. This corresponds to the
/// maximum value that can be stored within half the bits of u64, as two
/// counters (total workers and busy workers) are stored in one
/// AtomicU64.
#[expect(clippy::disallowed_methods)]
pub static EMU_MAX_SIZE: LazyLock<usize> = LazyLock::new(|| {
    let n = std::env::var_os(ENV_NPROC_MAX)
        .map(|val| btoi::<usize>(val.as_bytes()).expect("SYD_NPROC_MAX"))
        .unwrap_or(usize::MAX);
    assert!(
        n > *EMU_POOL_SIZE,
        "SYD_NPROC_MAX must be greater than SYD_NPROC!"
    );
    n
});

/// Specify the duration in milliseconds for which additional threads
/// outside the core pool remain alive while not receiving any work
/// before giving up and terminating.
/// Defaults to 3 seconds.
pub const EMU_KEEP_ALIVE: u16 = 3;

/// Specify the graceful wait time for the monitor thread.
/// Defaults to 250 milliseconds.
pub const MON_GRACE_TIME: Duration = Duration::from_millis(250);

/// Specify the cycle period of the monitor thread.
/// Defaults to 25 milliseconds.
pub const MON_CYCLE_TIME: Duration = Duration::from_millis(25);

/// Specify the cycle period of the interrupt thread.
/// Defaults to 25 milliseconds.
pub const INT_CYCLE_TIME: Duration = Duration::from_millis(25);

/// Stack size for the syscall handler threads.
/// Defaults to 2M.
pub const EMU_STACK_SIZE: usize = 2 * 1024 * 1024;

/// Stack size for the syscall monitor thread.
/// Defaults to 192k.
pub const MON_STACK_SIZE: usize = 192 * 1024;

/// Stack size for the syscall interrupt thread.
/// Defaults to 128k.
pub const INT_STACK_SIZE: usize = 128 * 1024;

/// Stack size for the syscall timeouter thread.
/// Defaults to 64k.
pub const OUT_STACK_SIZE: usize = 64 * 1024;

/// Stack size for the IPC thread.
/// Defaults to 256k.
pub const IPC_STACK_SIZE: usize = 256 * 1024;

/// Stack size for the AES encryption threads.
/// Defaults to 96k.
pub const AES_STACK_SIZE: usize = 96 * 1024;

/// Stack size for the short-lived micro-threads Syd spawns during
/// system call emulation.
/// Defaults to 8k.
pub const MINI_STACK_SIZE: usize = 8 * 1024;

/// Stack size for the main thread, this is asserted with RLIMIT_STACK.
///
/// Defaults to 2M.
pub const MAIN_STACK_SIZE: rlim_t = 2 * 1024 * 1024;

/// File size resource limit for the main thread, this is asserted with RLIMIT_FSIZE.
///
/// This is off when Crypt sandboxing is on, otherwise the biggest file Syd writes
/// is proc_pid_status(5) which is ~1.6k.
///
/// Defaults to 2k.
pub const MAIN_RLIMIT_FSIZE: rlim_t = 2 * 1024;

// Syslog(2) defaults

#[cfg(target_arch = "x86_64")]
const SYSLOG_CAPSHIFT: usize = 18;
#[cfg(target_arch = "x86")]
const SYSLOG_CAPSHIFT: usize = 18;
#[cfg(target_arch = "aarch64")]
const SYSLOG_CAPSHIFT: usize = 14;
#[cfg(target_arch = "arm")]
const SYSLOG_CAPSHIFT: usize = 13;
#[cfg(target_arch = "riscv64")]
const SYSLOG_CAPSHIFT: usize = 14;
#[cfg(any(target_arch = "powerpc", target_arch = "powerpc64"))]
const SYSLOG_CAPSHIFT: usize = 18;
#[cfg(target_arch = "s390x")]
const SYSLOG_CAPSHIFT: usize = 17;

#[cfg(not(any(
    target_arch = "x86_64",
    target_arch = "x86",
    target_arch = "aarch64",
    target_arch = "arm",
    target_arch = "s390x",
    target_arch = "riscv64",
    target_arch = "powerpc",
    target_arch = "powerpc64",
)))]
const SYSLOG_CAPSHIFT: usize = 14;

/// Default static capacity for syslog(2) stack mode.
///
/// This is the default mode unless a capacity has been specified
/// at startup using the environment variable `SYD_LOG_BUF_LEN`.
///
/// Quoting syslog(2):
/// In early kernels, LOG_BUF_LEN had the value 4096; from Linux 1.3.54,
/// it was 8192; from Linux 2.1.113, it was 16384; since Linux
/// 2.4.23/2.6, the value is a kernel configuration option
/// (CONFIG_LOG_BUF_SHIFT, default value dependent on the architecture).
/// Since Linux 2.6.6, the size can be queried with command type 10 (see
/// below).
pub const SYSLOG_STACK_SIZE: usize = 1 << SYSLOG_CAPSHIFT;

/// The ring buffer capacity for Syslog.
///
/// Note this buffer is allocated on the heap.
/// Defaults to 2M.
pub const SYSLOG_CAPACITY: usize = 2 * 1024 * 1024;

// SegvGuard Defaults

/// SegvGuard entry expiry timeout.
pub const SEGVGUARD_EXPIRY: Duration = Duration::from_secs(2 * 60);
/// SegvGuard entry suspension timeout.
pub const SEGVGUARD_SUSPENSION: Duration = Duration::from_secs(10 * 60);
/// SegvGuard max number of crashes before expiry.
pub const SEGVGUARD_MAXCRASHES: u8 = 5;

// Sandbox Restrictions

/// List of allowlisted personality(2) flags.
///
/// This is consistent with podman(1) and docker(1)
pub const SAFE_PERSONAS: &[KeyValue] = &[
    ("PER_LINUX", 0x0000),
    ("PER_LINUX32", 0x0800000),
    ("UNAME26", 0x0020000),
    ("PER_LINUX32|UNAME26", 0x0800000 | 0x0020000),
    ("GET_PERSONALITY", 0xffffffff),
];

/// Unsafe personality(2) flags.
pub const UNSAFE_PERSONA: Persona = Persona::from_bits_retain(
    Persona::READ_IMPLIES_EXEC.bits()
        | Persona::ADDR_NO_RANDOMIZE.bits()
        | Persona::ADDR_COMPAT_LAYOUT.bits()
        | Persona::MMAP_PAGE_ZERO.bits(),
);

/// List of allowlisted madvise(2) advice.
pub const ALLOW_MADVISE: &[KeyValue] = &[
    ("MADV_NORMAL", libc::MADV_NORMAL as u64),
    ("MADV_SEQUENTIAL", libc::MADV_SEQUENTIAL as u64),
    ("MADV_DONTNEED", libc::MADV_DONTNEED as u64),
    ("MADV_REMOVE", libc::MADV_REMOVE as u64),
    ("MADV_HUGEPAGE", libc::MADV_HUGEPAGE as u64),
    ("MADV_NOHUGEPAGE", libc::MADV_NOHUGEPAGE as u64),
    ("MADV_DONTDUMP", libc::MADV_DONTDUMP as u64),
    // Libc does not define MADV_COLLAPSE for musl.
    ("MADV_COLLAPSE", 25),
    ("MADV_POPULATE_READ", libc::MADV_POPULATE_READ as u64),
    ("MADV_POPULATE_WRITE", libc::MADV_POPULATE_WRITE as u64),
    // Libc does not yet define MADV_GUARD_{INSTALL,REMOVE}.
    ("MADV_GUARD_INSTALL", 102),
    ("MADV_GUARD_REMOVE", 103),
];

/// List of no-opped madvise(2) advice.
pub const NOOP_MADVISE: &[u64] = &[
    libc::MADV_RANDOM as u64,
    libc::MADV_WILLNEED as u64,
    libc::MADV_DONTFORK as u64,
    libc::MADV_DOFORK as u64,
    libc::MADV_MERGEABLE as u64,
    libc::MADV_UNMERGEABLE as u64,
    libc::MADV_SOFT_OFFLINE as u64,
    libc::MADV_DODUMP as u64,
    libc::MADV_FREE as u64,
    libc::MADV_WIPEONFORK as u64,
    libc::MADV_KEEPONFORK as u64,
    libc::MADV_COLD as u64,
    libc::MADV_PAGEOUT as u64,
];

/// List of denied madvise(2) advice.
pub const DENY_MADVISE: &[u64] = &[libc::MADV_HWPOISON as u64];

/// List of allowed madvise(2) advice for Syd threads.
pub const SYD_MADVISE: &[u64] = &[
    libc::MADV_DONTNEED as u64,
    libc::MADV_FREE as u64,
    libc::MADV_HUGEPAGE as u64,
    // Libc does not yet define MADV_GUARD_{INSTALL,REMOVE}.
    102, // MADV_GUARD_INSTALL
    103, // MADV_GUARD_REMOVE
];

/// List of allowlisted prctl(2) requests.
pub const ALLOW_PRCTL: &[KeyValue] = &[
    ("PR_SET_PDEATHSIG", 1),
    ("PR_GET_PDEATHSIG", 2),
    ("PR_GET_DUMPABLE", 3),
    ("PR_SET_DUMPABLE", 4),
    ("PR_GET_ALIGN", 5),
    ("PR_GET_SETALIGN", 6),
    ("PR_GET_KEEPCAPS", 7),
    ("PR_SET_KEEPCAPS", 8),
    ("PR_GET_FPEMU", 9),
    ("PR_SET_FPEMU", 10),
    ("PR_GET_FPEXC", 11),
    ("PR_SET_FPEXC", 12),
    ("PR_GET_TIMING", 13),
    ("PR_SET_TIMING", 14),
    ("PR_SET_NAME", 15),
    ("PR_GET_NAME", 16),
    ("PR_GET_ENDIAN", 19),
    ("PR_SET_ENDIAN", 20),
    ("PR_GET_SECCOMP", 21), // Disallowed unless trace/allow_unsafe_cbpf:1
    ("PR_SET_SECCOMP", 22), // Disallowed unless trace/allow_unsafe_cbpf:1
    ("PR_CAPBSET_READ", 23),
    ("PR_CAPBSET_DROP", 24),
    ("PR_GET_TSC", 25),
    ("PR_SET_TSC", 26),
    ("PR_GET_SECUREBITS", 27),
    ("PR_SET_SECUREBITS", 28),
    ("PR_SET_TIMERSLACK", 29),
    ("PR_GET_TIMERSLACK", 30),
    ("PR_TASK_PERF_EVENTS_DISABLE", 31), // Disallowed unless trace/allow_unsafe_perf:1
    ("PR_TASK_PERF_EVENTS_ENABLE", 32),  // Disallowed unless trace/allow_unsafe_perf:1
    ("PR_MCE_KILL", 33),
    ("PR_MCE_KILL_GET", 34),
    // SAFETY: no self modifying executables! PR_SET_MM
    ("PR_SET_PTRACER", 0x59616d61), // Made no-op to prevent ptrace detection.
    ("PR_SET_CHILD_SUBREAPER", 36),
    ("PR_GET_CHILD_SUBREAPER", 37),
    ("PR_SET_NO_NEW_PRIVS", 38),
    ("PR_GET_NO_NEW_PRIVS", 39),
    ("PR_GET_TID_ADDRESS", 40),
    ("PR_SET_THP_DISABLE", 41),
    ("PR_GET_THP_DISABLE", 42),
    // SAFETY: unimplemented! PR_MPX_ENABLE_MANAGEMENT
    // SAFETY: unimplemented! PR_MPX_DISABLE_MANAGEMENT
    ("PR_SET_FP_MODE", 45),
    ("PR_GET_FP_MODE", 46),
    ("PR_CAP_AMBIENT", 47),
    ("PR_SVE_SET_VL", 50),
    ("PR_SVE_GET_VL", 51),
    ("PR_GET_SPECULATION_CTRL", 52), // Disallowed unless trace/allow_unsafe_exec_speculative:1
    ("PR_SET_SPECULATION_CTRL", 53), // Disallowed unless trace/allow_unsafe_exec_speculative:1
    ("PR_PAC_RESET_KEYS", 54),
    ("PR_SET_TAGGED_ADDR_CTRL", 55),
    ("PR_GET_TAGGED_ADDR_CTRL", 56),
    ("PR_SET_IO_FLUSHER", 57),
    ("PR_GET_IO_FLUSHER", 58),
    ("PR_SET_SYSCALL_USER_DISPATCH", 59),
    ("PR_PAC_SET_ENABLED_KEYS", 60),
    ("PR_PAC_GET_ENABLED_KEYS", 61),
    ("PR_SCHED_CORE", 62),
    ("PR_SET_MDWE", 65),
    ("PR_GET_MDWE", 66),
    ("PR_SET_VMA", 0x53564d41),
];

/// List of denied setsockopt(2) options.
pub const DENY_SETSOCKOPT: &[(i32, i32)] = &[
    // SOL_SOCKET
    (libc::SOL_SOCKET, libc::SO_DEBUG),
    (libc::SOL_SOCKET, libc::SO_SNDBUFFORCE),
    (libc::SOL_SOCKET, libc::SO_RCVBUFFORCE),
    // SO_BINDTODEVICE & SO_BINDTOIFINDEX: Allows binding execution to a
    // specific network interface, potentially bypassing network
    // namespaces or restrictions.
    (libc::SOL_SOCKET, 25), // SO_BINDTODEVICE
    (libc::SOL_SOCKET, 62), // SO_BINDTOIFINDEX
    // SO_ATTACH_FILTER & SO_ATTACH_BPF: Allows attaching BPF filters.
    // While many BPF maps are restricted, attaching filters to sockets is a
    // significant attack surface for kernel exploit primitives (JIT spraying,
    // etc.).
    (libc::SOL_SOCKET, 26), // SO_ATTACH_FILTER
    (libc::SOL_SOCKET, 27), // SO_DETACH_FILTER
    (libc::SOL_SOCKET, 50), // SO_ATTACH_BPF
    (libc::SOL_SOCKET, 51), // SO_ATTACH_REUSEPORT_CBPF
    (libc::SOL_SOCKET, 52), // SO_ATTACH_REUSEPORT_EBPF
    (libc::SOL_SOCKET, 68), // SO_DETACH_REUSEPORT_BPF
    // IPv4: iptables / arptables and multicast filters
    (libc::IPPROTO_IP, 64), // IPT_SO_SET_REPLACE
    (libc::IPPROTO_IP, 65), // IPT_SO_SET_ADD_COUNTERS
    (libc::IPPROTO_IP, 96), // ARPT_SO_SET_REPLACE
    (libc::IPPROTO_IP, 97), // ARPT_SO_SET_ADD_COUNTERS
    (libc::IPPROTO_IP, 41), // IP_MSFILTER
    (libc::IPPROTO_IP, 48), // MCAST_MSFILTER
    // IPv4 multicast group membership
    (libc::IPPROTO_IP, libc::IP_ADD_MEMBERSHIP),
    (libc::IPPROTO_IP, libc::IP_DROP_MEMBERSHIP),
    (libc::IPPROTO_IP, libc::IP_ADD_SOURCE_MEMBERSHIP),
    (libc::IPPROTO_IP, libc::IP_DROP_SOURCE_MEMBERSHIP),
    (libc::IPPROTO_IP, libc::IP_BLOCK_SOURCE),
    (libc::IPPROTO_IP, libc::IP_UNBLOCK_SOURCE),
    // Protocol-independent multicast API (v4/v6) - advanced membership
    (libc::IPPROTO_IP, libc::MCAST_JOIN_GROUP),
    (libc::IPPROTO_IP, libc::MCAST_LEAVE_GROUP),
    (libc::IPPROTO_IP, libc::MCAST_JOIN_SOURCE_GROUP),
    (libc::IPPROTO_IP, libc::MCAST_LEAVE_SOURCE_GROUP),
    (libc::IPPROTO_IP, libc::MCAST_BLOCK_SOURCE),
    (libc::IPPROTO_IP, libc::MCAST_UNBLOCK_SOURCE),
    // IPv4 multicast routing (mroute)
    (libc::IPPROTO_IP, 200), // MRT_INIT
    (libc::IPPROTO_IP, 201), // MRT_DONE
    (libc::IPPROTO_IP, 202), // MRT_ADD_VIF
    (libc::IPPROTO_IP, 203), // MRT_DEL_VIF
    (libc::IPPROTO_IP, 204), // MRT_ADD_MFC
    (libc::IPPROTO_IP, 205), // MRT_DEL_MFC
    (libc::IPPROTO_IP, 206), // MRT_VERSION
    (libc::IPPROTO_IP, 207), // MRT_ASSERT
    (libc::IPPROTO_IP, 208), // MRT_PIM
    (libc::IPPROTO_IP, 209), // MRT_TABLE
    (libc::IPPROTO_IP, 210), // MRT_ADD_MFC_PROXY
    (libc::IPPROTO_IP, 211), // MRT_DEL_MFC_PROXY
    (libc::IPPROTO_IP, 212), // MRT_FLUSH
    // IPv6: ip6tables and header manipulation
    (libc::IPPROTO_IPV6, 64), // IP6T_SO_SET_REPLACE
    (libc::IPPROTO_IPV6, 65), // IP6T_SO_SET_ADD_COUNTERS
    (libc::IPPROTO_IPV6, libc::IPV6_ADDRFORM),
    (libc::IPPROTO_IPV6, libc::IPV6_RTHDR),
    (libc::IPPROTO_IPV6, libc::IPV6_DSTOPTS),
    // Bridging / ebtables (Netfilter, analogous to IPT_SO_* we already deny)
    (libc::IPPROTO_IP, 128), // EBT_SO_SET_ENTRIES
    (libc::IPPROTO_IP, 129), // EBT_SO_SET_COUNTERS
    // IPv6 multicast group membership
    (libc::IPPROTO_IPV6, 20 /* IPV6_JOIN_GROUP */),
    (libc::IPPROTO_IPV6, 21 /* IPV6_LEAVE_GROUP */),
    (libc::IPPROTO_IPV6, 27 /* IPV6_JOIN_ANYCAST */),
    (libc::IPPROTO_IPV6, 28 /* IPV6_LEAVE_ANYCAST */),
    (libc::IPPROTO_IPV6, libc::MCAST_JOIN_GROUP),
    (libc::IPPROTO_IPV6, libc::MCAST_LEAVE_GROUP),
    (libc::IPPROTO_IPV6, libc::MCAST_JOIN_SOURCE_GROUP),
    (libc::IPPROTO_IPV6, libc::MCAST_LEAVE_SOURCE_GROUP),
    (libc::IPPROTO_IPV6, libc::MCAST_BLOCK_SOURCE),
    (libc::IPPROTO_IPV6, libc::MCAST_UNBLOCK_SOURCE),
    // IPv6 multicast routing (mroute6)
    (libc::IPPROTO_IPV6, 200), // MRT6_INIT
    (libc::IPPROTO_IPV6, 201), // MRT6_DONE
    (libc::IPPROTO_IPV6, 202), // MRT6_ADD_MIF
    (libc::IPPROTO_IPV6, 203), // MRT6_DEL_MIF
    (libc::IPPROTO_IPV6, 204), // MRT6_ADD_MFC
    (libc::IPPROTO_IPV6, 205), // MRT6_DEL_MFC
    (libc::IPPROTO_IPV6, 206), // MRT6_VERSION
    (libc::IPPROTO_IPV6, 207), // MRT6_ASSERT
    (libc::IPPROTO_IPV6, 208), // MRT6_PIM
    (libc::IPPROTO_IPV6, 209), // MRT6_TABLE
    (libc::IPPROTO_IPV6, 210), // MRT6_ADD_MFC_PROXY
    (libc::IPPROTO_IPV6, 211), // MRT6_DEL_MFC_PROXY
    (libc::IPPROTO_IPV6, 212), // MRT6_FLUSH
    // TCP: repair / ULP
    (libc::IPPROTO_TCP, libc::TCP_REPAIR),
    (libc::IPPROTO_TCP, libc::TCP_REPAIR_QUEUE),
    (libc::IPPROTO_TCP, libc::TCP_QUEUE_SEQ),
    (libc::IPPROTO_TCP, libc::TCP_REPAIR_OPTIONS),
    (libc::IPPROTO_TCP, libc::TCP_REPAIR_WINDOW),
    (libc::IPPROTO_TCP, libc::TCP_ULP),
    // TCP: congestion control selection (used in recent mptcp/tcp_setsockopt CVEs)
    (libc::IPPROTO_TCP, libc::TCP_CONGESTION),
    // TCP Authentication Option (TCP-AO) key / repair controls
    (libc::IPPROTO_TCP, 38), // TCP_AO_ADD_KEY: TCP-AO key management; complex per-socket key handling increases bug exposure
    (libc::IPPROTO_TCP, 39), // TCP_AO_DEL_KEY: TCP-AO key deletion; shares fragile key-lifecycle code paths
    (libc::IPPROTO_TCP, 40), // TCP_AO_INFO: TCP-AO per-socket configuration; mutates internal auth state
    (libc::IPPROTO_TCP, 41), // TCP_AO_GET_KEYS: TCP-AO key enumeration; expands surface around key lifetime and metadata
    (libc::IPPROTO_TCP, 42), // TCP_AO_REPAIR: TCP-AO repair hooks; interacts with TCP_REPAIR-style state mutation paths
    // UDP: corking (had IPv6/UDP interaction CVEs)
    (libc::IPPROTO_UDP, libc::UDP_CORK),
    // AF_PACKET: tpacket rings / fanout / bypass
    (libc::SOL_PACKET, 5),  // PACKET_RX_RING
    (libc::SOL_PACKET, 10), // PACKET_VERSION
    (libc::SOL_PACKET, 13), // PACKET_TX_RING
    (libc::SOL_PACKET, 18), // PACKET_FANOUT
    (libc::SOL_PACKET, 19), // PACKET_TX_HAS_OFF
    (libc::SOL_PACKET, 20), // PACKET_QDISC_BYPASS
    (libc::SOL_PACKET, 21), // PACKET_ROLLOVER_STATS
    (libc::SOL_PACKET, 22), // PACKET_FANOUT_DATA
    (libc::SOL_PACKET, 23), // PACKET_IGNORE_OUTGOING
    // BPF-related SOL_SOCKET options: classic/extended filters & reuseport
    (libc::SOL_SOCKET, 26 /* SO_ATTACH_FILTER */),
    (
        libc::SOL_SOCKET,
        27, /* SO_DETACH_FILTER, SO_DETACH_BPF */
    ),
    (libc::SOL_SOCKET, 44 /* SO_LOCK_FILTER */),
    (libc::SOL_SOCKET, 48 /* SO_BPF_EXTENSIONS */),
    (libc::SOL_SOCKET, 50 /* SO_ATTACH_BPF */),
    (libc::SOL_SOCKET, 51 /* SO_ATTACH_REUSEPORT_CBPF */),
    (libc::SOL_SOCKET, 52 /* SO_ATTACH_REUSEPORT_EBPF */),
    (libc::SOL_SOCKET, 53 /* SO_DETACH_REUSEPORT_BPF */),
    // VSOCK: buffer size controls (CVE-2021-26708 in vsock_stream_setsockopt)
    (libc::AF_VSOCK, 0), // SO_VM_SOCKETS_BUFFER_SIZE
    (libc::AF_VSOCK, 1), // SO_VM_SOCKETS_BUFFER_MIN_SIZE
    (libc::AF_VSOCK, 2), // SO_VM_SOCKETS_BUFFER_MAX_SIZE
];

/// Based on the list of environment variables that glibc/musl remove
/// for programs running under secure-execution mode.
pub const UNSAFE_ENV: &[&[u8]] = &[
    b"GCONV_PATH",
    b"GETCONF_DIR",
    b"GLIBC_TUNABLES",
    b"HOSTALIASES",
    b"LD_AOUT_LIBRARY_PATH",
    b"LD_AOUT_PRELOAD",
    b"LD_AUDIT",
    b"LD_BIND_NOT",
    b"LD_DEBUG",
    b"LD_DEBUG",
    b"LD_DEBUG_OUTPUT",
    b"LD_DYNAMIC_WEAK",
    b"LD_DYNAMIC_WEAK",
    b"LD_ELF_HINTS_PATH",
    b"LD_HWCAP_MASK",
    b"LD_LIBMAP",
    b"LD_LIBMAP_DISABLE",
    b"LD_LIBRARY_PATH",
    b"LD_LIBRARY_PATH_FDS",
    b"LD_LIBRARY_PATH_RPATH",
    b"LD_LOADFLTR",
    b"LD_ORIGIN_PATH",
    b"LD_PREFER_MAP_32BIT_EXEC",
    b"LD_PRELOAD",
    b"LD_PRELOAD_FDS",
    b"LD_PROFILE",
    b"LD_SHOW_AUXV",
    b"LD_USE_LOAD_BIAS",
    b"LOCALDOMAIN",
    b"LOCPATH",
    b"MALLOC_TRACE",
    b"MUSL_LOCPATH",
    b"NIS_PATH",
    b"NLSPATH",
    b"RESOLV_HOST_CONF",
    b"RES_OPTIONS",
    b"TMPDIR",
    b"TZ",
    b"TZDIR",
    b"LANG",
    b"LC_CTYPE",
    b"LC_NUMERIC",
    b"LC_TIME",
    b"LC_COLLATE",
    b"LC_MONETARY",
    b"LC_MESSAGES",
    b"LC_PAPER",
    b"LC_NAME",
    b"LC_ADDRESS",
    b"LC_TELEPHONE",
    b"LC_MEASUREMENT",
    b"LC_IDENTIFICATION",
    b"LC_ALL",
    b"OLDPWD",
    // GUI environment variables.
    b"DBUS_SESSION_BUS_ADDRESS",
    b"DISPLAY",
    b"GDK_PIXBUF_MODULE_FILE",
    b"GDM_LANG",
    b"GTK_MODULES",
    b"QT_QPA_PLATFORM_PLUGIN_PATH",
    b"SESSION_MANAGER",
    b"WAYLAND_DISPLAY",
    b"XAUTHORITY",
    b"XDG_SESSION_COOKIE",
    // Sanitizer environment variables.
    // See: https://www.openwall.com/lists/oss-security/2016/02/17/9
    b"ASAN_OPTIONS",
    b"ASAN_SYMBOLIZER_PATH",
    b"LSAN_OPTIONS",
    b"MSAN_OPTIONS",
    b"MSAN_SYMBOLIZER_PATH",
    b"TSAN_OPTIONS",
    b"UBSAN_OPTIONS",
];

/// Filesystems supported by Filesystem sandboxing.
pub const FS_MAGIC: &[(&str, fs_type_t)] = &[
    ("aafs", 0x5a3c69f0i64 as fs_type_t),
    ("adfs", 0xadf5i64 as fs_type_t),
    ("affs", 0xadffi64 as fs_type_t),
    ("afs", 0x5346414Fi64 as fs_type_t),
    ("anon_inode_fs", 0x09041934i64 as fs_type_t),
    ("autofs", 0x0187i64 as fs_type_t),
    ("bcachefs", 0xca451a4ei64 as fs_type_t),
    ("bdevfs", 0x62646576i64 as fs_type_t),
    ("binderfs", 0x6c6f6f70i64 as fs_type_t),
    ("binfmtfs", 0x42494e4di64 as fs_type_t),
    ("bpf_fs", 0xcafe4a11i64 as fs_type_t),
    ("btrfs", 0x9123683Ei64 as fs_type_t),
    ("btrfs_test", 0x73727279i64 as fs_type_t),
    ("ceph", 0x00c36400i64 as fs_type_t),
    ("cgroup", 0x27e0ebi64 as fs_type_t),
    ("cgroup2", 0x63677270i64 as fs_type_t),
    ("cifs", 0xFF534D42i64 as fs_type_t),
    ("coda", 0x73757245i64 as fs_type_t),
    ("cramfs", 0x28cd3d45i64 as fs_type_t),
    ("cramfs", 0x453dcd28i64 as fs_type_t), /* magic number with the wrong endianness */
    ("daxfs", 0x64646178i64 as fs_type_t),
    ("debugfs", 0x64626720i64 as fs_type_t),
    ("devmem", 0x454d444di64 as fs_type_t),
    ("devpts", 0x1cd1i64 as fs_type_t),
    ("dma_buf", 0x444d4142i64 as fs_type_t),
    ("ecryptfs", 0xf15fi64 as fs_type_t),
    ("efivarfs", 0xde5e81e4i64 as fs_type_t),
    ("efs", 0x414A53i64 as fs_type_t),
    ("erofs", 0xE0F5E1E2i64 as fs_type_t),
    ("exfat", 0x2011BAB0i64 as fs_type_t),
    ("ext", 0xEF53i64 as fs_type_t),
    /*
    ("ext4", 0xEF53i64 as fs_type_t),
    ("ext3", 0xEF53i64 as fs_type_t),
    ("ext2", 0xEF53i64 as fs_type_t),
    */
    ("f2fs", 0xF2F52010i64 as fs_type_t),
    ("fuse", 0x65735546i64 as fs_type_t),
    ("futexfs", 0xBAD1DEAi64 as fs_type_t),
    ("hostfs", 0x00c0ffeei64 as fs_type_t),
    ("hpfs", 0xf995e849i64 as fs_type_t),
    ("hugetlbfs", 0x958458f6i64 as fs_type_t),
    ("isofs", 0x9660i64 as fs_type_t),
    ("jffs2", 0x72b6i64 as fs_type_t),
    ("minix", 0x137Fi64 as fs_type_t), /* minix v1 fs, 14 char names */
    ("minix", 0x138Fi64 as fs_type_t), /* minix v1 fs, 30 char names */
    ("minix2", 0x2468i64 as fs_type_t), /* minix v2 fs, 14 char names */
    ("minix2", 0x2478i64 as fs_type_t), /* minix v2 fs, 30 char names */
    ("minix3", 0x4d5ai64 as fs_type_t), /* minix v3 fs, 60 char names */
    ("msdos", 0x4d44i64 as fs_type_t),
    ("mtd_inode_fs", 0x11307854i64 as fs_type_t),
    ("ncp", 0x564ci64 as fs_type_t),
    ("nfs", 0x6969i64 as fs_type_t),
    ("nilfs", 0x3434i64 as fs_type_t),
    ("nsfs", 0x6e736673i64 as fs_type_t),
    ("ocfs2", 0x7461636fi64 as fs_type_t),
    ("openprom", 0x9fa1i64 as fs_type_t),
    ("overlayfs", 0x794c7630i64 as fs_type_t),
    ("pid_fd", 0x50494446i64 as fs_type_t),
    ("pipefs", 0x50495045i64 as fs_type_t),
    ("proc", 0x9fa0i64 as fs_type_t),
    ("pstorefs", 0x6165676Ci64 as fs_type_t),
    ("qnx4", 0x002fi64 as fs_type_t),
    ("qnx6", 0x68191122i64 as fs_type_t),
    ("ramfs", 0x858458f6i64 as fs_type_t),
    ("rdtgroup", 0x7655821i64 as fs_type_t),
    ("reiserfs", 0x52654973i64 as fs_type_t),
    ("reiserfs2", 0x52654974i64 as fs_type_t),
    ("reiserfs3", 0x52654975i64 as fs_type_t),
    ("secretmem", 0x5345434di64 as fs_type_t),
    ("securityfs", 0x73636673i64 as fs_type_t),
    ("selinux", 0xf97cff8ci64 as fs_type_t),
    ("smack", 0x43415d53i64 as fs_type_t),
    ("smb", 0x517Bi64 as fs_type_t),
    ("smb2", 0xFE534D42i64 as fs_type_t),
    ("sockfs", 0x534F434Bi64 as fs_type_t),
    ("squashfs", 0x73717368i64 as fs_type_t),
    ("sysfs", 0x62656572i64 as fs_type_t),
    ("tmpfs", 0x01021994i64 as fs_type_t),
    ("tracefs", 0x74726163i64 as fs_type_t),
    ("udf", 0x15013346i64 as fs_type_t),
    ("usbdevice", 0x9fa2i64 as fs_type_t),
    ("v9fs", 0x01021997i64 as fs_type_t),
    ("xenfs", 0xabba1974i64 as fs_type_t),
    ("xfs", 0x58465342i64 as fs_type_t),
    ("zfs", 0x2fc12fc1i64 as fs_type_t),
    ("zonefs", 0x5a4f4653i64 as fs_type_t),
];

/// Default ioctl allow/denylist.
/// false -> allow, true -> deny.
/*
 * Quotes from Landlock ioctl access helpers
 * is_masked_device_ioctl and is_masked_device_ioctl_compat:
 *
 * 1. FIOCLEX, FIONCLEX, FIONBIO and FIOASYNC manipulate the FD's
 *    close-on-exec and the file's buffered-IO and async flags.
 *    These operations are also available through fcntl(2),
 *    and are unconditionally permitted in Landlock.
 * 2. FIOQSIZE queries the size of a regular file, directory, or link.
 *    We still permit it, because it always returns -ENOTTY for
 *    other file types.
 * 3. FIFREEZE and FITHAW freeze and thaw the file system which the
 *    given file belongs to. Requires CAP_SYS_ADMIN.
 *    These commands operate on the file system's superblock rather
 *    than on the file itself. The same operations can also be
 *    done through any other file or directory on the same file
 *    system, so it is safe to permit these.
 * 4. FS_IOC_FIEMAP queries information about the allocation of
 *    blocks within a file.
 *    This IOCTL command only makes sense for regular files and is
 *    not implemented by devices. It is harmless to permit.
 * 5. FIGETBSZ queries the file system's block size for a file or
 *    directory.
 *    This command operates on the file system's superblock rather
 *    than on the file itself. The same operation can also be done
 *    through any other file or directory on the same file system,
 *    so it is safe to permit it.
 * 6. FICLONE, FICLONERANGE and FIDEDUPERANGE make files share
 *    their underlying storage ("reflink") between source and
 *    destination FDs, on file systems which support that.
 *    These IOCTL commands only apply to regular files
 *    and are harmless to permit for device files.
 * 7. FS_IOC_GETFSUUID and FS_IOC_GETFSSYSFSPATH both operate on
 *    the file system superblock, not on the specific file, so
 *    these operations are available through any other file on the
 *    same file system as well.
 * 8. FIONREAD, FS_IOC_GETFLAGS, FS_IOC_SETFLAGS, FS_IOC_FSGETXATTR and
 *    FS_IOC_FSSETXATTR are forwarded to device implementations.
 * 9. file_ioctl() commands (FIBMAP, FS_IOC_RESVSP, FS_IOC_RESVSP64,
 *    FS_IOC_UNRESVSP, FS_IOC_UNRESVSP64 and FS_IOC_ZERO_RANGE) are
 *    forwarded to device implementations, so not permitted.
 * 10. FICLONE is permitted, same as in the non-compat variant.
 * 11. CONFIG_X86_64: FS_IOC_RESVSP_32, FS_IOC_RESVSP64_32, FS_IOC_UNRESVSP_32,
 *     FS_IOC_UNRESVSP64_32, FS_IOC_ZERO_RANGE_32: not blanket-permitted,
 *     for consistency with their non-compat variants.
 * 12. FS_IOC32_GETFLAGS, FS_IOC32_SETFLAGS are forwarded to their device
 *     implementations.
 * 13. RNDGETENTCNT is the single ioctl(2) request for /dev/{u,}random
 *     which does not require CAP_SYS_ADMIN. The other ioctl(2) requests
 *     of the same interface RNDADDTOENTCNT, RNDGETPOOL, RNDADDENTROPY,
 *     RNDZAPENTCNT, and RNDCLEARPOOL are privileged.
 */
pub const DEFAULT_IOCTL: &[(&str, bool)] = &[
    ("FIOCLEX", false),
    ("FIONCLEX", false),
    ("FIONBIO", false),
    ("FIONREAD", false),
    ("FIOASYNC", false),
    ("FIOQSIZE", false),
    ("FIFREEZE", true),      // CAP_SYS_ADMIN!
    ("FITHAW", true),        // ditto!
    ("FS_IOC_FIEMAP", true), // Leaks on-disk layout.
    ("FIGETBSZ", true),      // ditto!
    ("FICLONE", false),
    ("FICLONERANGE", false),
    ("FIDEDUPERANGE", false),
    ("FS_IOC_GETFSUUID", true),      // Leaks on-disk layout.
    ("FS_IOC_GETFSSYSFSPATH", true), // ditto!
    ("FIBMAP", true),                // ditto!
    ("KDSETKEYCODE", true),
    ("KDSIGACCEPT", true),
    ("TIOCSETD", true),
    ("TIOCSTI", true),
    ("TIOCCONS", true),
    ("TIOCLINUX", true),
    ("TIOCSSERIAL", true),       // See kernel_lockdown(7).
    ("FS_IOC_FSGETXATTR", true), // (SAFETY: xattr restrictions)
    ("FS_IOC_FSSETXATTR", true), // ditto!
    ("FS_IOC_SETFLAGS", true),   // To deny immutable & append-only flag changes
    ("FS_IOC32_SETFLAGS", true), // ditto!
    ("PIDFD_GET_INFO", false),   // unprivileged.
    ("RNDGETENTCNT", false),     // unprivileged.
    ("RNDADDTOENTCNT", true),    // requires CAP_SYS_ADMIN.
    ("RNDGETPOOL", true),        // ditto.
    ("RNDADDENTROPY", true),     // ditto.
    ("RNDZAPENTCNT", true),      // ditto.
    ("RNDCLEARPOOL", true),      // ditto.
    ("SECCOMP_IOCTL_NOTIF_RECV", true),
    ("SECCOMP_IOCTL_NOTIF_SEND", true),
    ("SECCOMP_IOCTL_NOTIF_ID_VALID", true),
    ("SECCOMP_IOCTL_NOTIF_ADDFD", true),
    ("SECCOMP_IOCTL_NOTIF_SET_FLAGS", true),
];

// Sandboxing profiles

/// Off sandboxing profile.
/// Used as shorthand to turn all sandboxing off.
pub const PROFILE_OFF: &[&str] = &[
    "sandbox/all:off",
    "sandbox/fs,ioctl,net,mem,pid,pty,force,tpe:off",
];

/// Libsyd helper sandboxing profile.
/// Turns all sandboxing off and set sandbox lock to Exec.
/// Useful to configure syd in the application via libsyd.
pub const PROFILE_LIB: &[&str] = &["include_profile off", "lock:exec"];

/// Quiet sandboxing profile.
pub const PROFILE_QUIET: &[&str] = &[
    "default/all:filter",
    "default/net,block,mem,pid,force,segvguard,tpe:filter",
];

/// Trace sandboxing profile, used by pandora(1).
pub const PROFILE_TRACE: &[&str] = &[
    "log/level:info",
    "sandbox/force,ioctl:on",
    "default/all,fs,ioctl,force,tpe:warn",
];

/// OCI default sandboxing profile, used by syd-oci(1).
pub const PROFILE_OCI: &[&str] = &[
    "include_profile nopie", // most containers ship non-pie binaries...
    "include_profile trace",
    "lock:off", // allow esyd.
    "trace/allow_safe_syslog:true",
];

/// Enforce profile, used to practically undo `oci` and `trace` profiles at runtime.
pub const PROFILE_ENFORCE: &[&str] = &[
    "default/all,net,force,tpe:deny",
    "trace/allow_safe_syslog:false",
    "trace/allow_unsafe_exec_nopie:false",
    "lock:exec", // keep esyd.
];

/// No ipv4 sandboxing profile.
pub const PROFILE_NOIPV4: &[&str] = &[
    "sandbox/net:on",
    "allow/cnet+any6!0-65535",
    "deny/cnet+any4!0-65535",
];

/// No ipv6 sandboxing profile.
pub const PROFILE_NOIPV6: &[&str] = &[
    "sandbox/net:on",
    "allow/cnet+any4!0-65535",
    "deny/cnet+any6!0-65535",
];

/// Privileged sandboxing profile.
pub const PROFILE_PRIVILEGED: &[&str] = &["trace/allow_unsafe_caps:true"];

/// No core dump restrictions profile.
pub const PROFILE_CORE: &[&str] = &[
    "rlimit/core:inf",                  // RLIMIT_CORE
    "trace/allow_unsafe_dumpable:true", // PR_SET_DUMPABLE
];

/// Debug sandboxing profile.
pub const PROFILE_DEBUG: &[&str] = &[
    "include_profile core",
    "trace/allow_unsafe_magiclinks:true", // allow /proc/$pid/mem access.
    "trace/allow_unsafe_exec_memory:true", // allow W^X memory.
    "trace/allow_unsafe_prctl:true",      // allow changing process name.
    "trace/allow_unsafe_prlimit:true",    // allow setting resource limits.
    "trace/allow_unsafe_ptrace:true",     // strace -f syd
];

/// No memory restrictions sandboxing profile.
pub const PROFILE_NOMEM: &[&str] = &["trace/allow_unsafe_exec_memory:true"];

/// No PIE sandboxing profile.
pub const PROFILE_NOPIE: &[&str] = &["trace/allow_unsafe_exec_nopie:true"];

/// Random sandboxing profile, activated with:
/// `syd --profile rand`.
pub const PROFILE_RAND: &[&str] = &["trace/force_rand_fd:true"];

/// Local networking sandboxing profile, activated with:
/// `syd --profile local`.
pub const PROFILE_LOCAL: &[&str] = &["trace/force_local_net:true"];

/// GUI sandboxing profile.
/// `syd --profile gui`.
pub const PROFILE_GUI: &[&str] = &[
    "include_profile tty",
    "trace/exit_wait_all:true", // GUIs typically daemonize.
    "uts/host:none",            // Overriding hostname confuses X11 auth.
    "uts/domain:none",          // Overriding domainname confuses X11 auth.
    // GUI environment variables.
    "passenv+DISPLAY,WAYLAND_DISPLAY",
    "passenv+SESSION_MANAGER",
    "passenv+DBUS_SESSION_BUS_ADDRESS",
    "passenv+GDM_LANG,GTK_MODULES",
    "passenv+GDK_PIXBUF_MODULE_FILE",
    "passenv+QT_QPA_PLATFORM_PLUGIN_PATH",
    "passenv+XAUTHORITY,XDG_SESSION_COOKIE",
];

/// Firefox sandboxing profile.
/// `syd --profile firefox`
pub const PROFILE_FIREFOX: &[&str] = &[
    "include_profile gui",
    "trace/allow_unsafe_cbpf:true", // Firefox has a seccomp(2) sandbox.
    "trace/allow_unsafe_chroot:true", // Make chroot a no-op.
    "trace/allow_unsafe_namespace:all", // Allow Firefox to sandbox itself.
    "trace/allow_unsafe_magiclinks:true", // Firefox needs this to fiddle with subprocesses.
    "trace/allow_unsafe_exec_libc:true", // Allow preloading libmozsandbox.so
    "trace/allow_unsafe_exec_memory:true", // W^X memory required for JIT
    "trace/allow_unsafe_page_cache:true", // mincore(2) may be needed for DRM
    "trace/allow_unsafe_prlimit:true", // permit setting resource limits
    "trace/allow_unsafe_shm:true",  // Firefox uses shared memory.
];

/// Readonly sandboxing profile.
/// `syd --profile ro`
pub const PROFILE_READONLY: &[&str] = &["deny/cpath,wpath+/***"];

/// Container sandboxing profile, activated with:
/// `syd --profile container`
pub const PROFILE_CONTAINER: &[&str] = &[
    "unshare/all:true",
    "allow/lpath,rpath+/proc/uptime", // time-ns aware
];

/// Immutable container sandboxing profile, activated with:
/// `syd --profile immutable`
pub const PROFILE_IMMUTABLE: &[&str] = &[
    "include_profile container",
    "bind+/opt:/opt:ro,nodev,nosuid",
    "bind+/usr:/usr:ro,nodev,nosuid",
    "bind+/etc:/etc:ro,nodev,nosuid,noexec",
    "bind+/mnt:/mnt:ro,nodev,nosuid,noexec,nosymfollow",
    "bind+/srv:/srv:ro,nodev,nosuid,noexec,nosymfollow",
    "bind+/home:/home:ro,nodev,nosuid,noexec,nosymfollow",
    "bind+tmpfs:/boot:ro,nodev,nosuid,noexec,nosymfollow,mode=0,nr_blocks=1,nr_inodes=1",
    "bind+tmpfs:/run:nodev,nosuid,nosymfollow,mode=1700",
    "bind+tmpfs:/tmp:nodev,nosuid,nosymfollow,mode=1700",
    "bind+tmpfs:/dev/shm:nodev,nosuid,noexec,nosymfollow,mode=1700",
    "bind+mqueue:/dev/mqueue:nodev,nosuid,noexec,nosymfollow",
    "bind+sysfs:/sys:ro,nodev,nosuid,noexec,nosymfollow",
];

/// Landlock sandboxing profile, activated with:
/// `syd --profile landlock`.
pub const PROFILE_LANDLOCK: &[&str] = &[
    "sandbox/lock:on",
    "allow/lock/rpath,exec+/etc",
    "allow/lock/rpath,exec+/bin",
    "allow/lock/rpath,exec+/sbin",
    "allow/lock/rpath,exec+/lib",
    "allow/lock/rpath,exec+/opt",
    "allow/lock/rpath,exec+/usr",
    // /proc is always allowed.
    // /dev/null is always allowed.
    "allow/lock/read,ioctl,write+/dev/full",
    "allow/lock/read,ioctl,write+/dev/zero",
    "allow/lock/read,ioctl+/dev/random",
    "allow/lock/read,ioctl+/dev/urandom",
    // As of version 3.39.0, Linux profile omits bnet, exec, ioctl,
    // spath and fown access for public directories.
    // Paludis profile overrides this as it may be needed
    // during package testing.
    "allow/lock/rpath,cpath,wpath,tpath+/dev/shm",
    "allow/lock/rpath,cpath,wpath,tpath+/var/tmp",
    "allow/lock/rpath,cpath,wpath,tpath+/tmp",
];

/// Paludis sandboxing profile, activated with:
/// `syd --profile paludis`.
pub const PROFILE_PALUDIS: &[&str] = &[
    "include_profile linux",
    "include_profile tty",
    "default/lock:warn", // set Landlock compat-level to best-effort when stacked with Landlock profile.
    "allow/lock/rpath+/dev", // Relax Landlock to allow /dev when stacked with Landlock profile.
    "allow/lock/rpath+/run", // ditto.
    "allow/lock/rpath+/sys", // ditto.
    "allow/lock/rpath+/var", // ditto.
    "lock:exec",         // esandbox
    "mask^",             // do not inherit the default mask-list from linux profile.
    "rlimit/memlock:off", // libgcrypt's tests fail without this.
    "time/mono:0",       // libevent's epoll tests fail without this one, see: sydbox#235.
    "sandbox/fs:${SYD_PALUDIS_FS:-off}", // Filesystem sandboxing is unused.
    "sandbox/lpath:${SYD_PALUDIS_LPATH:-off}", // Path hiding is currently unused.
    "sandbox/ioctl:${SYD_PALUDIS_IOCTL:-off}", // Ioctl sandboxing is unused.
    "sandbox/pty:${SYD_PALUDIS_PTY:-off}", // Paludis creates a PTY on its own.
    "trace/memory_access:0", // fallback to unsafe proc_pid_mem(5) if cross memory attach fails.
    "trace/allow_safe_bind:true",
    "trace/allow_safe_kcapi:true",
    "trace/allow_unsafe_cbpf:true",     // stacked seccomp cbpf filters
    "trace/allow_unsafe_chroot:true",   // glibc uses this in src_install
    "trace/allow_unsafe_dumpable:true", // allows strace -f syd
    "trace/allow_unsafe_env:true",
    "trace/allow_unsafe_exec_libc:true",
    "trace/allow_unsafe_exec_memory:true",
    "trace/allow_unsafe_exec_nopie:true",
    "trace/allow_unsafe_exec_null:true",
    "trace/allow_unsafe_exec_stack:true",
    "trace/allow_unsafe_exec_script:true", // Do not set SECBIT_EXEC_RESTRICT_FILE_LOCKED.
    "trace/allow_unsafe_exec_interactive:true", // Do not set SECBIT_EXEC_DENY_INTERACTIVE_LOCKED.
    "trace/allow_unsafe_exec_speculative:true", // Do not enable Speculative Execution mitigation.
    "trace/allow_unsafe_filename:true",    // sydbox#118
    "trace/allow_unsafe_cap_fixup:true",   // For PR_SET_KEEPCAPS, see util-linux' setpriv test.
    "trace/allow_unsafe_create:true",      // poppler-data src_install fails without this.
    "trace/allow_unsafe_hardlinks:true",   // gnulib linkat test fails without this.
    "trace/allow_unsafe_magiclinks:true",  // tests love to access /proc/1.
    "trace/allow_unsafe_symlinks:true",    // git's tests fail without this.
    "trace/allow_unsafe_machine_id:true",  // make machine-id(5) visible to Paludis.
    "trace/allow_unsafe_memfd:true",       // executable memory-fds are OK.
    "trace/allow_unsafe_mkbdev:true",      // dev-python/backports-tarfile's tests need this.
    "trace/allow_unsafe_mkcdev:true",      // ditto.
    "trace/allow_unsafe_nice:true",
    "trace/allow_unsafe_nocookie:true", // syscookies are a mitigation against compromised Syd.
    "trace/allow_unsafe_nomseal:true",  // mseal(2) is a mitigation against compromised Syd.
    "trace/allow_unsafe_open_path:true",
    "trace/allow_unsafe_page_cache:true",
    "trace/allow_unsafe_perf:true",
    "trace/allow_unsafe_personality:true", // gawk tests require ADDR_NO_RANDOMIZE.
    "trace/allow_unsafe_pgrp:true",        // Paludis creates a PTY on its own, see: #242.
    "trace/allow_unsafe_ptrace:true",
    "trace/allow_unsafe_prctl:true", // perl tests want to set process name.
    "trace/allow_unsafe_prlimit:true",
    "trace/allow_unsafe_proc_files:true", // builds typically use /proc/cpuinfo and more.
    "trace/allow_unsafe_proc_pid_status:true", // proc_pid_status(5) hardening is for malware-analysis.
    "trace/allow_unsafe_msgqueue:true", // message queue syscalls are sometimes used in tests.
    "trace/allow_unsafe_perm_msgqueue:true", // ditto.
    "trace/allow_unsafe_shm:true",      // shm syscalls are often used in tests.
    "trace/allow_unsafe_perm_shm:true", // perl tests map executable shared memory.
    "trace/allow_unsafe_sigreturn:true",
    "trace/allow_unsafe_socket:true",
    "trace/allow_unsafe_sysinfo:true", // disables sysinfo(2) randomizer, >3.23.4
    "trace/allow_unsupp_socket:true",
    "trace/allow_unsafe_stat_cdev:true", // diffutils' tests break with this restriction.
    "trace/allow_unsafe_stat_bdev:true", // keep it consistent with ^^.
    "trace/allow_unsafe_time:true",
    "trace/allow_unsafe_uname:true", // coreutils' tests fail if proc_version(5) isn't a regular file.
    "trace/allow_unsafe_xattr:true", // setcap requires access to security.capability.*
    // Avoid annoying open(/, O_DIRECTORY) failures.
    // See: #208, #212, and #219.
    // no-op unless Landlock is enabled.
    "allow/lock/readdir+/",
    // Allow FS_IOC_SETFLAGS (used by libarchive, cpio, tar etc.)
    // Use `!` suffix as this ioctl request doesn't exist on i686, x32 etc.
    "deny/ioctl-FS_IOC_SETFLAGS!", // needed because it's denied by default.
    "allow/ioctl+FS_IOC_SETFLAGS!",
    // Turned on by esandbox in metadata phase.
    "sandbox/exec:off",
    // TPE is unnecessary for package builds.
    "sandbox/tpe:off",
    // Filter noisy systemd access.
    "filter/cnet+/run/systemd/userdb/io.systemd.DynamicUser",
    // Defaults for /selinux.
    "allow/lpath,rpath+/selinux",
    "allow/lpath,rpath+/selinux/context",
    "allow/lpath,rpath,write+/selinux/context/**",
    // Defaults for sysfs.
    "allow/lpath,rpath+/sys",
    "allow/lpath,rpath+/sys/devices",
    "allow/lpath,rpath+/sys/devices/system",
    "allow/lpath,rpath+/sys/devices/system/cpu",
    "allow/lpath,rpath+/sys/devices/system/cpu/isolated",
    "allow/lpath,rpath+/sys/devices/system/cpu/kernel_max",
    "allow/lpath,rpath+/sys/devices/system/cpu/online",
    "allow/lpath,rpath+/sys/devices/system/cpu/offline",
    "allow/lpath,rpath+/sys/devices/system/cpu/possible",
    "allow/lpath,rpath+/sys/devices/system/cpu/present",
    "allow/lpath,rpath+/sys/fs",
    "allow/lpath,rpath+/sys/fs/cgroup/***",
    "allow/lpath,rpath+/sys/kernel",
    "allow/lpath,rpath+/sys/kernel/mm",
    "allow/lpath,rpath+/sys/kernel/mm/transparent_hugepage/***",
    "allow/lpath,rpath+/sys/kernel/security",
    "allow/lpath,rpath+/sys/kernel/security/apparmor/***",
    // Defaults for procfs.
    "allow/lpath,rpath+/proc/crypto",
    "allow/lpath,rpath+/proc/cpuinfo",
    "allow/lpath,rpath+/proc/filesystems",
    "allow/lpath,rpath+/proc/loadavg",
    "allow/lpath,rpath+/proc/meminfo",
    "allow/lpath,rpath+/proc/mounts", // symlink to self/mounts
    "allow/lpath,rpath+/proc/stat",
    "allow/lpath,rpath+/proc/uptime",
    "allow/lpath,rpath+/proc/version",
    "allow/lpath,rpath+/proc/sys",
    "allow/lpath,rpath+/proc/sys/fs/***",
    "allow/lpath,rpath+/proc/sys/kernel/***",
    "allow/lpath,rpath+/proc/sys/net/***",
    "allow/lpath,rpath+/proc/sys/vm/***",
    // SAFETY: Allow /proc PID traversals.
    // Tests often assume they can identify fellow processes
    // by readdir'ing /proc hence we allow it here but deny
    // in the default secure `linux` profile.
    "allow/lpath,rpath+/proc/[0-9]*",
    // SAFETY:
    // 1. We allow relaxed stat access (/proc wide).
    //    This must be combined with trace/allow_unsafe_magiclinks:1.
    // 2. comm/cmdline, environ, status access is for ps(1).
    // 3. self/loginuid is for gnulib's test-getlogin test.
    "allow/lpath,rpath+/proc/self/cmdline",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/cmdline",
    "allow/lpath,rpath+/proc/self/comm",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/comm",
    "allow/lpath,rpath+/proc/self/environ",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/environ",
    "allow/lpath,rpath+/proc/self/loginuid",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/loginuid",
    "allow/lpath,rpath+/proc/self/stat*",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/stat*",
    "allow/lpath,rpath+/proc/[0-9]*/cmdline",
    "allow/lpath,rpath+/proc/[0-9]*/task/[0-9]*/cmdline",
    "allow/lpath,rpath+/proc/[0-9]*/comm",
    "allow/lpath,rpath+/proc/[0-9]*/task/[0-9]*/comm",
    "allow/lpath,rpath+/proc/[0-9]*/environ",
    "allow/lpath,rpath+/proc/[0-9]*/task/[0-9]*/environ",
    "allow/lpath,rpath+/proc/[0-9]*/stat*",
    "allow/lpath,rpath+/proc/[0-9]*/task/[0-9]*/stat*",
    // SAFETY:
    // 1. proc_pid_maps(5) access allows an attacker
    //    to easily circumvent ASLR, therefore the two rules
    //    below have been moved from Linux to Paludis profile.
    // 2. Access to proc_pid_smaps(5) and proc_pid_pagemap(5)
    //    have similar security characteristics.
    // 3. smaps has a `*` at the end to also allow smaps_rollup.
    // 4. ioctl(2) access is needed for PROCMAP_QUERY.
    "allow/ioctl+PROCMAP_QUERY",
    "allow/lpath,rpath+/proc/self/maps",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/maps",
    "allow/lpath,rpath+/proc/self/smaps*",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/smaps*",
    "allow/lpath,rpath+/proc/self/pagemap",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/pagemap",
    "allow/lpath,rpath+/proc/self/mounts",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/mounts",
    "allow/lpath,rpath+/proc/self/mountinfo",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/mountinfo",
    "allow/lpath,rpath+/proc/self/attr/***",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/attr/***",
    "allow/lpath,rpath+/proc/self/cgroup",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/cgroup",
    // Allow /dev/shm, /var/tmp and /tmp for Paludis builds.
    // As of version 3.39.0, Linux profile omits net, exec, ioctl,
    // spath and fown access for public directories which we re-add here.
    "allow/exec,spath,net,fown+/dev/shm/**",
    "allow/exec,spath,net,fown+/var/tmp/**",
    "allow/exec,spath,net,fown+/tmp/**",
    // Allow wider permissions for public directories.
    // no-op unless Landlock is enabled.
    "allow/lock/exec,ioctl,spath,bnet+/dev/shm",
    "allow/lock/exec,ioctl,spath,bnet+/var/tmp",
    "allow/lock/exec,ioctl,spath,bnet+/tmp",
    // Allow /var/cache for Paludis builds.
    "allow/lpath,rpath,mktemp+/var/cache/***",
    "allow/lpath,rpath,wpath,cpath,spath,tpath,exec,net,fown,fattr+/var/cache/**",
    "allow/lock/rpath,wpath,cpath,spath,tpath,exec,ioctl,bnet+/var/cache",
    // Defaults for network sandboxing.
    "allow/bnet+loopback!0",
    "allow/bnet+loopback!1024-65535",
    // Defaults for Landlock networking which is ABI>=4.
    "allow/lock/bnet+0",
    "allow/lock/bnet+1024-65535",
    "allow/lock/cnet+1024-65535",
    // Allow interaction with memory file descriptors.
    "allow/exec,lpath,rpath,wpath,cpath,fattr+!memfd:**",
    "allow/exec,lpath,rpath,wpath,cpath,fattr+!memfd-hugetlb:**",
    // Repository directory
    "allow/lpath,rpath+/var/db",
    "allow/lpath,rpath+/var/db/paludis/***",
];

/// LTP sandboxing profile, activated with:
/// `syd --profile ltp`
///
/// This profile allows wide filesystem & network access,
/// and it should only be used with LTP tests!
pub const PROFILE_LTP: &[&str] = &[
    "include_profile paludis",
    "rlimit/memlock:off",                 // LTP has mlock(2) tests.
    "trace/allow_unsafe_chroot:false",    // LTP chroot tests break if chroot is no-op.
    "trace/allow_unsafe_deprecated:true", // LTP has remap_file_pages(2) tests.
    "trace/allow_unsafe_keyring:true",    // LTP tests for add_key.
    "trace/allow_unsafe_madvise:true",    // LTP has madvise(2) tests.
    "trace/allow_unsafe_mbind:true",      // LTP has mbind(2) tests.
    "trace/allow_unsafe_oob:true",        // LTP recvmsg tests break with MSG_OOB restriction.
    "allow/all+/***",
    "allow/net+@**",
    "allow/net+any!0-65535",
    "allow/ioctl+SIOCATMARK,SIOCGIFCONF,SIOCGIFFLAGS,SIOCSIFFLAGS", // sockioctl01 test needs these
    "deny/ioctl-FIGETBSZ",
    "allow/ioctl+FIGETBSZ,NS_GET_OWNER_UID",
];

/// NIX sandboxing profile, activated with:
/// `syd --profile nix`
///
/// This profile allows wide filesystem & network access,
/// and it should only be used with NIX crate tests!
pub const PROFILE_NIX: &[&str] = &[
    "include_profile paludis",
    "trace/allow_unsafe_madvise:true",  // NIX has madvise(2) tests.
    "trace/allow_unsafe_vmsplice:true", // NIX has vmsplice(2) tests.
    "allow/all+/***",
    "allow/net+@**",
    "allow/net+any!0-65535",
];

/// CWD sandboxing profile.
///
/// Grants all access to the current working directory.
pub const PROFILE_CWD: &[&str] = &["allow/all+${SYD_PWD}/***", "allow/lock/all+${SYD_PWD}"];

/// Path hiding sandbox profile.
///
/// Turns on Chdir, Stat and Walk sandboxing.
pub const PROFILE_HIDE: &[&str] = &["sandbox/lpath:on"];

/// User sandboxing profile, activated with:
/// `syd --profile user`.
/// Syd sets the environment variables
/// UID, GID, USER, and HOME before parsing this
/// profile.
pub const PROFILE_USER: &[&str] = &[
    "include_profile linux",
    "include_profile local",
    "include_profile rand",
    "include_profile tty",
    // Path hiding is off by default,
    // here we set it on for secure defaults.
    // The user may override with SYD_USER_LPATH.
    "sandbox/lpath:${SYD_USER_LPATH:-on}",
    // dmesg(8)
    "trace/allow_safe_syslog:true",
    // TPE
    "tpe/negate:1",
    "tpe/user_owned:1",
    "tpe/gid:${SYD_GID}",
    // Enforce strict file modes:
    // Disallow executable bits to enhance W^X.
    // Disallow s{u,g}id bits on files.
    // Disallow setting group+other bits.
    "trace/force_umask:7177",
    // /home
    // 1. We allow read(${HOME}) but not write(${HOME}),
    //    read|write(${HOME}/**) is ok, i.e. the user can not delete
    //    their home directory under the sandbox which is a nice and
    //    funny protection.
    // 2. We disallow all dotfiles except history.
    // 3. We mark shell-history files as append-only.
    "allow/all+${HOME}/**",
    "allow/lpath,rpath+${HOME}/***",
    "deny/all+${HOME}/**/.*/***",
    "allow/all+${HOME}/**/.*history*",
    "append+${HOME}/.*history",
];

/// Common Linux system profile, used by oci, paludis and user profiles.
pub const PROFILE_LINUX: &[&str] = &[
    // Defaults for Filesystem sandboxing.
    "include_profile fs",
    // List root is safe.
    "allow/stat,chdir,readdir+/",
    // Safe defaults for Exec sandboxing
    "allow/lpath,rpath,exec+/bin/***",
    "allow/lpath,rpath,exec+/sbin/***",
    "allow/lpath,rpath,exec+/lib*/***",
    "allow/lpath,rpath,exec+/usr/***",
    "allow/lpath,rpath,exec+/opt/***",
    // SAFETY: Do not leak kernel config/module information.
    // This overrides the lib rule above.
    "deny/lpath,rpath,exec+/lib*/modules/***",
    // Safe defaults for Network sandboxing
    "allow/net/link+route", // allow NETLINK_ROUTE.
    "allow/cnet+/run/nscd/socket",
    "allow/cnet+/var/run/nscd/socket",
    "allow/cnet+/var/lib/sss/pipes/nss",
    "allow/cnet+loopback!65535", // getaddrinfo() with AI_ADDRCONFIG on musl.
    // /dev
    "allow/lpath,rpath+/dev",
    "allow/lpath,rpath+/dev/fd",
    // As of version 3.39.0, Linux profile omits bnet, exec, ioctl,
    // spath and fown access for public directories.
    // Paludis profile overrides this as it may be needed
    // during package testing.
    "allow/lpath,rpath,mktemp+/dev/shm/***",
    "allow/lpath,rpath,mktemp+/var/tmp/***",
    "allow/lpath,rpath,mktemp+/tmp/***",
    "allow/lpath,rpath,wpath,cpath,tpath,fattr+/dev/shm/**",
    "allow/lpath,rpath,wpath,cpath,tpath,fattr+/var/tmp/**",
    "allow/lpath,rpath,wpath,cpath,tpath,fattr+/tmp/**",
    "allow/lpath,rpath,wpath+/dev/full",
    "allow/lpath,rpath,wpath+/dev/zero",
    "allow/lpath,rpath,wpath+/dev/null",
    "allow/lpath,rpath,wpath+/dev/stdin",
    "allow/lpath,rpath,wpath+/dev/stdout",
    "allow/lpath,rpath,wpath+/dev/stderr",
    "allow/lpath,rpath,wpath+/dev/random",
    "allow/lpath,rpath,wpath+/dev/urandom",
    // /proc
    //
    // SAFETY: Note we allow readdir to `/proc`,
    // however we do _not_ allow PID traversals,
    // ie identifying fellow processing by readdir'ing `/proc`.
    // Notably, this is allowed in the `paludis` profile.
    "allow/lpath,rpath+/proc",
    // SAFETY: We allow basic stat access (not global but per-process and per-task).
    // trace/allow_unsafe_magiclinks:0 default is another layer of
    // protection against potential malicious activity with proc
    // magiclinks. Paludis profile allows global stat access for /proc/stat.
    "allow/lpath,rpath+/proc/self/stat",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/stat",
    "allow/lpath,rpath+/proc/self/statm",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/statm",
    "allow/lpath,rpath+/proc/self/status",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/status",
    // SAFETY: Mask global info/stat paths as an extra layer such as:
    // /proc/buddyinfo, /proc/cpuinfo, /proc/meminfo, /proc/pagetypeinfo,
    // /proc/slabinfo, /proc/vmallocinfo, /proc/zoneinfo,
    // /proc/diskstats, /proc/schedstat, /proc/stat, /proc/vmstat etc.
    "mask+/proc/*info*/***:/dev/null:/var/empty",
    "mask+/proc/*stat*/***:/dev/null:/var/empty",
    // SAFETY: We provide an empty file in place of /proc/cmdline
    // for compatibility rather than denying outright. In any case,
    // syd does not leak the contents of this file to the sandbox
    // process. See: https://gitlab.exherbo.org/sydbox/sydbox/-/issues/106
    "allow/lpath,rpath+/proc/cmdline",
    "mask+/proc/cmdline",
    // SAFETY: /proc/version is part of uname(2) handling as of 3.43.1.
    "allow/lpath,rpath+/proc/version",
    // SAFETY: Further masking for sa{f,n}e defaults.
    "mask+/proc/interrupts",
    "mask+/proc/kcore",
    "mask+/proc/keys",
    "mask+/proc/acpi/***:/dev/null:/var/empty",
    "mask+/proc/asound/***:/dev/null:/var/empty",
    "mask+/proc/bus/***:/dev/null:/var/empty",
    "mask+/proc/driver/***:/dev/null:/var/empty",
    // /proc/dynamic_debug
    // SAFETY: Provide wider debug glob for future-safety.
    "mask+/proc/*debug*/***:/dev/null:/var/empty",
    "mask+/proc/fs/***:/dev/null:/var/empty",
    "mask+/proc/irq/***:/dev/null:/var/empty",
    // /proc/latency_stats, /proc/timer_list, /proc/timer_stats etc.
    "mask+/proc/latency_*/***:/dev/null:/var/empty",
    "mask+/proc/timer_*/***:/dev/null:/var/empty",
    "mask+/proc/*_stats/***:/dev/null:/var/empty",
    "mask+/proc/pressure/***:/dev/null:/var/empty",
    // /proc/schedstat, /proc/sched_debug etc.
    "mask+/proc/sched*/***:/dev/null:/var/empty",
    "mask+/proc/scsi/***:/dev/null:/var/empty",
    // SAFETY: Modification of /proc/sys/kernel/modprobe requires no
    // capabilities (can cause arbitrary code to be inserted into the
    // kernel via a replacement modprobe)!
    // https://forums.grsecurity.net/viewtopic.php?f=7&t=2522
    //
    // As of 3.36.0 we extend this to protect more such as:
    // /proc/sys, /proc/sysvipc, /proc/sysrq-trigger etc.
    "mask+/proc/sys*/***:/dev/null:/var/empty",
    "mask+/proc/tty/***:/dev/null:/var/empty",
    // SAFETY: Mask boot_id with random UUID.
    // This prevents leaking host boot id unintentionally.
    // getrandom(2) is already in VDSO so allowing random UUID
    // file does not give any more power to the attacker.
    "allow/lpath,rpath+/proc/sys",
    "allow/lpath,rpath+/proc/sys/kernel",
    "allow/lpath,rpath+/proc/sys/kernel/random",
    "allow/lpath,rpath+/proc/sys/kernel/random/*id",
    "mask+/proc/sys/kernel/random/boot_id:/proc/sys/kernel/random/uuid",
    "mask+/proc/sys/kernel/random/uuid:/proc/sys/kernel/random/uuid",
    // Done /proc masking, move on!
    //
    // Wait, there is more!
    // SAFETY: Mask critical /sys paths as an extra layer.
    // Extra layer because this profile allows no access
    // to the sysfs directory otherwise.
    "mask+/sys/dev/***:/dev/null:/var/empty",
    "mask+/sys/devices/***:/dev/null:/var/empty",
    "mask+/sys/firmware/***:/dev/null:/var/empty",
    "mask+/sys/fs/***:/dev/null:/var/empty",
    // SAFETY: Provide an empty file in place of /sys/kernel/notes.
    // See: https://lwn.net/Articles/962782/
    // As of 3.36.0, we extend it to the directory recursively.
    "mask+/sys/kernel/***:/dev/null:/var/empty",
    // Done /sys masking, move on!
    //
    // Wait, there is even more!
    // SAFETY: Mask the critical /boot directory as an extra layer.
    // Extra layer because this profile allows no access to the
    // /boot directory. This is consistent with the immutable profile.
    "mask+/boot/***:/dev/null:/var/empty",
    // Done /boot masking, move on!
    "allow/lpath,rpath+/proc/self",
    "allow/lpath,rpath+/proc/thread-self",
    "allow/lpath,rpath+/proc/self/comm",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/comm",
    "allow/lpath,rpath+/proc/self/cmdline",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/cmdline",
    "allow/lpath,rpath+/proc/self/fd",
    "allow/lpath,rpath+/proc/self/fdinfo",
    "allow/lpath,rpath+/proc/self/task",
    "allow/lpath,rpath+/proc/self/task/[0-9]*",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/fd",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/fdinfo",
    // SAFETY: Avoid assigning wpath and ioctl sets
    // on /proc/self/{cwd,exe,root} magic symlinks
    // to provide hardening against container breaks.
    "allow/lpath,rpath+/proc/self/cwd",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/cwd",
    "allow/lpath,rpath+/proc/self/exe",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/exe",
    "allow/lpath,rpath+/proc/self/root",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/root",
    "allow/lpath,rpath+/proc/self/fdinfo/[0-9]*",
    "allow/lpath,rpath+/proc/self/task/[0-9]*/fdinfo/[0-9]*",
    "allow/lpath,rpath,wpath+/proc/self/fd/[0-9]*",
    "allow/lpath,rpath,wpath+/proc/self/task/[0-9]*/fd/[0-9]*",
    r"allow/lpath,rpath,wpath+/proc/self/fd/anon_inode:\[pidfd\]",
    r"allow/lpath,rpath,wpath+/proc/self/fd/pipe:\[[0-9]*\]",
    r"allow/lpath,rpath,wpath+/proc/self/fd/socket:\[[0-9]*\]",
    r"allow/lpath,rpath,wpath+/proc/self/task/[0-9]*/fd/anon_inode:\[pidfd\]",
    r"allow/lpath,rpath,wpath+/proc/self/task/[0-9]*/fd/pipe:\[[0-9]*\]",
    r"allow/lpath,rpath,wpath+/proc/self/task/[0-9]*/fd/socket:\[[0-9]*\]",
    // /run
    "allow/lpath,rpath+/run",
    "allow/lpath,rpath+/run/systemd",
    "allow/lpath,rpath+/run/systemd/resolve",
    "allow/lpath,rpath+/run/systemd/resolve/*.conf",
    // /var
    "allow/lpath,rpath+/var",
    "allow/lpath,rpath+/var/lib/sss/mc/passwd", // Required for LDAP.
    "allow/lpath,rpath+/var/lib/sss/mc/group",  // ditto.
    "allow/lpath,rpath,chroot+/var/empty",      // Allow daemons to chroot.
    // /etc
    "allow/lpath,rpath+/etc",
    "allow/lpath,rpath+/etc/DIR_COLORS",
    "allow/lpath,rpath+/etc/GREP_COLORS",
    "allow/lpath,rpath+/etc/bash*/***",
    "allow/lpath,rpath+/etc/alternatives/***",
    "allow/lpath,rpath+/etc/ca-certificates/***",
    "allow/lpath,rpath+/etc/env.d/***",
    "allow/lpath,rpath+/etc/groff/***",
    "allow/lpath,rpath+/etc/ld.so.conf.d/***",
    "allow/lpath,rpath+/etc/environment",
    "allow/lpath,rpath+/etc/ethertypes",
    "allow/lpath,rpath+/etc/gai.conf",
    "allow/lpath,rpath+/etc/group",
    "allow/lpath,rpath+/etc/hosts",
    "allow/lpath,rpath+/etc/inputrc",
    "allow/lpath,rpath+/etc/issue",
    "allow/lpath,rpath+/etc/ld*",
    "allow/lpath,rpath+/etc/locale.alias",
    "allow/lpath,rpath+/etc/locale.conf",
    "allow/lpath,rpath+/etc/localtime",
    "mask+/etc/localtime:/usr/share/zoneinfo/UTC", // prevent leaking the timezone.
    "mask+/usr/share/zoneinfo/**:/usr/share/zoneinfo/UTC:/var/empty", // ditto.
    "allow/lpath,rpath+/etc/machine-id", // randomized, unless trace/allow_unsafe_machine_id:1.
    "allow/lpath,rpath+/etc/hostid",     // ditto.
    "allow/lpath,rpath+/var/adm/hostid", // ditto.
    "allow/lpath,rpath+/etc/man_db.conf",
    "allow/lpath,rpath+/etc/nanorc",
    "allow/lpath,rpath+/etc/**/nsswitch.conf",
    "allow/lpath,rpath+/etc/passwd",
    "allow/lpath,rpath+/etc/php*",
    "allow/lpath,rpath+/etc/php*/**/*.ini",
    "allow/lpath,rpath+/etc/profile*/***",
    "deny/lpath,rpath+/etc/profile*/*systemd*", // mitigate init identification.
    "allow/lpath,rpath+/etc/services",
    "allow/lpath,rpath+/etc/*-release",
    "allow/lpath,rpath+/etc/protocols",
    "allow/lpath,rpath+/etc/resolv.conf",
    "allow/lpath,rpath+/etc/skel/***",
    "allow/stat,chdir,readdir+/etc/ssl",
    "allow/lpath,rpath+/etc/ssl/certs/***",
    "allow/lpath,rpath+/etc/ssl/misc/***",
    "allow/lpath,rpath+/etc/ssl/openssl.cnf",
    "allow/lpath,rpath+/etc/terminfo/***",
    "allow/lpath,rpath+/etc/zsh/***",
    // /home
    //
    // Do _not_ allow readdir which allows enumerating other users!
    "allow/lpath,chdir+/home",
    // Defaults for Network sandboxing:
    //
    // Allow network access to unnamed UNIX sockets.
    "allow/net+!unnamed",
];

/// Profile to set safe defaults for Filesystem sandboxing.
///
/// This is used by the linux profile.
pub const PROFILE_FS: &[&str] = &[
    "sandbox/fs:on",
    "allow/fs+all",
    // Kernel/Security Filesystems
    "deny/fs+aafs,bpf_fs,securityfs,selinux,smack",
    // Debugging/Tracing Filesystems
    "deny/fs+debugfs,pstorefs,tracefs",
    // Resource Control Filesystems
    "deny/fs+cgroup,cgroup2,nsfs,pid_fd,rdtgroup",
    // Firmware/Low-Level Filesystems
    "deny/fs+devmem,efivarfs,hostfs,mtd_inode_fs,openprom",
    // Memory/Device Filesystems
    "deny/fs+daxfs,secretmem",
    // Miscellaneous Filesystems
    "deny/fs+bdevfs,binderfs,usbdevice,xenfs,zonefs",
];

/// Profile to allowlist TTY ioctls without path check, used by oci,
/// paludis and user profiles.
/// Syd sets the environment variable TTY before parsing this profile.
pub const PROFILE_TTY: &[&str] = &[
    // Allow safe ioctl requests without path check.
    "include_profile tty_native",
    // TTY lock rules to be used with Landlock.
    "allow/lock/read,ioctl,write+/dev/console",
    "allow/lock/read,ioctl,write+/dev/tty",
    "allow/lock/read,ioctl,write+/dev/ptmx",
    "allow/lock/read,ioctl,write+${SYD_TTY}",
    "allow/lock/rpath,ioctl,write+/dev/pts",
    // TTY seccomp rules to act as the second layer.
    "allow/lpath,rpath,wpath+/dev/console",
    "allow/lpath,rpath,wpath+/dev/tty",
    "allow/lpath,rpath,wpath+/dev/ptmx",
    "allow/lpath,rpath,wpath+${SYD_TTY}",
    "allow/lpath,rpath+/dev/pts",
    "allow/lpath,rpath,wpath+/dev/pts/ptmx",
    "allow/lpath,rpath,wpath+/dev/pts/[0-9]*",
];

/// Profile to allowlist TTY-native ioctls without path check.
pub const PROFILE_TTY_NATIVE: &[&str] = &[
    "allow/ioctl+TCFLSH",
    "allow/ioctl+TCGETA",
    "allow/ioctl+TCGETS",
    "allow/ioctl+TCGETS2",
    "allow/ioctl+TCGETX",
    "allow/ioctl+TCSBRK",
    "allow/ioctl+TCSBRKP",
    "allow/ioctl+TCSETA",
    "allow/ioctl+TCSETAF",
    "allow/ioctl+TCSETAW",
    "allow/ioctl+TCSETS",
    "allow/ioctl+TCSETS2",
    "allow/ioctl+TCSETSF",
    "allow/ioctl+TCSETSF2",
    "allow/ioctl+TCSETSW",
    "allow/ioctl+TCSETSW2",
    "allow/ioctl+TCSETX",
    "allow/ioctl+TCSETXF",
    "allow/ioctl+TCSETXW",
    "allow/ioctl+TCXONC",
    "allow/ioctl+TIOCCBRK",
    "allow/ioctl+TIOCGDEV",
    "allow/ioctl+TIOCGEXCL",
    "allow/ioctl+TIOCGLCKTRMIOS",
    "allow/ioctl+TIOCGPGRP",
    "allow/ioctl+TIOCGPKT",
    "allow/ioctl+TIOCGPKT",
    "allow/ioctl+TIOCGPTLCK",
    "allow/ioctl+TIOCGPTLCK",
    "allow/ioctl+TIOCGPTN",
    "allow/ioctl+TIOCGPTPEER",
    "allow/ioctl+TIOCGPTPEER",
    "allow/ioctl+TIOCGSID",
    "allow/ioctl+TIOCGWINSZ",
    "allow/ioctl+TIOCPKT",
    "allow/ioctl+TIOCSBRK",
    "allow/ioctl+TIOCSCTTY",
    "allow/ioctl+TIOCSIG",
    "allow/ioctl+TIOCSLCKTRMIOS",
    "allow/ioctl+TIOCSPGRP",
    "allow/ioctl+TIOCSPTLCK",
    "allow/ioctl+TIOCSWINSZ",
    "allow/ioctl+TIOCVHANGUP",
];

/// Profile to allowlist KVM ioctls without path check.
/// Read: https://www.kernel.org/doc/Documentation/virtual/kvm/api.txt
pub const PROFILE_KVM: &[&str] = &[
    "allow/wpath+/dev/kvm",
    "include_profile kvm_native",
    "allow/ioctl+KVM_GET_API_VERSION",
    "allow/ioctl+KVM_CREATE_VM",
    "allow/ioctl+KVM_GET_VCPU_MMAP_SIZE",
    "allow/ioctl+KVM_CREATE_VCPU",
    "allow/ioctl+KVM_GET_DIRTY_LOG",
    "allow/ioctl+KVM_SET_TSS_ADDR",
    "allow/ioctl+KVM_RUN",
    "allow/ioctl+KVM_NMI",
    "allow/ioctl+KVM_CHECK_EXTENSION",
    "allow/ioctl+KVM_GET_TSC_KHZ",
    "allow/ioctl+KVM_SET_TSC_KHZ",
    "allow/ioctl+KVM_INTERRUPT",
    "allow/ioctl+KVM_SET_MSRS",
    "allow/ioctl+KVM_SET_USER_MEMORY_REGION",
    "allow/ioctl+KVM_SET_REGS",
    "allow/ioctl+KVM_SET_SREGS",
    "allow/ioctl+KVM_GET_MSRS",
    "allow/ioctl+KVM_GET_REGS",
    "allow/ioctl+KVM_GET_SREGS",
    "allow/ioctl+KVM_GET_SUPPORTED_CPUID",
    "allow/ioctl+KVM_GET_EMULATED_CPUID",
    "allow/ioctl+KVM_SET_CPUID2",
    "allow/ioctl+KVM_SET_SIGNAL_MASK",
    "allow/ioctl+KVM_GET_VCPU_EVENTS",
    "allow/ioctl+KVM_SET_VCPU_EVENTS",
    "allow/ioctl+KVM_SET_DEVICE_ATTR",
    "allow/ioctl+KVM_SET_IDENTITY_MAP_ADDR",
    "allow/ioctl+KVM_CREATE_IRQCHIP",
    "allow/ioctl+KVM_IRQ_LINE",
    "allow/ioctl+KVM_REGISTER_COALESCED_MMIO",
    "allow/ioctl+KVM_UNREGISTER_COALESCED_MMIO",
    "allow/ioctl+KVM_SET_GSI_ROUTING",
    "allow/ioctl+KVM_IRQFD",
    "allow/ioctl+KVM_IOEVENTFD",
    "allow/ioctl+KVM_GET_MP_STATE",
    "allow/ioctl+KVM_SET_MP_STATE",
    "allow/ioctl+KVM_SIGNAL_MSI",
    "allow/ioctl+KVM_SET_GUEST_DEBUG",
    "allow/ioctl+KVM_CREATE_DEVICE",
    "allow/ioctl+KVM_SET_DEVICE_ATTR",
    "allow/ioctl+KVM_GET_DEVICE_ATTR",
    "allow/ioctl+KVM_HAS_DEVICE_ATTR",
];

/// Profile to allowlist KVM arch-native ioctls without path check.
#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
pub const PROFILE_KVM_NATIVE: &[&str] = &[
    "allow/ioctl+0x4680aea3",         // _KVM_ENABLE_CAP
    "allow/ioctl+0x4010aeab",         // _KVM_GET_ONE_REG
    "allow/ioctl+0x4010aeac",         // _KVM_SET_ONE_REG
    "allow/ioctl+0xc008aeb0",         // _KVM_GET_REG_LIST
    "allow/ioctl+5",                  // _KVM_ARM_TARGET_GENERIC_V8
    "allow/ioctl+0x8020aeaf",         // _KVM_ARM_PREFERRED_TARGET
    "allow/ioctl+0x4020aeae",         // _KVM_ARM_VCPU_INIT
    "allow/ioctl+0x4040aec2",         // _KVM_ARM_VCPU_FINALIZE
    "allow/ioctl+0x6030000000100042", // _KVM_ARM64_REGS_PSTATE
    "allow/ioctl+0x6030000000100044", // _KVM_ARM64_REGS_SP_EL1
    "allow/ioctl+0x6030000000100000", // _KVM_ARM64_REGS_R0
    "allow/ioctl+0x6030000000100002", // _KVM_ARM64_REGS_R1
    "allow/ioctl+0x6030000000100004", // _KVM_ARM64_REGS_R2
    "allow/ioctl+0x6030000000100006", // _KVM_ARM64_REGS_R3
    "allow/ioctl+0x6030000000100010", // _KVM_ARM64_REGS_R8
    "allow/ioctl+0x6030000000100024", // _KVM_ARM64_REGS_R18
    "allow/ioctl+0x6030000000100040", // _KVM_ARM64_REGS_PC
    "allow/ioctl+0x603000000013c510", // _KVM_ARM64_REGS_MAIR_EL1
    "allow/ioctl+0x603000000013c102", // _KVM_ARM64_REGS_TCR_EL1
    "allow/ioctl+0x603000000013c100", // _KVM_ARM64_REGS_TTBR0_EL1
    "allow/ioctl+0x603000000013c101", // _KVM_ARM64_REGS_TTBR1_EL1
    "allow/ioctl+0x603000000013c080", // _KVM_ARM64_REGS_SCTLR_EL1
    "allow/ioctl+0x603000000013c082", // _KVM_ARM64_REGS_CPACR_EL1
    "allow/ioctl+0x603000000013c600", // _KVM_ARM64_REGS_VBAR_EL1
    "allow/ioctl+0x603000000013df1a", // _KVM_ARM64_REGS_TIMER_CNT
    "allow/ioctl+0x603000000013df00", // _KVM_ARM64_REGS_CNTFRQ_EL0
    "allow/ioctl+0x6030000000138012", // _KVM_ARM64_REGS_MDSCR_EL1
    "allow/ioctl+0x603000000013c708", // _KVM_ARM64_REGS_CNTKCTL_EL1
    "allow/ioctl+0x603000000013c684", // _KVM_ARM64_REGS_TPIDR_EL1
];

/// Profile to allowlist KVM arch-native ioctls without path check.
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
pub const PROFILE_KVM_NATIVE: &[&str] = &[
    "allow/ioctl+0x0",        // _KVM_VCPU_TSC_CTRL
    "allow/ioctl+0x0",        // _KVM_VCPU_TSC_OFFSET
    "allow/ioctl+0xc004ae02", // _KVM_GET_MSR_INDEX_LIST
    "allow/ioctl+0xc004ae0a", // _KVM_GET_MSR_FEATURE_INDEX_LIST
    "allow/ioctl+0x4040ae77", // _KVM_CREATE_PIT2
    "allow/ioctl+0xc208ae62", // _KVM_GET_IRQCHIP
    "allow/ioctl+0x4208ae63", // _KVM_SET_IRQCHIP
    "allow/ioctl+0x4030ae7b", // _KVM_SET_CLOCK
    "allow/ioctl+0x8030ae7c", // _KVM_GET_CLOCK
    "allow/ioctl+0x8070ae9f", // _KVM_GET_PIT2
    "allow/ioctl+0x4070aea0", // _KVM_SET_PIT2
    "allow/ioctl+0xc008aeba", // _KVM_MEMORY_ENCRYPT_OP
    "allow/ioctl+0x8010aebb", // _KVM_MEMORY_ENCRYPT_REG_REGION
    "allow/ioctl+0x8010aebc", // _KVM_MEMORY_ENCRYPT_UNREG_REGION
    "allow/ioctl+0xc018ae85", // _KVM_TRANSLATE
    "allow/ioctl+0x81a0ae8c", // _KVM_GET_FPU
    "allow/ioctl+0x41a0ae8d", // _KVM_SET_FPU
    "allow/ioctl+0x8400ae8e", // _KVM_GET_LAPIC
    "allow/ioctl+0x4400ae8f", // _KVM_SET_LAPIC
    "allow/ioctl+0x4008ae90", // _KVM_SET_CPUID2
    "allow/ioctl+0xc008ae91", // _KVM_GET_CPUID2
    "allow/ioctl+0x8080aea1", // _KVM_GET_DEBUGREGS
    "allow/ioctl+0x4080aea2", // _KVM_SET_DEBUGREGS
    "allow/ioctl+0x9000aea4", // _KVM_GET_XSAVE
    "allow/ioctl+0x5000aea5", // _KVM_SET_XSAVE
    "allow/ioctl+0x8188aea6", // _KVM_GET_XCRS
    "allow/ioctl+0x4188aea7", // _KVM_SET_XCRS
    "allow/ioctl+0xaead",     // _KVM_KVMCLOCK_CTRL
];

/// Profile to allowlist KVM arch-native ioctls without path check.
#[cfg(not(any(
    target_arch = "arm",
    target_arch = "x86",
    target_arch = "x86_64",
    target_arch = "aarch64"
)))]
pub const PROFILE_KVM_NATIVE: &[&str] = &[];

/// Profile for access to an immutable /nix/store.
/// Deny readdir to /nix/store to prevent discovery of
/// paths outside of what should be the runtime closure.
pub const PROFILE_NIX_STORE: &[&str] = &["allow/lpath,rpath,exec+/nix/store/**"];

/// The list of eBPF family syscalls
pub const EBPF_SYSCALLS: &[&str] = &["bpf"];

/// The list of mount family syscalls
pub const MOUNT_SYSCALLS: &[&str] = &[
    "fsconfig",
    "fsmount",
    "fsopen",
    "fspick",
    "listmount",
    "mount",
    "mount_setattr",
    "move_mount",
    "open_tree",
    "open_tree_attr",
    "statmount",
    "umount",
    "umount2",
];

/// The list of message queue syscalls
pub const MSGQUEUE_SYSCALLS: &[&str] = &[
    "mq_getsetattr",
    "mq_notify",
    "mq_open",
    "mq_timedreceive",
    "mq_timedreceive_time64",
    "mq_timedsend",
    "mq_timedsend_time64",
    "mq_unlink",
];

/// The list of shared memory syscalls
pub const SHM_SYSCALLS: &[&str] = &[
    "ipc",
    "msgget",
    "msgsnd",
    "msgrcv",
    "msgctl",
    "semget",
    "semop",
    "semctl",
    "semtimedop",
    "semtimedop_time64",
    "shmat",
    "shmctl",
    "shmdt",
    "shmget",
];

/// The list of process, i/o priority changing system calls.
pub const NICE_SYSCALLS: &[&str] = &[
    "ioprio_set",
    "sched_setattr",
    "sched_setscheduler",
    "sched_setparam",
    "setpriority",
];

/// The list of page cache system calls.
pub const PAGE_CACHE_SYSCALLS: &[&str] = &["cachestat", "mincore"];

/// The list of perf/debugging system calls.
pub const PERF_SYSCALLS: &[&str] = &[
    "lookup_dcookie",
    "perf_event_open",
    "rtas",
    "s390_runtime_instr",
    "sys_debug_setcontext",
];

/// The list of ptrace system calls.
pub const PTRACE_SYSCALLS: &[&str] = &[
    "kcmp",
    "pidfd_getfd",
    "ptrace",
    "process_madvise",
    "process_vm_readv",
    "process_vm_writev",
];

/// The list of system calls which are of epoll family.
pub const EPOLL_SYSCALLS: &[&str] = &[
    // SAFETY: epoll fd is created once at startup,
    // before the seccomp filters have been loaded.
    // "epoll_create",
    // "epoll_create1",
    "epoll_ctl",
    "epoll_ctl_old",
    "epoll_wait",
    "epoll_wait_old",
    "epoll_pwait",
    "epoll_pwait2",
];

/// The list of system calls which are of UTS family.
pub const UTS_SYSCALLS: &[&str] = &["setdomainname", "sethostname"];

/// The list of system calls which are of the getid family.
pub const GET_ID_SYSCALLS: &[&str] = &[
    "getuid",
    "getuid32",
    "getgid",
    "getgid32",
    "geteuid",
    "geteuid32",
    "getegid",
    "getegid32",
    "getresuid",
    "getresuid32",
    "getresgid",
    "getresgid32",
    "getgroups",
    "getgroups32",
];

/// The list of system calls which are of the fadvise family.
pub const FADVISE_SYSCALLS: &[&str] = &["arm_fadvise64_64", "fadvise64", "fadvise64_64"];

/// The list of system calls which are of the futex family.
pub const FUTEX_SYSCALLS: &[&str] = &[
    "futex",
    "futex_requeue",
    "futex_time64",
    "futex_wait",
    "futex_waitv",
    "futex_wake",
    "swapcontext",
    "sys_debug_swapcontext",
];

/// The list of system calls which are of the setid family.
pub const SET_ID_SYSCALLS: &[&str] = &[
    "setuid",
    "setuid32",
    "setgid",
    "setgid32",
    "setreuid",
    "setreuid32",
    "setregid",
    "setregid32",
    "setresuid",
    "setresuid32",
    "setresgid",
    "setresgid32",
];

/// SAFETY: We do not support diverging FsID from Effective ID.
/// SAFETY: We do not support setgroups (due to pointer deref -> TOCTOU vector)
pub const UNSAFE_ID_SYSCALLS: &[&str] = &["setgroups", "setgroups32"];

/// The list of system calls which are in vDSO.
///
/// This set is always allowed for backwards compatibility.
pub const VDSO_SYSCALLS: &[&str] = &[
    "clock_getres",
    "clock_getres_time64",
    "clock_gettime",
    "clock_gettime64",
    "getcpu",
    "getrandom",
    "gettimeofday",
    "time",
    "uretprobe", // always allowed with special exception.
    #[cfg(target_arch = "riscv64")]
    "riscv_hwprobe",
];

/// The list of system calls which syd has a `UNotify` callback for
pub const HOOK_SYSCALLS: &[&str] = &[
    "accept",
    "accept4",
    "bind",
    "connect",
    "getpeername",
    "getsockname",
    "getsockopt",
    "recvfrom",
    "recvmsg",
    "recvmmsg",
    "recvmmsg_time64",
    "sendto",
    "sendmsg",
    "sendmmsg",
    "socket",
    "socketcall",
    "socketpair",
    "execve",
    "execveat",
    "getdents64",
    "stat",
    "access",
    "fstat",
    "fstat64",
    "fstatat64",
    "lstat",
    "newfstatat",
    "stat64",
    "statx",
    "faccessat",
    "faccessat2",
    "chdir",
    "fchdir",
    "chmod",
    "fchmod",
    "fchmodat",
    "fchmodat2",
    "fchown",
    "fchown32",
    "chown",
    "lchown",
    "fchownat",
    "creat",
    "link",
    "symlink",
    "unlink",
    "linkat",
    "symlinkat",
    "unlinkat",
    "mkdir",
    "rmdir",
    "mkdirat",
    "mknod",
    "mknodat",
    "open",
    "openat",
    "openat2",
    "readlink",
    "readlinkat",
    "rename",
    "renameat",
    "renameat2",
    "utime",
    "utimes",
    "futimesat",
    "utimensat",
    "truncate",
    "truncate64",
    "ftruncate",
    "ftruncate64",
    "getxattr",
    "getxattrat",
    "fgetxattr",
    "lgetxattr",
    "setxattr",
    "setxattrat",
    "fsetxattr",
    "lsetxattr",
    "listxattr",
    "listxattrat",
    "flistxattr",
    "llistxattr",
    "removexattr",
    "removexattrat",
    "fremovexattr",
    "lremovexattr",
    "ioctl",
    "prctl",
    "kill",
    "tkill",
    "tgkill",
    "rt_sigqueueinfo",
    "rt_tgsigqueueinfo",
    "pidfd_open",
    "brk",
    "mmap",
    "mmap2",
    "mremap",
    "setrlimit",
    "prlimit64",
    "statfs",
    "statfs64",
    "fstatfs",
    "fstatfs64",
    "fallocate",
    "uname",
    "fanotify_mark",
    "inotify_add_watch",
    "memfd_create",
    "fcntl",
    "fcntl64",
    "sysinfo",
    "sigaction",
    "rt_sigaction",
    "chroot",
    "syslog",
];

// The list of socketcall(2) subcalls to hook for sandboxing.
//
// This list must be sorted because it's binary searched.
pub(crate) const HOOK_SCKCALLS: &[u8] = &[
    0x1,  // socket
    0x2,  // bind
    0x3,  // connect
    0x5,  // accept
    0x6,  // getsockname
    0x7,  // getpeername
    0x8,  // socketpair
    0x9,  // send
    0xb,  // sendto
    0xc,  // recvfrom
    0xf,  // getsockopt
    0x10, // sendmsg
    0x11, // recvmsg
    0x12, // accept4
    0x13, // recvmmsg
    0x14, // sendmmsg
];

/// The list of system calls which are confined by the Stat sandboxing category.
///
/// Note, this list _must_ be sorted!
pub const STAT_SYSCALLS: &[&str] = &[
    "access",
    "faccessat",
    "faccessat2",
    "fanotify_mark",
    "fgetxattr",
    "flistxattr",
    "fstatat64",
    "fstatfs",
    "fstatfs64",
    "getxattr",
    "getxattrat",
    "inotify_add_watch",
    "lgetxattr",
    "listxattr",
    "listxattrat",
    "llistxattr",
    "lstat",
    "lstat64",
    "newfstatat",
    "stat",
    "stat64",
    "statfs",
    "statfs64",
    "statx",
];

/// The list of system calls which are confined by the Rename sandboxing category.
///
/// Note, this list _must_ be sorted!
pub const RENAME_SYSCALLS: &[&str] = &["link", "linkat", "rename", "renameat", "renameat2"];

/// The list of system calls which are confined by the Truncate sandboxing category.
///
/// Note, this list _must_ be sorted!
pub const TRUNCATE_SYSCALLS: &[&str] = &[
    "fallocate",
    "ftruncate",
    "ftruncate64",
    "truncate",
    "truncate64",
];

/// The list of system calls which are confined by the Connect network sandboxing category.
///
/// Note, this list _must_ be sorted!
pub const CONNECT_SYSCALLS: &[&str] = &["connect", "sendmmsg", "sendmsg", "sendto"];

/// The list of system calls which are for CPU emulation functionality.
pub const CPU_SYSCALLS: &[&str] = &[
    "modify_ldt",
    "subpage_prot",
    "switch_endian",
    "vm86",
    "vm86old",
];

/// The list of system calls which are for Kernel keyring access.
pub const KEYRING_SYSCALLS: &[&str] = &["add_key", "keyctl", "request_key"];

/// The list of system calls which are used for memory protection keys.
pub const PKEY_SYSCALLS: &[&str] = &["pkey_alloc", "pkey_free", "pkey_mprotect"];

/// The list of system calls that are denied in syd parent seccomp filter.
/// This filter is inherited by the sandbox process.
/// These system calls must not be used by syd.
pub const DEAD_SYSCALLS: &[&str] = &[
    "_sysctl", // deprecated
    "acct",
    "create_module",
    "delete_module",
    "finit_module",
    "get_kernel_syms",
    "init_module",
    "ioperm",
    "iopl",
    "kexec_file_load",
    "kexec_load",
    "lsm_get_self_attr",
    "lsm_set_self_attr",
    "lsm_list_modules",
    //"mbind", gated behind trace/allow_unsafe_mbind:1
    "migrate_pages",
    "move_pages",
    // As of 3.35.2, mincore may be enabled using
    // trace/allow_unsafe_page_cache:1 at startup.
    //"mincore",
    "name_to_handle_at",
    "nfsservctl",
    "pciconfig_iobase",
    "pciconfig_read",
    "pciconfig_write",
    "query_module",
    "quotactl",
    "quotactl_fd",
    "reboot",
    //"stime", // deprecated use clock_settime, gated behind trace/allow_unsafe_deprecated:1
    "swapoff",
    "swapon",
    "sysfs",
    "uselib",
    // Added in Linux-4.3
    // (Ab)used by most Project Zero Linux kernel exploits.
    "userfaultfd",
    //"ustat", // deprecated, gated behind trace/allow_unsafe_deprecated:1
    "vhangup",
    // https://lore.kernel.org/linux-mm/X+PoXCizo392PBX7@redhat.com/
    // "vmsplice", // gated behind trace/allow_unsafe_vmsplice:1
];

/// The list of system calls which are deprecated.
pub const DEPRECATED_SYSCALLS: &[&str] = &[
    //"_sysctl",     // does not exist on Linux>=5.5.
    //"oldfstat",    // fstat(2) is provided by Syd.
    //"oldolduname", // uname(2) is provided by Syd.
    //"olduname",    // ditto.
    "remap_file_pages",
    "setfsgid",
    "setfsgid32",
    "setfsuid",
    "setfsuid32",
    "stime",
    //"uselib", deprecated but critical, moved to dead as of 3.45.1.
    "ustat",
];

/// The list of system calls which are part of time/clock adjustment.
pub const TIME_SYSCALLS: &[&str] = &[
    "adjtimex",
    "clock_adjtime",
    "clock_adjtime64",
    "clock_settime",
    "clock_settime64",
    "settimeofday",
    //"stime", deprecated.
];

/// The list of system calls for filesystem sync.
/// SAFETY: By default sync(2), syncfs(2) are no-ops to prevent potential local DoS.
pub const SYNC_SYSCALLS: &[&str] = &["sync", "syncfs"];

/// The list of system calls which are part of the io_uring interface.
pub const IOURING_SYSCALLS: &[&str] = &["io_uring_enter", "io_uring_register", "io_uring_setup"];

#[cfg(feature = "prof")]
/// The list of system calls which are used by gperf for profiling.
///
/// Empty in case `prof` feature is disabled.
/// Note we also allow open(2), openat(2), lstat(2), stat(2), and
/// write(2) syscalls for the main thread if profiling is enabled.
pub const PROF_SYSCALLS: &[&str] = &["setitimer"];

#[cfg(not(feature = "prof"))]
/// The list of system calls which are used by gperf for profiling.
///
/// Empty in case `prof` feature is disabled.
/// Note we also allow open(2), openat(2), lstat(2), stat(2), and
/// write(2) syscalls for the main thread if profiling is enabled.
pub const PROF_SYSCALLS: &[&str] = &[];

/// The list of system calls which are allowlisted without any filtering.
pub const SAFE_SYSCALLS: &[&str] = &[
    "_llseek",
    "_newselect",
    //"_sysctl", deprecated
    //"accept",
    //"accept4",
    //"acct", // CAP_SYS_PACCT
    //add_key, restrictions applied, see setup_seccomp.
    //adjtimex, restrictions applied, see setup_seccomp.
    //afs_syscall, unimplemented
    "alarm",
    "arch_prctl", // Used during platform-specific initialization by ld-linux.so.
    "arm_fadvise64_64",
    "arm_sync_file_range",
    "atomic_barrier",
    "atomic_cmpxchg_32",
    //"bpf", restrictions applied, see setup_seccomp.
    "breakpoint", // arm
    //"brk",
    //"cachestat", // moved to page_cache syscalls as of 3.35.2.
    "cacheflush", // arm
    "capget",
    "capset",
    //"chdir",
    //"chroot",
    //clock_adjtime, restrictions applied, see setup_seccomp.
    //clock_adjtime64, restrictions applied, see setup_seccomp.
    //"clock_getres", part of vDSO-syscalls.
    //"clock_getres_time64", ditto.
    //"clock_gettime", ditto.
    //"clock_gettime64", ditto.
    "clock_nanosleep",
    "clock_nanosleep_time64",
    //"clock_settime", restrictions applied, see setup_seccomp.
    //"clock_settime64", ditto
    "clone", // CLONE_NEW* is restricted in parent filter.
    // "clone3", // unsafe because namespaces cannot be restricted.
    "close",
    "close_range",
    "copy_file_range",
    //create_module
    //delete_module
    "dup",
    "dup2",
    "dup3",
    "epoll_create",
    "epoll_create1",
    "epoll_ctl",
    "epoll_ctl_old",
    "epoll_pwait",
    "epoll_pwait2",
    "epoll_wait",
    "epoll_wait_old",
    "eventfd",
    "eventfd2",
    "exit",
    "exit_group",
    "fadvise64",
    "fadvise64_64",
    //"fallocate",
    "fanotify_init",
    //"fanotify_mark", device side-channel mitigations
    //"fchdir",
    //"fchmod",
    //"fchown",
    //"fchown32",
    //"fcntl", restrictions applied for appendonly!
    //"fcntl64", ditto
    "fdatasync",
    //finit_module
    "flock",
    "fork",
    //fsconfig
    //fsmount
    //fsopen
    //fspick
    //"fstat",
    //"fstat64",
    //"oldfstat",
    //"fstatfs",
    //"fstatfs64",
    "fsync",
    //"ftruncate",
    //"ftruncate64",
    // "futex", See: FUTEX_SYSCALLS for futex family.
    //get_kernel_syms
    "getpagesize",
    "get_mempolicy",
    "get_robust_list",
    "get_thread_area",
    //"getcpu", part of vDSO-syscalls.
    "getcwd",
    //"getdents",
    //"getdents64",
    //"getegid",
    //"getegid32",
    //"geteuid",
    //"geteuid32",
    //"getgid",
    //"getgid32",
    "getgroups",
    "getgroups32",
    "getitimer",
    //"getpeername",
    "getpgid",
    "getpgrp",
    "getpid",
    "getpmsg",
    "getppid",
    "getpriority",
    //"getrandom", part of vDSO-syscalls.
    "getresgid",
    "getresuid",
    "getrlimit",
    "getrusage",
    "getsid",
    // "getuid", See: GET_ID_SYSCALLS for getid family.
    //"getsockname",
    //"getsockopt",
    "gettid",
    //"gettimeofday", part of vDSO-syscalls.
    //init_module
    //"inotify_add_watch",
    "inotify_init",
    "inotify_init1",
    "inotify_rm_watch",
    "io_cancel",
    "io_destroy",
    "io_getevents",
    "io_pgetevents",
    "io_pgetevents_time64",
    "io_setup",
    "io_submit",
    /*
     * io-uring: restrictions applied, see setup_seccomp.
    "io_uring_enter",
    "io_uring_register",
    "io_uring_setup",
    */
    //"ioctl", restrictions applied, see setup_seccomp.
    //"ioperm", // CAP_SYS_RAWIO
    //"iopl", // CAP_SYS_RAWIO
    "ioprio_get",
    //"ioprio_set", restrictions applied, see setup_seccomp.
    //"kcmp", part of PTRACE_SYSCALLS now as of 3.33.0.
    //kexec_file_load,
    //kexec_load,
    //keyctl, restrictions applied, see setup_seccomp.
    //"kill", restrictions applied, see setup_seccomp.
    "landlock_add_rule",
    "landlock_create_ruleset",
    "landlock_restrict_self",
    "listen",
    //"lookup_dcookie", restrictions applied, see setup_seccomp.
    "lseek",
    //"madvise", restrictions applied, see setup_seccomp.
    "map_shadow_stack",
    //"mbind", restrictions applied, see setup_seccomp.
    "membarrier",
    //"memfd_create",
    //"memfd_secret",
    //"migrate_pages",
    //"mincore", NO! https://arxiv.org/pdf/1901.01161
    "mlock",
    "mlock2",
    "mlockall",
    //"mmap",
    //"mmap2",
    //"modify_ldt", restrictions applied, see setup_seccomp.
    //"mount", restrictions applied, see setup_seccomp.
    //"mount_setattr", // ditto
    //"move_pages",
    "mprotect",
    //"mq_getsetattr", restrictions applied, see setup_seccomp.
    //"mq_notify", ditto
    //"mq_open", ditto
    //"mq_timedreceive", ditto
    //"mq_timedreceive_time64", ditto
    //"mq_timedsend", ditto
    //"mq_timedsend_time64", ditto
    //"mq_unlink",
    //"mremap",
    "mseal",
    //"msgctl", restrictions applied, see setup_seccomp.
    //"msgget", restrictions applied, see setup_seccomp.
    //"msgrcv", restrictions applied, see setup_seccomp.
    //"msgsnd", restrictions applied, see setup_seccomp.
    "msync",
    "munlock",
    "munlockall",
    "munmap",
    //name_to_handle_at
    "nanosleep",
    //nfsservctl,
    //"open_by_handle_at",
    //open_tree
    "pause",
    //"perf_event_open", restrictions applied see load_seccomp_parent and setup_seccomp.
    //"personality", restrictions applied, see setup_seccomp.
    //"pidfd_getfd", added to ptrace set as of 3.35.2.
    //"pidfd_open", restrictions applied, see setup_seccomp.
    "pidfd_send_signal",
    "pipe",
    //"pipe2", restrictions applied, see setup_seccomp.
    //pivot_root,
    //"pkey_alloc", restrictions applied, see setup_seccomp.
    //"pkey_free", ditto
    //"pkey_mprotect", ditto
    "poll",
    "ppoll",
    "ppoll_time64",
    // "prctl", restrictions applied, see setup_seccomp.
    "pread64",
    "preadv",
    "preadv2",
    //"prlimit64",
    //"process_madvise", added to ptrace set as of 3.35.2.
    "process_mrelease",
    //process_vm_readv // restrictions applied, see setup_seccomp.
    //process_vm_writev // ditto.
    "pselect6",
    "pselect6_time64",
    //"ptrace", // restrictions applied see load_seccomp_parent and setup_seccomp.
    //"putpmsg",
    "pwrite64",
    "pwritev",
    "pwritev2", // restrictions applied in unshare/child.rs.
    //query_module,
    //quotactl
    //quotactl_fd
    "read",
    "readahead",
    //"readlink", part of stat sandboxing as of 3.42.0
    //"readlinkat", ditto.
    "readv",
    //reboot
    "recv",
    //"recvfrom", // hooked for informational purposes.
    //"recvmmsg", // ditto
    //"recvmmsg_time64", // ditto
    //"recvmsg", // ditto
    //"remap_file_pages", // deprecated, prot _must_ be 0, not security critical.
    //request_key, restrictions applied, see setup_seccomp.
    "restart_syscall",
    #[cfg(target_arch = "riscv64")]
    "riscv_flush_icache",
    //"riscv_hwprobe", // in VDSO_SYSCALLS.
    //"rseq", restrictions applied, see setup_seccomp.
    //"rt_sigaction", SA_RESTART tracking.
    "rt_sigpending",
    "rt_sigprocmask",
    //"rt_sigqueueinfo", restrictions applied, see setup_seccomp.
    //"rt_sigreturn", SROP mitigations
    "rt_sigsuspend",
    "rt_sigtimedwait",
    "rt_sigtimedwait_time64",
    //"rt_tgsigqueueinfo", restrictions applied, see setup_seccomp.
    "s390_pci_mmio_read",
    "s390_pci_mmio_write",
    "s390_runtime_instr",
    "sched_get_priority_max",
    "sched_get_priority_min",
    "sched_getaffinity",
    "sched_getattr",
    "sched_rr_get_interval",
    "sched_rr_get_interval_time64",
    "sched_getparam",
    "sched_getscheduler",
    "sched_setaffinity",
    //"sched_setattr", restrictions applied, see setup_seccomp.
    //"sched_setscheduler", restrictions applied, see setup_seccomp.
    //"sched_setparam", restrictions applied, see setup_seccomp.
    "sched_yield",
    //"seccomp", restrictions applied, see setup_seccomp.
    //security, unimplemented
    "select",
    //"semctl", restrictions applied, see setup_seccomp.
    //"semget", restrictions applied, see setup_seccomp.
    //"semop", restrictions applied, see setup_seccomp.
    //"semtimedop", restrictions applied, see setup_seccomp.
    //"semtimedop_time64", restrictions applied, see setup_seccomp.
    "send",
    "sendfile",
    "sendfile64",
    //"sendmmsg", emulated.
    //"sendmsg", emulated.
    "set_mempolicy",
    "set_mempolicy_home_node",
    "set_robust_list",
    "set_thread_area",
    "set_tid_address",
    //"setdomainname", added to uts set as of 3.35.2.
    //"setfsgid",
    //"setfsgid32",
    //"setfsuid",
    //"setfsuid32",
    //"setgid",
    //"setgid32",
    //"setgroups",
    //"setgroups32",
    //"sethostname", added to uts set as of 3.35.2.
    "setitimer",
    "setns", // restrictions applied in load_seccomp_parent.
    "setpgid",
    //"setpriority", restrictions applied, see setup_seccomp.
    //"setregid",
    //"setregid32",
    //"setresgid",
    //"setresgid32",
    //"setresuid",
    //"setresuid32",
    //"setreuid",
    //"setreuid32",
    //"setrlimit",
    "setsid",
    "setsockopt",
    //"settimeofday"
    //"setuid",
    //"setuid32",
    "set_tls", // arm
    //"sgetmask", // x86, OBSOLETE!
    //"ssetmask", // x86, OBSOLETE!
    //"shmat", restrictions applied, see setup_seccomp.
    //"shmctl", restrictions applied, see setup_seccomp.
    //"shmdt", restrictions applied, see setup_seccomp.
    //"shmget", restrictions applied, see setup_seccomp.
    "shutdown",
    "signal",
    //"sigaction", SA_RESTART tracking.
    "sigaltstack",
    "signalfd",
    "signalfd4",
    "sigpending",
    "sigprocmask",
    "sigsuspend",
    //"sigreturn", SROP mitigations
    //"socket",
    //"socketpair",
    "splice",
    //"statfs",
    //"statfs64",
    //swapoff
    //swapon
    //"sync",
    "sync_file_range",
    "sync_file_range2", // arm & ppc
    //"syncfs",
    //"sysinfo", information-leak, see setup_seccomp.
    //"syslog",
    "tee",
    //"tgkill", restrictions applied, see setup_seccomp.
    //"time", part of vDSO-syscalls.
    "timer_create",
    "timer_delete",
    "timer_getoverrun",
    "timer_gettime",
    "timer_gettime64",
    "timer_settime",
    "timer_settime64",
    "timerfd_create",
    "timerfd_gettime",
    "timerfd_gettime64",
    "timerfd_settime",
    "timerfd_settime64",
    "times",
    //"tkill", an obsolete predecessor to tgkill, should be avoided.
    //tuxcall, // unimplemented
    "ugetrlimit",
    "umask",
    //"uname", restrictions applied, see setup_seccomp.
    //olduname, deprecated
    //oldolduname, deprecated
    //"umount", restrictions applied, see setup_seccomp.
    //"umount2", // ditto
    "unshare", // restrictions applied in load_seccomp_parent.
    //"uretprobe", restrictions applied, see setup_seccomp.
    //"uselib", deprecated
    // Added in Linux-4.3
    // (Ab)used by most Project Zero Linux kernel exploits.
    //"userfaultfd",
    //"ustat", deprecated
    "vfork",
    //"vhangup", // CAP_SYS_TTY_CONFIG
    //"vmsplice", // restrictions applied, see setup_seccomp.
    //vserver, unimplemented
    "wait4",
    "waitid",
    "waitpid",
    "write",
    "writev",
];

/// System calls allowed for emulator threads.
pub const EMU_SYSCALLS: &[&str] = &[
    "_llseek",
    //"accept4", protected with syscall argument cookies.
    //"bind", protected with syscall argument cookies.
    "brk",
    //"clock_gettime", part of vDSO-syscalls.
    //"clock_gettime64", part of vDSO-syscalls.
    "clock_nanosleep",
    //"clone", // CLONE_NEW* flags are restricted.
    //"clone3", // never allowed due to struct-pointer CLONE_NEW* bypass.
    "close",
    "close_range",
    //"connect", protected with syscall argument cookies.
    "exit",
    "exit_group",
    "fallocate", // TODO: protect with syscall cookies!
    "fanotify_mark",
    //"fchdir", protected with syscall argument cookies.
    "fchmod",
    "fchown",
    "fgetxattr",
    "flistxattr",
    "fremovexattr",
    "fsetxattr",
    "fstatfs",
    "fstatfs64",
    //"ftruncate", protected with syscall argument cookies.
    //"ftruncate64", ditto.
    // "futex", See: FUTEX_SYSCALLS for futex family.
    "get_robust_list",
    "getdents64",
    "getpgid",
    "getpgrp",
    "getpid",
    //"getrandom", part of vDSO-syscalls.
    "getsockname",
    "getsockopt",
    "gettid",
    "getxattrat",
    "inotify_add_watch",
    "kcmp",
    "landlock_create_ruleset",
    "landlock_restrict_self",
    "lgetxattr",
    //linkat, protected with syscall argument cookies.
    "listxattrat",
    "llistxattr",
    "lremovexattr",
    "lseek",
    "lsetxattr",
    "lstat",
    //"madvise", advice are confined.
    //"memfd_create", protected with syscall argument cookies.
    "mlock",
    "mmap",
    "mmap2",
    "mprotect",
    "mremap",
    "munlock",
    "munmap",
    "nanosleep",
    "pidfd_getfd",
    "pidfd_open",
    "pidfd_send_signal",
    //"pipe2", flags are confined, protected with syscall argument cookies.
    //"prctl", see EMU_PRCTL_OPS for permitted operations.
    "process_mrelease", // Used by kill action when signal is SIGKILL.
    "process_vm_readv",
    "process_vm_writev",
    "read",
    "readv",
    "recv",
    "recvfrom",
    "recvmsg",
    "recvmmsg",
    "recvmmsg_time64",
    "removexattrat",
    "restart_syscall",
    "rseq",
    "rt_sigprocmask",
    "rt_sigtimedwait",        // Used by interrupt handling.
    "rt_sigtimedwait_time64", // ditto.
    "sched_getaffinity",
    //"sched_setaffinity", we use sync seccomp rather than CPU pinning as of 3.37.6.
    "sched_yield",
    "seccomp", // Allowed until sandbox lock.
    "send",
    "sendmsg",
    "sendto",
    "set_robust_list",
    "setxattrat",
    "sigaltstack",
    "sigprocmask",
    //"socket", protected with syscall argument cookies.
    //"socketpair", ditto.
    "splice",
    "symlinkat", // TODO: protect with syscall cookies!
    "tee",
    "tgkill",
    "tkill",
    "timer_create",    // needed to unblock idle helper threads.
    "timer_delete",    // ditto.
    "timer_settime",   // ditto.
    "timer_settime64", // ditto.
    //"truncate", protected with syscall argument cookies.
    //"truncate64", ditto.
    //"umask", ditto.
    //"uname", ditto.
    "waitid",
    // Required to unblock FIFOs.
    "sigreturn",
    "rt_sigreturn",
    //"sigaction", installing new signal handlers is not permitted.
    //"rt_sigaction", ditto.
    // fd calls
    "faccessat2",
    "fchmodat",  // TODO: protect with syscall cookies!
    "fchmodat2", // TODO: ditto.
    "fchownat",  // TODO: ditto.
    "mkdirat",   // TODO: ditto.
    "mknodat",   // TODO: ditto.
    //"openat2", protected with syscall argument cookies.
    "readlinkat",
    //"renameat2", protected with syscall argument cookies.
    "statx", // fstat and newfstatat are implemented on top of this.
    //"unlinkat", protected with syscall argument cookies.
    "utimensat",
    // used by logging, peer_inode, and proc_pid_status(5) masking.
    "write",
];

/// System calls allowed for emulator threads until sandbox lock.
pub const EMU_LOCK_SYSCALLS: &[&str] = &["clone", "clone3", "unshare", "seccomp"];

/// Fcntl operations allowed for emulator threads.
pub const EMU_FCNTL_OPS: &[u64] = &[
    libc::F_GETFD as u64,
    libc::F_SETFD as u64,
    libc::F_GETFL as u64,
    libc::F_SETFL as u64,
    libc::F_OFD_SETLK as u64,
    libc::F_OFD_SETLKW as u64,
    libc::F_ADD_SEALS as u64,
];

/// Prctl operations allowed for emulator threads.
pub const EMU_PRCTL_OPS: &[KeyValue] = &[
    ("PR_SET_NAME", 15),         // used by syd_{mon->emu}
    ("PR_SET_VMA", 0x53564d41),  // used by allocator.
    ("PR_SET_NO_NEW_PRIVS", 38), // called by confine_scmp on sandbox lock.
    ("PR_GET_NO_NEW_PRIVS", 39), // safe.
];

/// Prctl operations allowed for emulator threads when SafeSetID is on.
pub const EMU_PRCTL_OPS_SAFESETID: &[KeyValue] = &[("PR_CAP_AMBIENT", 47), ("PR_CAPBSET_DROP", 24)];

/// System calls allowed for Interrupter thread.
pub const INT_SYSCALLS: &[&str] = &[
    "clock_nanosleep",
    "close",
    "exit",
    "exit_group", // to exit if inter-thread signaling does not work.
    //"clock_gettime", part of vDSO-syscalls.
    //"clock_gettime64", ditto.
    // "futex", See: FUTEX_SYSCALLS for futex family.
    "getpid",
    "gettid",
    // can {{dr}e,}allocate memory.
    // mmap{,2} and mprotect are further confined to disable PROT_EXEC.
    "brk",
    //"madvise", advice are confined.
    "mremap",
    "munmap",
    "nanosleep",
    "lseek",   // seek in /proc/$pid/status file.
    "_llseek", // ditto.
    //"prctl", see INT_PRCTL_OPS for permitted operations.
    "read",
    "restart_syscall",
    "rt_sigprocmask",
    "rseq",
    "sched_yield",
    "sigaltstack", // rare but necessary.
    "sigprocmask", // ditto.
];

/// System calls allowed for Timeouter thread.
pub const OUT_SYSCALLS: &[&str] = &[
    "clock_nanosleep",
    "exit",
    "exit_group", // to exit if timeout is exceeded.
    //"clock_gettime", part of vDSO-syscalls.
    //"clock_gettime64", ditto.
    // "futex", See: FUTEX_SYSCALLS for futex family.
    "getpid",
    "gettid",
    // can {{dr}e,}allocate memory.
    // mmap{,2} and mprotect are further confined to disable PROT_EXEC.
    "brk",
    //"madvise", advice are confined.
    "mremap",
    "munmap",
    "nanosleep",
    "restart_syscall",
    "rt_sigprocmask",
    "rseq",
    "sched_yield",
    "sigaltstack", // rare but necessary.
    "sigprocmask", // ditto.
];

/// Fcntl operations allowed for Interrupter thread.
pub const INT_FCNTL_OPS: &[u64] = &[
    libc::F_GETFD as u64,
    libc::F_SETFD as u64,
    libc::F_OFD_SETLK as u64,
    libc::F_OFD_SETLKW as u64,
];

/// Fcntl operations allowed for Timeout thread.
pub const OUT_FCNTL_OPS: &[u64] = &[
    libc::F_GETFD as u64,
    libc::F_SETFD as u64,
    libc::F_OFD_SETLK as u64,
    libc::F_OFD_SETLKW as u64,
];

/// Prctl operations allowed for Interrupter thread.
pub const INT_PRCTL_OPS: &[KeyValue] = &[("PR_SET_VMA", 0x53564d41)];

/// System calls allowed for the IPC thread.
///
/// We do not protect system calls of this thread with system call argument cookies,
/// because `syd_ipc` thread exits as soon as the sandbox is locked
/// therefore this mitigation was deemed unnecessary.
pub const IPC_SYSCALLS: &[&str] = &[
    "close",
    "exit",
    //"clock_gettime",   // part of vDSO-syscalls.
    //"clock_gettime64", // ditto.
    // "futex", See: FUTEX_SYSCALLS for futex family.
    "getpid",
    "gettid",
    //"getrandom", part of vDSO-syscalls.
    "getsockopt",
    "accept4",
    "recvmsg",
    "sendmsg",
    // can {{dr}e,}allocate and seal memory.
    // mmap{,2} and mprotect are further confined to disable PROT_EXEC.
    "brk",
    //"madvise", advice are confined.
    "mremap",
    "mseal",
    "munmap",
    //"prctl", see INT_PRCTL_OPS for permitted operations.
    "restart_syscall",
    "rt_sigprocmask",
    "rseq",
    "sched_yield",
    "sigaltstack", // rare but necessary.
    "sigprocmask", // ditto.
];

/// Fcntl operations allowed for IPC thread.
pub const IPC_FCNTL_OPS: &[u64] = &[
    libc::F_GETFD as u64,
    libc::F_SETFD as u64,
    libc::F_OFD_SETLK as u64,
    libc::F_OFD_SETLKW as u64,
];

/// Prctl operations allowed for Interrupter thread.
pub const IPC_PRCTL_OPS: &[KeyValue] = &[("PR_SET_VMA", 0x53564d41)];

/// System calls allowed for AES threads.
pub const AES_SYSCALLS: &[&str] = &[
    "_llseek",
    //"accept4", protected with syscall argument cookies.
    "brk",
    //"clock_gettime", part of vDSO-syscalls.
    //"clock_gettime64", ditto.
    "clock_nanosleep",
    //"clone", // CLONE_NEW* flags are restricted.
    //"clone3", // never allowed due to struct-pointer CLONE_NEW* bypass.
    "close",
    "exit",
    //"fadvise64", See: FADVISE_SYSCALLS for allowed fadvise syscalls.
    //"fcntl", See: AES_FCNTL_OPS for allowed fcntl operations.
    //"fcntl64",
    "fremovexattr",
    //"ftruncate", Protected with syscall argument cookies.
    //"ftruncate64", ditto.
    // "futex", See: FUTEX_SYSCALLS for futex family.
    "get_robust_list",
    "getpid",
    //"getrandom", part of vDSO-syscalls.
    "getsockopt",
    "gettid",
    "landlock_create_ruleset",
    "landlock_restrict_self",
    "lseek",
    //"madvise", advice are confined.
    "mlock",
    "mmap",
    "mmap2",
    "mprotect",
    "mremap",
    "munlock",
    "munmap",
    "nanosleep",
    //"pipe2", flags are confined, protected with syscall argument cookies.
    //"prctl", see AES_PRCTL_OPS for permitted operations.
    "recvmsg",
    "restart_syscall",
    "rseq",
    "rt_sigprocmask",
    "sched_getaffinity",
    "sched_yield",
    "send",    // TODO: protect with syscall cookies!
    "sendmsg", // TODO: protect with syscall cookies!
    "sendto",  // TODO: protect with syscall cookies!
    "set_robust_list",
    "sigaltstack",
    "sigprocmask",
    "splice",
    "tee",
    //"sigaction", installing new signal handlers is not permitted.
    //"rt_sigaction", ditto.
];

/// Fcntl operations allowed for AES threads.
pub const AES_FCNTL_OPS: &[u64] = &[
    libc::F_GETFD as u64,
    libc::F_SETFD as u64,
    libc::F_ADD_SEALS as u64,
    libc::F_OFD_SETLK as u64, // used by syd::log::LockedWriter.
    libc::F_OFD_SETLKW as u64,
];

/// Prctl operations allowed for AES threads.
pub const AES_PRCTL_OPS: &[KeyValue] = &[
    ("PR_SET_NAME", 15),        // used by syd_aes spawns
    ("PR_SET_VMA", 0x53564d41), // used by allocator.
];

/// System calls allowed for main wait thread.
pub const MAIN_SYSCALLS: &[&str] = &[
    "_llseek",
    "brk",
    //"clock_gettime", part of vDSO-syscalls.
    //"clock_gettime64", ditto.
    "clock_nanosleep",
    "close",
    "close_range",
    "exit_group",
    "fstatfs",
    "fstatfs64",
    // "futex", See: FUTEX_SYSCALLS for futex family.
    "get_robust_list",
    "getdents64", // used by pid-limiter.
    "getpgid",
    "getpgrp",
    "getpid",
    //"getrandom", part of vDSO-syscalls.
    "gettid",
    "kill", // used by pid-limiter.
    "lseek",
    //"madvise", advice are confined.
    "mmap",
    "mmap2",
    "mprotect",
    "mremap",
    "munlock",
    "munmap",
    "nanosleep",
    "pidfd_open",
    "pidfd_getfd",
    "pidfd_send_signal",
    //"prctl", see MAIN_PRCTL_OPS for permitted operations.
    "process_mrelease",  // Used by kill action when signal is SIGKILL.
    "process_vm_readv",  // needed to get AT_{RANDOM,SECURE}.
    "process_vm_writev", // needed to set AT_SECURE.
    "ptrace",
    "read",
    "readv",
    "restart_syscall",
    "rseq",
    "rt_sigprocmask",
    "sched_getaffinity",
    //"sched_setaffinity", set before confinement, unneeded after.
    "sched_yield",
    "set_robust_list",
    "sigaltstack",
    "sigprocmask",
    "statx",   // fstat and newfstatat are implemented on top of this.
    "sysinfo", // used by pid-limiter.
    "tgkill",
    "tkill",
    "waitid",
    //"sigaction", installing new signal handlers is not permitted.
    //"rt_sigaction", ditto.
    // fd-calls
    "faccessat2",
    //"openat2", protected with syscall argument cookies.
    "readlinkat",
];

/// Fcntl operations allowed for main wait thread.
pub const MAIN_FCNTL_OPS: &[u64] = &[
    libc::F_GETFD as u64,
    libc::F_SETFD as u64,
    libc::F_GETFL as u64,
    libc::F_SETFL as u64,
    libc::F_OFD_SETLK as u64,
    libc::F_OFD_SETLKW as u64,
];

/// Prctl operations allowed for main wait thread.
pub const MAIN_PRCTL_OPS: &[KeyValue] = &[("PR_SET_VMA", 0x53564d41)];

/// System calls that must be specifically allowed for syd-oci.
///
/// Note, this list _must_ be sorted!
pub const OCI_SYSCALLS: &[&str] = &[
    "kcmp",
    "pidfd_getfd",
    "process_mrelease",
    "process_vm_readv",
    "process_vm_writev",
    "ptrace",
    "syslog",
    "unshare",
];

/// List of pointer argument indexes for safe and hook syscalls.
pub const SYSCALL_PTR_ARGS: &[(&str, &[u32])] = &[
    ("_llseek", &[3]),
    ("_newselect", &[1, 2, 3, 4]),
    ("accept", &[1, 2]),
    ("accept4", &[1, 2]),
    ("access", &[0]),
    ("add_key", &[0, 1, 2]),
    ("adjtimex", &[0]),
    ("arch_prctl", &[1]),
    ("bind", &[1]),
    ("bpf", &[1]),
    ("brk", &[0]),
    ("cacheflush", &[0, 1]),
    ("capget", &[0, 1]),
    ("capset", &[0, 1]),
    ("chdir", &[0]),
    ("chmod", &[0]),
    ("chown", &[0]),
    ("chroot", &[0]),
    ("clock_adjtime", &[1]),
    ("clock_adjtime64", &[1]),
    ("clock_getres", &[1]),
    ("clock_getres_time64", &[1]),
    ("clock_gettime", &[1]),
    ("clock_gettime64", &[1]),
    ("clock_nanosleep", &[2, 3]),
    ("clock_nanosleep_time64", &[2, 3]),
    ("clock_settime", &[1]),
    ("clock_settime64", &[1]),
    ("clone", &[1, 2, 3, 4]),
    ("clone3", &[0]),
    ("connect", &[1]),
    ("copy_file_range", &[1, 3]),
    ("creat", &[0]),
    ("epoll_ctl", &[3]),
    ("epoll_ctl_old", &[3]),
    ("epoll_pwait", &[1, 4]),
    ("epoll_pwait2", &[1, 3, 4]),
    ("epoll_wait", &[1]),
    ("epoll_wait_old", &[1]),
    ("execve", &[0, 1, 2]),
    ("execveat", &[1, 2, 3]),
    ("faccessat", &[1]),
    ("faccessat2", &[1]),
    ("fanotify_mark", &[4]),
    ("fchmodat", &[1]),
    ("fchmodat2", &[1]),
    ("fchownat", &[1]),
    ("fgetxattr", &[1, 2]),
    ("flistxattr", &[1]),
    ("fremovexattr", &[1]),
    ("fsetxattr", &[1, 2]),
    ("fstat", &[1]),
    ("fstat64", &[1]),
    ("fstatat64", &[1, 2]),
    ("fstatfs", &[1]),
    ("fstatfs64", &[1]),
    ("futex", &[0]),
    ("futimesat", &[1, 2]),
    ("get_mempolicy", &[1]),
    ("get_robust_list", &[1, 2]),
    ("get_thread_area", &[0]),
    ("getcpu", &[0, 1, 2]),
    ("getcwd", &[0]),
    ("getdents", &[1]),
    ("getdents64", &[1]),
    ("getgroups", &[1]),
    ("getgroups32", &[1]),
    ("getitimer", &[1]),
    ("getpeername", &[1, 2]),
    ("getrandom", &[0]),
    ("getresgid", &[0, 1, 2]),
    ("getresuid", &[0, 1, 2]),
    ("getrlimit", &[1]),
    ("getrusage", &[1]),
    ("getsockname", &[1, 2]),
    ("getsockopt", &[3, 4]),
    ("gettimeofday", &[0, 1]),
    ("getxattr", &[0, 1, 2]),
    ("getxattrat", &[1, 3]),
    ("inotify_add_watch", &[1]),
    ("io_cancel", &[1, 2]),
    ("io_getevents", &[3, 4]),
    ("io_pgetevents", &[3, 4, 5]),
    ("io_pgetevents_time64", &[3, 4, 5]),
    ("io_setup", &[1]),
    ("io_submit", &[2]),
    ("io_uring_enter", &[4]),
    ("io_uring_register", &[2]),
    ("io_uring_setup", &[1]),
    ("kexec_file_load", &[3]),
    ("kexec_load", &[2]),
    ("keyctl", &[]), // Treated specially, see confine_scmp_kptr.
    ("landlock_add_rule", &[2]),
    ("landlock_create_ruleset", &[0]),
    ("lchown", &[0]),
    ("lgetxattr", &[0, 2]),
    ("link", &[0, 1]),
    ("linkat", &[1, 3]),
    ("listxattr", &[0, 1]),
    ("listxattrat", &[1, 2]),
    ("llistxattr", &[0, 1]),
    ("lookup_dcookie", &[1]),
    ("lremovexattr", &[0]),
    ("lsetxattr", &[0, 2]),
    ("lstat", &[0, 1]),
    ("lstat64", &[0, 1]),
    ("madvise", &[0]),
    ("map_shadow_stack", &[0]),
    ("mbind", &[0, 3]),
    ("memfd_create", &[0]),
    ("migrate_pages", &[2, 3]),
    ("mincore", &[0, 2]),
    ("mkdir", &[0]),
    ("mkdirat", &[1]),
    ("mknod", &[0]),
    ("mknodat", &[1]),
    ("mlock", &[0]),
    ("mlock2", &[0]),
    ("mmap", &[0]),
    ("mmap2", &[0]),
    ("modify_ldt", &[1]),
    ("mount", &[0, 1, 2, 4]),
    ("mount_setattr", &[1, 3]),
    ("move_pages", &[2, 3, 4]),
    ("mprotect", &[0]),
    ("mq_getsetattr", &[1, 2]),
    ("mq_notify", &[1]),
    ("mq_open", &[0, 3]),
    ("mq_timedreceive", &[1, 3, 4]),
    ("mq_timedreceive_time64", &[1, 3, 4]),
    ("mq_timedsend", &[1, 4]),
    ("mq_timedsend_time64", &[1, 4]),
    ("mq_unlink", &[0]),
    ("mremap", &[0, 4]),
    ("mseal", &[0]),
    ("msgctl", &[2]),
    ("msgrcv", &[1]),
    ("msgsnd", &[1]),
    ("msync", &[0]),
    ("munlock", &[0]),
    ("munmap", &[0]),
    ("name_to_handle_at", &[1, 2, 3]),
    ("nanosleep", &[0, 1]),
    ("newfstatat", &[1, 2]),
    ("nfsservctl", &[1, 2]),
    ("oldfstat", &[1]),
    ("open", &[0]),
    ("open_by_handle_at", &[1]),
    ("open_tree", &[1]),
    ("openat", &[1]),
    ("openat2", &[1, 2]),
    ("perf_event_open", &[0]),
    ("pidfd_send_signal", &[2]),
    ("pipe", &[0]),
    ("pipe2", &[0]),
    ("pivot_root", &[0, 1]),
    ("pkey_mprotect", &[0]),
    ("poll", &[0]),
    ("ppoll", &[0, 2, 3]),
    ("ppoll_time64", &[0, 2, 3]),
    ("prctl", &[]), // Treated specially, see confine_scmp_kptr.
    ("pread64", &[1]),
    ("preadv", &[1]),
    ("preadv2", &[1]),
    ("prlimit64", &[2, 3]),
    ("process_madvise", &[1]),
    ("process_vm_readv", &[1, 3]),
    ("process_vm_writev", &[1, 3]),
    ("pselect6", &[1, 2, 3, 4, 5]),
    ("pselect6_time64", &[1, 2, 3, 4, 5]),
    ("ptrace", &[2, 3]),
    ("putpmsg", &[1, 2]),
    ("pwrite64", &[1]),
    ("pwritev", &[1]),
    ("pwritev2", &[1]),
    ("query_module", &[0, 2, 4]),
    ("quotactl", &[1, 3]),
    ("quotactl_fd", &[3]),
    ("read", &[1]),
    ("readlink", &[0, 1]),
    ("readlinkat", &[1, 2]),
    ("readv", &[1]),
    ("reboot", &[3]),
    ("recv", &[1]),
    ("recvfrom", &[1, 4, 5]),
    ("recvmmsg", &[1, 4]),
    ("recvmmsg_time64", &[1, 4]),
    ("recvmsg", &[1]),
    ("remap_file_pages", &[0]),
    ("removexattr", &[0]),
    ("removexattrat", &[1]),
    ("rename", &[0, 1]),
    ("renameat", &[1, 3]),
    ("renameat2", &[1, 3]),
    ("request_key", &[0, 1, 2]),
    ("riscv_flush_icache", &[0, 1]),
    ("riscv_hwprobe", &[0, 3]),
    ("rmdir", &[0]),
    ("rseq", &[0]),
    ("rt_sigaction", &[1, 2]),
    ("rt_sigpending", &[0]),
    ("rt_sigprocmask", &[1, 2]),
    ("rt_sigqueueinfo", &[2]),
    ("rt_sigsuspend", &[0]),
    ("rt_sigtimedwait", &[0, 1, 2]),
    ("rt_sigtimedwait_time64", &[0, 1, 2]),
    ("rt_tgsigqueueinfo", &[3]),
    ("sched_getaffinity", &[2]),
    ("sched_getattr", &[1]),
    ("sched_getparam", &[1]),
    ("sched_rr_get_interval", &[1]),
    ("sched_rr_get_interval_time64", &[1]),
    ("sched_setaffinity", &[2]),
    ("sched_setattr", &[1]),
    ("sched_setparam", &[1]),
    ("sched_setscheduler", &[2]),
    ("seccomp", &[2]),
    ("select", &[1, 2, 3, 4]),
    ("semctl", &[3]),
    ("semop", &[1]),
    ("semtimedop", &[1, 3]),
    ("semtimedop_time64", &[1, 3]),
    ("send", &[1]),
    ("sendfile", &[2]),
    ("sendfile64", &[2]),
    ("sendmmsg", &[1]),
    ("sendmsg", &[1]),
    ("sendto", &[1, 4]),
    ("set_mempolicy", &[1]),
    ("set_robust_list", &[0]),
    ("set_thread_area", &[0]),
    ("set_tid_address", &[0]),
    ("setdomainname", &[0]),
    ("setgroups", &[1]),
    ("setgroups32", &[1]),
    ("sethostname", &[0]),
    ("setitimer", &[1, 2]),
    ("setrlimit", &[1]),
    ("setsockopt", &[3, 4]),
    ("settimeofday", &[0, 1]),
    ("setxattr", &[0, 1, 2]),
    ("setxattrat", &[1, 3]),
    ("shmat", &[1]),
    ("shmctl", &[2]),
    ("shmdt", &[0]),
    ("sigaction", &[1, 2]),
    ("sigaltstack", &[0, 1]),
    ("signal", &[1]),
    ("signalfd", &[1]),
    ("signalfd4", &[1]),
    ("sigpending", &[0]),
    ("sigprocmask", &[1, 2]),
    ("sigsuspend", &[0]),
    ("socketcall", &[1]),
    ("socketpair", &[3]),
    ("splice", &[1, 3]),
    ("stat", &[0, 1]),
    ("stat64", &[0, 1]),
    ("statfs", &[0, 1]),
    ("statfs64", &[0, 1]),
    ("statx", &[1, 4]),
    ("swapoff", &[0]),
    ("swapon", &[0]),
    ("symlink", &[0, 1]),
    ("symlinkat", &[0, 2]),
    ("sysinfo", &[0]),
    ("syslog", &[1]),
    ("time", &[0]),
    ("timer_create", &[1, 2]),
    ("timer_gettime", &[1]),
    ("timer_gettime64", &[1]),
    ("timer_settime", &[2, 3]),
    ("timer_settime64", &[2, 3]),
    ("timerfd_gettime", &[1]),
    ("timerfd_gettime64", &[1]),
    ("timerfd_settime", &[2, 3]),
    ("timerfd_settime64", &[2, 3]),
    ("times", &[0]),
    ("truncate", &[0]),
    ("truncate64", &[0]),
    ("ugetrlimit", &[1]),
    ("umount", &[0]),
    ("umount2", &[0]),
    ("uname", &[0]),
    ("unlink", &[0]),
    ("unlinkat", &[1]),
    ("uselib", &[0]),
    ("ustat", &[1]),
    ("utime", &[0, 1]),
    ("utimensat", &[1, 2]),
    ("utimes", &[0, 1]),
    ("vmsplice", &[1]),
    ("wait4", &[1, 3]),
    ("waitid", &[2]),
    ("waitpid", &[1]),
    ("write", &[1]),
    ("writev", &[1]),
];

/// Define a static (global) LazyLock value for the kernel version.
///
/// As of version 3.36.1, this can be overridden using SYD_ASSUME_KERNEL
/// environment variable.
#[expect(clippy::disallowed_methods)]
pub static KERNEL_VERSION: LazyLock<(u32, u32)> = LazyLock::new(|| {
    // As of version 3.36.1 we allow overriding kernel version with SYD_ASSUME_KERNEL.
    let version = if let Some(version) = std::env::var_os(ENV_ASSUME_KERNEL) {
        version.as_bytes().to_owned()
    } else {
        let version = uname().unwrap();
        version.release().as_bytes().to_owned()
    };
    let nextdot = memchr(b'.', &version).expect("BUG: Invalid kernel version!");
    let major = btoi::<u32>(&version[..nextdot]).expect("BUG: Invalid kernel version!");
    let version = &version[nextdot + 1..];
    let nextdot = memchr(b'.', version).unwrap_or(version.len());
    let minor = btoi::<u32>(&version[..nextdot]).expect("BUG: Invalid kernel version!");

    (major, minor)
});

/// Kernel version string derived from `KERNEL_VERSION`, used by uname(2).
///
/// # Safety
///
/// 1. If version string is longer than 65 characters, it'll be truncated.
/// 2. Micro version is randomized to prevent information leaks.
#[expect(clippy::disallowed_methods)]
pub static KERNEL_VERSION_STR: LazyLock<String> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    let micro = randint(0u8..=u8::MAX).expect("BUG: Failed to randomize kernel micro version!");
    format!("{major}.{minor}.{micro}")
});

/// Check for MFD_NOEXEC_SEAL support (Linux 6.3 or newer).
pub static HAVE_MFD_NOEXEC_SEAL: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 3)
});

/// Returns MFD_NOEXEC_SEAL on Linux>=6.3 and MFD_ALLOW_SEALING on older Linux.
/// Set MFD_CLOEXEC as well.
pub static SAFE_MFD_FLAGS: LazyLock<MFdFlags> = LazyLock::new(|| {
    (if *HAVE_MFD_NOEXEC_SEAL {
        MFdFlags::MFD_NOEXEC_SEAL
    } else {
        MFdFlags::MFD_ALLOW_SEALING
    } | MFdFlags::MFD_CLOEXEC)
});

/// Check for PIDFD_THREAD support (Linux-6.9 or newer).
pub static HAVE_PIDFD_THREAD: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 9)
});

/// Check for SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP support (Linux-6.6 or newer).
pub static HAVE_SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 6)
});

/// Check for STATX_MNT_ID_UNIQUE support (Linux-6.8 or newer).
pub(crate) static HAVE_STATX_MNT_ID_UNIQUE: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 8)
});

/// Check for AT_EXECVE_CHECK support (Linux-6.14 or newer).
pub static HAVE_AT_EXECVE_CHECK: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 14)
});

/// Check for PROCMAP_QUERY ioctl(2) (Linux-6.11 or newer).
pub static HAVE_PROCMAP_QUERY: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 11)
});

/// Check for MADV_GUARD_{INSTALL,REMOVE} madvise(2) (Linux-6.13 or newer).
pub static HAVE_MADV_GUARD: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 13)
});

/// Check for CONFIG_CROSS_MEMORY_ATTACH support.
pub static HAVE_CROSS_MEMORY_ATTACH: LazyLock<bool> = LazyLock::new(check_cross_memory_attach);

/// Check for stat.st_size support in /proc/$pid/fd (Linux-6.2 or newer).
pub static HAVE_PROC_PID_FD_STAT_SIZE: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 2)
});

/// Check for RWF_NOAPPEND support for pwritev2(2) (Linux-6.9 or newer).
pub static HAVE_RWF_NOAPPEND: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 9)
});

/// Check if kernel/pid_max sysctl is namespaced (Linux-6.14 or newer).
pub static HAVE_NAMESPACED_PID_MAX: LazyLock<bool> = LazyLock::new(|| {
    let (major, minor) = *KERNEL_VERSION;
    major > 6 || (major == 6 && minor >= 14)
});

/// Check if Landlock scoped signals are supported (Linux-6.12 or newer).
pub static HAVE_LANDLOCK_SCOPED_SIGNALS: LazyLock<bool> = LazyLock::new(|| {
    // Check KERNEL_VERSION to allow user to override with SYD_ASSUME_KERNEL.
    let (major, minor) = *KERNEL_VERSION;
    (major > 6 || (major == 6 && minor >= 12))
        && crate::landlock::ABI::new_current() >= crate::landlock::ABI::V6
});

//
// Below is internal territory, you have been warned.

/// Initial backoff delay (in milliseconds) for EAGAIN.
pub(crate) const EAGAIN_INITIAL_DELAY: u64 = 10;

/// Backoff factor for EAGAIN.
pub(crate) const EAGAIN_BACKOFF_FACTOR: f64 = 2.0;

/// Cap on the backoff delay (in milliseconds).
pub(crate) const EAGAIN_MAX_DELAY: u64 = 1000;

/// Cap on the backoff retry count.
pub(crate) const EAGAIN_MAX_RETRY: usize = 7;

/// Unsafe mode mask for SHM hardening.
pub(crate) const SHM_UNSAFE_MASK: u64 = 0o177;

/// Name prefix for memory file descriptors.
/// This is used in access control.
pub const MFD_NAME_PREFIX: &[u8] = b"!memfd:";

/// Name prefix for hugetlbfs memory file descriptors.
/// This is used in access control.
pub const MFD_HUGETLB_NAME_PREFIX: &[u8] = b"!memfd-hugetlb:";

/// BIGTCP maximum for loopback:
///
/// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=d6f938ce52f9adb23f4c31cc371654a5f18ff328
pub const LOOPBACK_BIGTCP_MAX: u32 = 0x30d40;

/// Determine minimum allowed mmap(2) address by reading `/proc/sys/vm/mmap_min_addr`.
///
/// Enforce OpenBSD's hard-coded minimum mmap(2) address by capping at PAGE_SIZE;
/// See: https://github.com/openbsd/src/blob/94a7e27d04f23871848a8f225ef286e84969197a/sys/uvm/uvm_map.c#L235
pub static MMAP_MIN_ADDR: LazyLock<u64> =
    LazyLock::new(|| proc_mmap_min_addr().unwrap_or(*PAGE_SIZE).max(*PAGE_SIZE));

// Trace data for ptrace(2) hooked system calls.
//
// This we we don't rely on system call number
// which is architecture/personality specific.
pub(crate) const PTRACE_DATA_CHDIR: u16 = 0;
pub(crate) const PTRACE_DATA_FCHDIR: u16 = 1;
pub(crate) const PTRACE_DATA_EXECVE: u16 = 2;
pub(crate) const PTRACE_DATA_EXECVEAT: u16 = 3;
pub(crate) const PTRACE_DATA_SIGRETURN: u16 = 4;
pub(crate) const PTRACE_DATA_RT_SIGRETURN: u16 = 5;
pub(crate) const PTRACE_DATA_MMAP: u16 = 6;
pub(crate) const PTRACE_DATA_MMAP2: u16 = 7;

/// Returns a new randomized timer.
pub(crate) static _RAND_TIMER: OnceLock<RandTimer> = OnceLock::new();

/// Returns a reference to the static randomized timer.
///
/// Calling this before `timer_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn RAND_TIMER() -> &'static RandTimer {
    _RAND_TIMER.get().unwrap()
}

/// Initialize randomized timer.
pub fn timer_init(timens: bool) -> Result<(), Errno> {
    let timer = RandTimer::new(timens)?;

    info!("ctx": "run", "op": "sysinfo_init_timer",
        "msg": format!("initialized sysinfo(2) timer with {} offset{}",
            if timens { "zero" } else { "random" },
            if timens { " in time namespace" } else { "" }),
        "off": [timer.uptime_offset, timer.idle_offset]);
    _RAND_TIMER.set(timer).or(Err(Errno::EAGAIN))?;

    Ok(())
}

/// Returns a reference to the static `/` dirfd.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn ROOT_FD() -> RawFd {
    *_ROOT_FD.get().unwrap()
}

/// Returns a reference to the static `/` mode.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn ROOT_F_MODE() -> u16 {
    *_ROOT_F_MODE.get().unwrap()
}

/// Returns a reference to the static `/` unique mount id.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn ROOT_MNT_ID() -> u64 {
    *_ROOT_MNT_ID.get().unwrap()
}

/// Returns a reference to the static `/` dirfd.
///
/// Calling this before calling `proc_init` will panic!
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn ROOT_FILE() -> BorrowedFd<'static> {
    // SAFETY: `proc_init' is called beforehand.
    unsafe { BorrowedFd::borrow_raw(ROOT_FD()) }
}

/// Returns a reference to the static `/proc` dirfd.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn PROC_FD() -> RawFd {
    *_PROC_FD.get().unwrap()
}

/// Returns a reference to the static `/proc` mode.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn PROC_F_MODE() -> u16 {
    *_PROC_F_MODE.get().unwrap()
}

/// Returns a reference to the static `/proc` unique mount id.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn PROC_MNT_ID() -> u64 {
    *_PROC_MNT_ID.get().unwrap()
}

/// Returns a reference to the static `/proc` dirfd.
///
/// Calling this before calling `proc_init` will panic!
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn PROC_FILE() -> BorrowedFd<'static> {
    // SAFETY: `proc_init' is called beforehand.
    unsafe { BorrowedFd::borrow_raw(PROC_FD()) }
}

/// Returns a reference to the static `/dev/null` fd.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn NULL_FD() -> RawFd {
    *_NULL_FD.get().unwrap()
}

/// Returns a reference to the static `/dev/null` mode.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn NULL_F_MODE() -> u16 {
    *_NULL_F_MODE.get().unwrap()
}

/// Returns a reference to the static `/dev/null` unique mount id.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn NULL_MNT_ID() -> u64 {
    *_NULL_MNT_ID.get().unwrap()
}

/*
/// Returns a reference to the static `/dev/null` fd.
///
/// Calling this before calling `proc_init` will panic!
#[expect(clippy::disallowed_methods)]
#[expect(non_snake_case)]
#[inline(always)]
pub(crate) fn NULL_FILE() -> BorrowedFd<'static> {
    // SAFETY: `proc_init' is called beforehand.
    unsafe { BorrowedFd::borrow_raw(NULL_FD()) }
}
*/

/// File descriptor to `/`, ie the root file system.
pub(crate) static _ROOT_FD: OnceLock<RawFd> = OnceLock::new();

/// Mode to `/`, ie the root file system.
pub(crate) static _ROOT_F_MODE: OnceLock<u16> = OnceLock::new();

/// Unique mount id to `/`, ie the root file system.
pub(crate) static _ROOT_MNT_ID: OnceLock<u64> = OnceLock::new();

/// File descriptor to /proc file system.
pub(crate) static _PROC_FD: OnceLock<RawFd> = OnceLock::new();

/// Mode to the /proc file system.
pub(crate) static _PROC_F_MODE: OnceLock<u16> = OnceLock::new();

/// Unique mount id to /proc file system.
pub(crate) static _PROC_MNT_ID: OnceLock<u64> = OnceLock::new();

/// File descriptor to /dev/null character device.
pub(crate) static _NULL_FD: OnceLock<RawFd> = OnceLock::new();

/// Mode to the /dev/null character device.
pub(crate) static _NULL_F_MODE: OnceLock<u16> = OnceLock::new();

/// Unique mount id to /dev/null character device.
pub(crate) static _NULL_MNT_ID: OnceLock<u64> = OnceLock::new();

/// Initialize static file descriptors for use by syd::proc and friends.
///
/// This is the simple version which only opens a fd to /proc not / and /dev/null.
#[expect(clippy::cast_possible_truncation)]
#[expect(clippy::disallowed_methods)]
pub fn proc_init_simple() -> Result<(), Errno> {
    let mut mask = STATX_MODE;
    mask |= if *HAVE_STATX_MNT_ID_UNIQUE {
        STATX_MNT_ID_UNIQUE
    } else {
        STATX_MNT_ID
    };

    // Note, we may call getdents64(2) on /proc for PID sandboxing,
    // therefore we open it without O_PATH here.
    let how = OpenHow::new()
        .flags(OFlag::O_RDONLY | OFlag::O_DIRECTORY | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC)
        .resolve(ResolveFlag::RESOLVE_NO_MAGICLINKS | ResolveFlag::RESOLVE_NO_SYMLINKS);
    let fd_proc = openat2(AT_BADFD, "/proc", how)?;
    let (f_mode_proc, mnt_id_proc) = fstatx(&fd_proc, mask)
        .map(|stx| (stx.stx_mode & !(libc::S_IFMT as u16), stx.stx_mnt_id))?;

    // SAFETY: Validate what we've opened is procfs(5).
    if !is_proc(&fd_proc).unwrap_or(false) {
        return Err(Errno::ENODEV);
    }

    // SAFETY: To make this file descriptor harder to spot by an
    // attacker we duplicate it to a random fd number.
    let fd_proc = duprand(fd_proc.as_raw_fd(), OFlag::O_CLOEXEC)?.into_raw_fd();

    info!("ctx": "run", "op": "opendir_proc",
        "msg": "opened /proc directory",
        "fd": fd_proc,
        "f_mode": f_mode_proc,
        "mnt_id": mnt_id_proc);
    _PROC_FD.set(fd_proc).or(Err(Errno::EAGAIN))?;
    _PROC_F_MODE.set(f_mode_proc).or(Err(Errno::EAGAIN))?;
    _PROC_MNT_ID.set(mnt_id_proc).or(Err(Errno::EAGAIN))?;

    Ok(())
}

/// Initialize static file descriptors for use by syd::proc and friends.
#[expect(clippy::cast_possible_truncation)]
#[expect(clippy::cognitive_complexity)]
#[expect(clippy::disallowed_methods)]
pub fn proc_init() -> Result<(), Errno> {
    let mut mask = STATX_MODE;
    mask |= if *HAVE_STATX_MNT_ID_UNIQUE {
        STATX_MNT_ID_UNIQUE
    } else {
        STATX_MNT_ID
    };

    let how = OpenHow::new()
        .flags(OFlag::O_PATH | OFlag::O_DIRECTORY | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC)
        .mode(Mode::empty())
        .resolve(ResolveFlag::RESOLVE_NO_MAGICLINKS | ResolveFlag::RESOLVE_NO_SYMLINKS);

    let fd_root = openat2(AT_BADFD, "/", how)?;
    let (f_mode_root, mnt_id_root) = fstatx(&fd_root, mask)
        .map(|stx| (stx.stx_mode & !(libc::S_IFMT as u16), stx.stx_mnt_id))?;

    // SAFETY: To make this file descriptor harder to spot by an
    // attacker we duplicate it to a random fd number.
    let fd_root = duprand(fd_root.as_raw_fd(), OFlag::O_CLOEXEC)?.into_raw_fd();

    info!("ctx": "run", "op": "opendir_root",
        "msg": "opened root directory",
        "fd": fd_root,
        "f_mode": f_mode_root,
        "mnt_id": mnt_id_root);
    _ROOT_FD.set(fd_root).or(Err(Errno::EAGAIN))?;
    _ROOT_F_MODE.set(f_mode_root).or(Err(Errno::EAGAIN))?;
    _ROOT_MNT_ID.set(mnt_id_root).or(Err(Errno::EAGAIN))?;

    // SAFETY: fd_root is a valid fd for process lifetime.
    let fd_root = unsafe { BorrowedFd::borrow_raw(fd_root) };

    // Note, we may call getdents64(2) on /proc for PID sandboxing,
    // therefore we open it without O_PATH here.
    let how = how
        .flags(OFlag::O_RDONLY | OFlag::O_DIRECTORY | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC)
        .resolve(
            ResolveFlag::RESOLVE_BENEATH
                | ResolveFlag::RESOLVE_NO_MAGICLINKS
                | ResolveFlag::RESOLVE_NO_SYMLINKS,
        );
    let fd_proc = openat2(fd_root, "proc", how)?;
    let (f_mode_proc, mnt_id_proc) = fstatx(&fd_proc, mask)
        .map(|stx| (stx.stx_mode & !(libc::S_IFMT as u16), stx.stx_mnt_id))?;

    // SAFETY: Validate what we've opened is procfs(5).
    if !is_proc(&fd_proc).unwrap_or(false) {
        return Err(Errno::ENODEV);
    }

    // SAFETY: To make this file descriptor harder to spot by an
    // attacker we duplicate it to a random fd number.
    let fd_proc = duprand(fd_proc.as_raw_fd(), OFlag::O_CLOEXEC)?.into_raw_fd();

    info!("ctx": "run", "op": "opendir_proc",
        "msg": "opened /proc directory",
        "fd": fd_proc,
        "f_mode": f_mode_proc,
        "mnt_id": mnt_id_proc);
    _PROC_FD.set(fd_proc).or(Err(Errno::EAGAIN))?;
    _PROC_F_MODE.set(f_mode_proc).or(Err(Errno::EAGAIN))?;
    _PROC_MNT_ID.set(mnt_id_proc).or(Err(Errno::EAGAIN))?;

    let how = how.flags(OFlag::O_PATH | OFlag::O_NOFOLLOW | OFlag::O_CLOEXEC);
    let fd_null = openat2(fd_root, "dev/null", how)?;
    let (f_mode_null, mnt_id_null) = fstatx(&fd_null, mask)
        .map(|stx| (stx.stx_mode & !(libc::S_IFMT as u16), stx.stx_mnt_id))?;

    // SAFETY: Validate what we've opened is indeed `/dev/null`.
    if !is_dev_null(&fd_null).unwrap_or(false) {
        return Err(Errno::ENODEV);
    }

    // SAFETY: To make this file descriptor harder to spot by an
    // attacker we duplicate it to a random fd number.
    let fd_null = duprand(fd_null.as_raw_fd(), OFlag::O_CLOEXEC)?.into_raw_fd();

    info!("ctx": "run", "op": "opendev_null",
        "msg": "opened /dev/null",
        "fd": fd_null,
        "f_mode": f_mode_null,
        "mnt_id": mnt_id_null);
    _NULL_FD.set(fd_null).or(Err(Errno::EAGAIN))?;
    _NULL_F_MODE.set(f_mode_null).or(Err(Errno::EAGAIN))?;
    _NULL_MNT_ID.set(mnt_id_null).or(Err(Errno::EAGAIN))?;

    Ok(())
}

/// Close static file descriptors for use by syd::proc and friends.
pub fn proc_close() {
    if let Some(fd) = _ROOT_FD.get() {
        let _ = close(*fd);
    }

    if let Some(fd) = _PROC_FD.get() {
        let _ = close(*fd);
    }

    if let Some(fd) = _NULL_FD.get() {
        let _ = close(*fd);
    }
}

/// Pink Floyd easter egg.
pub const PINK_FLOYD: &str = concat!(
    "\x1b[01;35m",
    "       ..uu.                               \n",
    "       ?$\"\"`?i           z'              \n",
    "       `M  .@\"          x\"               \n",
    "       'Z :#\"  .   .    f 8M              \n",
    "       '&H?`  :$f U8   <  MP   x#'         \n",
    "       d#`    XM  $5.  $  M' xM\"          \n",
    "     .!\">     @  'f`$L:M  R.@!`           \n",
    "    +`  >     R  X  \"NXF  R\"*L           \n",
    "        k    'f  M   \"$$ :E  5.           \n",
    "        %%    `~  \"    `  'K  'M          \n",
    "            .uH          'E   `h           \n",
    "         .x*`             X     `          \n",
    "      .uf`                *                \n",
    "    .@8     .                              \n",
    "   'E9F  uf\"          ,     ,             \n",
    "     9h+\"   $M    eH. 8b. .8    .....     \n",
    "    .8`     $'   M 'E  `R;'   d?\"\"\"`\"# \n",
    "   ` E      @    b  d   9R    ?*     @     \n",
    "     >      K.zM `%%M'   9'    Xf   .f     \n",
    "    ;       R'          9     M  .=`       \n",
    "    t                   M     Mx~          \n",
    "    @                  lR    z\"           \n",
    "    @                  `   ;\"             \n",
    "                          `                \n",
    "\x1b[0m",
);

/// See Emily Play easter egg.
pub const SEE_EMILY_PLAY: &str = concat!(
    "\x1b[0;1;35;95mTh",
    "\x1b[0;1;31;91mer",
    "\x1b[0;1;33;93me",
    "\x1b[0m ",
    "\x1b[0;1;32;92mis",
    "\x1b[0m ",
    "\x1b[0;1;36;96mn",
    "\x1b[0;1;34;94mo",
    "\x1b[0m ",
    "\x1b[0;1;35;95mot",
    "\x1b[0;1;31;91mhe",
    "\x1b[0;1;33;93mr",
    "\x1b[0m ",
    "\x1b[0;1;32;92mda",
    "\x1b[0;1;36;96my",
    "\x1b[0m",
    "\n",
    "\x1b[0;1;35;95mLe",
    "\x1b[0;1;31;91mt'",
    "\x1b[0;1;33;93ms",
    "\x1b[0m ",
    "\x1b[0;1;32;92mtr",
    "\x1b[0;1;36;96my",
    "\x1b[0m ",
    "\x1b[0;1;34;94mit",
    "\x1b[0m ",
    "\x1b[0;1;35;95ma",
    "\x1b[0;1;31;91mno",
    "\x1b[0;1;33;93mth",
    "\x1b[0;1;32;92mer",
    "\x1b[0m ",
    "\x1b[0;1;36;96mw",
    "\x1b[0;1;34;94may",
    "\x1b[0m",
    "\n",
    "\x1b[0;1;35;95mYo",
    "\x1b[0;1;31;91mu'",
    "\x1b[0;1;33;93mll",
    "\x1b[0m ",
    "\x1b[0;1;32;92ml",
    "\x1b[0;1;36;96mos",
    "\x1b[0;1;34;94me",
    "\x1b[0m ",
    "\x1b[0;1;35;95myo",
    "\x1b[0;1;31;91mur",
    "\x1b[0m ",
    "\x1b[0;1;33;93mm",
    "\x1b[0;1;32;92min",
    "\x1b[0;1;36;96md",
    "\x1b[0m ",
    "\x1b[0;1;34;94man",
    "\x1b[0;1;35;95md",
    "\x1b[0m ",
    "\x1b[0;1;31;91mpl",
    "\x1b[0;1;33;93may",
    "\x1b[0m",
    "\n",
    "\x1b[0;1;35;95mFr",
    "\x1b[0;1;31;91mee",
    "\x1b[0m \x1b[0;1;33;93mg",
    "\x1b[0;1;32;92mam",
    "\x1b[0;1;36;96mes",
    "\x1b[0m ",
    "\x1b[0;1;34;94mf",
    "\x1b[0;1;35;95mor",
    "\x1b[0m ",
    "\x1b[0;1;31;91mm",
    "\x1b[0;1;33;93may",
    "\x1b[0m",
    "\n",
    "\x1b[0;1;35;95mSe",
    "\x1b[0;1;31;91me",
    "\x1b[0m ",
    "\x1b[0;1;33;93mEm",
    "\x1b[0;1;32;92mil",
    "\x1b[0;1;36;96my",
    "\x1b[0m ",
    "\x1b[0;1;34;94mpl",
    "\x1b[0;1;35;95may",
    "\x1b[0m",
    "\n",
);

/// The Piper easter egg.
pub const THE_PIPER: &str = concat!(
    "Helper and healer, I cheer\n",
    "Small waifs in the woodland wet\n",
    "Strays I find in it, wounds I bind in it\n",
    "Bidding them all forget!\n",
);

// Warn at compile time if this crate is built with `panic=abort`.
#[cfg(panic = "abort")]
#[deprecated(
    note = "Built with panic=abort. This configuration is not supported; you have been warned."
)]
#[allow(dead_code)]
const SYD_PANIC_ABORT_BUILD: () = ();

// Force a use of the deprecated item so the warning is actually emitted.
#[cfg(panic = "abort")]
const _: () = {
    let _ = SYD_PANIC_ABORT_BUILD;
};
