diff --git a/scripts/create-image.sh b/scripts/create-image.sh index 3726e450..129d90d2 100755 --- a/scripts/create-image.sh +++ b/scripts/create-image.sh @@ -50,7 +50,7 @@ fi # Build and copy over usertest cd "$base"/usertest -usertest_binary="$(cargo build --message-format=json | jq -r 'select(.reason == "compiler-artifact") | .filenames[]' | grep "usertest")" +usertest_binary="$(cargo build --message-format=json-render-diagnostics | jq -r 'select(.reason == "compiler-artifact" and .target.name == "usertest") | .executable // empty')" cp "$usertest_binary" "$base/build/rootfs/bin/usertest" # make image diff --git a/src/arch/arm64/memory/fault.rs b/src/arch/arm64/memory/fault.rs index a40df10e..01f3a7a7 100644 --- a/src/arch/arm64/memory/fault.rs +++ b/src/arch/arm64/memory/fault.rs @@ -74,7 +74,7 @@ fn run_mem_fault_handler( } fn handle_uacess_abort(exception: Exception, info: AbortIss, state: &mut ExceptionState) { - match run_mem_fault_handler(current_work().vm.clone(), exception, info) { + match run_mem_fault_handler(current_work().vm.shared_vm(), exception, info) { // We mapped in a page, the uacess handler can proceed. Ok(FaultResolution::Resolved) => (), // If the fault couldn't be resolved, signal to the uacess fixup that @@ -124,7 +124,7 @@ pub fn handle_kernel_mem_fault(exception: Exception, info: AbortIss, state: &mut } pub fn handle_mem_fault(ctx: &mut ProcessCtx, exception: Exception, info: AbortIss) { - match run_mem_fault_handler(ctx.shared().vm.clone(), exception, info) { + match run_mem_fault_handler(ctx.shared().vm.shared_vm(), exception, info) { Ok(FaultResolution::Resolved) => {} Ok(FaultResolution::Denied) => { ctx.task().process.deliver_signal(SigId::SIGSEGV); diff --git a/src/arch/arm64/proc.rs b/src/arch/arm64/proc.rs index 341e57d4..c1fca28d 100644 --- a/src/arch/arm64/proc.rs +++ b/src/arch/arm64/proc.rs @@ -1,15 +1,10 @@ use crate::process::Task; use alloc::sync::Arc; -use libkernel::memory::proc_vm::address_space::UserAddressSpace; pub mod idle; pub mod signal; pub mod vdso; pub fn context_switch(new: Arc) { - new.vm - .lock_save_irq() - .mm_mut() - .address_space_mut() - .activate(); + new.vm.activate(); } diff --git a/src/drivers/fs/proc/task/task_file.rs b/src/drivers/fs/proc/task/task_file.rs index 267b27fb..db589e0e 100644 --- a/src/drivers/fs/proc/task/task_file.rs +++ b/src/drivers/fs/proc/task/task_file.rs @@ -109,7 +109,8 @@ Threads:\t{tasks}\n", TaskFileType::Comm => format!("{name}\n", name = name.as_str()), TaskFileType::State => format!("{state}\n"), TaskFileType::Stat => { - let vm = task.vm.lock_save_irq(); + let proc_vm = task.vm.shared_vm(); + let vm = proc_vm.lock_save_irq(); let mut vsize = 0; let mut startcode = 0; @@ -217,7 +218,8 @@ Threads:\t{tasks}\n", TaskFileType::Root => task.root.lock_save_irq().1.as_str().to_string(), TaskFileType::Maps => { let mut output = String::new(); - let mut vm = task.vm.lock_save_irq(); + let proc_vm = task.vm.shared_vm(); + let mut vm = proc_vm.lock_save_irq(); for vma in vm.mm_mut().iter_vmas() { output.push_str(&format!( diff --git a/src/memory/brk.rs b/src/memory/brk.rs index 81b577b0..89f072fc 100644 --- a/src/memory/brk.rs +++ b/src/memory/brk.rs @@ -20,7 +20,8 @@ use crate::sched::syscall_ctx::ProcessCtx; /// - On a successful resize, it returns the new break. /// - On a failed resize, it returns the current, unchanged break. pub async fn sys_brk(ctx: &ProcessCtx, addr: VA) -> Result { - let mut vm = ctx.shared().vm.lock_save_irq(); + let proc_vm = ctx.shared().vm.shared_vm(); + let mut vm = proc_vm.lock_save_irq(); // The query case `brk(0)` is special and is handled separately from modifications. if addr.is_null() { diff --git a/src/memory/mincore.rs b/src/memory/mincore.rs index 658512fd..ff87a846 100644 --- a/src/memory/mincore.rs +++ b/src/memory/mincore.rs @@ -35,7 +35,8 @@ pub async fn sys_mincore(ctx: &ProcessCtx, start: u64, len: usize, vec: UA) -> R let mut buf: Vec = vec![0; pages]; { - let mut vm_guard = ctx.shared().vm.lock_save_irq(); + let proc_vm = ctx.shared().vm.shared_vm(); + let mut vm_guard = proc_vm.lock_save_irq(); let mm = vm_guard.mm_mut(); // Validate the entire region is covered by VMAs diff --git a/src/memory/mmap.rs b/src/memory/mmap.rs index 6a312a9a..78842a75 100644 --- a/src/memory/mmap.rs +++ b/src/memory/mmap.rs @@ -127,7 +127,8 @@ pub async fn sys_mmap( }; // Lock the task and call the core memory manager to perform the mapping. - let new_mapping_addr = ctx.shared().vm.lock_save_irq().mm_mut().mmap( + let proc_vm = ctx.shared().vm.shared_vm(); + let new_mapping_addr = proc_vm.lock_save_irq().mm_mut().mmap( address_request, requested_len, permissions, @@ -141,7 +142,8 @@ pub async fn sys_mmap( pub async fn sys_munmap(ctx: &ProcessCtx, addr: VA, len: usize) -> Result { let region = VirtMemoryRegion::new(addr, len); - let pages = ctx.shared().vm.lock_save_irq().mm_mut().munmap(region)?; + let proc_vm = ctx.shared().vm.shared_vm(); + let pages = proc_vm.lock_save_irq().mm_mut().munmap(region)?; // Free any physical frames that were unmapped. if !pages.is_empty() { @@ -165,11 +167,8 @@ pub fn sys_mprotect(ctx: &ProcessCtx, addr: VA, len: usize, prot: u64) -> Result let perms = prot_to_perms(prot); let region = VirtMemoryRegion::new(addr, len); - ctx.shared() - .vm - .lock_save_irq() - .mm_mut() - .mprotect(region, perms)?; + let proc_vm = ctx.shared().vm.shared_vm(); + proc_vm.lock_save_irq().mm_mut().mprotect(region, perms)?; Ok(0) } diff --git a/src/process/clone.rs b/src/process/clone.rs index 46114d3c..5579f046 100644 --- a/src/process/clone.rs +++ b/src/process/clone.rs @@ -1,6 +1,6 @@ use super::owned::OwnedTask; use super::ptrace::{PTrace, TracePoint, ptrace_stop}; -use super::{ITimers, Tid}; +use super::{ITimers, Tid, VmHandle}; use super::{ ctx::Context, thread_group::signal::{AtomicSigSet, SigSet}, @@ -120,11 +120,14 @@ pub async fn sys_clone( }; let vm = if flags.contains(CloneFlags::CLONE_VM) { - current_task.vm.clone() + if flags.contains(CloneFlags::CLONE_THREAD) { + current_task.vm.clone() + } else { + Arc::new(VmHandle::from_shared(current_task.vm.shared_vm())) + } } else { - Arc::new(SpinLock::new( - current_task.vm.lock_save_irq().clone_as_cow()?, - )) + let proc_vm = current_task.vm.shared_vm(); + Arc::new(VmHandle::new(proc_vm.lock_save_irq().clone_as_cow()?)) }; let files = if flags.contains(CloneFlags::CLONE_FILES) { @@ -195,8 +198,15 @@ pub async fn sys_clone( } }; + if flags.contains(CloneFlags::CLONE_VFORK) { + new_task.process.start_vfork(); + } + let desc = new_task.descriptor(); let work = Work::new(Box::new(new_task)); + let vfork_process = flags + .contains(CloneFlags::CLONE_VFORK) + .then(|| work.process.clone()); TASK_LIST .lock_save_irq() @@ -219,5 +229,9 @@ pub async fn sys_clone( copy_to_user(child_tidptr, desc.tid.value()).await?; } + if let Some(vfork_process) = vfork_process { + vfork_process.wait_for_vfork_release().await; + } + Ok(desc.tid.value() as _) } diff --git a/src/process/exec.rs b/src/process/exec.rs index 2a348e7a..de745b8e 100644 --- a/src/process/exec.rs +++ b/src/process/exec.rs @@ -187,14 +187,7 @@ async fn exec_elf( ptrace_stop(ctx, TracePoint::Exec).await; let user_ctx = ArchImpl::new_user_context(entry_addr, stack_ptr); - let mut vm = ProcessVM::from_map(mem_map); - - // We don't have to worry about actually calling for a full context switch - // here. Parts of the old process that are replaced will go out of scope and - // be cleaned up (open files, etc.); We don't need to preserve any extra - // state. Simply activate the new process's address space. - vm.mm_mut().address_space_mut().activate(); - + let vm = ProcessVM::from_map(mem_map); let new_comm = argv.first().map(|s| Comm::new(s.as_str())); { @@ -205,10 +198,15 @@ async fn exec_elf( } current_task.ctx = Context::from_user_ctx(user_ctx); - *current_task.vm.lock_save_irq() = vm; + current_task.vm.replace(vm); + current_task.vm.activate(); *current_task.process.signals.lock_save_irq() = SignalActionState::new_default(); } + // `CLONE_VFORK` parents must resume as soon as the child has stopped using + // the shared address space, before any later async cleanup can block. + ctx.shared().process.complete_vfork(); + // Close all the CLOEXEC FDs. let mut fd_table = ctx.shared().fd_table.lock_save_irq().clone(); fd_table.close_cloexec_entries().await; diff --git a/src/process/exit.rs b/src/process/exit.rs index 9d19db68..c1da55a8 100644 --- a/src/process/exit.rs +++ b/src/process/exit.rs @@ -62,6 +62,10 @@ pub fn do_exit_group(task: &Arc, exit_code: ChildState) { // to wait for all the processes to have stopped execution before tearing // down the address-space, etc. + // If this process was created with `CLONE_VFORK`, the parent may resume as + // soon as we are guaranteed not to run in the shared address space again. + process.complete_vfork(); + // Reparent children to `init` { let mut our_children = process.children.lock_save_irq(); diff --git a/src/process/mod.rs b/src/process/mod.rs index 95737a21..bf1da129 100644 --- a/src/process/mod.rs +++ b/src/process/mod.rs @@ -143,6 +143,49 @@ impl TaskDescriptor { pub type ProcVM = ProcessVM<::ProcessAddressSpace>; +/// A per-task handle to a process address space. +/// +/// Separate processes may temporarily share the same underlying `ProcVM` +/// (e.g. `CLONE_VM` without `CLONE_THREAD`, including `CLONE_VFORK`) while +/// still allowing one side to detach on `execve()` without affecting the +/// other. Tasks in the same thread group share the same `VmHandle` so that an +/// `execve()` updates the whole group consistently. +pub struct VmHandle { + current: SpinLock>>, +} + +impl VmHandle { + pub fn new(vm: ProcVM) -> Self { + Self::from_shared(Arc::new(SpinLock::new(vm))) + } + + pub fn from_shared(vm: Arc>) -> Self { + Self { + current: SpinLock::new(vm), + } + } + + pub fn shared_vm(&self) -> Arc> { + self.current.lock_save_irq().clone() + } + + pub fn replace(&self, vm: ProcVM) { + self.replace_shared(Arc::new(SpinLock::new(vm))); + } + + pub fn replace_shared(&self, vm: Arc>) { + *self.current.lock_save_irq() = vm; + } + + pub fn activate(&self) { + self.shared_vm() + .lock_save_irq() + .mm_mut() + .address_space_mut() + .activate(); + } +} + #[derive(Copy, Clone)] pub struct Comm([u8; 16]); @@ -183,7 +226,7 @@ pub struct Task { pub tid: Tid, pub comm: Arc>, pub process: Arc, - pub vm: Arc>, + pub vm: Arc, pub cwd: Arc, PathBuf)>>, pub root: Arc, PathBuf)>>, pub creds: SpinLock, @@ -276,8 +319,10 @@ impl Task { Box::into_pin(fut).await?; } + let proc_vm = self.vm.shared_vm(); + { - let mut vm = self.vm.lock_save_irq(); + let mut vm = proc_vm.lock_save_irq(); if let Some(pa) = vm.mm_mut().address_space_mut().translate(va) { let region = pa.pfn.as_phys_range(); @@ -302,7 +347,7 @@ impl Task { } // Try to handle the fault. - match handle_demand_fault(self.vm.clone(), va, access_kind)? { + match handle_demand_fault(proc_vm.clone(), va, access_kind)? { // Resolved the fault. Try again FaultResolution::Resolved => continue, FaultResolution::Denied => return Err(KernelError::Fault), diff --git a/src/process/owned.rs b/src/process/owned.rs index 779696f4..0508cd9e 100644 --- a/src/process/owned.rs +++ b/src/process/owned.rs @@ -1,5 +1,5 @@ use super::{ - Comm, ITimers, Task, Tid, + Comm, ITimers, Task, Tid, VmHandle, creds::Credentials, ctx::{Context, UserCtx}, fd_table::FileDescriptorTable, @@ -70,7 +70,7 @@ impl OwnedTask { cwd: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))), root: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))), creds: SpinLock::new(Credentials::new_root()), - vm: Arc::new(SpinLock::new(vm)), + vm: Arc::new(VmHandle::new(vm)), fd_table: Arc::new(SpinLock::new(FileDescriptorTable::new())), i_timers: SpinLock::new(ITimers::default()), ptrace: SpinLock::new(PTrace::new()), @@ -100,7 +100,7 @@ impl OwnedTask { cwd: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))), root: Arc::new(SpinLock::new((Arc::new(DummyInode {}), PathBuf::new()))), creds: SpinLock::new(Credentials::new_root()), - vm: Arc::new(SpinLock::new( + vm: Arc::new(VmHandle::new( ProcessVM::empty().expect("Could not create init process's VM"), )), i_timers: SpinLock::new(ITimers::default()), diff --git a/src/process/thread_group.rs b/src/process/thread_group.rs index ee136de1..5eafb46d 100644 --- a/src/process/thread_group.rs +++ b/src/process/thread_group.rs @@ -6,7 +6,7 @@ use crate::{ sched_task::{Work, state::TaskState}, waker::create_waker, }, - sync::SpinLock, + sync::{CondVar, SpinLock}, }; use alloc::{ collections::btree_map::BTreeMap, @@ -16,7 +16,7 @@ use alloc::{ use builder::ThreadGroupBuilder; use core::sync::atomic::AtomicUsize; use core::{fmt::Display, sync::atomic::Ordering}; -use libkernel::fs::pathbuf::PathBuf; +use libkernel::{fs::pathbuf::PathBuf, sync::condvar::WakeupType}; use pid::PidT; use rsrc_lim::ResourceLimits; use signal::{SigId, SigSet, SignalActionState}; @@ -112,6 +112,9 @@ pub struct ThreadGroup { pub pending_signals: SpinLock, pub priority: SpinLock, pub child_notifiers: Notifiers, + /// `true` while a parent is blocked in `CLONE_VFORK` waiting for this + /// process to either `execve()` successfully or exit. + pub vfork_blocked_parent: CondVar, pub utime: AtomicUsize, pub stime: AtomicUsize, pub last_account: AtomicUsize, @@ -151,6 +154,30 @@ impl ThreadGroup { TG_LIST.lock_save_irq().get(&id).and_then(|x| x.upgrade()) } + pub fn start_vfork(&self) { + self.vfork_blocked_parent.update(|blocked| { + *blocked = true; + WakeupType::None + }); + } + + pub async fn wait_for_vfork_release(&self) { + self.vfork_blocked_parent + .wait_until(|blocked| (!*blocked).then_some(())) + .await; + } + + pub fn complete_vfork(&self) { + self.vfork_blocked_parent.update(|blocked| { + if *blocked { + *blocked = false; + WakeupType::All + } else { + WakeupType::None + } + }); + } + pub fn notify_signal_waiters(&self) { let tasks: Vec<_> = self .tasks diff --git a/src/process/thread_group/builder.rs b/src/process/thread_group/builder.rs index 68bf8bc3..2217426d 100644 --- a/src/process/thread_group/builder.rs +++ b/src/process/thread_group/builder.rs @@ -2,7 +2,10 @@ use core::sync::atomic::AtomicUsize; use alloc::{collections::btree_map::BTreeMap, sync::Arc}; -use crate::{drivers::fs::cgroup, sync::SpinLock}; +use crate::{ + drivers::fs::cgroup, + sync::{CondVar, SpinLock}, +}; use super::{ Pgid, ProcessState, Sid, TG_LIST, Tgid, ThreadGroup, @@ -80,6 +83,7 @@ impl ThreadGroupBuilder { .unwrap_or_else(|| Arc::new(SpinLock::new(ResourceLimits::default()))), pending_signals: SpinLock::new(SigSet::empty()), child_notifiers: Notifiers::new(), + vfork_blocked_parent: CondVar::new(false), priority: SpinLock::new(self.pri.unwrap_or(0)), utime: AtomicUsize::new(0), stime: AtomicUsize::new(0), diff --git a/src/process/threading/futex/key.rs b/src/process/threading/futex/key.rs index b436908c..f64abd30 100644 --- a/src/process/threading/futex/key.rs +++ b/src/process/threading/futex/key.rs @@ -20,9 +20,8 @@ impl FutexKey { } pub fn new_shared(ctx: &ProcessCtx, uaddr: TUA) -> Result { - let pg_info = ctx - .shared() - .vm + let proc_vm = ctx.shared().vm.shared_vm(); + let pg_info = proc_vm .lock_save_irq() .mm_mut() .address_space_mut() diff --git a/usertest/src/main.rs b/usertest/src/main.rs index 3272f05d..aec75219 100644 --- a/usertest/src/main.rs +++ b/usertest/src/main.rs @@ -79,6 +79,54 @@ fn test_fork() { register_test!(test_fork); +#[expect(deprecated)] +fn test_vfork_exit() { + unsafe { + let pid = libc::vfork(); + if pid < 0 { + panic!("vfork failed"); + } else if pid == 0 { + libc::_exit(0); + } else { + let mut status = 0; + libc::waitpid(pid, &mut status, 0); + assert!(libc::WIFEXITED(status)); + assert_eq!(libc::WEXITSTATUS(status), 0); + } + } +} + +register_test!(test_vfork_exit); + +#[expect(deprecated)] +fn test_vfork_exec() { + static TRUE_PATH: &[u8] = b"/bin/true\0"; + + unsafe { + let pid = libc::vfork(); + if pid < 0 { + panic!("vfork failed"); + } else if pid == 0 { + let argv = [TRUE_PATH.as_ptr().cast::(), core::ptr::null()]; + let envp = [core::ptr::null()]; + + libc::execve( + TRUE_PATH.as_ptr().cast::(), + argv.as_ptr(), + envp.as_ptr(), + ); + libc::_exit(127); + } else { + let mut status = 0; + libc::waitpid(pid, &mut status, 0); + assert!(libc::WIFEXITED(status)); + assert_eq!(libc::WEXITSTATUS(status), 0); + } + } +} + +register_test!(test_vfork_exec); + fn test_rust_thread() { let handle = thread::spawn(|| 24);