fork()
and Copy-on-Write (CoW) on Linuxfork()
System CallThe fork()
system call in Unix-like operating systems, including Linux, creates a new process by duplicating the calling process. Here’s how it works:
task_struct
(process descriptor) for the child process.task_struct
.TASK_UNINTERRUPTIBLE
until the fork completes.fork()
returns the child’s PID to the parent and 0 to the child, distinguishing their roles.Simplified Pseudocode for fork()
:
pid_t sys_fork(void) {
struct task_struct *child;
child = alloc_task_struct();
if (!child)
return -ENOMEM;
copy_process(child, current);
child->mm = copy_mm(current);
copy_page_tables(child);
copy_files(child, current);
copy_sighand(child, current);
alloc_pid(child);
setup_kernel_stack(child);
wake_up_new_task(child);
return child->pid;
}
void copy_page_tables(struct task_struct *child) {
unsigned long addr;
for (addr = TASK_SIZE; addr < STACK_TOP; addr += PAGE_SIZE) {
pgd_t *pgd = pgd_offset(current->mm, addr);
if (pgd_present(*pgd)) {
pte_t *pte = pte_offset(pgd, addr);
if (pte_present(*pte)) {
pte_t new_pte = *pte;
pte_make_readonly(new_pte);
pte_mkspecial(new_pte);
set_pte(pte, new_pte);
set_pte(pte_offset(child->mm->pgd, addr), new_pte);
}
}
}
}
fork()
:
Simplified Pseudocode for CoW:
int handle_page_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) {
if (flags & FAULT_FLAG_WRITE) { // If it's a write fault
pte_t *pte = pte_offset(vma->vm_mm->pgd, address);
if (pte_present(*pte) && pte_special(*pte)) {
return handle_cow_fault(vma, address, pte);
}
}
return VM_FAULT_SIGBUS; // Return a bus error for other fault types
}
int handle_cow_fault(struct vm_area_struct *vma, unsigned long address, pte_t *pte) {
struct page *old_page, *new_page;
old_page = pte_page(*pte);
new_page = alloc_page(GFP_HIGHUSER_MOVABLE);
if (!new_page)
return VM_FAULT_OOM; // Return Out of Memory error if allocation fails
copy_user_highpage(new_page, old_page, address, vma); // Copy content from the old page to the new
pte_t entry = mk_pte(new_page, vma->vm_page_prot);
entry = pte_mkwrite(entry);
entry = pte_mkyoung(entry);
entry = pte_mkdirty(entry);
set_pte_at(vma->vm_mm, address, pte, entry);
page_remove_rmap(old_page); // Decrease the reference count of the old page
flush_tlb_page(vma, address);
return VM_FAULT_WRITE; // Return a write fault indicator
}