系统调用是用户态程序进入内核的唯一合法通道。在 x86_64 架构上syscall指令提供了最低延迟的陷门。本文基于 Linux 6.8.12 源码完整剖析 64 位系统调用的生命周期从 CPU 的 MSR 初始化到entry_SYSCALL_64的现场保护再到do_syscall_64的分发执行以及最终sysret与iret的路径选择。同时我们还将看到内核如何为了安全而在性能上做出的必要取舍。一、硬件准备syscall_init与 MSR 魔法x86_64 CPU 通过一组模型特定寄存器MSR控制syscall/sysret的行为。Linux 在启动时通过syscall_init()配置这些寄存器arch/x86/kernel/cpu/common.c。cvoid syscall_init(void) { wrmsr(MSR_STAR, 0, (__USER32_CS 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); // ... 兼容 32 位与 SYSENTER 处理 ... wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF| ... |X86_EFLAGS_ID); }MSR_STARsyscall进入内核时CPU 从该寄存器的[47:32]位加载CS和SS内核段退出时从[63:48]位加载用户段。MSR_LSTAR存放内核系统调用入口点entry_SYSCALL_64。MSR_SYSCALL_MASK当syscall执行时硬件会将RFLAGS与该掩码做AND操作从而清除中断标志IF、方向标志DF等保证内核态运行时不会意外被中断。一个小细节syscall_init没有标记为__init因为系统休眠唤醒后需要重新加载这些 MSR所以该函数必须保留在运行时镜像中。二、入口冲锋entry_SYSCALL_64的栈切换与现场保存当用户程序执行syscall时CPU 自动完成以下动作将RIP保存到RCX将RFLAGS保存到R11从MSR_LSTAR加载RIP到entry_SYSCALL_64从MSR_STAR加载内核CS从MSR_SYSCALL_MASK屏蔽RFLAGS不自动切换RSP仍指向用户栈不压栈任何内容。因此入口代码的第一要务是切换到内核栈并保存所有寄存器arch/x86/entry/entry_64.SassemblySYM_CODE_START(entry_SYSCALL_64) swapgs /* 交换 GS 基址访问 per-CPU 数据 */ movq %rsp, PER_CPU_VAR(cpu_tss_rw TSS_sp2) /* 暂存用户栈顶 */ SWITCH_TO_KERNEL_CR3 scratch_reg%rsp movq PER_CPU_VAR(pcpu_hot X86_top_of_stack), %rsp /* 切换到内核栈 */ /* 构建 struct pt_regs */ pushq $__USER_DS /* ss */ pushq PER_CPU_VAR(cpu_tss_rw TSS_sp2) /* sp (用户栈) */ pushq %r11 /* flags */ pushq $__USER_CS /* cs */ pushq %rcx /* ip (用户返回地址) */ pushq %rax /* orig_ax (系统调用号) */ PUSH_AND_CLEAR_REGS rax$-ENOSYS /* 保存剩余通用寄存器rax 设为 -ENOSYS */压栈顺序与struct pt_regs的定义完全匹配。注意RCX和R11被特意压入栈中——它们保存了用户态的RIP和RFLAGS将在返回时使用。三、C 世界do_syscall_64与系统调用表栈建立完成后调用do_syscall_64(struct pt_regs *regs, int nr)arch/x86/entry/common.cc__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); if (!do_syscall_x64(regs, nr) !do_syscall_x32(regs, nr) nr ! -1) regs-ax __x64_sys_ni_syscall(regs); instrumentation_end(); syscall_exit_to_user_mode(regs); // ... 返回路径决策 ... }核心分发函数do_syscall_x64会通过系统调用表查找对应的内核函数cstatic __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) { unsigned int unr nr; if (likely(unr NR_syscalls)) { unr array_index_nospec(unr, NR_syscalls); regs-ax x64_sys_call(regs, unr); return true; } return false; }x64_sys_call是由脚本arch/x86/syscalls/syscall_64.tbl自动生成的巨大switch语句将系统调用号映射到实际函数如__x64_sys_read。注意代码中有一行调试输出trace_printk(do_syscall_64: nr%d, ip0x%lx\n, nr, regs-ip);这通常是内核开发期间用于追踪系统调用行为的临时日志生产内核中不应出现。四、返回决策SYSRET 快速路 vs IRET 慢速路系统调用返回时内核可以选择两条路快速路使用sysretq指令开销极小但条件苛刻。慢速路使用iretq指令能够处理各种复杂情况如改变CS、SS、信号返回等。do_syscall_64在返回前会执行一系列检查决定是否能够使用sysretc/* 1. Xen PV 虚拟机强制走 IRET */ if (cpu_feature_enabled(X86_FEATURE_XENPV)) return false; /* 2. RCX RIP 且 R11 RFLAGS 才能用 SYSRET */ if (unlikely(regs-cx ! regs-ip || regs-r11 ! regs-flags)) return false; /* 3. CS/SS 必须为标准用户段 */ if (unlikely(regs-cs ! __USER_CS || regs-ss ! __USER_DS)) return false; /* 4. RIP 必须在用户空间范围内非规范地址 */ if (unlikely(regs-ip TASK_SIZE_MAX)) return false; /* 5. 不能有 RF 或 TF 标志 */ if (unlikely(regs-flags (X86_EFLAGS_RF | X86_EFLAGS_TF))) return false; return true; /* 所有检查通过使用 SYSRET */RCX/RIP 匹配因为sysret从RCX恢复RIP如果用户态不小心修改了RCX直接跳转会导致不可预知的结果。非规范地址某些旧 CPU 在sysret遇到非规范地址时会触发内核态#GP成为安全漏洞因此必须拦截。TF 标志如果用户态要求单步调试sysret恢复TF后会在用户态立即触发#DB破坏执行流。五、快速返回的实现细节如果决策通过汇编代码会执行快速返回路径assemblysyscall_return_via_sysret: IBRS_EXIT POP_REGS pop_rdi0 /* 恢复除 RDI、RSP 外的所有寄存器 */ movq %rsp, %rdi /* 保存当前栈指针 */ movq PER_CPU_VAR(cpu_tss_rw TSS_sp0), %rsp /* 切到 trampoline 栈 */ pushq RSP-RDI(%rdi) /* 压入原用户 RSP */ pushq (%rdi) /* 压入原 RDI */ STACKLEAK_ERASE_NOCLOBBER SWITCH_TO_USER_CR3_STACK scratch_reg%rdi popq %rdi popq %rsp swapgs CLEAR_CPU_BUFFERS sysretq最后一步sysretq硬件完成RIP RCX用户返回地址RFLAGS R11CS/SS从MSR_STAR加载用户段特权级切换到 Ring 3整个过程不经过任何软件中断或任务切换因此延迟极低。六、安全博弈缓解现代 CPU 漏洞的代价Linux 6.8.12 在系统调用路径中集成了大量针对推测执行漏洞的缓解措施宏 / 标签作用IBRS_ENTER阻止用户态间接分支预测污染内核UNTRAIN_RET清空返回预测器RSB防御 RetbleedCLEAR_BRANCH_HISTORY冲刷分支历史缓冲区防御 Branch History Injection (BHI)CLEAR_CPU_BUFFERS在返回用户态前清除 CPU 内部缓冲区如填充缓冲区SWITCH_TO_KERNEL_CR3配合内核页表隔离KPTI切换内核页表这些措施在entry_SYSCALL_64中被精确放置在用户态可控状态刚刚进入内核时以及返回用户态的前一刻。当然它们也带来了可测量的性能开销一次简单的getpid系统调用可能因为额外的IBRS和RSB清理而增加数十纳秒。然而在安全威胁面前这已经是内核开发者能够做出的最优权衡。七、总结一条指令背后的复杂性从用户态syscall到内核态sysret整个流程涉及MSR 的精心配置STAR,LSTAR,SYSCALL_MASK栈的两次切换用户栈 → 内核栈 → trampoline 栈上下文的完整保存与恢复pt_regs系统调用表的快速分发x64_sys_call安全返回的条件判断do_syscall_64中的五重检查多层次的漏洞缓解IBRS,UNTRAIN_RET,CLEAR_BRANCH_HISTORY等每一处代码都凝聚了内核开发者对性能、正确性与安全的反复权衡。了解这些细节不仅有助于写出更高效的应用程序也能帮助我们真正理解“用户态与内核态边界”这一操作系统的核心抽象。 ##源码void syscall_init(void) { wrmsr(MSR_STAR, 0, (__USER32_CS 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesnt allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } /* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF| X86_EFLAGS_AC|X86_EFLAGS_ID); } /* * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers. * * This is the only entry point used for 64-bit system calls. The * hardware interface is reasonably well designed and the register to * argument mapping Linux uses fits well with the registers that are * available when SYSCALL is used. * * SYSCALL instructions can be found inlined in libc implementations as * well as some other programs and libraries. There are also a handful * of SYSCALL instructions in the vDSO used, for example, as a * clock_gettimeofday fallback. * * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, * then loads new ss, cs, and rip from previously programmed MSRs. * rflags gets masked by a value from another MSR (so CLD and CLAC * are not needed). SYSCALL does not save anything on the stack * and does not change rsp. * * Registers on entry: * rax system call number * rcx return address * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) * rdi arg0 * rsi arg1 * rdx arg2 * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI) * * Only called from user space. * * When user can change pt_regs-foo always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */ SYM_CODE_START(entry_SYSCALL_64) UNWIND_HINT_ENTRY ENDBR swapgs /* tss.sp2 is scratch space. */ movq %rsp, PER_CPU_VAR(cpu_tss_rw TSS_sp2) SWITCH_TO_KERNEL_CR3 scratch_reg%rsp movq PER_CPU_VAR(pcpu_hot X86_top_of_stack), %rsp SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs-ss */ pushq PER_CPU_VAR(cpu_tss_rw TSS_sp2) /* pt_regs-sp */ pushq %r11 /* pt_regs-flags */ pushq $__USER_CS /* pt_regs-cs */ pushq %rcx /* pt_regs-ip */ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL) pushq %rax /* pt_regs-orig_ax */ PUSH_AND_CLEAR_REGS rax$-ENOSYS /* IRQs are off. */ movq %rsp, %rdi /* Sign extend the lower 32bit as syscall numbers are treated as int */ movslq %eax, %rsi /* clobbers %rax, make sure it is after saving the syscall nr */ IBRS_ENTER UNTRAIN_RET CLEAR_BRANCH_HISTORY call do_syscall_64 /* returns with IRQs disabled */ /* * Try to use SYSRET instead of IRET if were returning to * a completely clean 64-bit userspace context. If were not, * go to the slow exit path. * In the Xen PV case we must use iret anyway. */ ALTERNATIVE testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode, \ jmp swapgs_restore_regs_and_return_to_usermode, X86_FEATURE_XENPV /* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: IBRS_EXIT POP_REGS pop_rdi0 /* * Now all regs are restored except RSP and RDI. * Save old stack pointer and switch to trampoline stack. */ movq %rsp, %rdi movq PER_CPU_VAR(cpu_tss_rw TSS_sp0), %rsp UNWIND_HINT_END_OF_STACK pushq RSP-RDI(%rdi) /* RSP */ pushq (%rdi) /* RDI */ /* * We are on the trampoline stack. All regs except RDI are live. * We can do future final exit work right here. */ STACKLEAK_ERASE_NOCLOBBER SWITCH_TO_USER_CR3_STACK scratch_reg%rdi popq %rdi popq %rsp SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL) ANNOTATE_NOENDBR swapgs CLEAR_CPU_BUFFERS sysretq SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL) ANNOTATE_NOENDBR int3 SYM_CODE_END(entry_SYSCALL_64) /* Returns true to return using SYSRET, or false to use IRET */ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr) { add_random_kstack_offset(); nr syscall_enter_from_user_mode(regs, nr); instrumentation_begin(); if (!do_syscall_x64(regs, nr) !do_syscall_x32(regs, nr) nr ! -1) { /* Invalid system call, but still a system call. */ regs-ax __x64_sys_ni_syscall(regs); } instrumentation_end(); syscall_exit_to_user_mode(regs); /* * Check that the register state is valid for using SYSRET to exit * to userspace. Otherwise use the slower but fully capable IRET * exit path. */ /* XEN PV guests always use the IRET path */ if (cpu_feature_enabled(X86_FEATURE_XENPV)) return false; /* SYSRET requires RCX RIP and R11 EFLAGS */ if (unlikely(regs-cx ! regs-ip || regs-r11 ! regs-flags)) return false; /* CS and SS must match the values set in MSR_STAR */ if (unlikely(regs-cs ! __USER_CS || regs-ss ! __USER_DS)) return false; /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP * in kernel space. This essentially lets the user take over * the kernel, since userspace controls RSP. * * TASK_SIZE_MAX covers all user-accessible addresses other than * the deprecated vsyscall page. */ if (unlikely(regs-ip TASK_SIZE_MAX)) return false; /* * SYSRET cannot restore RF. It can restore TF, but unlike IRET, * restoring TF results in a trap from userspace immediately after * SYSRET. */ if (unlikely(regs-flags (X86_EFLAGS_RF | X86_EFLAGS_TF))) return false; /* Use SYSRET to exit to userspace */ return true; } static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr) { /* * Convert negative numbers to very high and thus out of range * numbers for comparisons. */ unsigned int unr nr; //yym-gaizao trace_printk(do_syscall_64: nr%d, ip0x%lx\n, nr, regs-ip); if (likely(unr NR_syscalls)) { unr array_index_nospec(unr, NR_syscalls); regs-ax x64_sys_call(regs, unr); return true; } return false; } long x64_sys_call(const struct pt_regs *regs, unsigned int nr) { switch (nr) { #include asm/syscalls_64.h default: return __x64_sys_ni_syscall(regs); } };