C 调用克隆系统调用时,谁设置RIP寄存器?

C 调用克隆系统调用时,谁设置RIP寄存器?,c,binary,operating-system,system-calls,glibc,C,Binary,Operating System,System Calls,Glibc,我正在尝试实现一个最小内核,我正在尝试实现克隆系统调用。在手册页中,您可以看到定义为以下内容的克隆系统调用: int clone(int (*fn)(void *), void *stack, int flags, void *arg, ... /* pid_t *parent_tid, void *tls, pid_t *child_tid */ ); long _do_fork(unsigned long clone_flags, un

我正在尝试实现一个最小内核,我正在尝试实现克隆系统调用。在手册页中,您可以看到定义为以下内容的克隆系统调用:

int clone(int (*fn)(void *), void *stack, int flags, void *arg, ...
                 /* pid_t *parent_tid, void *tls, pid_t *child_tid */ );
long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace, tls);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;
        struct pid *pid;

        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}
如您所见,它接收一个函数指针。如果您更仔细地阅读手册页,实际上可以看到内核中的实际系统调用实现没有收到函数指针:

long clone(unsigned long flags, void *stack,
                      int *parent_tid, int *child_tid,
                      unsigned long tls);
所以,我的问题是,谁在创建线程后修改RIP寄存器?是libc吗

我在glibc中发现了这段代码:但我不确定函数实际上是在什么时候被调用的

额外资料:

当查看clone.S源代码时,您可以看到它在系统调用之后跳转到一个thread\u start分支。在克隆系统调用之后的分支上(因此只有子级执行此操作),它从堆栈中弹出函数地址和参数。到底是谁在堆栈上推送了这些参数和函数地址?我猜它必须发生在内核中的某个地方,因为在
syscall
指令中,它们不在那里

以下是一些gdb输出:

就在系统调用之前:

[-------------------------------------code-------------------------------------]
   0x7ffff7d8af22 <clone+34>:   mov    r8,r9
   0x7ffff7d8af25 <clone+37>:   mov    r10,QWORD PTR [rsp+0x8]
   0x7ffff7d8af2a <clone+42>:   mov    eax,0x38
=> 0x7ffff7d8af2f <clone+47>:   syscall 
   0x7ffff7d8af31 <clone+49>:   test   rax,rax
   0x7ffff7d8af34 <clone+52>:   jl     0x7ffff7d8af49 <clone+73>
   0x7ffff7d8af36 <clone+54>:   je     0x7ffff7d8af39 <clone+57>
   0x7ffff7d8af38 <clone+56>:   ret
Guessed arguments:
arg[0]: 0x3d0f00 
arg[1]: 0x7ffff8020b60 --> 0x7ffff7d3fb30 (<do_something>:  push   rbx)
arg[2]: 0x7fffffffda90 --> 0x0 
[------------------------------------stack-------------------------------------]
0000| 0x7fffffffda78 --> 0x7ffff7d3f52c (<main+172>:    pop    rsi)
0008| 0x7fffffffda80 --> 0x7fffffffda94 --> 0x73658b0000000000 
0016| 0x7fffffffda88 --> 0x7fffffffda94 --> 0x73658b0000000000 
0024| 0x7fffffffda90 --> 0x0 
0032| 0x7fffffffda98 --> 0x492e085573658b00 
0040| 0x7fffffffdaa0 --> 0x7ffff7d3f0d0 (<_init>:   sub    rsp,0x8)
0048| 0x7fffffffdaa8 --> 0x7ffff7d40830 (<__libc_csu_init>: push   r15)
0056| 0x7fffffffdab0 --> 0x7ffff7d408d0 (<__libc_csu_fini>: push   rbp)
[------------------------------------------------------------------------------]
[---------------------------------------代码-----------------------------------------]
0x7ffff7d8af22:mov r8、r9
0x7ffff7d8af25:mov r10,QWORD PTR[rsp+0x8]
0x7ffff7d8af2a:mov eax,0x38
=>0x7ffff7d8af2f:系统调用
0x7ffff7d8af31:测试rax,rax
0x7ffff7d8af34:jl 0x7ffff7d8af49
0x7ffff7d8af36:je 0x7ffff7d8af39
0x7ffff7d8af38:ret
猜测的论点:
arg[0]:0x3d0f00
参数[1]:0x7ffff8020b60-->0x7ffff7d3fb30(:按rbx)
参数[2]:0x7FFFFFDA90-->0x0
[-------------------------------------堆栈---------------------------------------------------]
0000 | 0x7FFFFFDA78-->0x7ffff7d3f52c(:pop rsi)
0008 | 0x7FFFFFDA80-->0x7FFFFFDA94-->0x736580000000000
0016 | 0x7FFFFFDA88-->0x7FFFFFDA94-->0x736580000000000
0024 | 0x7FFFFFDA90-->0x0
0032 | 0x7FFFFFDA98-->0x492E08573658B00
0040 | 0x7FFFFFDAA0-->0x7ffff7d3f0d0(:子rsp,0x8)
0048 | 0x7FFFFFDAA8-->0x7ffff7d40830(:按r15)
0056 | 0x7FFFFFDAB0-->0x7ffff7d408d0(:按rbp)
[------------------------------------------------------------------------------]
在子线程上的syscall指令之后(检查堆栈顶部-这不会发生在父线程上):

[---------------------------------------代码-----------------------------------------]
0x7ffff7d8af25:mov r10,QWORD PTR[rsp+0x8]
0x7ffff7d8af2a:mov eax,0x38
0x7ffff7d8af2f:系统调用
=>0x7ffff7d8af31:测试rax,rax
0x7ffff7d8af34:jl 0x7ffff7d8af49
0x7ffff7d8af36:je 0x7ffff7d8af39
0x7ffff7d8af38:ret
0x7ffff7d8af39:xor ebp,ebp
[-------------------------------------堆栈---------------------------------------------------]
0000 | 0x7ffff8020b60-->0x7ffff7d3fb30(:按rbx)
0008 | 0x7ffff8020b68-->0x7ffff7dd5add-->0x4c414d0074736574(“测试”)
0016 | 0x7FF8020B70-->0x0
0024 | 0x7FF8020B78-->0x411
0032 | 0x7ffff8020b80(“参数:0x7ffff7d3fb30 4001536 0x7ffff8020b70 0x7fffffffda90 0x7ffff8000b60 0x7fffffffda94\n”)
0040 | 0x7ffff8020b88(“rs:0x7ffff7d3fb30 4001536 0x7ffff8020b70 0x7fffffffda90 0x7ffff8000b60 0x7fffffffda94\n”)
0048 | 0x7ffff8020b90(“fff7d3fb30 4001536 0x7ffff8020b70 0x7fffffffda90 0x7ffff8000b60 0x7fffffffda94\n”)
0056 | 0x7ffff8020b98(“30 4001536 0x7ffff8020b70 0x7fffffffda90 0x7ffff8000b60 0x7fffffffda94\n”)
[------------------------------------------------------------------------------]

通常,它的工作方式是,当计算机启动时,Linux设置一个MSR(特定于型号的寄存器)来处理汇编指令
syscall
。汇编指令
syscall
将使RIP寄存器跳转到MSR中指定的地址,以进入内核模式。如英特尔64-ia-32-architectures-software-developer-vol-2b-manual中所述:

SYSCALL调用特权级别为0的OS系统调用处理程序。 它通过从IA32lstarMSR加载RIP来实现

一旦进入内核模式,内核将查看传递到常规寄存器(RAX、RBX等)的参数,以确定系统调用的请求。然后内核将调用一个sys_XXX函数,其原型位于linux/syscalls.h()中。sys_clone的定义在kernel/fork.c中

SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         unsigned long, tls)
#endif
{
    return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
}
SYSCALLDEFINE5宏接受第一个参数,并在其前面加上sys_uu前缀。这个函数实际上是sys_clone,它调用_do_fork

这意味着实际上没有glibc调用的
clone()
函数来调用内核。内核用
syscall
指令调用,它跳转到MSR中指定的地址,然后调用sys\u call\u表中的一个syscall

x86内核的入口点在这里:。如果向下滚动,您将看到一行:
call*sys\u call\u table(,%rax,8)
。基本上,调用sys_call_表的一个函数。sys\u call\u表的实现如下所示:

生成表后,当您使用
syscall
汇编指令时,将跳转到其中一个条目。对于clone(),它将调用sys\u clone(),而sys\u clone()本身调用_do\u fork()。其定义如下:

int clone(int (*fn)(void *), void *stack, int flags, void *arg, ...
                 /* pid_t *parent_tid, void *tls, pid_t *child_tid */ );
long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace, tls);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;
        struct pid *pid;

        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}
它调用wake_up_new_task(),将任务放入运行队列并将其唤醒。我很惊讶它甚至能立即唤醒任务。我会猜测调度程序会代替它,并且它会被赋予尽快运行的高优先级。内核本身不必接收函数指针,因为正如clone()手册页上所述:

raw clone()系统调用与fork(2)更接近 在这种情况下,子对象中的执行从 呼叫因此,clone()包装器的fn和arg参数 函数被省略

孩子继续
long _do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr,
          unsigned long tls)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace, tls);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
        struct completion vfork;
        struct pid *pid;

        trace_sched_process_fork(current, p);

        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }

        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
    } else {
        nr = PTR_ERR(p);
    }
    return nr;
}
testq   %rax,%rax
jl  SYSCALL_ERROR_LABEL
jz  L(thread_start) //Child jumps to thread_start

ret //Parent returns to where it was
The parameters are passed in register and on the stack from userland:
rdi: fn
rsi: child_stack
rdx: flags
rcx: arg
r8d: TID field in parent
r9d: thread pointer
/* Insert the argument onto the new stack.  */
subq    $16,%rsi
movq    %rcx,8(%rsi)

/* Save the function pointer.  It will be popped off in the
      child in the ebx frobbing below.  */
movq    %rdi,0(%rsi)
retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
if (retval)
    goto bad_fork_cleanup_io;