分类 mooc 下的文章

Linux内核分析作业(2)

以下是李扬《Linux内核分析》MOOC课程作业 原创作品转载请注明出处


之前的那个太长了,不好编辑了,新开一个继续写。


第五周作业

task_struct是Linux系统中用来描述一个进程的结构体,存储了一个进程的所有的信息。

整个linux系统的所有进程是一个树形结构。内核态下执行的0号进程,它是所有进程的祖先。由0号进程创建1号进程(内核态),1号负责执行内核的部分初始化工作及进行系统配置。随后,1号进程调用execve()运行可执行程序init,并演变成用户态1号进程,即init进程。
在c语言中fork的用法

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char * argv[])
{
    int pid;
    pid = fork();
    if (pid < 0) 
    { 
        //pid < 0表示fork失败
        fprintf(stderr,"Fork Failed!");
        exit(-1);
    } 
    else if (pid == 0) 
    {
        //在子进程中获取的pid为0
        printf("This is Child Process!\n");
    } 
    else 
    {  
        //父进程中获取的pid是子进程实际的pid
        printf("This is Parent Process!\n");
        //等待子进程结束
        wait(NULL);
        printf("Child Complete!\n");
    }
}

fork()是进程创建新的进程用的,fork创造的子进程复制了父亲进程的资源,包括task_struct内容。fork、vfork和clone三个系统调用都可以创建一个新进程,而且都是通过调用do_fork来实现进程的创建。

比如fork函数对应的内核处理过程sys_clone

#ifdef __ARCH_WANT_SYS_CLONE
#ifdef CONFIG_CLONE_BACKWARDS
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int, tls_val,
         int __user *, child_tidptr)
#elif defined(CONFIG_CLONE_BACKWARDS2)
SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#elif defined(CONFIG_CLONE_BACKWARDS3)
SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
        int, stack_size,
        int __user *, parent_tidptr,
        int __user *, child_tidptr,
        int, tls_val)
#else
SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
         int __user *, parent_tidptr,
         int __user *, child_tidptr,
         int, tls_val)
#endif
{
    return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
}
#endif

do_fork的代码

long do_fork(unsigned long clone_flags,
          unsigned long stack_start,
          unsigned long stack_size,
          int __user *parent_tidptr,
          int __user *child_tidptr)
{
    struct task_struct *p;
    int trace = 0;
    long nr;

    /*
     * Determine whether and which event to report to ptracer.  When
     * called from kernel_thread or CLONE_UNTRACED is explicitly
     * requested, no event is reported; otherwise, report if the event
     * for the type of forking is enabled.
     */
        //判断是否进行跟踪
    if (!(clone_flags & CLONE_UNTRACED)) {
        if (clone_flags & CLONE_VFORK)
            trace = PTRACE_EVENT_VFORK;
        else if ((clone_flags & CSIGNAL) != SIGCHLD)
            trace = PTRACE_EVENT_CLONE;
        else
            trace = PTRACE_EVENT_FORK;

        if (likely(!ptrace_event_enabled(current, trace)))
            trace = 0;
    }

        //调用copy_process进行初始化,返回初始化好的struct task_struct结构体,当我们调用fork时返回两次的原因也是在这个函数当中
    p = copy_process(clone_flags, stack_start, stack_size,
             child_tidptr, NULL, trace);
    /*
     * Do this prior waking up the new thread - the thread pointer
     * might get invalid after that point, if the thread exits quickly.
     */
    if (!IS_ERR(p)) {
                //成功
        struct completion vfork;
        struct pid *pid;

        trace_sched_process_fork(current, p);
                //获取子进程PID
        pid = get_task_pid(p, PIDTYPE_PID);
        nr = pid_vnr(pid);

        if (clone_flags & CLONE_PARENT_SETTID)
            put_user(nr, parent_tidptr);

        if (clone_flags & CLONE_VFORK) {
            p->vfork_done = &vfork;
            init_completion(&vfork);
            get_task_struct(p);
        }

                //将新进程加入到CPU的运行队列中
        wake_up_new_task(p);

        /* forking complete and child started to run, tell ptracer */
        if (unlikely(trace))
            ptrace_event_pid(trace, pid);

        if (clone_flags & CLONE_VFORK) {
            if (!wait_for_vfork_done(p, &vfork))
                ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
        }

        put_pid(pid);
    } else {
        nr = PTR_ERR(p);
    }
        //返回新进程PID(新进程在这会返回0)
    return nr;
}

实验截图,一个是下断点,一个运行到了copy_process了
QQ20150412-1@2x.png
1QQ20150412-2@2x.png

第六周作业

程序编译链接过程

预处理:(.c -> .cpp)

gcc -E -o hello.cpp hello.c -m32

编译:(.cpp -> .s 汇编)

gcc -x cpp-output -S -o hello.s hello.cpp -m32

编译:(.s -> .o 二进制目标代码)

gcc -x assembler -c hello.s -o hello.o -m32

链接:(.o -> a.out)共享库

gcc -o hello hello.o -m32

静态编译:

gcc -o hello.static hello.o -m32 -static

c语言的main函数除了常见的int main(int argc, char *argv[])以外还有int main(int argc, char *argv[], char *envp[]),envp是环境变量。Shell会调用execve将命令行参数和环境参数传递给可执行程序的main函数

库函数exec*都是execve的封装例程,原型是int execve(const char * filename,char * const argv[ ],char * const envp[ ]);

比如

int main(int argc, char * argv[])
{
    int pid;
    /* fork another process */
    pid = fork();
    if (pid<0) 
    { 
        /* error occurred */
        fprintf(stderr,"Fork Failed!");
        exit(-1);
    } 
    else if (pid==0) 
    {
        /*   child process   */
        execlp("/bin/ls","ls",NULL);
    } 
    else 
    {  
        /*     parent process  */
        /* parent will wait for the child to complete*/
        wait(NULL);
        printf("Child Complete!");
        exit(0);
    }
}

命令行参数、环境变量等都是通过用户堆栈传递的。
week7_args.png
sys_execve内部会解析可执行文件格式,流程do_execve -> do_execve_common -> exec_binprm,然后search_binary_handler符合寻找文件格式对应的解析模块

list_for_each_entry(fmt, &formats, lh) {
    if (!try_module_get(fmt->module))
        continue;
    read_unlock(&binfmt_lock);
    bprm->recursion_depth++;
    retval = fmt->load_binary(bprm);
    read_lock(&binfmt_lock);

实验
QQ20150420-1@2x.png
QQ20150420-2@2x.png

Linux内核数据结构 - 链表

Linux内核提供了一个通用的链表实现,使用的是双向环形链表。本文代码参考 https://git.kernel.org/cgit/linux/kernel/git/stable/linux-stable.git/tree/include/linux/list.h?id=refs/tags/v2.6.32.65

我们定义链表一般都是这样写的

struct node
{
    int data;
    struct node *prev;
    struct node *next;
};

但是Linux内核提供了前后指针的结构体,代码如下

struct list_head {
    struct list_head *next, *prev;
};

然后我们需要自己去引用这个结构体。

struct node
{
    int data;
    struct list_head list;
};

这样的话,node.list.next或者node.list.prev就是前后的元素了。

container_of是linux内核中已经定义的一个宏

/**
 * container_of - cast a member of a structure out to the containing structure
 * @ptr:    the pointer to the member.
 * @type:   the type of the container struct this is embedded in.
 * @member: the name of the member within the struct.
 *
 */
#define container_of(ptr, type, member) ({          \
    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \
    (type *)( (char *)__mptr - offsetof(type,member) );})

看注释,就是根据一个结构体变量中的一个域成员变量的指针来获取指向整个结构体变量的指针。

在链表中增加节点有下面几个方法,都很简单

/*
 * Insert a new entry between two known consecutive entries.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
#ifndef CONFIG_DEBUG_LIST
static inline void __list_add(struct list_head *new,
                  struct list_head *prev,
                  struct list_head *next)
{
    next->prev = new;
    new->next = next;
    new->prev = prev;
    prev->next = new;
}
#else
extern void __list_add(struct list_head *new,
                  struct list_head *prev,
                  struct list_head *next);
#endif

/**
 * list_add - add a new entry
 * @new: new entry to be added
 * @head: list head to add it after
 *
 * Insert a new entry after the specified head.
 * This is good for implementing stacks.
 */
static inline void list_add(struct list_head *new, struct list_head *head)
{
    __list_add(new, head, head->next);
}


/**
 * list_add_tail - add a new entry
 * @new: new entry to be added
 * @head: list head to add it before
 *
 * Insert a new entry before the specified head.
 * This is useful for implementing queues.
 */
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
    __list_add(new, head->prev, head);
}

删除链表中的节点的几个方法,要注意的是都没有free掉那个节点

/*
 * Delete a list entry by making the prev/next entries
 * point to each other.
 *
 * This is only for internal list manipulation where we know
 * the prev/next entries already!
 */
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
    next->prev = prev;
    prev->next = next;
}

/**
 * list_del - deletes entry from list.
 * @entry: the element to delete from the list.
 * Note: list_empty() on entry does not return true after this, the entry is
 * in an undefined state.
 */
#ifndef CONFIG_DEBUG_LIST
static inline void list_del(struct list_head *entry)
{
    __list_del(entry->prev, entry->next);
    entry->next = LIST_POISON1;
    entry->prev = LIST_POISON2;
}
#else
extern void list_del(struct list_head *entry);
#endif

遍历链表的操作,正序和反序

/**
 * list_for_each    -   iterate over a list
 * @pos:    the &struct list_head to use as a loop cursor.
 * @head:   the head for your list.
 */
#define list_for_each(pos, head) \
    for (pos = (head)->next; prefetch(pos->next), pos != (head); \
            pos = pos->next)

/**
 * __list_for_each  -   iterate over a list
 * @pos:    the &struct list_head to use as a loop cursor.
 * @head:   the head for your list.
 *
 * This variant differs from list_for_each() in that it's the
 * simplest possible list iteration code, no prefetching is done.
 * Use this for code that knows the list to be very short (empty
 * or 1 entry) most of the time.
 */
#define __list_for_each(pos, head) \
    for (pos = (head)->next; pos != (head); pos = pos->next)

/**
 * list_for_each_prev   -   iterate over a list backwards
 * @pos:    the &struct list_head to use as a loop cursor.
 * @head:   the head for your list.
 */
#define list_for_each_prev(pos, head) \
    for (pos = (head)->prev; prefetch(pos->prev), pos != (head); \
            pos = pos->prev)

Linux内核分析作业(1)

以下是李扬《Linux内核分析》MOOC课程作业 原创作品转载请注明出处


第一周作业

int g(int x)
{
  return x + 3;
}

int f(int x)
{
  return g(x);
}

int main(void)
{
  return f(8) + 1;
}

代码使用gcc –S –o main.s main.c -m32命令编译成汇编代码,然后分析汇编代码

得到的汇编代码是这样的

g:
.LFB0:
    pushl   %ebp
    movl    %esp, %ebp
        //把第二个8赋给eax
    movl    8(%ebp), %eax
        //eax + 3
    addl    $3, %eax
        //pop出来的值给ebp
    popl    %ebp
        //popl %eip
    ret

f:
.LFB1:
    pushl   %ebp
    movl    %esp, %ebp
        //esp下移4个字节
    subl    $4, %esp
        //将ebp - 8位置的数字8给eax
    movl    8(%ebp), %eax
        //esp现在也是8了
    movl    %eax, (%esp)
    call    g
    leave
    ret

main:
.LFB2:
        //ebp的值压栈 同时esp自动下移一格
    pushl   %ebp
        //esp和ebp指向同一个地址 栈为空了
    movl    %esp, %ebp
        //esp再次下移四个字节
    subl    $4, %esp
        //把参数8放到刚空出来的位置
    movl    $8, (%esp)
        //调用f函数 相当于push %eip; movl f的地址, %eip;保存下一行代码的地址
    call    f
    addl    $1, %eax
    leave
    ret

第二周作业

以前在给别人讲递归的时候,就是说你可以将一个函数看成一个整体,然后每次调用一次自己,就压栈一次,最后看到的就是一个栈,里面有很多函数体。然后在递归回溯的时候,逐个的弹栈,直到栈为空的。而通过这一次的学习,更加明确了函数调用的时候上下文切换和各种寄存器的变化,每个函数体不再是一个黑箱子了。

之后是学了操作系统,恐龙书,很厚的,但是关于操作系统的知识实在太多了,这么厚的一本书中关于进程调度其实也没写多少东西,更没有自己去写代码模拟。前段时间做leetcode,有一道题目就是LRU Cache,这个就是操作系统内存换页算法的一个简化。我逐步的觉得,操作系统的内核有很多精妙的算法结构,应该好好的学一下。

这一周主要就是学习的嵌入式汇编和操作系统轮转法进程调度,每个进程都是轮流来,和其他的几种调度方式比,不会出现饿死现象。

mypcb.h

#define MAX_TASK_NUM        4
#define KERNEL_STACK_SIZE   1024*8

/* CPU-specific state of this task */
struct Thread {
    //eip
    unsigned long       ip;
    //esp
    unsigned long       sp;
};


//PCB就是进程控制块
typedef struct PCB{
    //唯一的标识一个进程
    int pid;
    //进程运行状态 -1 unrunnable, 0 runnable, >0 stopped
    volatile long state;
    //当前进程堆栈,栈是从高地址向低地址延伸
    char stack[KERNEL_STACK_SIZE];
    /* CPU-specific state of this task */
    struct Thread thread;
    //入口
    unsigned long   task_entry;
    //next指针
    struct PCB *next;
}tPCB;

void my_schedule(void);

mymain.c

#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/tty.h>
#include <linux/vmalloc.h>


#include "mypcb.h"

//创建MAX_TASK_NUM长度的数组,每个元素都是一个进程
tPCB task[MAX_TASK_NUM];
//当前进程指针,初始化为NULL
tPCB * my_current_task = NULL;
volatile int my_need_sched = 0;

void my_process(void);


void __init my_start_kernel(void)
{
    int pid = 0;
    int i;
    //初始化一个进程,对各个属性进行赋值
    task[pid].pid = pid;
    task[pid].state = 0;
    //进程执行的内容
    task[pid].task_entry = task[pid].thread.ip = (unsigned long)my_process;
    //栈顶
    task[pid].thread.sp = (unsigned long)&task[pid].stack[KERNEL_STACK_SIZE-1];
    //只有一个进程,先指向自己
    task[pid].next = &task[pid];
    //循环复制剩下的进程
    for(i=1;i<MAX_TASK_NUM;i++)
    {
        memcpy(&task[i],&task[0],sizeof(tPCB));
       //pid是唯一的,自增
        task[i].pid = i;
        task[i].state = -1;
        task[i].thread.sp = (unsigned long)&task[i].stack[KERNEL_STACK_SIZE-1];
        //切换尾指针
        task[i].next = task[i-1].next;
        task[i-1].next = &task[i];
    }
    /* start process 0 by task[0] */
    pid = 0;
    my_current_task = &task[pid];
    //下面的嵌入式汇编  %0就是第一个参数  %1就是第二个参数
    asm volatile(
        //系统esp指向进程堆栈栈底
        "movl %1,%%esp\n\t"     /* set task[pid].thread.sp to esp */
        //当前ebp入栈
        "pushl %1\n\t"          /* push ebp */
        //当前eip入栈
        "pushl %0\n\t"          /* push task[pid].thread.ip */
        "ret\n\t"
        //为0号进程创建入口完成               
        /* pop task[pid].thread.ip to eip */
        "popl %%ebp\n\t"
        : 
        : "c" (task[pid].thread.ip),"d" (task[pid].thread.sp)   /* input c or d mean %ecx/%edx*/
    );
}   
void my_process(void)
{
    int i = 0;
    while(1)
    {
        i++;
        if(i%10000000 == 0)
        {
            printk(KERN_NOTICE "this is process %d -\n",my_current_task->pid);
            //主动调度
            if(my_need_sched == 1)
            {
                my_need_sched = 0;
            my_schedule();
            }
            printk(KERN_NOTICE "this is process %d +\n",my_current_task->pid);
        }     
    }
}

myinterrupt.c

#include <linux/types.h>
#include <linux/string.h>
#include <linux/ctype.h>
#include <linux/tty.h>
#include <linux/vmalloc.h>

#include "mypcb.h"

extern tPCB task[MAX_TASK_NUM];
extern tPCB * my_current_task;
extern volatile int my_need_sched;
volatile int time_count = 0;

/*
 * Called by timer interrupt.
 * it runs in the name of current running process,
 * so it use kernel stack of current running process
 */
void my_timer_handler(void)
{
#if 1
    if(time_count%1000 == 0 && my_need_sched != 1)
    {
        printk(KERN_NOTICE ">>>my_timer_handler here<<<\n");
        my_need_sched = 1;
    } 
    time_count ++ ;  
#endif
    return;     
}

void my_schedule(void)
{
    tPCB * next;
    //当前进程
    tPCB * prev;

    if(my_current_task == NULL 
        || my_current_task->next == NULL)
    {
        return;
    }
    printk(KERN_NOTICE ">>>my_schedule<<<\n");
    /* schedule */
    next = my_current_task->next;
    prev = my_current_task;
    //根据下一个进程状态不同,使用不同的切换方法
    //如果下一个进程是曾经运行过的
    if(next->state == 0)/* -1 unrunnable, 0 runnable, >0 stopped */
    {
        /* switch to next process */
        asm volatile(   
                //保存当前进程ebp
            "pushl %%ebp\n\t"       /* save ebp */
            "movl %%esp,%0\n\t"     /* save esp */
                //把当前进程esp交给下一个进程
            "movl %2,%%esp\n\t"     /* restore  esp */
            "movl $1f,%1\n\t"       /* save eip */  
            "pushl %3\n\t" 
            "ret\n\t"               /* restore  eip */
            "1:\t"                  /* next process start here */
            "popl %%ebp\n\t"
            : "=m" (prev->thread.sp),"=m" (prev->thread.ip)
            : "m" (next->thread.sp),"m" (next->thread.ip)
        ); 
        my_current_task = next; 
        printk(KERN_NOTICE ">>>switch %d to %d<<<\n",prev->pid,next->pid);      
    }
    else
    {
        next->state = 0;
        my_current_task = next;
        printk(KERN_NOTICE ">>>switch %d to %d<<<\n",prev->pid,next->pid);
        /* switch to new process */
        asm volatile(   
            "pushl %%ebp\n\t"       /* save ebp */
            "movl %%esp,%0\n\t"     /* save esp */
            "movl %2,%%esp\n\t"     /* restore  esp */
            "movl %2,%%ebp\n\t"     /* restore  ebp */
            "movl $1f,%1\n\t"       /* save eip */  
            "pushl %3\n\t" 
            "ret\n\t"               /* restore  eip */
            : "=m" (prev->thread.sp),"=m" (prev->thread.ip)
            : "m" (next->thread.sp),"m" (next->thread.ip)
        );          
    }   
    return; 
}

里面最重要的就是那几段汇编代码,之前还没学过汇编,已经拿到了王爽的汇编语言开始看了~

http://blog.csdn.net/zs634134578/article/details/9205287也有一个图说的很好,贴过来了
20130629235215718.jpeg

实验楼截图
shiyanlou.png

第三周作业

计算机的启动过程概述

x86 CPU启动的第一个动作CS:EIP=FFFF:0000H(换算为物理地址为000FFFF0H,因为16位CPU有20根地址线),即BIOS程序的位置。

BIOS例行程序检测完硬件并完成相应的初始化之后就会寻找可引导介质,找到后把引导程序加载到指定内存区域后,就把控制权交给了引导程序。这里一般是把硬盘的第一个扇区MBR和活动分区的引导程序加载到内存(即加载BootLoader),加载完整后把控制权交给BootLoader。

引导程序BootLoader开始负责操作系统初始化,然后起动操作系统。启动操作系统时一般会指定kernel、initrd和root所在的分区和目录,比如root (hd0,0),kernel (hd0,0)/bzImage root=/dev/ram init=/bin/ash,initrd (hd0,0)/myinitrd4M.img

内核启动过程包括start_kernel之前和之后,之前全部是做初始化的汇编指令,之后开始C代码的操作系统初始化,最后执行第一个用户态进程init。

一般分两阶段启动,先是利用initrd的内存文件系统,然后切换到硬盘文件系统继续启动。initrd文件的功能主要有两个:1、提供开机必需的但kernel文件(即vmlinuz)没有提供的驱动模块(modules) 2、负责加载硬盘上的根文件系统并执行其中的/sbin/init程序进而将开机过程持续下去

内核的初始化过程由start_kernel函数开始,至第一个用户进程init结束,调用了一系列的初始化函数对所有的内核组件进行初始化。其中,start_kernel、rest_init、kernel_init、init_post等4个函数构成了整个初始化过程的主线。

进程1又称为init进程,是所有用户进程的祖先。

init进程是在start_kernel调用rest_init的时候创建的,PID为1,它是个普通的用户态进程,它是Unix系统内核初始化与用户态初始化的接合点,它是所有用户进程的祖宗。在运行init以前是内核态初始化,该过程(内核初始化)的最后一个动作就是运行/sbin/init可执行文件。

实验截图
QQ20150321-1@2x.png
QQ20150321-3@2x.png

第四周作业

本周主要学习了怎么使用代码去调用系统调用。

系统调用就是操作系统为用户态进程与硬件进行交互提供的一组接口,这样的话,可以不用程序员与底层的代码和硬件打交道,而且能写出更加通用的代码。

当用户态进行调用一个系统调用的时候,cpu切换到内核态并开始执行一个内核函数。在Linux中,是通过执行int $0x80来执行系统调用的。

http://codelab.shiyanlou.com/xref/linux-3.18.6/arch/x86/syscalls/syscall_32.tbl可以看到一些熊调用及相关的信息,我选择getpid()来做实验。这个函数是用来获取当前运行的进程的pid的,返回值是一个整数。

首先使用库函数,真是简单

#include <stdio.h>
#include <unistd.h>

int main()
{
    printf("Process id: %d\n", getpid());
    return 0;
}

然后是

#include <stdio.h>
#include <unistd.h>

int main()
{
    int pid;
    asm volatile(
        "mov $0, %%ebx\n\t"
        "mov $20, %%eax\n\t"
        "int $0x80\n\t"
        "mov %%eax, %0\n\n"
        : "=r"(pid)
     );

    printf("Process id: %d\n", pid);
    return 0;
}

分析一下,mov $0, %%ebx\n\t就是把ebx赋值为0,然后mov $20, %%eax\n\t把getpid的系统调用号20给eax,int $0x80\n\t调用,mov %%eax, %0\n\n,这个时候eax中保存的就是系统调用的返回值,然后赋值给pid。

QQ20150326-1@2x.png

ps 在retina屏幕上看这个图真模糊,Ubuntu对高分屏的支持也不好~

嵌入式汇编的格式

__asm__(
        汇编语言,
        输出部分,
        输入部分,
        破坏描述部分
        );

在嵌入式汇编中,%0, %1之类的表示下面第几个变量,比如

#include <stdio.h>

int main()
{
    int val1 = 0;
    int val2 = 3;
    int val3 = 9;

    asm volatile(
    //eax置0
    "movl $0, %%eax\n\t"
    //%1 就是val1
    "addl %1, %%eax\n\t"
    //%2 就是val2
    "addl %2, %%eax\n\t"
    //%0 就是val3 赋值
    "movl %%eax, %0\n\t"
    //m代表内存变量 等号表示只写
    :"=m"(val3)
    //a -> eax b -> ebx 等
    :"c"(val1), "d"(val2)
    );
    printf("%d %d %d", val1, val2, val3);
    return 0;
}

第五周作业

这次作业写得很简单,很多东西还是根本不明白是怎么回事==

首先在test.c里面增加上次写的getpid的代码,然后make rootfs并启动qemu运行。
QQ20150404-1@2x.png

QQ20150404-3@2x.png

但是getpid貌似直接进入了系统调用。gdb无法跟踪
QQ20150404-5@2x.png

glibc中getpid的定义是extern __pid_t __getpid (void);

然后在getpid.c中,代码是这样的

pid_t
__getpid (void)
{
#ifdef NOT_IN_libc
  INTERNAL_SYSCALL_DECL (err);
  pid_t result = INTERNAL_SYSCALL (getpid, err, 0);
#else
  pid_t result = THREAD_GETMEM (THREAD_SELF, pid);
  if (__builtin_expect (result <= 0, 0))
    result = really_getpid (result);
#endif
  return result;
}

/arch/x86/kernel/entry_32.S就是sys_call的实现代码,分析写在注释了

ENTRY(system_call)
    RING0_INT_FRAME         # can't unwind into user space anyway
    ASM_CLAC
        # 这个eax保存了系统调用的调用号
    pushl_cfi %eax          # save orig_eax
        # save all就是保存上下文,压栈保存寄存器的值
    SAVE_ALL
        # 获取当前进程描述符的成员变量thread_info的地址,并保存在ebx中
    GET_THREAD_INFO(%ebp)
                    # system call tracing in operation / emulation
    testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
    jnz syscall_trace_entry
        # 检查系统调用号是否合法
    cmpl $(NR_syscalls), %eax
        # 如果不合法,就跳到这个地方进行出错处理
    jae syscall_badsys
syscall_call:
        # 调用具体的系统调用处理函数,地址是在sys_call_table + 4 * eax中
    call *sys_call_table(,%eax,4)
syscall_after_call:
        # 将系统调用处理函数保存在寄存器%eax中的返回值移动到内核态栈相应的位置中
    movl %eax,PT_EAX(%esp)      # store the return value
syscall_exit:
    LOCKDEP_SYS_EXIT
        # 屏蔽其他系统调用
    DISABLE_INTERRUPTS(CLBR_ANY)    # make sure we don't miss an interrupt
                    # setting need_resched or sigpending
                    # between sampling and the iret
    TRACE_IRQS_OFF
        # 为返回到系统调用之前做准备
    movl TI_flags(%ebp), %ecx
    testl $_TIF_ALLWORK_MASK, %ecx  # current->work
    jne syscall_exit_work

restore_all:
    TRACE_IRQS_IRET
restore_all_notrace:
#ifdef CONFIG_X86_ESPFIX32
    movl PT_EFLAGS(%esp), %eax  # mix EFLAGS, SS and CS
    # Warning: PT_OLDSS(%esp) contains the wrong/random values if we
    # are returning to the kernel.
    # See comments in process.c:copy_thread() for details.
    movb PT_OLDSS(%esp), %ah
    movb PT_CS(%esp), %al
    andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax
    cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax
    CFI_REMEMBER_STATE
    je ldt_ss           # returning to user-space with LDT SS
#endif
restore_nocheck:
    RESTORE_REGS 4          # skip orig_eax/error_code
irq_return:
    INTERRUPT_RETURN
.section .fixup,"ax"
ENTRY(iret_exc)
    pushl $0            # no error code
    pushl $do_iret_error
    jmp error_code
.previous
    _ASM_EXTABLE(irq_return,iret_exc)

#ifdef CONFIG_X86_ESPFIX32
    CFI_RESTORE_STATE
ldt_ss:
#ifdef CONFIG_PARAVIRT
    /*
     * The kernel can't run on a non-flat stack if paravirt mode
     * is active.  Rather than try to fixup the high bits of
     * ESP, bypass this code entirely.  This may break DOSemu
     * and/or Wine support in a paravirt VM, although the option
     * is still available to implement the setting of the high
     * 16-bits in the INTERRUPT_RETURN paravirt-op.
     */
    cmpl $0, pv_info+PARAVIRT_enabled
    jne restore_nocheck
#endif

/*
 * Setup and switch to ESPFIX stack
 *
 * We're returning to userspace with a 16 bit stack. The CPU will not
 * restore the high word of ESP for us on executing iret... This is an
 * "official" bug of all the x86-compatible CPUs, which we can work
 * around to make dosemu and wine happy. We do this by preloading the
 * high word of ESP with the high word of the userspace ESP while
 * compensating for the offset by changing to the ESPFIX segment with
 * a base address that matches for the difference.
 */
#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
    mov %esp, %edx          /* load kernel esp */
    mov PT_OLDESP(%esp), %eax   /* load userspace esp */
    mov %dx, %ax            /* eax: new kernel esp */
    sub %eax, %edx          /* offset (low word is 0) */
    shr $16, %edx
    mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
    mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
    pushl_cfi $__ESPFIX_SS
    pushl_cfi %eax          /* new kernel esp */
    /* Disable interrupts, but do not irqtrace this section: we
     * will soon execute iret and the tracer was already set to
     * the irqstate after the iret */
    DISABLE_INTERRUPTS(CLBR_EAX)
    lss (%esp), %esp        /* switch to espfix segment */
    CFI_ADJUST_CFA_OFFSET -8
    jmp restore_nocheck
#endif
    CFI_ENDPROC
ENDPROC(system_call)