5.内核态处理流程
5.1. 软中断处理
上节说到产生软中断后,内核态会跳转到中断向量处执行。可以看到,通过指令ldrcc pc, [tbl, scno, lsl #2]执行系统调用,通过ret_fast_syscall来返回。
Code: Select all
ENTRY(vector_swi)
//执行系统调用前先保存用户态18个寄存器,PT_REGS_SIZE = 72,sizeof(struct pt_regs),分//别是r0-r15、cspr、spsr
sub sp, sp, #PT_REGS_SIZE
stmia sp, {r0 - r12} @ Calling r0 - r12
ARM( add r8, sp, #S_PC )
ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr
THUMB( mov r8, sp )
THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr
mrs r8, spsr @ called from non-FIQ mode, so ok.
str lr, [sp, #S_PC] @ Save calling PC
//进入内核态之前先保存CPSR,返回到用户态时从SPSR中恢复
str r8, [sp, #S_PSR] @ Save CPSR
str r0, [sp, #S_OLD_R0] @ Save OLD_R0
zero_fp
alignment_trap r10, ip, __cr_alignment
enable_irq
ct_user_exit
get_thread_info tsk
/*
* Get the system call number.
*/
#if defined(CONFIG_OABI_COMPAT)
...
#elif defined(CONFIG_AEABI)
/*
* Pure EABI user space always put syscall number into scno (r7).
*/
#elif defined(CONFIG_ARM_THUMB)
/ Legacy ABI only, possibly thumb mode. /
tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs
addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in
USER( ldreq scno, [lr, #-4] )
#else
…
#endif
uaccess_disable tbl
//加载系统调用表基地址
adr tbl, sys_call_table @ load syscall table pointer
#if defined(CONFIG_OABI_COMPAT)
/*
* If the swi argument is zero, this is an EABI call and we do nothing.
*
* If this is an old ABI call, get the syscall number into scno and
* get the old ABI syscall table address.
*/
…
#elif !defined(CONFIG_AEABI)
bic scno, scno, #0xff000000 @ mask off SWI op-code
eor scno, scno, #__NR_SYSCALL_BASE @ check OS number
#endif
local_restart:
ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing
stmdb sp!, {r4, r5} @ push fifth and sixth args
tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls?
bne __sys_trace
cmp scno, #NR_syscalls @ check upper syscall limit
//通过__ret_fast_syscall返回
badr lr, __ret_fast_syscall @ return address
//通过系统调用表基地址tbl+系统调用好scno,执行系统调用函数
ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine
add r1, sp, #S_OFF
2: cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)
eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back
bcs arm_syscall
mov why, #0 @ no longer a real syscall
b sys_ni_syscall @ not private func
ENDPROC(vector_swi)
5.2 系统调用返回
上一小节看到,系统调用执行完成返回到__ret_fast_syscall:
Code: Select all
ret_fast_syscall:
__ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
/ perform architecture specific actions before user return /
arch_ret_to_user r1, lr
restore_user_regs fast = 1, offset = S_OFF
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)
fast_work_pending:
str r0, [sp, #S_R0+S_OFF]! @ returned r0
/ fall through to work_pending /
slow_work_pending:
mov r0, sp @ 'regs'
mov r2, why @ 'syscall'
bl do_work_pending //见下
cmp r0, #0
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
ldmia sp, {r0 - r6} @ have to reload r0 - r6
b local_restart @ ... and off we go
no_work_pending:
asm_trace_hardirqs_on save = 0
/ perform architecture specific actions before user return /
arch_ret_to_user r1, lr //恢复用户态的寄存器
ct_user_enter save = 0
restore_user_regs fast = 0, offset = 0
在返回用户态前,do_work_pending主要检查是否处理pend的信号。
Code: Select all
asmlinkage int
do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
{
/*
* The assembly code enters us with IRQs off, but it hasn't
* informed the tracing code of that for efficiency reasons.
* Update the trace code with the current status.
*/
trace_hardirqs_off();
do {
//检查是否需要重新调用
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
} else {
if (unlikely(!user_mode(regs)))
return 0;
local_irq_enable();
//有未处理的信号
if (thread_flags & _TIF_SIGPENDING) {
int restart = do_signal(regs, syscall);
if (unlikely(restart)) {
/*
* Restart without handlers.
* Deal with it without leaving
* the kernel space.
*/
return restart;
}
syscall = 0;
}
…
}
local_irq_disable();
thread_flags = current_thread_info()->flags;
} while (thread_flags & _TIF_WORK_MASK);
return 0;
}
5.3 系统调用处理
系统调用write实际调用的是sys_write,在内核代码中无法直接搜到,因为它是通过宏定义拼接的,跟踪宏展开中name字段就可以看到最终是sys_write函数,在内核编译生成的System.map也可以搜到sys_write符号:
Code: Select all
define __NR_write 64
__SYSCALL(__NR_write, sys_write)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(SyS##name)))); \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
Sys_write函数的具体实现如下:
Code: Select all
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if(f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_write(f.file, buf, count, &pos);
if(ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
return ret;
}
vfs_write函数调用如下:
Code: Select all
vfs_write
__vfs_write
file->f_op->write(file, p, count, pos);
//这里的实际执行函数时redirected_tty_write
5.4 stdout重定向到console
查看程序的fd,可以看到fd 0、1和2都是重定向到/dev/console。
Code: Select all
# 679为程序pid
ls /proc/679/fd
lrwx------ 1 64 2 -> /dev/console
lrwx------ 1 64 1 -> /dev/console
lrwx------ 1 64 0 -> /dev/console
内核启动时创建init进程(pid=1):
Code: Select all
start_kernel
rest_init
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
kernel_thread(kernel_init, NULL, CLONE_FS);
init进程打开/dev/console作为标准输入输出。
Code: Select all
kernel_init
kernel_init_freeable
/ Open the /dev/console on the rootfs, this should never fail /
if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) //stdin, fd = 0
pr_err("Warning: unable to open an initial console.\n");
(void) sys_dup(0); //stdout, fd = 1;
(void) sys_dup(0); //stdout fd = 2;
Linux的所有进程都是由init进程创建的,继承fd 0、1和2。因此,打印都被重定向到/dev/console上,执行系统调用write函数,实际就是执行的console的file_operations的write函数。
在内核启动日志中,可以看到在打印:console [ttyS0] enabled。
5.5 tty及sstar uart驱动
tty驱动初始化流程如下,创建字符设备并注册到/dev/console:
Code: Select all
__initcall_chr_dev_init5
chr_dev_init
tty_init
tty_init
cdev_init(&console_cdev, &console_fops);
if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) ||
register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0)
panic("Couldn't register /dev/console driver\n");
struct file_operations console_fops结构体如下:
Code: Select all
static const struct file_operations console_fops = {
.llseek = no_llseek,
.read = tty_read,
.write = redirected_tty_write,
.poll = tty_poll,
.unlocked_ioctl = tty_ioctl,
.compat_ioctl = tty_compat_ioctl,
.open = tty_open,
.release = tty_release,
.fasync = tty_fasync,
};
write系统调用最终会调用到redirected_tty_write
Code: Select all
redirected_tty_write
tty_write
do_tty_write(ld->ops->write, tty, file, buf, count)
ld->ops->write();
//tty_register_ldisc 中通过tty_register_ldisc(N_TTY, &n_tty_ops);注册
//(见下文注释1),因此此处write函数实际为n_tty_write。
n_tty_write
c = tty->ops->write(tty, b, nr);
//uart_register_driver中通过tty_set_operations(normal, &uart_ops); 注册
//(见下文注释2),write回调函数为uart_write
uart_write
//将数据送到xmit环形缓冲区(队列大小:PAGE_SIZE)中,若环形队列满
//则不再拷贝
__uart_start
port->ops->start_tx(port);
//sstar平台在_ms_uart_console_prepare中通过
//console_port.port.ops=&ms_uart_ops; 注册
//此处调用的是ms_uart_start_tx
ms_uart_start_tx
//将xmit环形队列的数据拷贝到驱动的dma的tx_buf中
URDMA_StartTx
至此,write系统调用返回。