一文弄懂printf函数从用户态到内核态的执行流程
我们经常使用C库的printf函数,花时间整理一下从用户态到内核态的整个流程,涉及libc、系统调用、tty驱动、console等多个方面。
目录
1.简介
我们经常使用C库的printf函数,花时间整理一下从用户态到内核态的整个流程,涉及libc、系统调用、tty驱动、console等多个方面()其中,跟踪的驱动部分代码是sigmastar的,视用户实际使用的平台而定)。文章略长,请耐心阅读哈~
由于作者水平有限,如有纰漏,请帮忙指正,谢谢~
2.示例代码
使用最简单的代码作为示例。
#include <stdlib.h>
#include <stdio.h>
int main()
{
printf("hello world!\n");
return 0;
}
3.程序执行初探
gcc编译上述程序后,使用strace命令可以跟踪程序的系统调用流程。可以看到,程序执行需要依赖C库。整个执行流程大致如下:Hello_world可执行程序通过execve加载到内存后,libc.so等动态库通过mmap加载到内存映射区,最终通过write系统调用将“hello world!”输出到屏幕,程序执行完成退出。
$ strace ./hello_world
execve("./hello_world", ["./hello_world"], [/ 44 vars /]) = 0
brk(0) = 0x12bd000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f48bfadc000
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=70625, ...}) = 0
mmap(NULL, 70625, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7f48bfaca000
close(3) = 0
open("/lib64/libc.so.6", O_RDONLY) = 3
read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0000\356\1\0\0\0\0\0"..., 832) = 832
fstat(3, {st_mode=S_IFREG|0755, st_size=1924768, ...}) = 0
mmap(NULL, 3750184, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7f48bf529000
mprotect(0x7f48bf6b4000, 2093056, PROT_NONE) = 0
mmap(0x7f48bf8b3000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x18a000) = 0x7f48bf8b3000
mmap(0x7f48bf8b9000, 14632, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7f48bf8b9000
close(3) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f48bfac9000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f48bfac8000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f48bfac7000
arch_prctl(ARCH_SET_FS, 0x7f48bfac8700) = 0
mprotect(0x7f48bf8b3000, 16384, PROT_READ) = 0
mprotect(0x7f48bfadd000, 4096, PROT_READ) = 0
munmap(0x7f48bfaca000, 70625) = 0
fstat(1, {st_mode=S_IFCHR|0620, st_rdev=makedev(136, 2), ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7f48bfadb000
write(1, "hello world!\n", 13hello world!
) = 13
exit_group(0) = ?
+++ exited with 0 +++
4.用户态处理流程
printf的实现是在C库,通过stdout打印。
int printf(const char * __restrict format, ...)
{
va_list arg;
int rv;
va_start(arg, format);
rv = vfprintf(stdout, format, arg);
va_end(arg);
return rv;
}
vfprintf函数主要是处理和校验打印格式,调用关系如下:
Vfprintf:
PUTC-> putc_unlocked-> __PUTC_UNLOCKED->…-> __PUTC_UNLOCKED_MACRO:
__fputc_unlocked
__fputc_unlocked函数部分实现如下:
//buffer还没满
if (__STDIO_STREAM_BUFFER_SIZE(stream)) {
//添加到缓冲区
__STDIO_STREAM_BUFFER_ADD(stream, ((unsigned char) c));
if (__STDIO_STREAM_IS_LBF(stream)) {
//遇到'\n'则直接进行commit buffer。
if ((((unsigned char) c) == '\n')
&& __STDIO_COMMIT_WRITE_BUFFER(stream)) {
/ Commit failed! /
__STDIO_STREAM_BUFFER_UNADD(stream); / Undo the write! /
goto BAD;
}
}
} else {
//buffer满了,则直接进行write。
unsigned char uc = (unsigned char) c;
if (! __stdio_WRITE(stream, &uc, 1)) {
goto BAD;
}
}
上文的__STDIO_COMMIT_WRITE_BUFFER 和__stdio_WRITE最终都会调用到write系统调用陷入到内核态继续执行。
static inline ssize_t __WRITE(FILE stream, const char buf, size_t bufsize)
{
__STDIO_STREAM_CUSTOM_WRITE_FUNC(stream, buf, bufsize);
return write(stream->__filedes, buf, bufsize);
}
这里的write调用实际是__libc_write,各种宏定义展开如下:
PSEUDO (__libc_write, write, 3)
ret
PSEUDO_END (__libc_write)
#define PSEUDO(name, syscall_name, args) \
.text; \
ENTRY (name); \
DO_CALL (syscall_name, args); \
cmn r0, $4096;
#undef DO_CALL
#if defined(__ARM_EABI__)
#define DO_CALL(syscall_name, args) \
DOARGS_##args \
mov ip, r7; \
ldr r7, =SYS_ify (syscall_name); \ //r7记录系统调用号
swi 0x0; \ //产生软中断
mov r7, ip; \
UNDOARGS_##args
#else
#define DO_CALL(syscall_name, args) \
DOARGS_##args \
swi SYS_ify (syscall_name); \
UNDOARGS_##args
#endif
#define SYS_ify(syscall_name) (__NR_##syscall_name)
//在内核src\arch\arm\include\uapi\asm\unistd.h中的定义如下:
#if defined(__thumb__) || defined(__ARM_EABI__)
#define __NR_SYSCALL_BASE 0
#else
#define __NR_SYSCALL_BASE __NR_OABI_SYSCALL_BASE
#endif
#define __NR_write (__NR_SYSCALL_BASE+ 4)
调用是先处理参数,接着通过r7记录系统调用号(我使用的内核支持__ARM_EABI__(#define CONFIG_AEABI 1),write的系统调用号为4),执行swi 0x0从用户态先入到内核态。
至此,用户态流程处理完成。
5.内核态处理流程
5.1. 软中断处理
上节说到产生软中断后,内核态会跳转到中断向量处执行。可以看到,通过指令ldrcc pc, [tbl, scno, lsl #2]执行系统调用,通过ret_fast_syscall来返回。
ENTRY(vector_swi)
//执行系统调用前先保存用户态18个寄存器,PT_REGS_SIZE = 72,sizeof(struct pt_regs),分//别是r0-r15、cspr、spsr
sub sp, sp, #PT_REGS_SIZE
stmia sp, {r0 - r12} @ Calling r0 - r12
ARM( add r8, sp, #S_PC )
ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr
THUMB( mov r8, sp )
THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr
mrs r8, spsr @ called from non-FIQ mode, so ok.
str lr, [sp, #S_PC] @ Save calling PC
//进入内核态之前先保存CPSR,返回到用户态时从SPSR中恢复
str r8, [sp, #S_PSR] @ Save CPSR
str r0, [sp, #S_OLD_R0] @ Save OLD_R0
zero_fp
alignment_trap r10, ip, __cr_alignment
enable_irq
ct_user_exit
get_thread_info tsk
/*
* Get the system call number.
*/
#if defined(CONFIG_OABI_COMPAT)
...
#elif defined(CONFIG_AEABI)
/*
* Pure EABI user space always put syscall number into scno (r7).
*/
#elif defined(CONFIG_ARM_THUMB)
/ Legacy ABI only, possibly thumb mode. /
tst r8, #PSR_T_BIT @ this is SPSR from save_user_regs
addne scno, r7, #__NR_SYSCALL_BASE @ put OS number in
USER( ldreq scno, [lr, #-4] )
#else
…
#endif
uaccess_disable tbl
//加载系统调用表基地址
adr tbl, sys_call_table @ load syscall table pointer
#if defined(CONFIG_OABI_COMPAT)
/*
* If the swi argument is zero, this is an EABI call and we do nothing.
*
* If this is an old ABI call, get the syscall number into scno and
* get the old ABI syscall table address.
*/
…
#elif !defined(CONFIG_AEABI)
bic scno, scno, #0xff000000 @ mask off SWI op-code
eor scno, scno, #__NR_SYSCALL_BASE @ check OS number
#endif
local_restart:
ldr r10, [tsk, #TI_FLAGS] @ check for syscall tracing
stmdb sp!, {r4, r5} @ push fifth and sixth args
tst r10, #_TIF_SYSCALL_WORK @ are we tracing syscalls?
bne __sys_trace
cmp scno, #NR_syscalls @ check upper syscall limit
//通过__ret_fast_syscall返回
badr lr, __ret_fast_syscall @ return address
//通过系统调用表基地址tbl+系统调用好scno,执行系统调用函数
ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine
add r1, sp, #S_OFF
2: cmp scno, #(__ARM_NR_BASE - __NR_SYSCALL_BASE)
eor r0, scno, #__NR_SYSCALL_BASE @ put OS number back
bcs arm_syscall
mov why, #0 @ no longer a real syscall
b sys_ni_syscall @ not private func
ENDPROC(vector_swi)
5.2 系统调用返回
上一小节看到,系统调用执行完成返回到__ret_fast_syscall:
ret_fast_syscall:
__ret_fast_syscall:
UNWIND(.fnstart )
UNWIND(.cantunwind )
disable_irq_notrace @ disable interrupts
ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing
tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK
bne fast_work_pending
/ perform architecture specific actions before user return /
arch_ret_to_user r1, lr
restore_user_regs fast = 1, offset = S_OFF
UNWIND(.fnend )
ENDPROC(ret_fast_syscall)
fast_work_pending:
str r0, [sp, #S_R0+S_OFF]! @ returned r0
/ fall through to work_pending /
slow_work_pending:
mov r0, sp @ 'regs'
mov r2, why @ 'syscall'
bl do_work_pending //见下
cmp r0, #0
beq no_work_pending
movlt scno, #(__NR_restart_syscall - __NR_SYSCALL_BASE)
ldmia sp, {r0 - r6} @ have to reload r0 - r6
b local_restart @ ... and off we go
no_work_pending:
asm_trace_hardirqs_on save = 0
/ perform architecture specific actions before user return /
arch_ret_to_user r1, lr //恢复用户态的寄存器
ct_user_enter save = 0
restore_user_regs fast = 0, offset = 0
在返回用户态前,do_work_pending主要检查是否处理pend的信号。
asmlinkage int
do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall)
{
/*
* The assembly code enters us with IRQs off, but it hasn't
* informed the tracing code of that for efficiency reasons.
* Update the trace code with the current status.
*/
trace_hardirqs_off();
do {
//检查是否需要重新调用
if (likely(thread_flags & _TIF_NEED_RESCHED)) {
schedule();
} else {
if (unlikely(!user_mode(regs)))
return 0;
local_irq_enable();
//有未处理的信号
if (thread_flags & _TIF_SIGPENDING) {
int restart = do_signal(regs, syscall);
if (unlikely(restart)) {
/*
* Restart without handlers.
* Deal with it without leaving
* the kernel space.
*/
return restart;
}
syscall = 0;
}
…
}
local_irq_disable();
thread_flags = current_thread_info()->flags;
} while (thread_flags & _TIF_WORK_MASK);
return 0;
}
5.3 系统调用处理
系统调用write实际调用的是sys_write,在内核代码中无法直接搜到,因为它是通过宏定义拼接的,跟踪宏展开中name字段就可以看到最终是sys_write函数,在内核编译生成的System.map也可以搜到sys_write符号:
define __NR_write 64
__SYSCALL(__NR_write, sys_write)
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
#define SYSCALL_DEFINEx(x, sname, ...) \
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
#define __SYSCALL_DEFINEx(x, name, ...) \
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
__attribute__((alias(__stringify(SyS##name)))); \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
Sys_write函数的具体实现如下:
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if(f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_write(f.file, buf, count, &pos);
if(ret >= 0)
file_pos_write(f.file, pos);
fdput_pos(f);
}
return ret;
}
vfs_write函数调用如下:
vfs_write
__vfs_write
file->f_op->write(file, p, count, pos);
//这里的实际执行函数时redirected_tty_write
5.4 stdout重定向到console
查看程序的fd,可以看到fd 0、1和2都是重定向到/dev/console。
# 679为程序pid
ls /proc/679/fd
lrwx------ 1 64 2 -> /dev/console
lrwx------ 1 64 1 -> /dev/console
lrwx------ 1 64 0 -> /dev/console
内核启动时创建init进程(pid=1):
start_kernel
rest_init
/*
* We need to spawn init first so that it obtains pid 1, however
* the init task will end up wanting to create kthreads, which, if
* we schedule it before we create kthreadd, will OOPS.
*/
kernel_thread(kernel_init, NULL, CLONE_FS);
init进程打开/dev/console作为标准输入输出。
kernel_init
kernel_init_freeable
/ Open the /dev/console on the rootfs, this should never fail /
if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0) //stdin, fd = 0
pr_err("Warning: unable to open an initial console.\n");
(void) sys_dup(0); //stdout, fd = 1;
(void) sys_dup(0); //stdout fd = 2;
Linux的所有进程都是由init进程创建的,继承fd 0、1和2。因此,打印都被重定向到/dev/console上,执行系统调用write函数,实际就是执行的console的file_operations的write函数。
在内核启动日志中,可以看到在打印:console [ttyS0] enabled。
5.5 tty及sstar uart驱动
tty驱动初始化流程如下,创建字符设备并注册到/dev/console:
__initcall_chr_dev_init5
chr_dev_init
tty_init
tty_init
cdev_init(&console_cdev, &console_fops);
if (cdev_add(&console_cdev, MKDEV(TTYAUX_MAJOR, 1), 1) ||
register_chrdev_region(MKDEV(TTYAUX_MAJOR, 1), 1, "/dev/console") < 0)
panic("Couldn't register /dev/console driver\n");
struct file_operations console_fops结构体如下:
static const struct file_operations console_fops = {
.llseek = no_llseek,
.read = tty_read,
.write = redirected_tty_write,
.poll = tty_poll,
.unlocked_ioctl = tty_ioctl,
.compat_ioctl = tty_compat_ioctl,
.open = tty_open,
.release = tty_release,
.fasync = tty_fasync,
};
write系统调用最终会调用到redirected_tty_write
redirected_tty_write
tty_write
do_tty_write(ld->ops->write, tty, file, buf, count)
ld->ops->write();
//tty_register_ldisc 中通过tty_register_ldisc(N_TTY, &n_tty_ops);注册
//(见下文注释1),因此此处write函数实际为n_tty_write。
n_tty_write
c = tty->ops->write(tty, b, nr);
//uart_register_driver中通过tty_set_operations(normal, &uart_ops); 注册
//(见下文注释2),write回调函数为uart_write
uart_write
//将数据送到xmit环形缓冲区(队列大小:PAGE_SIZE)中,若环形队列满
//则不再拷贝
__uart_start
port->ops->start_tx(port);
//sstar平台在_ms_uart_console_prepare中通过
//console_port.port.ops=&ms_uart_ops; 注册
//此处调用的是ms_uart_start_tx
ms_uart_start_tx
//将xmit环形队列的数据拷贝到驱动的dma的tx_buf中
URDMA_StartTx
至此,write系统调用返回。
注释1:
n_tty_ops注册流程如下:
start_kernel
console_init
n_tty_init
tty_register_ldisc(N_TTY, &n_tty_ops);
static struct tty_ldisc_ops n_tty_ops = {
.magic = TTY_LDISC_MAGIC,
.name = "n_tty",
.open = n_tty_open,
.close = n_tty_close,
.flush_buffer = n_tty_flush_buffer,
.read = n_tty_read,
.write = n_tty_write,
.ioctl = n_tty_ioctl,
.set_termios = n_tty_set_termios,
.poll = n_tty_poll,
.receive_buf = n_tty_receive_buf,
.write_wakeup = n_tty_write_wakeup,
.receive_buf2 = n_tty_receive_buf2,
};
注释2:
我这里跟踪的是sigmastar平台的uart驱动。
内核启动时会调用uart驱动模块的init函数,即ms_uart_module_init。
ms_uart_module_init
uart_register_driver(&ms_uart_driver);
tty_set_operations(normal, &uart_ops);
platform_driver_register(&ms_uart_platform_driver);
static struct uart_driver ms_uart_driver = {
.owner = THIS_MODULE,
.driver_name = "ms_uart",
.dev_name = "ttyS",
.nr = 8,
.cons = &ms_uart_console,
};
static struct console ms_uart_console =
{
.name = MS_CONSOLE_DEV,
.write = ms_uart_console_write,
.setup = ms_uart_console_setup,
.flags = CON_PRINTBUFFER,
.device = uart_console_device,
.data = &ms_uart_driver,
.index = -1,
#if CONSOLE_DMA
.match = ms_uart_console_match,
#endif
};
static const struct tty_operations uart_ops = {
.open = uart_open,
.close = uart_close,
.write = uart_write,
.put_char = uart_put_char,
.flush_chars = uart_flush_chars,
.write_room = uart_write_room,
…
};
static struct platform_driver ms_uart_platform_driver = {
.remove = ms_uart_remove,
.probe = ms_uart_probe,
…
};
在platform_driver_register中,会调用probe函数ms_uart_probe注册console ttyS0,调用关系如下:
ms_uart_module_init
platform_driver_register—> __platform_driver_register
driver_register
bus_add_driver
driver_attach
bus_for_each_dev
__driver_attach
driver_probe_device
really_probe
ret = dev->bus->probe(dev);
//这里实际调用的就是上面注册的
ms_uart_probe
uart_add_one_port
uart_configure_port
register_console
print console [ttyS0] enabled
5.6 sstar uart dma发送线程
需要关注到一个内核线程urdma_tx_thread,它是在内核启动时会初始化platform,调用ms_uart_probe创建tx线程。
ms_uart_probe
//设置DMA的tx和rx缓冲区(页对齐)
mp->urdma->rx_urdma_size = PAGE_ALIGN(UR2DMA_RX_BUF_LENGTH);
mp->urdma->tx_urdma_size = PAGE_ALIGN(UR2DMA_TX_BUF_LENGTH);
//启动一个内核线程输出打印
mp->urdma_task = kthread_run(urdma_tx_thread,(void *)&mp->port,"urdma_tx_thread");
ret = uart_add_one_port(&ms_uart_driver, &mp->port);
uart_configure_port
//boot参数dh_keyboard在此生效
register_console
ms_uart_console_setup //设置波特率等参数
urdma_tx_thread实现如下:
static int urdma_tx_thread(void *arg)
{
struct uart_port p = (struct uart_port )arg;
struct circ_buf *xmit;
while(!kthread_should_stop()){
//等待中断唤醒返回
wait_event_interruptible(urdma_wait, urdma_conditions);
urdma_conditions = 0;
xmit = &p->state->xmit;
if (uart_circ_empty(xmit) || uart_tx_stopped(p))
{
ms_uart_stop_tx(p);
}
if (uart_circ_chars_pending(xmit))
{
//环形缓冲区有数据,则将数据拷贝到驱动
URDMA_StartTx(p);
}else
{
//环形缓冲区数据满了
//调用n_tty_write_wakeup,发送SIGIO信号通知driver有output data
uart_write_wakeup(p);
}
}
return 0;
}
驱动加载的时候会uart_ops中的open接口,实现如下:
uart_open
tty_port_open
port->ops->activate(port, tty);
//实际为uart_port_activate
uart_port_activate
uart_startup
uart_port_startup
uport->ops->startup(uport)
//实际为ms_uart_startup
ms_uart_startup
//此处注册了uart的中断处理函数ms_uart_interrupt
request_irq(mp->urdma->urdma_irq, ms_uart_interrupt, IRQF_SHARED, "ms_serial_dma",p);
ms_uart_interrupt函数实现如下:
static irqreturn_t ms_uart_interrupt(s32 irq, void *dev_id)
{
…
if(mp->use_dma)
{
u8 status = URDMA_GetInterruptStatus(p);
if(status & URDMA_INTR_STATUS_RX)
{
…
}
else if(status & URDMA_INTR_STATUS_TX)
{
//有tx_mcu_intr中断,则wakeup
URDMA_TxClearInterrupt(p);
urdma_conditions = 1;
wake_up_interruptible(&urdma_wait);
}
…
}
在收到urdma_wait的唤醒中断时,urdma_tx_thread会被唤醒,如果环形缓冲区数据满了,则通知驱动中断程序取数据并输出到串口上。
至此,整个printf打印流程完成。
6 问:为什么printf打印不会卡?
答:printf打印不会卡最根本的原因在于printf打印是异步的。数据从用户态的C库中的缓冲区到内核态的write调用,接着到拷贝到tty的xmit环形缓冲区,这个过程是同步的,执行完成返回。在此过程中,主要涉及内存拷贝动作,没有其他耗时的操作。
剩下的过程是异步执行:当有tx_mcu_intr中断时,在内核线程urdma_tx_thread中从tty的环形缓冲区拷贝到驱动设备的私有数据,当数据满的时候,发送SIGIO信号通知driver有output data。而驱动收到信号后将数据输出打印到串口中。
7.参考文献
http://blog.chinaunix.net/uid-29401328-id-4866781.html
https://www.cnblogs.com/pengdonglin137/p/3878316.html
https://www.cnblogs.com/cslunatic/p/3655970.html
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
所有评论(0)