linux宕机时Oops分析及问题定位
以下面这个例子说明(下面这个例子就是造一个野指针所引发的错误):/** test-debug-scr.c** Copyright (C) 2012 - 2021 Reuuimlla Limited** Adapt to support xxx*/#include <linux/types.h>#include <linux/kernel.h>#include <lin
以下面这个例子说明(下面这个例子就是造一个野指针所引发的错误):
/*
* test-debug-scr.c
*
* Copyright (C) 2012 - 2021 Reuuimlla Limited
*
* Adapt to support xxx
*/
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/ide.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/vmalloc.h>
/*-------------------------------------------------------------------------------*/
/* DEFINITION */
/*-------------------------------------------------------------------------------*/
/*
* @description : 驱动入口函数
* @param : 无
* @return : 无
*/
static int __init test_init(void)
{
int *p = 0x1231223;
*p = 0x1231223;
printk("module loaded.\n");
return 0;
}
/*
* @description : 驱动出口函数
* @param : 无
* @return : 无
*/
static void __exit test_exit(void)
{
printk("module unloaded.\n");
return;
}
module_init(test_init);
module_exit(test_exit);
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("test-debug-scr generic test driver");
MODULE_AUTHOR("xxxx");
很明显,人为制造错误的地方就是test_init函数内的 “ *p = 0x1231223 ”语句。
然后,把这个模块编译出来,再用insmod来插入到内核空间,Oops出现了。
# insmod test-debug.ko
[ 61.715630] Internal error: Oops: 805 [#1] PREEMPT SMP ARM
[ 61.721656] Modules linked in: test_debug(O+) shb_uart(O) shb_lcd(O) adc(O) i2c(O)
[ 61.721904] CPU: 2 PID: 1177 Comm: insmod Tainted: G O 3.10.65 #493
[ 61.721904] task: e6265500 ti: e5d5a000 task.ti: e5d5a000
[ 61.721904] PC is at test_init+0x1c/0x44 [test_debug]
[ 61.721904] LR is at do_one_initcall+0xa8/0x144
[ 61.721904] pc : [<bf01a01c>] lr : [<c000a4d8>] psr: 600c0013
[ 61.721904] sp : e5d5be40 ip : e5d5be50 fp : e5d5be4c
[ 61.721904] r10: e5ff7124 r9 : 00000001 r8 : 00000000
[ 61.721904] r7 : bf0180d4 r6 : c0967100 r5 : bf01a000 r4 : e5d5a000
[ 61.721904] r3 : 01231000 r2 : 00000023 r1 : 00000012 r0 : bf018044
[ 61.721904] Flags: nZCv IRQs on FIQs on Mode SVC_32 ISA ARM Segment user
[ 61.721904] Control: 10c53c7d Table: 66b0c06a DAC: 00000015
[ 61.721904]
[ 61.721904] LR: 0xc000a458:
[ 61.721904] a458 e3c4403f e5d6301c e5948004 e3530000 0a000019 e594300c e1a01000 e59f00dc
[ 61.721904] a478 e593215c eb1754da e24b002c eb017781 e14b22dc e14b23f4 e12fff35 e1a07000
[ 61.721904] a498 e24b002c eb01777b e14b22dc e14b03d4 e0520000 e1a02520 e0c31001 e1a03521
[ 61.721904] a4b8 e59f009c e1822b01 e1a01005 e88d000c e1a02007 eb1754c6 ea000001 e12fff30
[ 61.721904] a4d8 e1a07000 e3a03000 e5c6301d e5943004 e1580003 0a000003 e59f1068 e59f0068
[ 61.721904] a4f8 eb09c6d5 e5848004 e10f3000 e3130080 0a000004 e3a02040 e59f1050 e59f0048
[ 61.721904] a518 eb09c895 f1080080 e5d6301d e3530000 0a000006 e59f3030 e30012c5 e59f2030
[ 61.721904] a538 e59f0030 e58d3000 e1a03005 eb0065a6 e1a00007 e24bd020 e89da9f0 c0967100
[ 61.721904]
[ 61.721904] SP: 0xe5d5bdc0:
[ 61.721904] bdc0 8040003f c00c4e38 ffffffff c00b262c 00000001 c00b2868 bf01a01c 600c0013
[ 61.721904] bde0 ffffffff e5d5be2c e5d5be4c e5d5bdf8 c05e5b98 c000a16c bf018044 00000012
[ 61.721904] be00 00000023 01231000 e5d5a000 bf01a000 c0967100 bf0180d4 00000000 00000001
[ 61.721904] be20 e5ff7124 e5d5be4c e5d5be50 e5d5be40 c000a4d8 bf01a01c 600c0013 ffffffff
[ 61.721904] be40 e5d5be8c e5d5be50 c000a4d8 bf01a00c c00f58cc e5d5bf48 00000001 bf01808c
[ 61.721904] be60 bf0180d4 e5ff7100 e5d5be8c e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100
[ 61.721904] be80 e5d5bf44 e5d5be90 c007b494 c000a43c bf018098 00007fff c0077394 e5d5bea8
[ 61.721904] bea0 00024dd6 bf0181f0 e80133ec e5d5bf48 c05eef2c c0078c30 e5ff7108 00000000
[ 61.721904]
[ 61.721904] IP: 0xe5d5bdd0:
[ 61.721904] bdd0 00000001 c00b2868 bf01a01c 600c0013 ffffffff e5d5be2c e5d5be4c e5d5bdf8
[ 61.721904] bdf0 c05e5b98 c000a16c bf018044 00000012 00000023 01231000 e5d5a000 bf01a000
[ 61.721904] be10 c0967100 bf0180d4 00000000 00000001 e5ff7124 e5d5be4c e5d5be50 e5d5be40
[ 61.721904] be30 c000a4d8 bf01a01c 600c0013 ffffffff e5d5be8c e5d5be50 c000a4d8 bf01a00c
[ 61.721904] be50 c00f58cc e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100 e5d5be8c e5d5bf48
[ 61.721904] be70 00000001 bf01808c bf0180d4 e5ff7100 e5d5bf44 e5d5be90 c007b494 c000a43c
[ 61.721904] be90 bf018098 00007fff c0077394 e5d5bea8 00024dd6 bf0181f0 e80133ec e5d5bf48
[ 61.721904] beb0 c05eef2c c0078c30 e5ff7108 00000000 bf018098 e5d5a000 e7ff8000 0001b43c
[ 61.721904]
[ 61.721904] FP: 0xe5d5bdcc:
[ 61.721904] bdcc c00b262c 00000001 c00b2868 bf01a01c 600c0013 ffffffff e5d5be2c e5d5be4c
[ 61.721904] bdec e5d5bdf8 c05e5b98 c000a16c bf018044 00000012 00000023 01231000 e5d5a000
[ 61.721904] be0c bf01a000 c0967100 bf0180d4 00000000 00000001 e5ff7124 e5d5be4c e5d5be50
[ 61.721904] be2c e5d5be40 c000a4d8 bf01a01c 600c0013 ffffffff e5d5be8c e5d5be50 c000a4d8
[ 61.721904] be4c bf01a00c c00f58cc e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100 e5d5be8c
[ 61.721904] be6c e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100 e5d5bf44 e5d5be90 c007b494
[ 61.721904] be8c c000a43c bf018098 00007fff c0077394 e5d5bea8 00024dd6 bf0181f0 e80133ec
[ 61.721904] beac e5d5bf48 c05eef2c c0078c30 e5ff7108 00000000 bf018098 e5d5a000 e7ff8000
[ 61.721904]
[ 61.721904] R4: 0xe5d59f80:
[ 61.721904] 9f80 5f617461 30313030 3138373a 3e3e3e20 646f6320 32333d65 20203333 35323120
[ 61.721904] 9fa0 5f39320a 38333431 e7203430 83e8b594 6d2f20bd 682f746e 2f736667 6f632f45
[ 61.721904] 9fc0 745f6564 61622f33 6d5f6573 72657465 2f33745f 2f637273 2e636969 20707063
[ 61.721904] 9fe0 5f706d63 61746164 3130305f 38373a30 3e3e2032 6f63203e 333d6564 20383332
[ 61.721904] a000 00000000 00000001 00000000 e6265500 c0919278 00000002 00000015 c1983e00
[ 61.721904] a020 e6265500 e687b740 e5d5a000 e611f6c0 c090f820 00000000 e5d5be7c e5d5bde8
[ 61.721904] a040 c05e4030 00000000 00000000 00000000 00000000 00000000 01000000 00000000
[ 61.721904] a060 b6f496d0 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 61.721904]
[ 61.721904] R6: 0xc0967080:
[ 61.721904] 7080 c09e2d3c c054e6a0 c054e7f0 c09e2d48 c054c0c0 c054c1a0 c09e2d54 c056b84c
[ 61.721904] 70a0 c056ba40 c09e2d54 c056a360 c056a568 c09e2d54 c0567424 c05677e4 c09e2d54
[ 61.721904] 70c0 c0560e94 c0560f54 c09e2d54 c054d858 c054d864 c09e2d54 c054bc34 c054bce8
[ 61.721904] 70e0 c09e2d60 c0578ca4 c0578cb4 c09e2d60 c054f768 c054f778 c09e2d60 00000000
[ 61.721904] 7100 00000000 c19754b8 c07c24ff 00000000 00000000 c1975280 c1975440 00000000
[ 61.721904] 7120 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 61.721904] 7140 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 61.721904] 7160 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 61.721904]
[ 61.721904] R10: 0xe5ff70a4:
[ 61.721904] 70a4 30936305 e5ce5b54 00000000 00000000 81240002 000024bc 00000000 00000001
[ 61.721904] 70c4 00000000 e63d17c0 e63d15c0 e63d1691 e5ff7090 e63d1890 00000000 00000000
[ 61.721904] 70e4 2fe0e316 e5ce5b78 00000000 00000000 81240002 000024bd 00000000 e5ff7140
[ 61.721904] 7104 00000001 e63d1640 00000124 00000024 bf018068 c0078b04 00000000 00000000
[ 61.721904] 7124 00000000 00000000 00000000 00000000 00000000 00000000 00000000 e5ff7180
[ 61.721904] 7144 e5ff7144 e5ff7144 bf0180d4 00000000 c092d01c e5ff7200 00000001 00000003
[ 61.721904] 7164 00000000 00000000 00000000 00000000 00000000 00000000 00000000 65746f6e
[ 61.721904] 7184 00000073 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 61.721904] Process insmod (pid: 1177, stack limit = 0xe5d5a238)
[ 61.721904] Stack: (0xe5d5be40 to 0xe5d5c000)
[ 61.721904] be40: e5d5be8c e5d5be50 c000a4d8 bf01a00c c00f58cc e5d5bf48 00000001 bf01808c
[ 61.721904] be60: bf0180d4 e5ff7100 e5d5be8c e5d5bf48 00000001 bf01808c bf0180d4 e5ff7100
[ 61.721904] be80: e5d5bf44 e5d5be90 c007b494 c000a43c bf018098 00007fff c0077394 e5d5bea8
[ 61.721904] bea0: 00024dd6 bf0181f0 e80133ec e5d5bf48 c05eef2c c0078c30 e5ff7108 00000000
[ 61.721904] bec0: bf018098 e5d5a000 e7ff8000 0001b43c 00001171 00000000 0b300007 00000000
[ 61.721904] bee0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
[ 61.721904] bf00: 00000000 00000000 00000000 00000000 00000000 00000000 000000e0 00000000
[ 61.721904] bf20: 00024dd6 00000003 0000017b c000fac8 e5d5a000 00000000 e5d5bfa4 e5d5bf48
[ 61.721904] bf40: c007bb94 c0079eec e7ff8000 0001b43c e8012e74 e800cae3 e800cf94 00000204
[ 61.721904] bf60: 000002c4 00000000 00000000 00000000 00000023 00000024 0000000f 00000000
[ 61.721904] bf80: 0000000c 00000000 00000000 00000000 00000000 00eb8d78 00000000 e5d5bfa8
[ 61.721904] bfa0: c000f880 c007bb30 00000000 00000000 00000003 00024dd6 00000000 00eb8d78
[ 61.721904] bfc0: 00000000 00000000 00eb8d78 0000017b 00eb8060 00000000 00000002 beb26d84
[ 61.721904] bfe0: beb26bb8 beb26ba8 0001e120 b6ea7da2 800c0030 00000003 fad1ffdf 7eb9fb6e
[ 61.721904] [<bf01a01c>] (test_init+0x1c/0x44 [test_debug]) from [<c000a4d8>] (do_one_initcall+0xa8/0x144)
[ 61.721904] [<c000a4d8>] (do_one_initcall+0xa8/0x144) from [<c007b494>] (load_module+0x15b4/0x1b68)
[ 61.721904] [<c007b494>] (load_module+0x15b4/0x1b68) from [<c007bb94>] (SyS_finit_module+0x70/0x78)
[ 61.721904] [<c007bb94>] (SyS_finit_module+0x70/0x78) from [<c000f880>] (ret_fast_syscall+0x0/0x30)
[ 61.721904] Code: e59f3028 e3a02023 e3a01012 e59f0020 (e5c32223)
[ 62.833595] ---[ end trace a662172d624f693f ]---
Segmentation fault
这里需要关注几个地方:
- PC指针的位置: [ 61.721904] PC is at test_init+0x1c/0x44 [test_debug]
- Oops的错误代码:[ 61.715630] Internal error: Oops: 805 [#1] PREEMPT SMP ARM
- 栈的回溯过程(最后几行):SyS_finit_module -> load_module -> do_one_initcall -> test_init
(1) 先说Oops的错误代码: Oops: 805 [#1]
这里,805表示Oops的错误代码,#1表示这个错误发生一次。Oops的错误代码,根据错误的原因会有不同的定义,本文中的例子可以参考下面的定义(如果发现自己遇到的Oops和下面无法对应的话,最好去内核代码里查找):
* error_code:
* bit 0 == 0 means no page found, 1 means protection fault
* bit 1 == 0 means read, 1 means write
* bit 2 == 0 means kernel, 1 means user-mode
* bit 3 == 0 means data, 1 means instruction
(2)栈的回溯过程: SyS_finit_module -> load_module -> do_one_initcall -> test_init
从栈的回溯顺序可以看到,最后出现错误的地方就是 “test_init”函数中。
(3)PC指针的位置: PC is at test_init+0x1c/0x44 [test_debug]
从栈的回溯过程可以得到出错的函数,但是具体出错在函数的哪一行还不知道。
但是从“test_init+0x1c/0x44” 这段信息可以知道:该函数长度为 0x44,错误发生在 test_init 函数的 0x1c 偏移处。
因此这里需要借用 objdump 工具,如果是交叉编译的,可以使用具体环境下的objdump工具:
我这里使用的是:arm-linux-gnueabihf-objdump
arm-linux-gnueabihf-objdump -S test-debug.o : 参数 -S 表示尽可能的把原来的代码和反汇编出来的代码一起呈现出来,-S 参数需要结合arm-linux-gcc编译参数 -g,才能达到反汇编时同时输出原来的代码。
通过将编译的 “.o” 文件反编译,找出对应的代码偏移处。反编译后的代码如下:
# arm-linux-gnueabihf-objdump -S test-debug.o
test-debug.o: file format elf32-littlearm
Disassembly of section .init.text:
00000000 <init_module>:
* @description : 驱动入口函数
* @param : 无
* @return : 无
*/
static int __init test_init(void)
{
0: e1a0c00d mov ip, sp
4: e92dd800 push {fp, ip, lr, pc}
8: e24cb004 sub fp, ip, #4
int *p = 0x1231223;
*p = 0x1231223;
c: e59f3028 ldr r3, [pc, #40] ; 3c <init_module+0x3c>
10: e3a02023 mov r2, #35 ; 0x23
14: e3a01012 mov r1, #18
printk("module loaded.\n");
18: e59f0020 ldr r0, [pc, #32] ; 40 <init_module+0x40>
* @return : 无
*/
static int __init test_init(void)
{
int *p = 0x1231223;
*p = 0x1231223;
1c: e5c32223 strb r2, [r3, #547] ; 0x223
20: e5c32225 strb r2, [r3, #549] ; 0x225
24: e3a02001 mov r2, #1
28: e5c31224 strb r1, [r3, #548] ; 0x224
2c: e5c32226 strb r2, [r3, #550] ; 0x226
printk("module loaded.\n");
30: ebfffffe bl 0 <printk>
return 0;
}
34: e3a00000 mov r0, #0
38: e89da800 ldm sp, {fp, sp, pc}
3c: 01231000 .word 0x01231000
40: 00000000 .word 0x00000000
Disassembly of section .exit.text:
00000000 <cleanup_module>:
* @description : 驱动入口函数
* @param : 无
* @return : 无
*/
static int __init test_init(void)
{
0: e1a0c00d mov ip, sp
4: e92dd800 push {fp, ip, lr, pc}
8: e24cb004 sub fp, ip, #4
int *p = 0x1231223;
*p = 0x1231223;
c: e59f0004 ldr r0, [pc, #4] ; 18 <cleanup_module+0x18>
10: ebfffffe bl 0 <printk>
14: e89da800 ldm sp, {fp, sp, pc}
printk("module loaded.\n");
18: 00000010 .word 0x00000010
重点观察 test_init 函数的 1c 处,即发生错误的行。
至此:
- 我们人为制造出的错误行: *p = 0x1231223;
- Oops给出的偏移位置: PC is at test_init+0x1c/0x44 [test_debug]
- 反编译出的代码偏移位置: 1c: e5c32223 strb r2, [r3, #547] ; 0x223
三者基本吻合,由此可以断定发生的错误就在: *p = 0x1231223 处。
在Oops的帮助下我们很快就解决了问题。
=========================================================================
摘抄一段:
在Oops发生以后没有造成宕机的情况下,我们就可以从dmesg中查看到完整的信息。但更多
的情况是Oops发生的同时系统也会宕机,此时这些出错信息是来不及存入文件中的,关掉电源后
就无法再看到了,我们只能通过其他的方式来记录:手抄或者拍照。
还有更坏的情况,如果Oops信息过多的话,一页屏幕显示不全,我们怎么来查看完整的内容
呢?第1种方法,在grub里用vga参数指定更高的分辨率以使屏幕可以显示更多的内容。很明显,
这个方法其实解决不了太多的问题;第2种方法,使用两台机器,把调试机的Oops信息通过串口
打印到宿主机的屏幕上。但现在大部分的笔记本电脑是没有串口的,这个解决方法也有很大的局限
性;第3种方法,使用内核转储工具 kdump 把发生Oops时的内存和CPU寄存器的内容dump到一
个文件里,之后我们再用gdb来分析问题(重点在第3种方法)。
开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!
更多推荐
所有评论(0)