2008-10-14 07:08:15 Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others
Ian Jeffray (UNITED KINGDOM)
Message: 63641
I've been troubled by severe instability with the 2008R1.5-RC3 when using NFS. even an "ls" can sometimes kill the board or cause CPLB misses. I've observed this on the BF537-STAMP board, our custom BF537 board and custom BF527 board. The problem is almost impossible to reproduce with the previous 2008R1 release (I think I managed it once, after some persistant loading).
Using a vanilla build configuration for the BF537-STAMP (minus a few user space apps, so it fits in the flash!), I can reproduce the crash in just three commands: ifconfig, mount, shell tab expansion. The crash manifests as an error in strcmp.S from within uClibc, during a qsort from busybox. Why this should only happen when using NFS mounts, I'm not sure.
Below is a trace showing the problem. I ifconfig the interface, mount the NFS mount, cd to the mount point, then run busybox with strace. In busybox I type "cat sn" and then hit tab. That's when the crash happens. (There's a file called "snap.jpg" in my home directory which I expected to get tab expanded). This is 100% repeatable on a variety of boards.
I've spent a fair time trying to dig in to this, but I'm not really coming any closer to any resolution. Any suggestions/help would be hugely appreciated.
Ian.
root:/> ifconfig eth0 192.168.0.41 up
root:/> PHY: 0:01 - Link is Up - 100/Full
root:/> mount -o nolock 192.168.0.101:/home/ijeffray /home
root:/> cd /home
root:/home> strace sh
ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
getuid32() = 0
getgid32() = 0
setgid32(0) = 0
setuid32(0) = 0
mmap2(NULL, 90112, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x500000
close(10) = -1 EBADF (Bad file descriptor)
close(11) = -1 EBADF (Bad file descriptor)
close(12) = -1 EBADF (Bad file descriptor)
close(13) = -1 EBADF (Bad file descriptor)
close(14) = -1 EBADF (Bad file descriptor)
close(15) = -1 EBADF (Bad file descriptor)
close(16) = -1 EBADF (Bad file descriptor)
close(17) = -1 EBADF (Bad file descriptor)
close(18) = -1 EBADF (Bad file descriptor)
close(19) = -1 EBADF (Bad file descriptor)
getpid() = 120
geteuid32() = 0
ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
write(1, "\n\n", 2
) = 2
write(1, "BusyBox v1.4.1 (2008-10-13 22:57"..., 62BusyBox v1.4.1 (2008-10-13 22:57:12 BST) Built-in shell (msh)
) = 62
write(1, "Enter \'help\' for a list of built"..., 46Enter 'help' for a list of built-in commands.
) = 46
write(1, "\n", 1
) = 1
rt_sigaction(SIGQUIT, {SIG_IGN}, {SIG_DFL}, 8) = 0
rt_sigaction(SIGTERM, {0x4c27a4, [TERM], SA_RESTART}, {SIG_DFL}, 8) = 0
rt_sigaction(SIGINT, {SIG_IGN}, {SIG_DFL}, 8) = 0
rt_sigaction(SIGINT, {0x4c3af0, [INT], SA_RESTART}, {SIG_IGN}, 8) = 0
ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0
ioctl(0, SNDCTL_TMR_START or TCSETS, {B9600 opost -isig -icanon -echo ...}) = 0
ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost -isig -icanon -echo ...}) = 0
ioctl(0, TIOCGWINSZ, {ws_row=24, ws_col=80, ws_xpixel=0, ws_ypixel=0}) = 0
rt_sigaction(SIGWINCH, {0x4c8590, [WINCH], SA_RESTART}, {SIG_DFL}, 8) = 0
geteuid32() = 0
open("/etc/passwd", O_RDONLY) = 3
ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0x4f6a28) = -1 ENOTTY (Inappropriate ioctl for device)
mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x37ad000
read(3, "root:C7O0oWtNuGHNE:0:0:root:/roo"..., 256) = 66
close(3) = 0
getuid32() = 0
getgid32() = 0
mmap2(NULL, 8192, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x472000
getcwd("/hom", 4098) = 6
write(1, "root:/home> ", 12root:/home> ) = 12
read(0, "c", 1) = 1
write(1, "c", 1c) = 1
read(0, "a", 1) = 1
write(1, "a", 1a) = 1
read(0, "t", 1) = 1
write(1, "t", 1t) = 1
read(0, " ", 1) = 1
write(1, " ", 1 ) = 1
read(0, "s", 1) = 1
write(1, "s", 1s) = 1
read(0, "n", 1) = 1
write(1, "n", 1n) = 1
read(0, "\t", 1) = 1
open(".", O_RDONLY|O_NONBLOCK|O_DIRECTORY) = 3
fstat(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
mmap2(NULL, 36864, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x520000
getdents64(3, /* 56 entries */, 32768) = 1832
stat("./snap.jpg", {st_mode=S_IFREG|0666, st_size=244445, ...}) = 0
--- SIGBUS (Bus error) @ 0 (0) ---
+++ killed by SIGBUS +++
Process 120 detached
root:/home>
Data access CPLB miss
- Used by the MMU to signal a CPLB miss on a data access.
Defered Exception context
CURRENT PROCESS:
COMM=sh PID=120
TEXT = 0x00480040-0x004d2440 DATA = 0x004d2444-0x004e87e4
BSS = 0x004e87e4-0x004efe64 USER-STACK = 0x004f7f24
return address: [0x00492630]; contents of:
0x00492610: 05e5 0c01 e800 0003 3028 1832 0c00 185a
0x00492620: 3210 3228 6439 63e0 5581 6058 0a06 6060
0x00492630: [e414] ffff 0730 0a26 6fe5 102a 3006 64e0
0x00492640: 0984 1c19 e140 004e e100 7e84 e300 1262
SEQUENCER STATUS: Not tainted
SEQSTAT: 00060026 IPEND: 0030 SYSCFG: 0006
HWERRCAUSE: 0x18
EXCAUSE : 0x26
RETE: <0x00000000> /* Maybe null pointer? */
RETN: <0x037b0000> /* unknown address */
RETX: <0x00492630> [ sh + 0x125f0 ]
RETS: <0x004b876c> [ sh + 0x3872c ]
PC : <0x00492630> [ sh + 0x125f0 ]
DCPLB_FAULT_ADDR: <0x2e04001c> /* unknown address */
ICPLB_FAULT_ADDR: <0x00492630> [ sh + 0x125f0 ]
PROCESSOR STATE:
R0 : 0000000c R1 : 99b9bd97 R2 : fefefeff R3 : 00000002
R4 : 00472004 R5 : 2e040020 R6 : 99b9bd94 R7 : 99b9bd90
P0 : 004ee490 P1 : 00472044 P2 : 2e040020 P3 : 004f6b30
P4 : 004f69d8 P5 : 2e040020 FP : 004f6068 SP : 037aff24
LB0: 0048bc81 LT0: 0048bc80 LC0: 00000000
LB1: 0048aaed LT1: 0048aaec LC1: 00000000
B0 : 00000000 L0 : 00000000 M0 : 00000000 I0 : 005204a7
B1 : 00000000 L1 : 00000000 M1 : 00000000 I1 : 004f6b30
B2 : 00000000 L2 : 00000000 M2 : 00000000 I2 : 00000000
B3 : 00000000 L3 : 00000000 M3 : 00000000 I3 : 00000000
A0.w: 00000000 A0.x: 00000000 A1.w: 00000000 A1.x: 00000000
USP : 004f605c ASTAT: 02003004
No trace since you do not have CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled
Stack from 037aff04:
0003a33c ffa00760 00146570 00146570 00146568 c0000000 ffa009c6 00462874
00492630 00000030 00060026 00000000 037b0000 00492630 00492630 004b876c
0000000c 02003004 0048aaed 0048bc81 0048aaec 0048bc80 00000000 00000000
00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000
00000000 00000000 004f6b30 005204a7 004f605c 004f6068 2e040020 004f69d8
Call Trace:
QuoteReplyEditDelete
2008-10-14 07:09:24 Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others
Ian Jeffray (UNITED KINGDOM)
Message: 63642
Here's another set of output, when I don't use strace. The 2.9GB allocation request is very odd. 100% repeatable.
BusyBox v1.4.1 (2008-10-13 22:57:12 BST) Built-in shell (msh)
Enter 'help' for a list of built-in commands.
root:/> ifconfig eth0 192.168.0.41 up
root:/> PHY: 0:01 - Link is Up - 100/Full
root:/> mount -o nolock 192.168.0.101:/home/ijeffray /home
root:/> cd /home
root:/home> ls -l
drwxr-xr-x 2 1000 1000 4096 May 30 2008 Desktop
drwxr-xr-x 12 1000 1000 4096 Oct 13 2008 ev
drwx--x--x 3 1000 1000 4096 Aug 6 2008 public_html
-rw-rw-rw- 1 1000 1000 244445 Sep 15 2008 snap.jpg
drwxr-xr-x 6 1000 1000 4096 Oct 14 2008 tmps
root:/home> ps
PID Uid VSZ Stat Command
1 root 104 S /init
2 root SW< [kthreadd]
3 root SWN [ksoftirqd/0]
4 root SW< [events/0]
5 root SW< [khelper]
22 root SW< [kblockd/0]
34 root SW [pdflush]
35 root SW [pdflush]
36 root SW< [kswapd0]
37 root SW< [aio/0]
70 root SW< [mtdblockd]
72 root SW< [bfin-spi.0]
95 root 40 S inetd
97 root 580 S -/bin/sh
98 root 32 S /bin/watchdogd -f -s
99 root 480 S /sbin/syslogd -n
100 root 480 S /sbin/klogd -n
111 root SW< [rpciod/0]
113 root 484 R ps
root:/home> cat snAllocation of length 3088060416 from process 97 failed
DMA per-cpu:
CPU 0: Hot: hi: 18, btch: 3 usd: 16 Cold: hi: 6, btch: 1 usd: 0
Active:280 inactive:1295 dirty:0 writeback:0 unstable:0
free:11255 slab:441 mapped:0 pagetables:0 bounce:0
DMA free:45020kB min:952kB low:1188kB high:1428kB active:1120kB inactive:5180kB
present:56896kB pages_scanned:0 all_unreclaimable? no
lowmem_reserve[]: 0 0
DMA: 1*4kB 1*8kB 1*16kB 4*32kB 1*64kB 0*128kB 1*256kB 1*512kB 1*1024kB 1*2048kB
QuoteReplyEditDelete
2008-10-14 08:42:46 Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others
Robin Getz (UNITED STATES)
Message: 63648
Ian:
It looks like someone accidently removed a manditory option for nfs mounting.
https://docs.blackfin.uclinux.org/doku.php?id=uclinux-dist:nfs
Can you check that out? (rsize=1024,wsize=1024)
-Robin
QuoteReplyEditDelete
2008-10-14 09:15:04 Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others
Ian Jeffray (UNITED KINGDOM)
Message: 63651
On a 5 minute test, that does indeed appear to fix things. Thanks Robin!
I had read the recommendation to use TCP (which in itself made no difference) but not seen/noticed the buffer size limits before.
QuoteReplyEditDelete
2008-10-14 10:25:51 Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others
Robin Getz (UNITED STATES)
Message: 63656
Ian:
No problem. I updated the docs before I posted to make it more findable - that is likely why you didn't see it before.
Glad that fixed things.
QuoteReplyEditDelete