2008-10-14 07:08:15     Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others

Document created by Aaronwu Employee on Aug 8, 2013
Version 1Show Document
  • View in full screen mode

2008-10-14 07:08:15     Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others

Ian Jeffray (UNITED KINGDOM)

Message: 63641   

 

I've been troubled by severe instability with the 2008R1.5-RC3 when using NFS.   even an "ls" can sometimes kill the board or cause CPLB misses.   I've observed this on the BF537-STAMP board, our custom BF537 board and custom BF527 board.  The problem is almost impossible to reproduce with the previous 2008R1 release (I think I managed it once, after some persistant loading).

 

Using a vanilla build configuration for the BF537-STAMP (minus a few user space apps, so it fits in the flash!), I can reproduce the crash in just three commands:   ifconfig, mount, shell tab expansion.   The crash manifests as an error in strcmp.S from within uClibc, during a qsort from busybox.   Why this should only happen when using NFS mounts, I'm not sure.

 

Below is a trace showing the problem.   I ifconfig the interface, mount the NFS mount, cd to the mount point, then run busybox with strace.  In busybox I type "cat sn" and then hit tab.  That's when the crash happens.  (There's a file called "snap.jpg" in my home directory which I expected to get tab expanded).     This is 100% repeatable on a variety of boards.

 

I've spent a fair time trying to dig in to this, but I'm not really coming any closer to any resolution.    Any suggestions/help would be hugely appreciated.

 

Ian.

 

 

root:/> ifconfig eth0 192.168.0.41 up

root:/> PHY: 0:01 - Link is Up - 100/Full

 

root:/> mount -o nolock 192.168.0.101:/home/ijeffray /home

root:/> cd /home

root:/home> strace sh

ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

getuid32()                              = 0

getgid32()                              = 0

setgid32(0)                             = 0

setuid32(0)                             = 0

mmap2(NULL, 90112, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x500000

close(10)                               = -1 EBADF (Bad file descriptor)

close(11)                               = -1 EBADF (Bad file descriptor)

close(12)                               = -1 EBADF (Bad file descriptor)

close(13)                               = -1 EBADF (Bad file descriptor)

close(14)                               = -1 EBADF (Bad file descriptor)

close(15)                               = -1 EBADF (Bad file descriptor)

close(16)                               = -1 EBADF (Bad file descriptor)

close(17)                               = -1 EBADF (Bad file descriptor)

close(18)                               = -1 EBADF (Bad file descriptor)

close(19)                               = -1 EBADF (Bad file descriptor)

getpid()                                = 120

geteuid32()                             = 0

ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

ioctl(1, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

write(1, "\n\n", 2

 

)                     = 2

write(1, "BusyBox v1.4.1 (2008-10-13 22:57"..., 62BusyBox v1.4.1 (2008-10-13 22:57:12 BST) Built-in shell (msh)

) = 62

write(1, "Enter \'help\' for a list of built"..., 46Enter 'help' for a list of built-in commands.

) = 46

write(1, "\n", 1

)                       = 1

rt_sigaction(SIGQUIT, {SIG_IGN}, {SIG_DFL}, 8) = 0

rt_sigaction(SIGTERM, {0x4c27a4, [TERM], SA_RESTART}, {SIG_DFL}, 8) = 0

rt_sigaction(SIGINT, {SIG_IGN}, {SIG_DFL}, 8) = 0

rt_sigaction(SIGINT, {0x4c3af0, [INT], SA_RESTART}, {SIG_IGN}, 8) = 0

ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost isig icanon echo ...}) = 0

ioctl(0, SNDCTL_TMR_START or TCSETS, {B9600 opost -isig -icanon -echo ...}) = 0

ioctl(0, SNDCTL_TMR_TIMEBASE or TCGETS, {B9600 opost -isig -icanon -echo ...}) = 0

ioctl(0, TIOCGWINSZ, {ws_row=24, ws_col=80, ws_xpixel=0, ws_ypixel=0}) = 0

rt_sigaction(SIGWINCH, {0x4c8590, [WINCH], SA_RESTART}, {SIG_DFL}, 8) = 0

geteuid32()                             = 0

open("/etc/passwd", O_RDONLY)           = 3

ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0x4f6a28) = -1 ENOTTY (Inappropriate ioctl for device)

mmap2(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x37ad000

read(3, "root:C7O0oWtNuGHNE:0:0:root:/roo"..., 256) = 66

close(3)                                = 0

getuid32()                              = 0

getgid32()                              = 0

mmap2(NULL, 8192, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x472000

getcwd("/hom", 4098)                    = 6

write(1, "root:/home> ", 12root:/home> )            = 12

read(0, "c", 1)                         = 1

write(1, "c", 1c)                        = 1

read(0, "a", 1)                         = 1

write(1, "a", 1a)                        = 1

read(0, "t", 1)                         = 1

write(1, "t", 1t)                        = 1

read(0, " ", 1)                         = 1

write(1, " ", 1 )                        = 1

read(0, "s", 1)                         = 1

write(1, "s", 1s)                        = 1

read(0, "n", 1)                         = 1

write(1, "n", 1n)                        = 1

read(0, "\t", 1)                        = 1

open(".", O_RDONLY|O_NONBLOCK|O_DIRECTORY) = 3

fstat(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0

fcntl(3, F_SETFD, FD_CLOEXEC)           = 0

mmap2(NULL, 36864, PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS|0x4000000, 0, 0) = 0x520000

getdents64(3, /* 56 entries */, 32768)  = 1832

stat("./snap.jpg", {st_mode=S_IFREG|0666, st_size=244445, ...}) = 0

--- SIGBUS (Bus error) @ 0 (0) ---

+++ killed by SIGBUS +++

Process 120 detached

root:/home>

 

 

Data access CPLB miss

- Used by the MMU to signal a CPLB miss on a data access.

Defered Exception context

CURRENT PROCESS:

COMM=sh PID=120

TEXT = 0x00480040-0x004d2440        DATA = 0x004d2444-0x004e87e4

BSS = 0x004e87e4-0x004efe64  USER-STACK = 0x004f7f24

 

return address: [0x00492630]; contents of:

0x00492610:  05e5  0c01  e800  0003  3028  1832  0c00  185a

0x00492620:  3210  3228  6439  63e0  5581  6058  0a06  6060

0x00492630: [e414] ffff  0730  0a26  6fe5  102a  3006  64e0

0x00492640:  0984  1c19  e140  004e  e100  7e84  e300  1262

 

SEQUENCER STATUS:               Not tainted

SEQSTAT: 00060026  IPEND: 0030  SYSCFG: 0006

  HWERRCAUSE: 0x18

  EXCAUSE   : 0x26

RETE: <0x00000000> /* Maybe null pointer? */

RETN: <0x037b0000> /* unknown address */

RETX: <0x00492630> [ sh + 0x125f0 ]

RETS: <0x004b876c> [ sh + 0x3872c ]

PC  : <0x00492630> [ sh + 0x125f0 ]

DCPLB_FAULT_ADDR: <0x2e04001c> /* unknown address */

ICPLB_FAULT_ADDR: <0x00492630> [ sh + 0x125f0 ]

 

PROCESSOR STATE:

R0 : 0000000c    R1 : 99b9bd97    R2 : fefefeff    R3 : 00000002

R4 : 00472004    R5 : 2e040020    R6 : 99b9bd94    R7 : 99b9bd90

P0 : 004ee490    P1 : 00472044    P2 : 2e040020    P3 : 004f6b30

P4 : 004f69d8    P5 : 2e040020    FP : 004f6068    SP : 037aff24

LB0: 0048bc81    LT0: 0048bc80    LC0: 00000000

LB1: 0048aaed    LT1: 0048aaec    LC1: 00000000

B0 : 00000000    L0 : 00000000    M0 : 00000000    I0 : 005204a7

B1 : 00000000    L1 : 00000000    M1 : 00000000    I1 : 004f6b30

B2 : 00000000    L2 : 00000000    M2 : 00000000    I2 : 00000000

B3 : 00000000    L3 : 00000000    M3 : 00000000    I3 : 00000000

A0.w: 00000000   A0.x: 00000000   A1.w: 00000000   A1.x: 00000000

USP : 004f605c  ASTAT: 02003004

 

No trace since you do not have CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE enabled

 

Stack from 037aff04:

        0003a33c ffa00760 00146570 00146570 00146568 c0000000 ffa009c6 00462874

        00492630 00000030 00060026 00000000 037b0000 00492630 00492630 004b876c

        0000000c 02003004 0048aaed 0048bc81 0048aaec 0048bc80 00000000 00000000

        00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000

        00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000

        00000000 00000000 004f6b30 005204a7 004f605c 004f6068 2e040020 004f69d8

 

Call Trace:

 

 

 

QuoteReplyEditDelete

 

 

2008-10-14 07:09:24     Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others

Ian Jeffray (UNITED KINGDOM)

Message: 63642   

 

Here's another set of output, when I don't use strace.   The 2.9GB allocation request is very odd.  100% repeatable.

 

 

BusyBox v1.4.1 (2008-10-13 22:57:12 BST) Built-in shell (msh)

Enter 'help' for a list of built-in commands.

 

root:/> ifconfig eth0 192.168.0.41 up

root:/> PHY: 0:01 - Link is Up - 100/Full

 

root:/> mount -o nolock 192.168.0.101:/home/ijeffray /home

root:/> cd /home

root:/home> ls -l

drwxr-xr-x    2 1000     1000         4096 May 30  2008 Desktop

drwxr-xr-x   12 1000     1000         4096 Oct 13  2008 ev

drwx--x--x    3 1000     1000         4096 Aug  6  2008 public_html

-rw-rw-rw-    1 1000     1000       244445 Sep 15  2008 snap.jpg

drwxr-xr-x    6 1000     1000         4096 Oct 14  2008 tmps

root:/home> ps

  PID  Uid        VSZ Stat Command

    1 root        104 S   /init

    2 root            SW< [kthreadd]

    3 root            SWN [ksoftirqd/0]

    4 root            SW< [events/0]

    5 root            SW< [khelper]

   22 root            SW< [kblockd/0]

   34 root            SW  [pdflush]

   35 root            SW  [pdflush]

   36 root            SW< [kswapd0]

   37 root            SW< [aio/0]

   70 root            SW< [mtdblockd]

   72 root            SW< [bfin-spi.0]

   95 root         40 S   inetd

   97 root        580 S   -/bin/sh

   98 root         32 S   /bin/watchdogd -f -s

   99 root        480 S   /sbin/syslogd -n

  100 root        480 S   /sbin/klogd -n

  111 root            SW< [rpciod/0]

  113 root        484 R   ps

root:/home> cat snAllocation of length 3088060416 from process 97 failed

DMA per-cpu:

CPU    0: Hot: hi:   18, btch:   3 usd:  16   Cold: hi:    6, btch:   1 usd:   0

Active:280 inactive:1295 dirty:0 writeback:0 unstable:0

free:11255 slab:441 mapped:0 pagetables:0 bounce:0

DMA free:45020kB min:952kB low:1188kB high:1428kB active:1120kB inactive:5180kB

present:56896kB pages_scanned:0 all_unreclaimable? no

lowmem_reserve[]: 0 0

DMA: 1*4kB 1*8kB 1*16kB 4*32kB 1*64kB 0*128kB 1*256kB 1*512kB 1*1024kB 1*2048kB

 

 

QuoteReplyEditDelete

 

 

2008-10-14 08:42:46     Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others

Robin Getz (UNITED STATES)

Message: 63648   

 

Ian:

 

It looks like someone accidently removed a manditory option for nfs mounting.

 

https://docs.blackfin.uclinux.org/doku.php?id=uclinux-dist:nfs

 

Can you check that out? (rsize=1024,wsize=1024)

 

-Robin

QuoteReplyEditDelete

 

 

2008-10-14 09:15:04     Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others

Ian Jeffray (UNITED KINGDOM)

Message: 63651   

 

On a 5 minute test, that does indeed appear to fix things.  Thanks Robin!

 

I had read the recommendation to use TCP (which in itself made no difference) but not seen/noticed the buffer size limits before.

 

 

QuoteReplyEditDelete

 

 

2008-10-14 10:25:51     Re: Instability with NFS with 2008R1.5-RC3 on BF537-STAMP + others

Robin Getz (UNITED STATES)

Message: 63656   

 

Ian:

 

No problem. I updated the docs before I posted to make it more findable - that is likely why you didn't see it before.

 

Glad that fixed things.

QuoteReplyEditDelete

Attachments

    Outcomes