Hello,

my problem is, that existing C-code form 21369 does not allways work on external SRAM-memory of 21489.

SRAM has a width of 16 bits.

This are the affected vectors in external memory (CS1) of 21489:

Symbol | Demangled name | Address | Size | Binding |
---|---|---|---|---|

_FFT_Window_Hanning_4096 | FFT_Window_Hanning_4096 | 0x4004000 | 0x1000 | GLOBAL |

_FFT_Window_Hanning_512 | FFT_Window_Hanning_512 | 0x4005000 | 0x200 | GLOBAL |

_FPGA_Filter_0U | FPGA_Filter_0U | 0x4005200 | 0x4b5 | GLOBAL |

_fft_in.0 | fft_in.0 | 0x40056b6 | 0x1000 | LOCAL |

_fft_out.1 | fft_out.1 | 0x40066b6 | 0x1000 | LOCAL |

_fft_in.2 | fft_in.2 | 0x40076b6 | 0x200 | LOCAL |

_fft_out.3 | fft_out.3 | 0x40078b6 | 0x200 | LOCAL |

Relevant for the problem are '**_FFT_Window_Hanning_4096**', '**_fft_in.0'** and another vector **'Knock_Demo_B.Sum_k**' located inside internal memory.

This is the failing operation:

It just multiplies sampled data with the window-function.

// prepare FFT:

{

tUINT32 i = NUMBER_OF_SAMPLES;

const tREAL32* src = Knock_Demo_B.Sum_k; // internal memory

const tREAL32* win = FFT_Window_Hanning_4096; // external memory

tREAL32* dst = fft_in; // external memory

while (i)

{

*dst = *src * *win;

dst++;

src++;

win++;

i--;

}

}

I know that this all is very slow in external memory, but it's fast enough in this application and we need all internal memory for other things.

But the really bad thing is:

It does not work at all! Only every second value reaches the destination vector 'fft_in'. All other values contains 0 after this loop.

(This code worked for years on 21369-systems)

On the 21489 I have to insert just one nop and it works:

// prepare FFT:

{

tUINT32 i = NUMBER_OF_SAMPLES;

const tREAL32* src = Knock_Demo_B.Sum_k;

const tREAL32* win = FFT_Window_Hanning_4096;

tREAL32* dst = fft_in;

while (i)

{

*dst = *src * *win;

asm("nop;");

dst++;

src++;

win++;

i--;

}

}

In the assembler code below you can see, that this single nop as big effect to the coder.

But why do I hove to insert this nop?

And do I have to insert more nops in my code on other locations?

Thank you for any help!!!

Assembler-Code for not working C-Code:

.P75L50:

r2=btgl r0 by r15, i4=modify(i5,m5);

if sz jump (pc,.P75L76) (db);

i3=_FFT_Window_Hanning_4096;

i2=_fft_in.0;

.P75L81:

r12=ashift r0 by -1;

s12=r12;

.P75L85:

bit set mode1 0x200000;

.P75L83:

.LN35:

// line 1015

nop; // Inserted to fix anomaly enter_simd.

r12=r12-1;

if eq jump (pc,.P75L105) (db);

r1=dm(i3,2);

// -- stall --

r2=dm(i4,2);

.P75L107:

lcntr=r12, do (pc,.P75L106_end) until lce;

.P75L106:

//-------------------------------------------------------------------

// Loop at "Knock_Demo_.c" line 1014 col 11

//-------------------------------------------------------------------

// This loop executes 2 iterations of the original loop in estimated 3

// cycles.

//-------------------------------------------------------------------

// Unknown Trip Count

// Successfully found modulo schedule with:

// Initiation Interval (II) = 3

// Stage Count (SC) = 2

// MVE Unroll Factor = 1

// Minimum initiation interval due to recurrences (rec MII) = 1

// Minimum initiation interval due to resources (res MII) = 3.00

//-------------------------------------------------------------------

// This loop's resource usage is:

// define dm dag used 3 out of 3 (100.0%)

// dm dag used 3 out of 3 (100.0%)

// pm dag used 3 out of 3 (100.0%)

// shift immediate used 3 out of 3 (100.0%)

// memory access used 3 out of 6 ( 50.0%)

// multifunction alu used 1 out of 3 ( 33.3%)

// multifunction float multiply used 1 out of 3 ( 33.3%)

// multifunction integer add sub used 1 out of 3 ( 33.3%)

// multifunction integer multiply used 1 out of 3 ( 33.3%)

// multifunction mult used 1 out of 3 ( 33.3%)

//-------------------------------------------------------------------

// Loop was vectorized by a factor of 2.

//-------------------------------------------------------------------

// Vectorization peeled 1 conditional iteration from the back of the loop

// because of an unknown trip count, possibly not a multiple of 2.

//

// Consider using pragma loop_count to specify the trip count or trip modulo

// in order to avoid conditional peeling.

//-------------------------------------------------------------------

f1=f2*f1, r2=dm(i4,2);

dm(i2,2)=r1;

.P75L106_end:

r1=dm(i3,2);

// end loop .P75L106;

//-------------------------------------------------------------------

// End Kernel for Loop L106

//-------------------------------------------------------------------

Assembler-Code for working C-Code:

.P75L52:

i4=modify(i5,m5);

i2=_fft_in.0;

i3=_FFT_Window_Hanning_4096;

lcntr=r2, do (pc,.P75L15_end) until lce;

.P75L15:

//-------------------------------------------------------------------

// Loop at "Knock_Demo_.c" line 1014 col 11

//-------------------------------------------------------------------

.LN35:

// line 1015

r1=dm(i4,m6);

r0=dm(i3,m6);

f1=f1*f0;

dm(i2,m6)=r1;

.P75L16:

.LN36:

// line 1022

// ASM STATEMENT

nop;

// END ASM STATEMENT

.P75L17:

.P75L46:

nop;

nop;

.P75L15_end:

nop;

.LN37:

// line 1014

// end loop .P75L15;

//-------------------------------------------------------------------

// End Loop L15

//-------------------------------------------------------------------

I had the trouble of external SDRAM memory access when I did buterfly test. Also I had the software interrupt with nesting interrupt. I solved my problem so: I raise my software interrupt setting bit without the function raise