Hi,
I have the following code, that doesn't work.
.section/dm seg_dmda; .var xre[8192]; .section/Data seg_pmda; .var xim[8192]; .section/dm seg_bank4; .var Hre[4098]; .var Him[4098]; .var Xre[4098]; .var Xim[4098]; .var Yre[4098]; .var Yim[4098]; .section/dm seg_sdram; .var X4098_left_re[4098]; .var X4098_left_im[4098]; .var H_tail_left_re[4098]; .var H_tail_left_im[4098]; .var H_tail_right_re[4098]; .var H_tail_right_im[4098]; .var Y_tail_left_re[4098]; .var Y_tail_left_im[4098]; .section/dm seg_dmda; .var tcb_x_left_1[6] = 0, 1, X4098_left_re, 4098, 1, Xre; .var tcb_x_left_2[6] = 0, 1, X4098_left_im, 4098, 1, Xim; .var tcb_y_left_1[6] = 0, 1, Y_tail_left_re, 4098, 1, Yre; .var tcb_y_left_2[6] = 0, 1, Y_tail_left_im, 4098, 1, Yim; #define WAIT_CHAINED_DMAC0 call WaitDmaC0Finished /* r2 = CHS + DMAS;\ r1 = dm(DMAC0);\ r1 = r1 and r2;\ if ne jump (pc, 0xfffffffe)*/ _process: leaf_entry; //--------------------------------------------------------------------------- // save non-scratch register // scratch regsiter: r0, r1, r2, r4, r8, r12, i4, i12, i13, m4, m12, b4, b12, b13, PX, USTAT1, USTAT2 //--------------------------------------------------------------------------- save_reg; r0 = dm(DMAC0); puts = r0; puts = mode1; puts = ustat1; puts = ustat2; puts = ustat3; puts = ustat4; r0 = i0; puts = r0; r0 = i1; puts = r0; r0 = i2; puts = r0; r0 = i3; puts = r0; r0 = i5; puts = r0; r0 = i8; puts = r0; r0 = i9; puts = r0; r0 = i14; puts = r0; r0 = i15; puts = r0; r0 = m0; puts = r0; r0 = m1; puts = r0; r0 = m8; puts = r0; r0 = b5; puts = r0; r0 = l4; puts = r0; r0 = l5; puts = r0; bit set mode1 CBUFEN; // enable cyclic buffer access nop; nop; r0 = DEN | CHEN | OFCEN; dm(DMAC0) = r0; nop;nop;nop;nop; f8 = 0.0; f12 = 1.0; f4 = -1.0; i4 = Xre; i5 = Xim; m4 = 1; lcntr = 4098, do(pc,loop_1) until lce; dm(i4,m4) = f8; f0 = f8; f1 = f0 * f4; dm(i5,m4) = f1; f8 = f8 + f12; loop_1: nop; r2 = (tcb_y_left_2 + 5) & 0x7FFFF; r2 = bclr r2 by 20; dm(tcb_y_left_1) = R2; r2 = (tcb_y_left_1 + 5) & 0x7FFFF; r2 = bclr r2 by 20; dm(CPEP0) = r2; r12 = 13; r8 = xim; r4 = xre; // CALL fft function WAIT_CHAINED_DMAC0; i0 = xre; i1 = Xre; i2 = Xim; i8 = xim; m0 = 0; m4 = 2; m8 = 0; m12 = 2; ENABLE_PEY; lcntr = 4098 / 2, do(pc,loop_convolve_left_firsttail) until lce; f0 = dm(i0,m0), f1 = pm(i8,m8); // load Re{X},Im{X} dm(i1,m4) = f0; dm(i2,m4) = f1; loop_convolve_left_firsttail: dm(i0,m4) = f0, pm(i8,m12) = f1; DISABLE_PEY; // save X4098 to SDRAM r2 = (tcb_x_left_2 + 5) & 0x7FFFF; r2 = bset r2 by 20; dm(tcb_x_left_1) = r2; r2 = (tcb_x_left_1 + 5) & 0x7FFFF; r2 = bset r2 by 20; dm(CPEP0) = r2; r12 = 13; r8 = xim; r4 = xre; // CALL ifft function WAIT_CHAINED_DMAC0; // read X4098 from SDRAM r2 = (tcb_x_left_2 + 5) & 0x7FFFF; r2 = bclr r2 by 20; dm(tcb_x_left_1) = R2; r2 = (tcb_x_left_1 + 5) & 0x7FFFF; r2 = bclr r2 by 20; dm(CPEP0) = r2; WAIT_CHAINED_DMAC0; lcntr = 2, do(pc,loop_5) until lce; //------ load delayed spectrum (left) ------ r2 = (tcb_y_left_2 + 5) & 0x7FFFF; r2 = bclr r2 by 20; dm(tcb_y_left_1) = R2; r2 = (tcb_y_left_1 + 5) & 0x7FFFF; r2 = bclr r2 by 20; dm(CPEP0) = r2; i0 = Hre; i1 = Him; i4 = Xre; i5 = Xim; m0 = 0; m4 = 2; f0 = 1.0; s0 = 1.0; f1 = 1.0; s1 = 1.0; ENABLE_PEY; lcntr = 4098 / 2, do(pc,loop_2) until lce; /*f0 = dm(i0,m0);*/ f4 = dm(i4,m4); f8 = f0 * f4 /*, f1 = dm(i1,m0)*/; f5 = dm(i5,m4); f9 = f1 * f5; dm(i0,m4) = f8; loop_2: dm(i1,m4) = f9; DISABLE_PEY; WAIT_CHAINED_DMAC0; i0 = Hre; i1 = Him; i4 = Yre; i5 = Yim; m0 = 0; m4 = 2; ENABLE_PEY; lcntr = 4098 / 2, do(pc,loop_4) until lce; f8 = dm(i0,m4); f12 = dm(i4,m0); f8 = f8 + f12, f9 = dm(i1,m4); f13 = dm(i5,m0); f9 = f9 + f13, dm(i4,m4) = f8; loop_4: dm(i5,m4) = f9; DISABLE_PEY; //------ save delayed spectrum (left) ------ r0 = DEN | CHEN | TRAN; //OFCEN; dm(DMAC0) = r0; nop;nop;nop;nop; r2 = (tcb_y_left_2 + 5) & 0x7FFFF; r2 = bset r2 by 20; dm(tcb_y_left_1) = r2; r2 = (tcb_y_left_1 + 5) & 0x7FFFF; r2 = bset r2 by 20; dm(CPEP0) = r2; i0 = xre; i1 = Yre; m0 = 0; m4 = 1; f12 = 1.0; lcntr = 4098, do(pc,loop_3) until lce; f8 = dm(i0,m0); f8 = f8 + f12; //dm(i0,m4) = f8; loop_3: nop; WAIT_CHAINED_DMAC0; nop; loop_5: nop; //------------------------------------------------------------------------------------------------------------------------------------------------------ // restore non-scratch register //------------------------------------------------------------------------------------------------------------------------------------------------------ l5 = gets(1); l4 = gets(2); b5 = gets(3); m8 = gets(4); m1 = gets(5); m0 = gets(6); i15 = gets(7); i14 = gets(8); i9 = gets(9); i8 = gets(10); i5 = gets(11); i3 = gets(12); i2 = gets(13); i1 = gets(14); i0 = gets(15); ustat4 = gets(16); ustat3 = gets(17); ustat2 = gets(18); ustat1 = gets(19); mode1 = gets(20); // address warning ea2547 by two nops nop; nop; r0 = gets(21); dm(DMAC0) = r0; alter(21); restore_reg; leaf_exit; ._process.end: rts; WaitDmaC0Finished: read_dmac0_status: r0 = dm(DMAC0); btst r0 by 20; if not sz jump(pc, read_dmac0_status); btst r0 by 21; if not sz jump(pc, read_dmac0_status); rts;
I am calling the function process() from a C++ program that writes data to xre. When I run it the first time I can see that data is written to Y_tail_left_re. But when I run process() the second time data in Y_tail_left_re is not changed. You can also see it if you measure the cycle counts of process(). The second call takes 80000 cycles less then the first call.
Calling process() third and fourth time does not change the behavior. I can see the content in Yre was changed but not on SDRAM.
Even more uncanny: If I set a breakpoint directly after the write to SDRAM in line 256 I can see that the content of the SDRAM changes and cycle counts are stable, no 80000 difference. If I set the breakpoint outside the function the behavior above is back.
What am I doing wrong here? This is really a showstopper for the 21489. I need help urgently!
Best regards,
Raphael
P.S. I am working in a EZKIT-21489, therefore I can exclude a hardware issue in our hardware design.