I use Sharc (ADSP21489) + VDSP++5.U10.

I think that multiply with cumulative add is the one of main quality of digital signal processing, but I didn't find to support it in C/C++. Only using asm save the situation

May be AD add builtin function supporting multiply with cumulative add?

My code:

/*****************************************************************************

* MultiplyWithAdd.c

*****************************************************************************/

#include "stdfix.h"

#define fr32 fract

#define sfr32 fract

#define ufr32 unsigned fract

#define s32 int

#define u32 unsigned int

#define SPACE1 pm

#define SPACE2 dm

s32 FiltrA1st(void *pSigOut, void *pSigIn, void *pk, u32 len)

{

//===================================================================

sfr32 SPACE2 *pDataDst_Sig = (sfr32 SPACE2 *)pSigIn;

sfr32 SPACE1 *pDataDst_Coeff = (sfr32 SPACE1 *)pk;

sfr32 *pDataSrc_Sig = (sfr32 *)pSigOut;

u32 i;

#ifndef __MVS__

#pragma SIMD_for

#pragma all_aligned

#pragma vector_for

#pragma loop_count(16, 64, 2)

#endif

for (i = 0; i < len; i++)

{

*pDataSrc_Sig++ += (*pDataDst_Coeff++)*(*pDataDst_Sig++);

}

//===================================================================

return 1;

//===================================================================

}

volatile int In[4];

volatile int k[4];

volatile int Out[4];

int main( void )

{

/* Begin adding your custom code here */

FiltrA1st((void *)&Out[0],(void *)&In[0],(void *)&k[0],4);

return 0;

}

The compiler do In asm:

s32 FiltrA1st(void *pSigOut, void *pSigIn, void *pk, u32 len)

{

[124354] i7=modify (i7,0xfffffffa);

[124357] r2=i3;

[124359] dm(0xfffffffa,i6)=r2;

[12435B] r2=i5;

[12435D] r2=mr0f, dm(0xfffffffb,i6)=r2;

[124360] r2=mr1f, dm(0xfffffffc,i6)=r2;

[124363] r2=mr2f, dm(0xfffffffd,i6)=r2;

[124366] i12=0xb2150;

[124373] dm(0xfffffffe,i6)=r2;

[124375] bit set mode1 0x200000;

//===================================================================

sfr32 SPACE2 *pDataDst_Sig = (sfr32 SPACE2 *)pSigIn;

sfr32 SPACE1 *pDataDst_Coeff = (sfr32 SPACE1 *)pk;

sfr32 *pDataSrc_Sig = (sfr32 *)pSigOut;

u32 i;

#ifndef __MVS__

#pragma SIMD_for

#pragma all_aligned

#pragma vector_for

#pragma loop_count(16, 64, 2)

#endif

for (i = 0; i < len; i++)

{

*pDataSrc_Sig++ += (*pDataDst_Coeff++)*(*pDataDst_Sig++);

[124369] m12=0x2;

[12436B] m4=0x2;

[12436D] i3=0xb2130;

[124370] i5=0xb2170;

[124378] nop;

[124379] nop;

[12437A] r2=dm(i3,m4), r1=pm(i12,m12);

[12437C] MRF=r2*r1(ssf), i4=modify(i5,m5);

[12437F] r1=sat MRF(sf), r2=dm(i5,0x2);

[124382] lcntr=0xf, do (pc,0xf) until lce;

[124385] r2=r2+r1, r1=dm(i3,m4), r0=pm(i12,m12);

[124388] if av r2=ashift r2 by 0xffffffe1;

[12438B] if av r2=btgl r2 by 0x1f;

[12438E] MRF=r1*r0(ssf), dm(i4,0x2)=r2;

[124391] r1=sat MRF(sf), r2=dm(i5,0x2);

[124394] r2=r2+r1;

[124395] if av r2=ashift r2 by 0xffffffe1;

[124398] if av r2=btgl r2 by 0x1f;

[12439B] dm(i4,0x2)=r2;

[12439D] bit clr mode1 0x200000;

}

//===================================================================

return 1;

[1243A0] nop;

[1243A1] nop;

[1243A2] r2=dm(0xfffffffc,i6);

[1243A4] mr0f=r2, r2=dm(0xfffffffd,i6);

[1243A7] mr1f=r2, i12=dm(m7,i6);

[1243AA] r2=dm(0xfffffffe,i6);

[1243AC] mr2f=r2, r0=m6;

[1243AF] i3=dm(0xfffffffa,i6);

[1243B1] i5=dm(0xfffffffb,i6);

[1243B3] jump (m14,i12) (db);

[1243B5] rframe;

[1243B6] nop;

//===================================================================

}

I would do as:

s32 FiltrA1st(void *pSigOut, void *pSigIn, void *pk, u32 len)

{

i7=modify (i7,0xfffffffa);

r2=i3;

dm(0xfffffffa,i6)=r2;

r2=i5;

r2=mr0f, dm(0xfffffffb,i6)=r2;

r2=mr1f, dm(0xfffffffc,i6)=r2;

r2=mr2f, dm(0xfffffffd,i6)=r2;

i12=0xb2150;

i3=0xb2130;

i5=0xb2170;

i4=i5;

dm(0xfffffffe,i6)=r2;

bit set mode1 0x200000;

nop;

nop;

mrf=0, r4=dm(i5,m6);

mr1f=r4, r2=dm(i3,m4), r1=pm(i12,m12);

r8 = mrf + r2*r1(ssf), r2=dm(i3,m4), r1=pm(i12,m12);

lcntr=0xf, do cycle_end until lce;

mrf=0, r4=dm(i5,m6);

mr1f=r4, dm(i4,m6)=r8;

cycle_end:

r8 = mrf + r2*r1(ssf), r2=dm(i3,m4), r1=pm(i12,m12);

dm(i4,m6)=r8;

bit clr mode1 0x200000;

[1243A0] nop;

[1243A1] nop;

[1243A2] r2=dm(0xfffffffc,i6);

[1243A4] mr0f=r2, r2=dm(0xfffffffd,i6);

[1243A7] mr1f=r2, i12=dm(m7,i6);

[1243AA] r2=dm(0xfffffffe,i6);

[1243AC] mr2f=r2, r0=m6;

[1243AF] i3=dm(0xfffffffa,i6);

[1243B1] i5=dm(0xfffffffb,i6);

[1243B3] jump (m14,i12) (db);

[1243B5] rframe;

[1243B6] nop;

Hi Eugene,

Thanks for your post. As you raised before, using fractional arithmetic with the 80-bit multiplier registers can be awkward from C, and we are looking at how to improve this situation for a coming update of VisualDSP++.

In your loop, you are doing

fract += fract * fract;

All arithmetic on fract types is defined to saturate on overflow, which includes the addition operation. This leads the compiler to produce the five-cycle kernel, in order to saturate the result of the addition. This saturation wouldn't happen in the MRF register, unless done explicitly. Therefore, although the compiler could have achieved a four-cycle loop (adding a SAT operation to your solution), it could not have managed the three-cycle one you propose as this does not saturate the result of the addition. We will look at trying to achieve the four-cycle loop solution in the next update of VisualDSP++.

If you do not require saturation of the addition result, it is possible to generate a three-cycle loop by using the int type and using the RxR built-in function, which performs a fractional multiply. For example:

will generate an inner loop:

r2=r1+r2, r0=dm(i5,m4), r1=pm(i12,m12);

mrf=r1*r0 (SSFR), dm(i3,2)=r2;

.P33L14_end:

r2=SAT mrf (SF), r1=dm(i4,2);

// end loop .P33L14;

I hope this helps. All the best,

Michael.