📄 hhcfft4.asm

📁 adsp2116x的基4fft
💻 ASM
字号:

/**************************************************************

 		ADSP-21160 Radix-4 Complex Fast Fourier Transform 


File name:		HHCFFT4.ASM    

Author:     	Karl Schwarz & Raimund Meyer, Universitaet Erlangen Nuernberg       

Date Modified:  
				25-APR-91, RY
	    		18-JUN-91, RY
				05-MAR-99, RFG
				24-MAY-00, GGL

Description:

	This routine performs a single channel SIMD, complex, radix 4 Fast Fourier 
	Transform (FFT). The FFT length (N) must be a power of 4 and a minimum of 
	64 points. The real part of the input data is placed in DM and the complex 
	part in PM. This data is destroyed during the course of the computation. 
	The real and complex output of the FFT is placed in separate locations in DM.
	
	Since this routine takes care of all necessary address digit-reversals, the 
	input and output data are in normal order. The digit reversal is accomplished 
	by using a modified radix 4 butterfly throughout which swaps the inner two 
	nodes resulting with bit reversed data. The digit reversal is completed by 
	bit reversing the real data in the final stage and then bit reversing the 
	imaginary so that it ends up in DM.
	
	To implement an inverse FFT, you only have to (1) swap the incoming datas real
	and imaginary parts, (2) run the forward FFT, (3) swap the outgoing datas 
	real and imaginary parts and (4) scale the data by 1/N.
	
	

Calling Parameters:

	costwid table at DM : cosine    length 3*N/4                        
	sintwid table at PM : sine      length 3*N/4                        
	real  input  at DM : redata     length N, normal order              
	imag  input  at PM : imdata     length N, normal order        

Assumptions:

	All arrays must start on even address boundaries.        

Results:

    real  output at DM : refft      length N, normal order              
    imag  output at DM : imfft      length N, normal order     		

Registers Affected:
		
		All I, M, L and R registers.
    		Three levels of looping.

Notes: 
			Because the bit reversed addressing mode is used with the arrays
		    refft and imfft, they must start at addresses that are integer
		    multiples of the length (N) of the transform, (i.e. 0,N,2N,3N,...).
		    This is accomplished by specifying two segments starting at those addresses 
		    in the architecture file and placing the variables alone in their 
		    respective segments. These addresses must also be reflected in the 
		    preprocessor variables ORE and OIM in bit reversed format.)

			Circular Buffering Issue:
			Because SIMD or Long word access transfer two 32-bit words, programs
			must be careful when using these accesses in circular buffering.
			It is important that SIMD or Long word accesses do not cross a circular
		 	buffer boundary.  If a SIMD	mode access occurs using a circular buffer
			index register that points to the last location in the circular buffer
		 	(end of buffer), the resulting access transfers the last location in
		 	the circular buffer and the	first location outside the buffer (end of buffer + 1).
         

Benchmarks: radix-4, complex with digit reversal

    FFT Length   cycles   
    ----------   ------   
         64			725		
        256			3494	
       1024        17159   
       4096		   82472	
	 

    First Stage  -   8 cycles per radix-4 butterfly
    Other Stages -  14 cycles per radix-4 butterfly

Memory Usage:
    pm code = 200 words, pm data = 1.75*N words, dm data = 3.75*N words

**************************************************************/


#include    "def21160.h"				/* Symbol Definition File */

#define	N				1024			/*Number of points in FFT.  Must be a power of four, minimum of 64.*/
#define	STAGES			5				/*Set to log4(N) or (log(N)/log(4))*/
#define	OST				0x00400000		/*bitrev(32 bit N/2)*/
#define	ORE				0x0001A000		/*bitrev(32 bit addr of output real in dm), addr is 0,N,2N,3N,...*/
#define	OIM				0x00062000		/*bitrev(24 bit addr of output imag. in dm), addr is 0,N,2N,3N,...*/

.global first;
.global cosine;
.global redata;
.global sine;
.global imdata;
/* program memory code */
.section/pm seg_pmco;

/*_______first stage radix-4 butterfly without twiddles______*/
first:
		bit set mode1 RND32 | CBUFEN | PEYEN; /* enable 32 bit floating point, circular buffers and SIMD */
		nop; /* 1 cycle latency before SIMD is active */	
                                                f0=dm(i0,m0),   f1=pm(i8,m8);
                                                f2=dm(i2,m0),   f3=pm(i10,m8);
                f0=f0+f2,       f2=f0-f2,       f4=dm(i1,m0),   f5=pm(i9,m8);
                f1=f1+f3,       f3=f1-f3,       f6=dm(i3,m0),   f7=pm(i11,m8);
                f4=f6+f4,       f6=f6-f4;
                f5=f5+f7,       f7=f5-f7;
                f8=f0+f4,       f9=f0-f4;
                f10=f1+f5,      f11=f1-f5;

lcntr=N/8,      do fstage until lce;    /* do N/4 simple radix-4 butterflies */
                f12=f2+f7,      f13=f2-f7,      f0=dm(i0,m0),   f1=pm(i8,m8);
                f14=f3+f6,      f15=f3-f6,      f2=dm(i2,m0),   f3=pm(i10,m8);
                f0=f0+f2,       f2=f0-f2,       f4=dm(i1,m0),   f5=pm(i9,m8);
                f1=f1+f3,       f3=f1-f3,       f6=dm(i3,m0),   f7=pm(i11,m8);
                f4=f6+f4,       f6=f6-f4,       dm(i4,m0)=f8,   pm(i12,m8)=f10;
                f5=f5+f7,       f7=f5-f7,       dm(i5,m0)=f9,   pm(i13,m8)=f11;
                f8=f0+f4,       f9=f0-f4,       dm(i6,m0)=f12,  pm(i14,m8)=f14;
fstage:
                f10=f1+f5,      f11=f1-f5,      dm(i7,m0)=f13,  pm(i15,m8)=f15;

				bit clr mode1 PEYEN; /* added for testing SIMD */
				m0=1;
				m8=1;

/*_____________Middle stages with radix-4 main butterfly___________________*/

/* m0=1 and m8=1 is still preset */
                        m1=-2;          /* reverse step for twiddles */
                        m9=m1;  
                        m2=3;           /* forward step for twiddles */
                        m10=m2;
                        m5=4;           /* first there are 4 groups */
                        r2=N/16;        /* with N/16 butterflies in each group */
                        r3=N/16*3;      /* step to next group */
                                
lcntr=STAGES-2, do mstage until lce;    /* do STAGES-2 stages */

                        i7=cosine;      /* first real twiddle */
                        i15=sine;       /* first imag twiddle */

                        r8=redata;
                        r9=imdata;

                        i0=r8;          /* upper real      path */
        r10=r8+r2;      i8=r9;          /* upper imaginary path */
                        i1=r10;         /* second real input path */
        r10=r10+r2,     i4=r10;         /* second real output path */
                        i2=r10;         /* third real input path */
        r10=r10+r2,     i5=r10;         /* third real output path */
                        i3=r10;         /* fourth real input path */
        r10=r9+r2,      i6=r10;         /* fourth real output path */
                        i9=r10;         /* second imag input path */
        r10=r10+r2,     i12=r10;        /* second imag output path */
                        i10=r10;        /* third imag input path */
        r10=r10+r2,     i13=r10;        /* third imag output path */
                        i11=r10;        /* fourth imag input path */
                        i14=r10;        /* fourth imag output path */ 
                        m4=r3;
                        m12=r3;
        r4=r3+1,        m6=r2;
                        m3=r4;
        r2=r2-1,        m11=r4;
                        m7=r2;
        
lcntr=m5,       do mgroup until lce;    /* do m5 groups */
                                                f0=dm(i7,m0),   f5=pm(i9,m8);
f8=f0*f5,                                       f4=dm(i1,m0),   f1=pm(i15,m8);
f9=f0*f4;
f12=f1*f5,                                      f0=dm(i7,m0),   f5=pm(i11,m8);
f13=f1*f4,      f12=f9+f12,                     f4=dm(i3,m0),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13;
f13=f1*f5;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m1),   f5=pm(i10,m8);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m0),   f1=pm(i15,m9);
f11=f0*f4;
f13=f1*f5,                      f6=f9-f13;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i0,0);
f13=f1*f4,      f8=f11+f13,     f10=f11-f13;
           /*___________Do m7 radix-4 butterflies___________*/
lcntr=m7,       do mr4bfly until lce;
                                f13=f9-f13,     f4=dm(i1,m0),   f5=pm(i9,m8);
                f2=f2+f6,       f15=f2-f6,      f0=dm(i7,m0),   f1=pm(i15,m8);
f8=f0*f4,       f3=f8+f12,      f7=f8-f12,                      f9=pm(i8,0);
f12=f1*f5,      f9=f9+f13,      f11=f9-f13,     f13=f2;
f8=f0*f5,       f12=f8+f12,                     f0=dm(i7,m0),   f5=pm(i11,m8);
f13=f1*f4,      f9=f9+f13,      f6=f9-f13,      f4=dm(i3,m0),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13,      dm(i0,m0)=f3,   pm(i8,m8)=f9;
f13=f1*f5,      f11=f11+f14,    f7=f11-f14,     dm(i4,m0)=f7,   pm(i12,m8)=f6;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m1),   f5=pm(i10,m8);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m0),   f1=pm(i15,m9);
f11=f0*f4,      f3=f10+f15,     f8=f10-f15,                     pm(i13,m8)=f11;
f13=f1*f5,                      f6=f9-f13,      dm(i6,m0)=f8,   pm(i14,m8)=f7;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i0,0);
mr4bfly:
f13=f1*f4,      f8=f11+f13,     f10=f11-f13,    dm(i5,m0)=f3;
           /*___________End radix-4 butterfly_____________*/
/*                       dummy for address update      *               * */
                                f13=f9-f13,     f0=dm(i7,m2),   f1=pm(i15,m10);
                f2=f2+f6,       f15=f2-f6,      f0=dm(i1,m4),   f1=pm(i9,m12);
                f3=f8+f12,      f7=f8-f12,                      f9=pm(i8,0);
                f9=f9+f13,      f11=f9-f13,     f0=dm(i2,m4);
                f9=f9+f2,       f6=f9-f2,       f0=dm(i3,m4),   f1=pm(i10,m12);
                                                dm(i0,m3)=f3,   pm(i8,m11)=f9;
                f11=f11+f14,    f7=f11-f14,     dm(i4,m3)=f7,   pm(i12,m11)=f6;
                f3=f10+f15,     f8=f10-f15,                     pm(i13,m11)=f11;
                                                dm(i6,m3)=f8,   pm(i14,m11)=f7;
mgroup:                                         dm(i5,m3)=f3,   f1=pm(i11,m12);

                                r3=m4;
                                r1=m5;
                                r2=m6;
        r3=ashift r3 by -2;             /* groupstep/4 */
        r1=ashift r1 by 2;              /* groups*4 */
                                m5=r1;
mstage: r2=ashift r2 by -2;             /* butterflies/4 */
        
/*____________________Last radix-4 stage__________________________________*/
/* Includes bitreversal of the real data in dm */

        bit set mode1 BR0 | BR8;              /* bitreversal in i0 */
/* with: m0=m8=1 preset */
        i4=redata;                      /* input */
        i1=redata+1;
        i2=redata+2;
        i3=redata+3;
        i0=ORE;  /* real output array base must be an integer multiple of N */
        m2=OST;
        i7=cosine;
        i8=OIM;                      /* input */
        i9=imdata+1;
        i10=imdata+2;
        i11=imdata+3;
        i12=imdata;                     /* output */
        i15=sine;
        m1=4;
        m9=m1;
        m12=OST;
                                                f0=dm(i7,m0),   f5=pm(i9,m9);
f8=f0*f5,                                       f4=dm(i1,m1),   f1=pm(i15,m8);
f9=f0*f4;
f12=f1*f5,                                      f0=dm(i7,m0),   f5=pm(i11,m9);
f13=f1*f4,      f12=f9+f12,                     f4=dm(i3,m1),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13;
f13=f1*f5;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m0),   f5=pm(i10,m9);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m1),   f1=pm(i15,m8);
f11=f0*f4;
f13=f1*f5,                      f6=f9-f13;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i4,m1);
f13=f1*f4,      f8=f11+f13,     f10=f11-f13;
            /*________Do N/4-1 radix-4 butterflies_______*/
lcntr=N/4-1, do lstage until lce;
                                f13=f9-f13,     f4=dm(i1,m1),   f5=pm(i9,m9);
                f2=f2+f6,       f15=f2-f6,      f0=dm(i7,m0),   f1=pm(i15,m8);
f8=f0*f4,       f3=f8+f12,      f7=f8-f12,                      f9=pm(i12,m9);
f12=f1*f5,      f9=f9+f13,      f11=f9-f13,     f13=f2;
f8=f0*f5,       f12=f8+f12,                     f0=dm(i7,m0),   f5=pm(i11,m9);
f13=f1*f4,      f9=f9+f13,      f6=f9-f13,      f4=dm(i3,m1),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13,      dm(i0,m2)=f3,   pm(i8,m12)=f9;
f13=f1*f5,      f11=f11+f14,    f7=f11-f14,     dm(i0,m2)=f7,   pm(i8,m12)=f6;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m0),   f5=pm(i10,m9);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m1),   f1=pm(i15,m8);
f11=f0*f4,      f3=f10+f15,     f8=f10-f15,                     pm(i8,m12)=f11;
f13=f1*f5,                      f6=f9-f13,      dm(i0,m2)=f3,   pm(i8,m12)=f7;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i4,m1);
lstage:
f13=f1*f4,      f8=f11+f13,     f10=f11-f13,    dm(i0,m2)=f8;

                                f13=f9-f13;
                f2=f2+f6,       f15=f2-f6;
                f3=f8+f12,      f7=f8-f12,                      f9=pm(i12,m9);
                f9=f9+f13,      f11=f9-f13,     dm(i0,m2)=f3;
                f9=f9+f2,       f6=f9-f2,       dm(i0,m2)=f7;
                                                                pm(i8,m12)=f9;
                f11=f11+f14,    f7=f11-f14,                     pm(i8,m12)=f6;
                f3=f10+f15,     f8=f10-f15,                     pm(i8,m12)=f11;
                                                dm(i0,m2)=f3,   pm(i8,m12)=f7;
                rts(db); 
                bit clr mode1 BR0 | BR8 | RND32 | CBUFEN | PEYEN;	/* no bitreversal in i0, disable 32bit floating point, circular buffering and SIMD */                     
                                                dm(i0,m2)=f8;
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -