📄 hhcfft4.asm
字号:
/**************************************************************
ADSP-21160 Radix-4 Complex Fast Fourier Transform
File name: HHCFFT4.ASM
Author: Karl Schwarz & Raimund Meyer, Universitaet Erlangen Nuernberg
Date Modified:
25-APR-91, RY
18-JUN-91, RY
05-MAR-99, RFG
24-MAY-00, GGL
Description:
This routine performs a single channel SIMD, complex, radix 4 Fast Fourier
Transform (FFT). The FFT length (N) must be a power of 4 and a minimum of
64 points. The real part of the input data is placed in DM and the complex
part in PM. This data is destroyed during the course of the computation.
The real and complex output of the FFT is placed in separate locations in DM.
Since this routine takes care of all necessary address digit-reversals, the
input and output data are in normal order. The digit reversal is accomplished
by using a modified radix 4 butterfly throughout which swaps the inner two
nodes resulting with bit reversed data. The digit reversal is completed by
bit reversing the real data in the final stage and then bit reversing the
imaginary so that it ends up in DM.
To implement an inverse FFT, you only have to (1) swap the incoming datas real
and imaginary parts, (2) run the forward FFT, (3) swap the outgoing datas
real and imaginary parts and (4) scale the data by 1/N.
Calling Parameters:
costwid table at DM : cosine length 3*N/4
sintwid table at PM : sine length 3*N/4
real input at DM : redata length N, normal order
imag input at PM : imdata length N, normal order
Assumptions:
All arrays must start on even address boundaries.
Results:
real output at DM : refft length N, normal order
imag output at DM : imfft length N, normal order
Registers Affected:
All I, M, L and R registers.
Three levels of looping.
Notes:
Because the bit reversed addressing mode is used with the arrays
refft and imfft, they must start at addresses that are integer
multiples of the length (N) of the transform, (i.e. 0,N,2N,3N,...).
This is accomplished by specifying two segments starting at those addresses
in the architecture file and placing the variables alone in their
respective segments. These addresses must also be reflected in the
preprocessor variables ORE and OIM in bit reversed format.)
Circular Buffering Issue:
Because SIMD or Long word access transfer two 32-bit words, programs
must be careful when using these accesses in circular buffering.
It is important that SIMD or Long word accesses do not cross a circular
buffer boundary. If a SIMD mode access occurs using a circular buffer
index register that points to the last location in the circular buffer
(end of buffer), the resulting access transfers the last location in
the circular buffer and the first location outside the buffer (end of buffer + 1).
Benchmarks: radix-4, complex with digit reversal
FFT Length cycles
---------- ------
64 725
256 3494
1024 17159
4096 82472
First Stage - 8 cycles per radix-4 butterfly
Other Stages - 14 cycles per radix-4 butterfly
Memory Usage:
pm code = 200 words, pm data = 1.75*N words, dm data = 3.75*N words
**************************************************************/
#include "def21160.h" /* Symbol Definition File */
#define N 1024 /*Number of points in FFT. Must be a power of four, minimum of 64.*/
#define STAGES 5 /*Set to log4(N) or (log(N)/log(4))*/
#define OST 0x00400000 /*bitrev(32 bit N/2)*/
#define ORE 0x0001A000 /*bitrev(32 bit addr of output real in dm), addr is 0,N,2N,3N,...*/
#define OIM 0x00062000 /*bitrev(24 bit addr of output imag. in dm), addr is 0,N,2N,3N,...*/
.global first;
.global cosine;
.global redata;
.global sine;
.global imdata;
/* program memory code */
.section/pm seg_pmco;
/*_______first stage radix-4 butterfly without twiddles______*/
first:
bit set mode1 RND32 | CBUFEN | PEYEN; /* enable 32 bit floating point, circular buffers and SIMD */
nop; /* 1 cycle latency before SIMD is active */
f0=dm(i0,m0), f1=pm(i8,m8);
f2=dm(i2,m0), f3=pm(i10,m8);
f0=f0+f2, f2=f0-f2, f4=dm(i1,m0), f5=pm(i9,m8);
f1=f1+f3, f3=f1-f3, f6=dm(i3,m0), f7=pm(i11,m8);
f4=f6+f4, f6=f6-f4;
f5=f5+f7, f7=f5-f7;
f8=f0+f4, f9=f0-f4;
f10=f1+f5, f11=f1-f5;
lcntr=N/8, do fstage until lce; /* do N/4 simple radix-4 butterflies */
f12=f2+f7, f13=f2-f7, f0=dm(i0,m0), f1=pm(i8,m8);
f14=f3+f6, f15=f3-f6, f2=dm(i2,m0), f3=pm(i10,m8);
f0=f0+f2, f2=f0-f2, f4=dm(i1,m0), f5=pm(i9,m8);
f1=f1+f3, f3=f1-f3, f6=dm(i3,m0), f7=pm(i11,m8);
f4=f6+f4, f6=f6-f4, dm(i4,m0)=f8, pm(i12,m8)=f10;
f5=f5+f7, f7=f5-f7, dm(i5,m0)=f9, pm(i13,m8)=f11;
f8=f0+f4, f9=f0-f4, dm(i6,m0)=f12, pm(i14,m8)=f14;
fstage:
f10=f1+f5, f11=f1-f5, dm(i7,m0)=f13, pm(i15,m8)=f15;
bit clr mode1 PEYEN; /* added for testing SIMD */
m0=1;
m8=1;
/*_____________Middle stages with radix-4 main butterfly___________________*/
/* m0=1 and m8=1 is still preset */
m1=-2; /* reverse step for twiddles */
m9=m1;
m2=3; /* forward step for twiddles */
m10=m2;
m5=4; /* first there are 4 groups */
r2=N/16; /* with N/16 butterflies in each group */
r3=N/16*3; /* step to next group */
lcntr=STAGES-2, do mstage until lce; /* do STAGES-2 stages */
i7=cosine; /* first real twiddle */
i15=sine; /* first imag twiddle */
r8=redata;
r9=imdata;
i0=r8; /* upper real path */
r10=r8+r2; i8=r9; /* upper imaginary path */
i1=r10; /* second real input path */
r10=r10+r2, i4=r10; /* second real output path */
i2=r10; /* third real input path */
r10=r10+r2, i5=r10; /* third real output path */
i3=r10; /* fourth real input path */
r10=r9+r2, i6=r10; /* fourth real output path */
i9=r10; /* second imag input path */
r10=r10+r2, i12=r10; /* second imag output path */
i10=r10; /* third imag input path */
r10=r10+r2, i13=r10; /* third imag output path */
i11=r10; /* fourth imag input path */
i14=r10; /* fourth imag output path */
m4=r3;
m12=r3;
r4=r3+1, m6=r2;
m3=r4;
r2=r2-1, m11=r4;
m7=r2;
lcntr=m5, do mgroup until lce; /* do m5 groups */
f0=dm(i7,m0), f5=pm(i9,m8);
f8=f0*f5, f4=dm(i1,m0), f1=pm(i15,m8);
f9=f0*f4;
f12=f1*f5, f0=dm(i7,m0), f5=pm(i11,m8);
f13=f1*f4, f12=f9+f12, f4=dm(i3,m0), f1=pm(i15,m8);
f8=f0*f4, f2=f8-f13;
f13=f1*f5;
f9=f0*f5, f8=f8+f13, f0=dm(i7,m1), f5=pm(i10,m8);
f13=f1*f4, f12=f8+f12, f14=f8-f12, f4=dm(i2,m0), f1=pm(i15,m9);
f11=f0*f4;
f13=f1*f5, f6=f9-f13;
f9=f0*f5, f13=f11+f13, f11=dm(i0,0);
f13=f1*f4, f8=f11+f13, f10=f11-f13;
/*___________Do m7 radix-4 butterflies___________*/
lcntr=m7, do mr4bfly until lce;
f13=f9-f13, f4=dm(i1,m0), f5=pm(i9,m8);
f2=f2+f6, f15=f2-f6, f0=dm(i7,m0), f1=pm(i15,m8);
f8=f0*f4, f3=f8+f12, f7=f8-f12, f9=pm(i8,0);
f12=f1*f5, f9=f9+f13, f11=f9-f13, f13=f2;
f8=f0*f5, f12=f8+f12, f0=dm(i7,m0), f5=pm(i11,m8);
f13=f1*f4, f9=f9+f13, f6=f9-f13, f4=dm(i3,m0), f1=pm(i15,m8);
f8=f0*f4, f2=f8-f13, dm(i0,m0)=f3, pm(i8,m8)=f9;
f13=f1*f5, f11=f11+f14, f7=f11-f14, dm(i4,m0)=f7, pm(i12,m8)=f6;
f9=f0*f5, f8=f8+f13, f0=dm(i7,m1), f5=pm(i10,m8);
f13=f1*f4, f12=f8+f12, f14=f8-f12, f4=dm(i2,m0), f1=pm(i15,m9);
f11=f0*f4, f3=f10+f15, f8=f10-f15, pm(i13,m8)=f11;
f13=f1*f5, f6=f9-f13, dm(i6,m0)=f8, pm(i14,m8)=f7;
f9=f0*f5, f13=f11+f13, f11=dm(i0,0);
mr4bfly:
f13=f1*f4, f8=f11+f13, f10=f11-f13, dm(i5,m0)=f3;
/*___________End radix-4 butterfly_____________*/
/* dummy for address update * * */
f13=f9-f13, f0=dm(i7,m2), f1=pm(i15,m10);
f2=f2+f6, f15=f2-f6, f0=dm(i1,m4), f1=pm(i9,m12);
f3=f8+f12, f7=f8-f12, f9=pm(i8,0);
f9=f9+f13, f11=f9-f13, f0=dm(i2,m4);
f9=f9+f2, f6=f9-f2, f0=dm(i3,m4), f1=pm(i10,m12);
dm(i0,m3)=f3, pm(i8,m11)=f9;
f11=f11+f14, f7=f11-f14, dm(i4,m3)=f7, pm(i12,m11)=f6;
f3=f10+f15, f8=f10-f15, pm(i13,m11)=f11;
dm(i6,m3)=f8, pm(i14,m11)=f7;
mgroup: dm(i5,m3)=f3, f1=pm(i11,m12);
r3=m4;
r1=m5;
r2=m6;
r3=ashift r3 by -2; /* groupstep/4 */
r1=ashift r1 by 2; /* groups*4 */
m5=r1;
mstage: r2=ashift r2 by -2; /* butterflies/4 */
/*____________________Last radix-4 stage__________________________________*/
/* Includes bitreversal of the real data in dm */
bit set mode1 BR0 | BR8; /* bitreversal in i0 */
/* with: m0=m8=1 preset */
i4=redata; /* input */
i1=redata+1;
i2=redata+2;
i3=redata+3;
i0=ORE; /* real output array base must be an integer multiple of N */
m2=OST;
i7=cosine;
i8=OIM; /* input */
i9=imdata+1;
i10=imdata+2;
i11=imdata+3;
i12=imdata; /* output */
i15=sine;
m1=4;
m9=m1;
m12=OST;
f0=dm(i7,m0), f5=pm(i9,m9);
f8=f0*f5, f4=dm(i1,m1), f1=pm(i15,m8);
f9=f0*f4;
f12=f1*f5, f0=dm(i7,m0), f5=pm(i11,m9);
f13=f1*f4, f12=f9+f12, f4=dm(i3,m1), f1=pm(i15,m8);
f8=f0*f4, f2=f8-f13;
f13=f1*f5;
f9=f0*f5, f8=f8+f13, f0=dm(i7,m0), f5=pm(i10,m9);
f13=f1*f4, f12=f8+f12, f14=f8-f12, f4=dm(i2,m1), f1=pm(i15,m8);
f11=f0*f4;
f13=f1*f5, f6=f9-f13;
f9=f0*f5, f13=f11+f13, f11=dm(i4,m1);
f13=f1*f4, f8=f11+f13, f10=f11-f13;
/*________Do N/4-1 radix-4 butterflies_______*/
lcntr=N/4-1, do lstage until lce;
f13=f9-f13, f4=dm(i1,m1), f5=pm(i9,m9);
f2=f2+f6, f15=f2-f6, f0=dm(i7,m0), f1=pm(i15,m8);
f8=f0*f4, f3=f8+f12, f7=f8-f12, f9=pm(i12,m9);
f12=f1*f5, f9=f9+f13, f11=f9-f13, f13=f2;
f8=f0*f5, f12=f8+f12, f0=dm(i7,m0), f5=pm(i11,m9);
f13=f1*f4, f9=f9+f13, f6=f9-f13, f4=dm(i3,m1), f1=pm(i15,m8);
f8=f0*f4, f2=f8-f13, dm(i0,m2)=f3, pm(i8,m12)=f9;
f13=f1*f5, f11=f11+f14, f7=f11-f14, dm(i0,m2)=f7, pm(i8,m12)=f6;
f9=f0*f5, f8=f8+f13, f0=dm(i7,m0), f5=pm(i10,m9);
f13=f1*f4, f12=f8+f12, f14=f8-f12, f4=dm(i2,m1), f1=pm(i15,m8);
f11=f0*f4, f3=f10+f15, f8=f10-f15, pm(i8,m12)=f11;
f13=f1*f5, f6=f9-f13, dm(i0,m2)=f3, pm(i8,m12)=f7;
f9=f0*f5, f13=f11+f13, f11=dm(i4,m1);
lstage:
f13=f1*f4, f8=f11+f13, f10=f11-f13, dm(i0,m2)=f8;
f13=f9-f13;
f2=f2+f6, f15=f2-f6;
f3=f8+f12, f7=f8-f12, f9=pm(i12,m9);
f9=f9+f13, f11=f9-f13, dm(i0,m2)=f3;
f9=f9+f2, f6=f9-f2, dm(i0,m2)=f7;
pm(i8,m12)=f9;
f11=f11+f14, f7=f11-f14, pm(i8,m12)=f6;
f3=f10+f15, f8=f10-f15, pm(i8,m12)=f11;
dm(i0,m2)=f3, pm(i8,m12)=f7;
rts(db);
bit clr mode1 BR0 | BR8 | RND32 | CBUFEN | PEYEN; /* no bitreversal in i0, disable 32bit floating point, circular buffering and SIMD */
dm(i0,m2)=f8;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -