📄 fft_flp32.asm
字号:
/* fft32.asm
Prelim rev. October 19, 2003 - BL
Rev. 1.0 - added real inputs case - PM
This is assembly routine for the Complex radix-2 C-callable FFT on TigerSHARC
family of DSPs.
I. Description of Calling.
1. Inputs:
j4 -> input (ping-pong buffer 1)
j5 -> ping-pong buffer 1
j6 -> ping-pong buffer 2
j7 -> output
j27+0x18 -> N = Number of points
j27+0x19 -> REAL or COMPLEX
2. C-Calling Example:
fft32(&(input), &(ping_pong_buffer1), &(ping_pong_buffer2), &(output), N, COMPLEX);
3. Limitations:
a. All buffers must be aligned on memory boundary which is a multiple of 4.
b. N must be between 32 and MAX_FFT_SIZE.
c. If memory space savings are required and input does not have to be
preserved, ping_pong_buffer1 can be the same buffer as input.
d. If memory space savings are required, output can be the same buffer
as ping_pong_buffer2 if the number of FFT stages is even (i.e.
Log2(N) is even) or the same as ping_pong_buffer1 if the number of
FFT stages is odd (i.e. Log2(N) is odd).
4. MAX_FFT_SIZE can be selected via #define. Larger values allow for more choices
of N, but its twiddles will occupy more memory.
5. This C - callable function can process up to 64K blocks of data on TS201
(16K blocks on TS101) because C environment itself necessitates memory.
Therefore, if more input points are necessary, assembly language development
may become a must. On TS201, a block of memory is 128K words long, so
maximum N is 128K real points or 64K complex points. TS101 contains
only 2 blocks of data memory of 64K words and 4 buffers must be
accommodated. Therefore, maximum N is 32K real words or 16K complex words.
II. Description of the FFT algorithm.
1. The input data is treated as complex interleaved N-point.
2. Due to re-ordering, no stage can be done in-place.
3. The bit reversal and the first two stages are combined into
a single loop. This loop takes data from input and stores it
in the ping-pong buffer1.
4. Each subsequent stage ping-pongs the data between the two ping-pong
buffers. The last stage uses FFT output buffer for its output.
5. Although the FFT is designed to be called with any point size
N <= MAX_FFT_SIZE by subsampling the twiddle factors, for ADSP-TS20x
processors, the best cycle optimization is achieved when MAX_FFT_SIZE=N.
For ADSP-TS101 all choices of MAX_FFT_SIZE are equally optimal.
III. Description of the REAL FFT algorithm.
1. The input data is treated as complex interleaved N/2-point. The N/2 point complex
FFT will be computed first. Thus, N is halved, now number of points = N/2.
2. Details and source code of the N/2 point complex FFT are in II above.
3. Real re-combine:
Here the complex N/2-point FFT computed in the previous steps is recombined to
produce the N-point real FFT. If G is the complex FFT and F is the real FFT,
the formula for F is given by:
F(n) = 0.5*(G(n)+conj(G(N/2-n))-0.5*i*exp(-2*pi*i*n/N)*(G(n)-conj(G(N/2-n)).
From this the following can be derived:
conj(F(N/2-n)) = 0.5*(G(n)+conj(G(N/2-n))+0.5*i*exp(-2*pi*i*n/N)*(G(n)-conj(G(N/2-n)).
Thus, this can be computed in (n,N/2-n) pairs, as follows (dropping factor of 2):
G(n) ------------------------------->------------------------>--------> F(n)
\ +/ \ +/
\/ \/
/\ /\
conj / -\ exp(-2*pi*i*n)*i / -\ conj
G(N/2-n) -----> conj(G(N/2-n))------>------------------------>--------> F(N/2-n)
This is very efficient on the TigerSHARC architecture due to the add/subtract
instruction.
IV. For all additional details regarding this algorithm and code, see EE-218
application note, available from the ADI web site.
*/
//************************************ Includes **********************************
#include "FFTDef.h"
#include "defts201.h"
//************************* Externs *************************************
.extern _twiddles;
//********************************* FFT Routine *********************************
.section program;
.global _FFT32;
_FFT32:
//********************************** Prologue ***********************************
mENTER
mPUSHQ(xR31:28)
mPUSHQ(xR27:24)
mPUSHQ(yR31:28)
mPUSHQ(yR27:24)
//************************************ Setup *************************************
j17 = [j27 + 0x18];; //j17 = N
j11 = [j27 + 0x19];; // j11=COMPLEX or REAL, off the stack
comp(j11,COMPLEX);; // Complex or Real?
if jeq, jump _FFTStages1and2;;
j17=ashiftr j17;; // if Real, half N
//********************************************************************************
_FFTStages1and2:
j11 = j31 + j17;; // j11=N
xr3=j11; k7=k31+_twiddles;;
k1=j11; j8=lshiftr j11;; // k1=N, j8=N/2
j9=lshiftr j8; xr0=MAX_FFT_SIZE; xr3=LD0 r3;; // j9=N/4, compute the twiddle stride
k8=lshiftr k1; xr0=LD0 r0; xr1=j11;;
k8=lshiftr k8; xr1=LD0 r1; xr2=(31-3);; // k8=N/4, Compute Stages-3
k0=j4; k10=lshiftr k8; xr1=r1-r0; xr0=lshift r0 by -32;; // k0->input, xr1=bit difference between MAX and N
k10=lshiftr k10; xr0=bset r0 by r1; xr30=r2-r3;; // k10=N/16, xr30=Stages-3
k10=k10-1; xr0=lshift r0 by 2; LC1=xr30;; // k10=N/16-1, LC1=Stages-3
k9=xr0; k4=k31+(MAX_FFT_SIZE/4-1);;
k4=not k4; j10=lshiftr j9;; // initial twiddles pointer mask, j10=N/8
//****************** Bit Reverse and Stages 1 & 2 ******************************
k5=lshiftr k1;; // k5=N/2
j0=j31+j6; k6=k6-k6;; // j0->ping_pong_buffer2
j1=j0+j9; LC0=k10;; // j1->ping_pong_buffer2+N/4, LC0=N/16-1
j2=j1+j9; k1=k0+k5;; // j2->ping_pong_buffer2+N/2, k1->input+N/2
j3=j2+j9; k2=k1+k5;; // j3->ping_pong_buffer2+3N/4, k2->input+N
j12=j3+j9; k3=k2+k5;; // j12->ping_pong_buffer2+N, k3->input+3N/2
j13=j12+j9; k5=lshiftr k5;; // j13->ping_pong_buffer2+5N/4, k5=N/4
j14=j13+j9; r1:0=q[k0+k6];; // j14->ping_pong_buffer2+3N/2
j15=j14+j9; r3:2=q[k2+k6];; // j15->ping_pong_buffer2+7N/4
r5:4=q[k1+k6];;
r7:6=q[k3+k6];;
k6=k6+k5 (br); fr0=r0+r2, fr20=r0-r2;;
r9:8=q[k0+k6]; fr2=r1+r3, fr29=r1-r3;;
r11:10=q[k2+k6]; fr4=r4+r6, fr21=r4-r6;;
r13:12=q[k1+k6]; fr5=r5+r7, fr28=r5-r7;;
r15:14=q[k3+k6]; fr18=r8+r10, fr22=r8-r10;;
k6=k6+k5 (br); fr19=r9+r11, fr31=r9-r11;;
fr26=r12+r14, fr23=r12-r14;;
fr27=r13+r15, fr30=r13-r15;;
fr20=r20+r28, fr28=r20-r28;;
fr29=r29+r21, fr21=r29-r21;;
fr22=r22+r30, fr30=r22-r30;;
fr31=r31+r23, fr23=r31-r23;;
.align_code 4;
_Stages1and2Loop:
r1:0=q[k0+k6]; q[j2+=4]=yr23:20; fr16=r0+r4, fr24=r0-r4;;
r3:2=q[k2+k6]; q[j3+=4]=xr23:20; fr17=r2+r5, fr25=r2-r5;;
r5:4=q[k1+k6]; q[j14+=4]=yr31:28; fr18=r18+r26, fr26=r18-r26;;
r7:6=q[k3+k6]; q[j15+=4]=xr31:28; fr19=r19+r27, fr27=r19-r27;;
k6=k6+k5 (br); q[j0+=4]=yr19:16; fr0=r0+r2, fr20=r0-r2;;
r9:8=q[k0+k6]; q[j1+=4]=xr19:16; fr2=r1+r3, fr29=r1-r3;;
r11:10=q[k2+k6]; q[j12+=4]=yr27:24; fr4=r4+r6, fr21=r4-r6;;
r13:12=q[k1+k6]; q[j13+=4]=xr27:24; fr5=r5+r7, fr28=r5-r7;;
r15:14=q[k3+k6]; fr18=r8+r10, fr22=r8-r10;;
k6=k6+k5 (br); fr19=r9+r11, fr31=r9-r11;;
fr26=r12+r14, fr23=r12-r14;;
fr27=r13+r15, fr30=r13-r15;;
fr20=r20+r28, fr28=r20-r28;;
fr29=r29+r21, fr21=r29-r21;;
fr22=r22+r30, fr30=r22-r30;;
.align_code 4;
if NLC0E, jump _Stages1and2Loop;
fr31=r31+r23, fr23=r31-r23;;
q[j2+=4]=yr23:20; fr16=r0+r4, fr24=r0-r4;;
q[j3+=4]=xr23:20; fr17=r2+r5, fr25=r2-r5;;
q[j14+=4]=yr31:28; fr18=r18+r26, fr26=r18-r26;;
q[j15+=4]=xr31:28; fr19=r19+r27, fr27=r19-r27;;
q[j0+=4]=yr19:16;;
q[j1+=4]=xr19:16;;
q[j12+=4]=yr27:24;;
q[j13+=4]=xr27:24;;
//************************ Stages 3 to Log2(N)-1 *******************************
j0=j31+j6; k5=k31+0;;
.align_code 4;
_StageLoop:
yr3:0=q[j0+=4]; k3=k5 and k4;; // F1, K1
xr3:0=q[j0+=4]; r5:4=l[k7+k3];; // F2, K2
LC0=k10; k5=k5+k9;; // K3, M1
yr11:8=q[j0+=4]; k3=k5 and k4; fr6=r2*r4;; // F1+, K1+
xr11:8=q[j0+=4]; r13:12=l[k7+k3]; fr7=r3*r5;; // F2+, K2+, M2
fr14=r2*r5;; // M3
j1=j31+j5; k5=k5+k9; fr6=r10*r12; fr16=r6-r7;; // K3+, M1+, A1
yr23:20=q[j0+=4]; k3=k5 and k4; fr15=r3*r4;; // F1++, K1++, M4
xr23:20=q[j0+=4]; r5:4=l[k7+k3]; fr7=r11*r13;; // F2++, K2++, M2+
fr14=r10*r13; fr17=r14+r15;; // M3+, A2
j2=j1+j11; k5=k5+k9; fr6=r22*r4; fr18=r6-r7;; // K3++, M1++, A1+
yr31:28=q[j0+=4]; k3=k5 and k4; fr15=r11*r12; fr24=r0+r16, fr26=r0-r16;; // F1+++, K1+++, M4+, A3
xr31:28=q[j0+=4]; r13:12=l[k7+k3]; fr7=r23*r5; fr25=r1+r17, fr27=r1-r17;; // F2+++, K2+++, M2++, A4
q[j1+=4]=r25:24; fr14=r22*r5; fr19=r14+r15;; // S1, M3++, A2+
.align_code 4;
_BflyLoop:
q[j2+=4]=r27:26; k5=k5+k9; fr6=r30*r12; fr16=r6-r7;; // S2----,K3-, M1-, A1--
yr3:0=q[j0+=4]; k3=k5 and k4; fr15=r23*r4; fr24=r8+r18, fr26=r8-r18;; // F1, K1, M4--, A3---
xr3:0=q[j0+=4]; r5:4=l[k7+k3]; fr7=r31*r13; fr25=r9+r19, fr27=r9-r19;; // F2, K2, M2-, A4---
q[j1+=4]=r25:24; fr14=r30*r13; fr17=r14+r15;; // S1---, M3-, A2--
q[j2+=4]=r27:26; k5=k5+k9; fr6=r2*r4; fr18=r6-r7;; // S2---, K3, M1, A1-
yr11:8=q[j0+=4]; k3=k5 and k4; fr15=r31*r12; fr24=r20+r16, fr26=r20-r16;; // F1+, K1+, M4-, A3--
xr11:8=q[j0+=4]; r13:12=l[k7+k3]; fr7=r3*r5; fr25=r21+r17, fr27=r21-r17;; // F2+, K2+, M2, A4--
q[j1+=4]=r25:24; fr14=r2*r5; fr19=r14+r15;; // S1--, M3, A2-
q[j2+=4]=r27:26; k5=k5+k9; fr6=r10*r12; fr16=r6-r7;; // S2--, K3+, M1+, A1
yr23:20=q[j0+=4]; k3=k5 and k4; fr15=r3*r4; fr24=r28+r18, fr26=r28-r18;; // F1++, K1++, M4, A3-
xr23:20=q[j0+=4]; r5:4=l[k7+k3]; fr7=r11*r13; fr25=r29+r19, fr27=r29-r19;; // F2++, K2++, M2+, A4-
q[j1+=4]=r25:24; fr14=r10*r13; fr17=r14+r15;; // S1-, M3+, A2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -