📄 fft_fxp16.asm
字号:
#include "FFTDef.h";
//************************************ Macros ************************************
#define mPUSHQ(arg) \
Q[k27 += -4] = arg;;
#define mPOPQ(arg) \
k27 = k27 + 4;; \
arg = Q[k27 += 0];;
#define mENTER \
j26 = j27 - 0x40; k26 = k27 - 0x40;; \
[j27 += 0xFFFFFFF4] = cJMP; k27 = k27 - 0x04;;
#define mRETURN \
cjmp = [j26 + 0x40];; \
cjmp(ABS)(NP); j27:24 = Q[j26 + 0x44]; k27:24 = Q[k26 + 0x44];;
//********************************* FFT Routine *********************************
.section program;
.global _FFT16;
.extern _twiddles;
_FFT16:
//********************************** Prologue ***********************************
mENTER
mPUSHQ(xR31:28)
mPUSHQ(xR27:24)
mPUSHQ(yR31:28)
mPUSHQ(yR27:24)
/************************************************************************/
/************************************** Start of code *****************************************/
//pointers are initialized
j25 = j31+j6;; //N
j12 = j31+j7;; //log2_N
j27 = lshiftr j6;; //N/2
j28 = j31+j4;; //input
j29 = j31+j5;; //int_buff
j30 = j31+_twiddles;;
/*******************************************/
//This part performs the Stages 1-2-3 including the bit reversal of the inputs
//the inputs are taken from input buffer (pointed by j0) and
//saved into int_buffer (pointed by k6, k7, k8, k9)
// r31=WN64=-j used as twiddle factor
j0 = j31 + j28; r31 = 0x80000000;; //pointer to input buffer
k6 = j29;; //one of the pointers used to save data into int_buffer
k0 = k6;;
k4 = j27;; //pointer used to compute the initial values of the pointers used
//to save data into int_buffer
//r30=N
r30 = j25;;
//r29=WN3*32=exp(-3/4.pi.j)=-.707-j.707 as twiddle factor
r29 = 0xa57ea57e; xr30 = ashift r30 by -5;; //xr30=N/32
xr30 = dec r30;;//xr30=N/32-1
j21 = xr30;;
LC0 = j21;; //LC0 = N/32-1
//j4=128, mask for bit rev
// r30=WN32=exp(-1/4.pi.j)=.707-j.707 as twiddle factor
j4 = j31+j27; R30 = 0xa57e5a82;;
//yr0 = x(00h)
//yr1 = x(01h)
//xr0 = x(02h)
//xr1 = x(03h)
//k0 reading is used only to compute a pointer used to save data into int_buff
r1:0 = br q[j0+=j4]; r3:2 = br q[k0+=k4];;
//yr2 = x(80h)
//yr3 = x(81h)
//xr2 = x(82h)
//xr3 = x(83h)
r3:2 = br q[j0+=j4]; k8 = k0;;//k8=int_buf+N/2
//yr4 = x(40h)
//yr5 = x(41h)
//xr4 = x(42h)
//xr5 = x(43h)
//k0 reading is used only to compute a pointer used to save data into int_buff
r5:4 = br q[j0+=j4]; r7:6 = br q[k0+=k4];;
//stage 1 begins here
//-------------------
//yr6 = x(c0h)
//yr7 = x(c1h)
//xr6 = x(c2h)
//xr7 = x(c3h) yr8=x(00h)+x(80h)=Xeeeeeee(0) yr9=x(00h)-x(80h)=Xeeeeeee(1)
// xr8=x(02h)+x(82h)=Xeoeeeee(0) xr9=x(02h)-x(82h)=Xeoeeeee(1)
r7:6 = br q[j0+=j4]; sr8=r0+r2,sr9=r0-r2; k7 = k0;;//k7=int_buf+0x40
//yr16 = x(20h)
//yr17 = x(21h)
//xr16 = x(22h)
//xr17 = x(23h) yr0=x(01h)+x(81h)=Xoeeeeee(0) yr1=x(01h)-x(81h)=Xoeeeeee(1)
// xr0=x(03h)+x(83h)=Xooeeeee(0) xr1=x(03h)-x(83h)=Xooeeeee(1)
//k0 reading is used only to compute a pointer used to save data into int_buff
r17:16 = br q[j0+=j4]; sr0=r1+r3,sr1=r1-r3; r19:18 = br q[k0+=k4];;
//yr18 = x(a0h)
//yr19 = x(a1h)
//xr18 = x(a2h)
//xr19 = x(a3h) yr2=x(40h)+x(c0h)=Xeeeeeeo(0) yr3=x(40h)-x(c0h)=Xeeeeeeo(1)
// xr2=x(42h)+x(c2h)=Xeoeeeeo(0) xr3=x(42h)-x(c2h)=Xeoeeeeo(1)
r19:18 = br q[j0+=j4]; sr2=r4+r6,sr3=r4-r6; k9=k0;; //k9=int_buf+0xC0
//yr20 = x(60h)
//yr21 = x(61h)
//xr20 = x(62h)
//xr21 = x(63h) yr4=x(41h)+x(c1h)=Xoeeeeeo(0) yr5=x(41h)-x(c1h)=Xoeeeeeo(1)
// xr4=x(43h)+x(c3h)=Xooeeeeo(0) xr5=x(43h)-x(c3h)=Xooeeeeo(1)
r21:20 = br q[j0+=j4]; sr4=r5+r7,sr5=r5-r7;;
//yr22 = x(e0h)
//yr23 = x(e1h)
//xr22 = x(e2h)
//xr23 = x(e3h) yr6=x(20h)+x(a0h)=Xeeeeeoe(0) yr7=x(20h)-x(a0h)=Xeeeeeoe(1)
// xr6=x(22h)+x(a2h)=Xeoeeeoe(0) xr7=x(22h)-x(a2h)=Xeoeeeoe(1)
r23:22 = br q[j0+=j4]; mr1:0+=r3**r31(C); sr6=r16+r18,sr7=r16-r18;;
.align_code 4;
STG_1_2_3_LOOP:
//yr3=Xeeeeeeo(1)*(-j) yr16=x(21h)+x(a1h)=Xoeeeeoe(0) yr17=x(21h)-x(a1h)=Xoeeeeoe(1)
//xr3=Xeoeeeeo(1)*(-j) xr16=x(23h)+x(a3h)=Xooeeeoe(0) xr17=x(23h)-x(a3h)=Xooeeeoe(1)
r3=mr1:0,mr1:0+=r5**r31(C); sr16=r17+r19,sr17=r17-r19;;
//yr5=Xoeeeeeo(1)*(-j) yr18=x(60h)+x(e0h)=Xeeeeeoo(0) yr19=x(60h)-x(e0h)=Xeeeeeoo(1)
//xr5=Xooeeeeo(1)*(-j) xr18=x(62h)+x(e2h)=Xeoeeeoo(0) xr19=x(62h)-x(e2h)=Xeoeeeoo(1)
//the multiplication repeated only to have the transfer in r5
r5=mr1:0,mr1:0+=r5**r31(C); sr18=r20+r22,sr19=r20-r22;;
// yr20=x(60h)+x(e0h)=Xeeeeeoo(0) yr21=x(60h)-x(e0h)=Xeeeeeoo(1)
// xr20=x(62h)+x(e2h)=Xeoeeeoo(0) xr21=x(62h)-x(e2h)=Xeoeeeoo(1)
sr20=r21+r23,sr21=r21-r23;;
//stage 2 begins here
//-------------------
// yr9:8=Xeeeeee(1) Xeeeeee(0) yr3:2=Xeeeeee(3) Xeeeeee(2)
// xr9:8=Xeoeeee(1) Xeoeeee(0) xr3:2=Xeoeeee(3) Xeoeeee(2)
mr1:0+=r19**r31(C); sr9:8=r9:8+r3:2,sr3:2=r9:8-r3:2;;
//yr19=Xeeeeeoo(1)*(-j) yr1:0=Xoeeeee(1) Xoeeeee(0) yr5:4=Xoeeeee(3) Xoeeeee(2)
//xr19=Xeoeeeoo(1)*(-j) xr1:0=Xooeeee(1) Xooeeee(0) xr5:4=Xooeeee(3) Xooeeee(2)
r19=mr1:0,mr1:0+=r21**r31(C); sr1:0=r1:0+r5:4,sr5:4=r1:0-r5:4;;
//the multiplication repeated only to have the transfer in r21
//stall yr7:6=Xeeeeeo(1) Xeeeeeo(0) yr19:18=Xeeeeeo(3) Xeeeeeo(2)
//yr21=Xeeeeeoo(1)*(-j) xr7:6=Xeoeeeo(1) Xeoeeeo(0) xr19:18=Xeoeeeo(3) Xeoeeeo(2)
//xr21=Xeoeeeoo(1)*(-j)
r21=mr1:0,mr1:0+=r21**r31(C); sr7:6=r7:6+r19:18,sr19:18=r7:6-r19:18;;
//stall yr17:16=Xoeeeeo(1) Xoeeeeo(0) yr21:20=Xoeeeeo(3) Xoeeeeo(2)
// xr17:16=Xooeeeo(1) Xooeeeo(0) xr21:20=Xooeeeo(3) Xooeeeo(2)
mr1:0+=r7**r30(C); sr17:16=r17:16+r21:20,sr21:20=r17:16-r21:20;;
//yr7=Xeeeeeo(1)*WN32
//xr7=Xeoeeeo(1)*WN32
r7=mr1:0,mr1:0+=r18**r31(C);;
//yr18=Xeeeeeo(2)*(-j)
//xr18=Xeoeeeo(2)*(-j)
r18=mr1:0,mr1:0+=r19**r29(C);;
//stage 3 begins here
//-------------------
//yr19=Xeeeeeo(3)*WN3*32 yr9:8=Xeeeee(1) Xeeeee(0) yr13:12=Xeeeee(5) Xeeeee(4)
//xr19=Xeoeeeo(3)*WN3*32 xr9:8=Xeoeee(1) Xeoeee(0) xr13:12=Xeoeee(5) Xeoeee(4)
r19=mr1:0,mr1:0+=r17**r30(C); sr9:8=r9:8+r7:6,sr13:12=r9:8-r7:6;;
//yr3=Xoeeeeo(1)*WN32
//xr3=Xooeeeo(1)*WN32
r17=mr1:0,mr1:0+=r20**r31(C);;
//yr20=Xoeeeeo(2)*(-j) yr11:10=Xeeeee(3) Xeeeee(2) yr15:14=Xeeeee(7) Xeeeee(6)
//xr20=Xooeeeo(2)*(-j) xr11:10=Xeoeee(3) Xeoeee(2) xr15:14=Xeoeee(7) Xeoeee(6)
r20=mr1:0,mr1:0+=r21**r29(C); sr11:10=r3:2+r19:18,sr15:14=r3:2-r19:18;;
//yr0=x(10h)
//yr1=x(11h)
//xr0=x(12h) yr21=Xoeeeeo(3)*WN3*32 yr17:16=Xoeeee(1) Xoeeee(0) yr25:24=Xoeeee(5) Xoeeee(4)
//xr1=x(13h) xr21=Xooeeeo(3)*WN3*32 xr17:16=Xooeee(1) Xooeee(0) xr25:24=Xooeee(5) Xooeee(4)
//the multiplication repeated only to have the transfer in r21
r1:0=br q[j0+=j4]; r21=mr1:0,mr1:0+=r21**r29(C); sr17:16=r1:0+r17:16,sr25:24=r1:0-r17:16; q[k6+=4]=yr11:8;;
//yr2=x(90h)
//yr3=x(91h)
//xr2=x(92h) yr19:18=Xoeeee(3) Xoeeee(2) yr27:26=Xoeeee(7) Xoeeee(6)
//xr3=x(93h) xr19:18=Xooeee(3) Xooeee(2) xr27:26=Xooeee(7) Xooeee(6)
//stall
r3:2=br q[j0+=j4]; sr19:18=r5:4+r21:20,sr27:26=r5:4-r21:20; q[k7+=4]=xr11:8;;
//yr4=x(50h)
//yr5=x(51h)
//xr4=x(52h)
//xr5=x(53h)
r5:4 = br q[j0+=j4]; q[k8+=4]=yr19:16;;
//yr6=x(d0h)
//yr7=x(d1h)
//xr6=x(d2h)
//xr7=x(d3h)
r7:6 = br q[j0+=j4]; q[k9+=4]=xr19:16;;
//yr16=x(30h)
//yr17=x(31h)
//xr16=x(32h)
//xr17=x(33h)
r17:16 = br q[j0+=j4];sr8=r0+r2,sr9=r0-r2; q[k8+=0x4]=yr27:24;;
//yr18=x(b0h)
//yr19=x(b1h)
//xr18=x(b2h)
//xr19=x(b3h)
r19:18 = br q[j0+=j4]; sr0=r1+r3,sr1=r1-r3; q[k9+=4]=xr27:24;;
//yr20=x(70h)
//yr21=x(71h)
//xr20=x(72h)
//xr21=x(73h)
r21:20 = br q[j0+=j4];sr2=r4+r6,sr3=r4-r6; q[k6+=4]=yr15:12;;
//yr22=x(f0h)
//yr23=x(f1h)
//xr22=x(f2h)
//xr23=x(f3h)
r23:22=br q[j0+=j4];sr4=r5+r7,sr5=r5-r7; q[k7+=4]=xr15:12;;
.align_code 4;
if NLC0E, jump STG_1_2_3_LOOP; mr1:0+=r3**r31(C); sr6=r16+r18,sr7=r16-r18;;
//the loop above computes outputs that has been fetched one loop before.
//Executing this loop N/32 times, would have provoked non aligned
//quad readings due to the bit reversal behavior.
//Therefore, the loop is executed N/32-1 times and the following computations
//basically repeat the loop content without the fetches.
r3=mr1:0,mr1:0+=r5**r31(C); sr16=r17+r19,sr17=r17-r19;;
r5=mr1:0,mr1:0+=r5**r31(C); sr18=r20+r22,sr19=r20-r22;;
sr20=r21+r23,sr21=r21-r23;;
mr1:0+=r19**r31(C); sr9:8=r9:8+r3:2,sr3:2=r9:8-r3:2;;
r19=mr1:0,mr1:0+=r21**r31(C); sr1:0=r1:0+r5:4,sr5:4=r1:0-r5:4;;
r21=mr1:0,mr1:0+=r21**r31(C); sr7:6=r7:6+r19:18,sr19:18=r7:6-r19:18;;
mr1:0+=r7**r30(C); sr17:16=r17:16+r21:20,sr21:20=r17:16-r21:20;;
r7=mr1:0,mr1:0+=r18**r31(C);;
r18=mr1:0,mr1:0+=r19**r29(C);;
r19=mr1:0,mr1:0+=r17**r30(C); sr9:8=r9:8+r7:6,sr13:12=r9:8-r7:6;;
r17=mr1:0,mr1:0+=r20**r31(C);;
r20=mr1:0,mr1:0+=r21**r29(C); sr11:10=r3:2+r19:18,sr15:14=r3:2-r19:18;;
r21=mr1:0,mr1:0+=r21**r29(C); sr17:16=r1:0+r17:16,sr25:24=r1:0-r17:16; q[k6+=4]=yr11:8;;
sr19:18=r5:4+r21:20,sr27:26=r5:4-r21:20; q[k7+=4]=xr11:8;;
q[k8+=4]=yr19:16;;
q[k9+=4]=xr19:16;;
q[k8+=0x4]=yr27:24;;
q[k9+=4]=xr27:24;;
q[k6+=4]=yr15:12;;
q[k7+=4]=xr15:12;;
/*******************************************************************************/
/* The following is the loop for stages 4 and up. Note that two sets of
pointers are used to load (j0 and strides in JALU), and to store (k0 and strides
in KALU). The inputs are taken from int_buffer and saved into input buffer*/
/* Compute strides.
Bfly, group and twiddle strides are set initially to their stage 3 values,
so that on loop entry they are updated to their stage 4 values */
k9 = j25;; //k9=256
j1 = j31+0x4;; //bfly stride
j2 = j31+0x8;; //group stride
j4 = j31+0x1;; //bflies per group iteration count.
k9 = ashiftr k9;;
k9 = ashiftr k9;;
k9 = ashiftr k9;; //twiddle stride = N/8
k6 = ashiftr k9;;
/* Compute iteration counts. */
LC1 = j4;; //LC1=1
// j5 = k9;; //j5=32, gps per stage iteration count.
k6 = k6 - 1;;//k6=N/16-1
/* stages iteration count */
j6 = j12 - 0x5;; //j6=3
/* j10 and k10 hold the pointers to the inputs buffer and internal buffer.
Every stage loop iteration they are swapped. Initialized here: */
j5 = k6;;//j5=16, gps per stage iteration count
j10 = j31 + j28;; //j10 points to input buffer
k10 = j29;; //k10 points to int_buffer
//In the stage 4, BFLY_LOOP executed 1 time and GROUP_LOOP 8 times(32 Multiplications/time)
//In the stage 5, BFLY_LOOP executed 2 times and GROUP_LOOP 4 times(32 Multiplications/time)
//In the stage 6, BFLY_LOOP executed 4 times and GROUP_LOOP 2 times(32 Multiplications/time)
.align_code 4;
STAGE_LOOP:
//swap pointers
j11 = j31+j10;; //j11=input
j10 = k10;; //j10=int_buff
k10 = j11;; //k10=input
k8 = j30;; //k8=Pointer to twiddles
j5 = ashiftr j5;;// divide by two gps per stage counter. Stage 4: 8, ...
// bfly stride. Stage 4: 8; stg 5: 16,... Need two strides in j1 and j7
j1 = j1 + j1;; //j1=8
j7 = j1 + 0x4;; //j7=12
k1 = j1;; //k1=8
k7 = j7;; //k7=12
j2 = j2 + j2;; // grp stride. Stg 4: 16; stg 5: 32,...
j3 = 0xfffffff8;; // stride for next bfly after grp loop...
// ...Stg 4: 0; stg 5: 0,8; stg 6: 0,8,16 ...
k2 = j2;; //k2=16
// twiddles stride. Stg 4: 8; stg 5: 4,...
k9 = ashiftr k9;;
.align_code 4;
BFLY_LOOP:
//Set pointer to buffer where inputs are loaded, and outputs stored
//k0=input, j0=int_buf
j0 = j10+0; k0 = k10+0;;
// Advance j0 to next bfly after group loop
//j3=0
j3 = j3+0x8;;
j0 = j0+j3; k3 = j3;;
// load 8 twiddles
// yr4=WN0=exp(-0/256.pi.j)=1
yr22 = [k8+=k9];;
// yr5=WN32=exp(-32/256.pi.j)=exp(-pi.j/8)
yr23 = [k8+=k9];;
// xr4=WN64=exp(-64/256.pi.j)=exp(-pi.j/4)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -