⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fft_fxp16.asm

📁 TigerSharc TS201 32-bit floating point FFT routine
💻 ASM
📖 第 1 页 / 共 2 页
字号:
#include "FFTDef.h";

//************************************ Macros ************************************

#define mPUSHQ(arg) \
  Q[k27 += -4] = arg;;

#define mPOPQ(arg) \
  k27 = k27 + 4;;  \
  arg = Q[k27 += 0];;

#define mENTER \
  j26 = j27 - 0x40; k26 = k27 - 0x40;; \
  [j27 += 0xFFFFFFF4] = cJMP; k27 = k27 - 0x04;;

#define mRETURN \
  cjmp = [j26 + 0x40];; \
  cjmp(ABS)(NP); j27:24 = Q[j26 + 0x44]; k27:24 = Q[k26 + 0x44];;

//********************************* FFT Routine *********************************
.section program;
.global _FFT16;
.extern _twiddles;

_FFT16:

//********************************** Prologue ***********************************

	mENTER
    mPUSHQ(xR31:28)
	mPUSHQ(xR27:24)
	mPUSHQ(yR31:28)
	mPUSHQ(yR27:24)

/************************************************************************/


/************************************** Start of code *****************************************/
//pointers are initialized
  j25 = j31+j6;;          //N
  j12 = j31+j7;;          //log2_N
  j27 = lshiftr j6;;      //N/2
  j28 = j31+j4;;          //input
  j29 = j31+j5;;          //int_buff
  j30 = j31+_twiddles;;

/*******************************************/
//This part performs the Stages 1-2-3 including the bit reversal of the inputs


//the inputs are taken from input buffer (pointed by j0) and
//saved into int_buffer (pointed by k6, k7, k8, k9)

//                r31=WN64=-j used as twiddle factor
  j0 = j31 + j28; r31 = 0x80000000;;  //pointer to input buffer
  k6 = j29;;        //one of the pointers used to save data into int_buffer
  k0 = k6;;
  k4 = j27;;        //pointer used to compute the initial values of the pointers used
                    //to save data into int_buffer

//r30=N
  r30 = j25;;

//r29=WN3*32=exp(-3/4.pi.j)=-.707-j.707 as twiddle factor
  r29 = 0xa57ea57e; xr30 = ashift r30 by -5;; //xr30=N/32
  xr30 = dec r30;;//xr30=N/32-1

  j21 = xr30;;
  LC0 = j21;;   //LC0 = N/32-1

//j4=128, mask for bit rev
//               r30=WN32=exp(-1/4.pi.j)=.707-j.707  as twiddle factor
  j4  = j31+j27; R30 = 0xa57e5a82;;

//yr0 = x(00h)
//yr1 = x(01h)
//xr0 = x(02h)
//xr1 = x(03h)
//k0 reading is used only to compute a pointer used to save data into int_buff
  r1:0 = br q[j0+=j4]; r3:2 = br q[k0+=k4];;

//yr2 = x(80h)
//yr3 = x(81h)
//xr2 = x(82h)
//xr3 = x(83h)
  r3:2 = br q[j0+=j4]; k8 = k0;;//k8=int_buf+N/2

//yr4 = x(40h)
//yr5 = x(41h)
//xr4 = x(42h)
//xr5 = x(43h)
//k0 reading is used only to compute a pointer used to save data into int_buff
  r5:4 = br q[j0+=j4]; r7:6 = br q[k0+=k4];;

//stage 1 begins here
//-------------------

//yr6 = x(c0h)
//yr7 = x(c1h)
//xr6 = x(c2h)
//xr7 = x(c3h)         yr8=x(00h)+x(80h)=Xeeeeeee(0) yr9=x(00h)-x(80h)=Xeeeeeee(1)
//                     xr8=x(02h)+x(82h)=Xeoeeeee(0) xr9=x(02h)-x(82h)=Xeoeeeee(1)
  r7:6 = br q[j0+=j4]; sr8=r0+r2,sr9=r0-r2; k7 = k0;;//k7=int_buf+0x40

//yr16 = x(20h)
//yr17 = x(21h)
//xr16 = x(22h)
//xr17 = x(23h)          yr0=x(01h)+x(81h)=Xoeeeeee(0) yr1=x(01h)-x(81h)=Xoeeeeee(1)
//                       xr0=x(03h)+x(83h)=Xooeeeee(0) xr1=x(03h)-x(83h)=Xooeeeee(1)
//k0 reading is used only to compute a pointer used to save data into int_buff
  r17:16 = br q[j0+=j4]; sr0=r1+r3,sr1=r1-r3; r19:18 = br q[k0+=k4];;

//yr18 = x(a0h)
//yr19 = x(a1h)
//xr18 = x(a2h)
//xr19 = x(a3h)          yr2=x(40h)+x(c0h)=Xeeeeeeo(0) yr3=x(40h)-x(c0h)=Xeeeeeeo(1)
//                       xr2=x(42h)+x(c2h)=Xeoeeeeo(0) xr3=x(42h)-x(c2h)=Xeoeeeeo(1)
  r19:18 = br q[j0+=j4]; sr2=r4+r6,sr3=r4-r6; k9=k0;; //k9=int_buf+0xC0

//yr20 = x(60h)
//yr21 = x(61h)
//xr20 = x(62h)
//xr21 = x(63h)          yr4=x(41h)+x(c1h)=Xoeeeeeo(0) yr5=x(41h)-x(c1h)=Xoeeeeeo(1)
//                       xr4=x(43h)+x(c3h)=Xooeeeeo(0) xr5=x(43h)-x(c3h)=Xooeeeeo(1)
  r21:20 = br q[j0+=j4]; sr4=r5+r7,sr5=r5-r7;;

//yr22 = x(e0h)
//yr23 = x(e1h)
//xr22 = x(e2h)
//xr23 = x(e3h)          yr6=x(20h)+x(a0h)=Xeeeeeoe(0) yr7=x(20h)-x(a0h)=Xeeeeeoe(1)
//                       xr6=x(22h)+x(a2h)=Xeoeeeoe(0) xr7=x(22h)-x(a2h)=Xeoeeeoe(1)
  r23:22 = br q[j0+=j4]; mr1:0+=r3**r31(C); sr6=r16+r18,sr7=r16-r18;;

.align_code 4;
STG_1_2_3_LOOP:



//yr3=Xeeeeeeo(1)*(-j)  yr16=x(21h)+x(a1h)=Xoeeeeoe(0) yr17=x(21h)-x(a1h)=Xoeeeeoe(1)
//xr3=Xeoeeeeo(1)*(-j)  xr16=x(23h)+x(a3h)=Xooeeeoe(0) xr17=x(23h)-x(a3h)=Xooeeeoe(1)
  r3=mr1:0,mr1:0+=r5**r31(C); sr16=r17+r19,sr17=r17-r19;;

//yr5=Xoeeeeeo(1)*(-j)  yr18=x(60h)+x(e0h)=Xeeeeeoo(0) yr19=x(60h)-x(e0h)=Xeeeeeoo(1)
//xr5=Xooeeeeo(1)*(-j)  xr18=x(62h)+x(e2h)=Xeoeeeoo(0) xr19=x(62h)-x(e2h)=Xeoeeeoo(1)
//the multiplication repeated only to have the transfer in r5
  r5=mr1:0,mr1:0+=r5**r31(C); sr18=r20+r22,sr19=r20-r22;;

//                       yr20=x(60h)+x(e0h)=Xeeeeeoo(0) yr21=x(60h)-x(e0h)=Xeeeeeoo(1)
//                       xr20=x(62h)+x(e2h)=Xeoeeeoo(0) xr21=x(62h)-x(e2h)=Xeoeeeoo(1)
                                sr20=r21+r23,sr21=r21-r23;;

//stage 2 begins here
//-------------------

//                            yr9:8=Xeeeeee(1) Xeeeeee(0) yr3:2=Xeeeeee(3) Xeeeeee(2)
//                            xr9:8=Xeoeeee(1) Xeoeeee(0) xr3:2=Xeoeeee(3) Xeoeeee(2)
             mr1:0+=r19**r31(C); sr9:8=r9:8+r3:2,sr3:2=r9:8-r3:2;;

//yr19=Xeeeeeoo(1)*(-j)        yr1:0=Xoeeeee(1) Xoeeeee(0) yr5:4=Xoeeeee(3) Xoeeeee(2)
//xr19=Xeoeeeoo(1)*(-j)        xr1:0=Xooeeee(1) Xooeeee(0) xr5:4=Xooeeee(3) Xooeeee(2)
  r19=mr1:0,mr1:0+=r21**r31(C); sr1:0=r1:0+r5:4,sr5:4=r1:0-r5:4;;

//the multiplication repeated only to have the transfer in r21
//stall                       yr7:6=Xeeeeeo(1) Xeeeeeo(0) yr19:18=Xeeeeeo(3) Xeeeeeo(2)
//yr21=Xeeeeeoo(1)*(-j)       xr7:6=Xeoeeeo(1) Xeoeeeo(0) xr19:18=Xeoeeeo(3) Xeoeeeo(2)
//xr21=Xeoeeeoo(1)*(-j)
  r21=mr1:0,mr1:0+=r21**r31(C); sr7:6=r7:6+r19:18,sr19:18=r7:6-r19:18;;


//stall                       yr17:16=Xoeeeeo(1) Xoeeeeo(0) yr21:20=Xoeeeeo(3) Xoeeeeo(2)
//                            xr17:16=Xooeeeo(1) Xooeeeo(0) xr21:20=Xooeeeo(3) Xooeeeo(2)
           mr1:0+=r7**r30(C); sr17:16=r17:16+r21:20,sr21:20=r17:16-r21:20;;


//yr7=Xeeeeeo(1)*WN32
//xr7=Xeoeeeo(1)*WN32
  r7=mr1:0,mr1:0+=r18**r31(C);;

//yr18=Xeeeeeo(2)*(-j)
//xr18=Xeoeeeo(2)*(-j)
  r18=mr1:0,mr1:0+=r19**r29(C);;

//stage 3 begins here
//-------------------

//yr19=Xeeeeeo(3)*WN3*32       yr9:8=Xeeeee(1) Xeeeee(0) yr13:12=Xeeeee(5) Xeeeee(4)
//xr19=Xeoeeeo(3)*WN3*32       xr9:8=Xeoeee(1) Xeoeee(0) xr13:12=Xeoeee(5) Xeoeee(4)
  r19=mr1:0,mr1:0+=r17**r30(C); sr9:8=r9:8+r7:6,sr13:12=r9:8-r7:6;;

//yr3=Xoeeeeo(1)*WN32
//xr3=Xooeeeo(1)*WN32
  r17=mr1:0,mr1:0+=r20**r31(C);;

//yr20=Xoeeeeo(2)*(-j)         yr11:10=Xeeeee(3) Xeeeee(2) yr15:14=Xeeeee(7) Xeeeee(6)
//xr20=Xooeeeo(2)*(-j)         xr11:10=Xeoeee(3) Xeoeee(2) xr15:14=Xeoeee(7) Xeoeee(6)
  r20=mr1:0,mr1:0+=r21**r29(C); sr11:10=r3:2+r19:18,sr15:14=r3:2-r19:18;;

//yr0=x(10h)
//yr1=x(11h)
//xr0=x(12h)         yr21=Xoeeeeo(3)*WN3*32      yr17:16=Xoeeee(1) Xoeeee(0) yr25:24=Xoeeee(5) Xoeeee(4)
//xr1=x(13h)         xr21=Xooeeeo(3)*WN3*32      xr17:16=Xooeee(1) Xooeee(0) xr25:24=Xooeee(5) Xooeee(4)
//the multiplication repeated only to have the transfer in r21
  r1:0=br q[j0+=j4]; r21=mr1:0,mr1:0+=r21**r29(C); sr17:16=r1:0+r17:16,sr25:24=r1:0-r17:16; q[k6+=4]=yr11:8;;

//yr2=x(90h)
//yr3=x(91h)
//xr2=x(92h)        yr19:18=Xoeeee(3) Xoeeee(2) yr27:26=Xoeeee(7) Xoeeee(6)
//xr3=x(93h)        xr19:18=Xooeee(3) Xooeee(2) xr27:26=Xooeee(7) Xooeee(6)
//stall
  r3:2=br q[j0+=j4]; sr19:18=r5:4+r21:20,sr27:26=r5:4-r21:20; q[k7+=4]=xr11:8;;

//yr4=x(50h)
//yr5=x(51h)
//xr4=x(52h)
//xr5=x(53h)
  r5:4 = br q[j0+=j4];                                       q[k8+=4]=yr19:16;;

//yr6=x(d0h)
//yr7=x(d1h)
//xr6=x(d2h)
//xr7=x(d3h)
  r7:6 = br q[j0+=j4];                                       q[k9+=4]=xr19:16;;

//yr16=x(30h)
//yr17=x(31h)
//xr16=x(32h)
//xr17=x(33h)
  r17:16 = br q[j0+=j4];sr8=r0+r2,sr9=r0-r2;                 q[k8+=0x4]=yr27:24;;

//yr18=x(b0h)
//yr19=x(b1h)
//xr18=x(b2h)
//xr19=x(b3h)
  r19:18 = br q[j0+=j4]; sr0=r1+r3,sr1=r1-r3;                q[k9+=4]=xr27:24;;

//yr20=x(70h)
//yr21=x(71h)
//xr20=x(72h)
//xr21=x(73h)
  r21:20 = br q[j0+=j4];sr2=r4+r6,sr3=r4-r6;                 q[k6+=4]=yr15:12;;

//yr22=x(f0h)
//yr23=x(f1h)
//xr22=x(f2h)
//xr23=x(f3h)
  r23:22=br q[j0+=j4];sr4=r5+r7,sr5=r5-r7;                   q[k7+=4]=xr15:12;;

.align_code 4;
if NLC0E, jump STG_1_2_3_LOOP; mr1:0+=r3**r31(C); sr6=r16+r18,sr7=r16-r18;;

//the loop above computes outputs that has been fetched one loop before.
//Executing this loop N/32 times, would have provoked non aligned
//quad readings due to the bit reversal behavior.
//Therefore, the loop is executed N/32-1 times and the following computations
//basically repeat the loop content without the fetches.

  r3=mr1:0,mr1:0+=r5**r31(C); sr16=r17+r19,sr17=r17-r19;;
  r5=mr1:0,mr1:0+=r5**r31(C); sr18=r20+r22,sr19=r20-r22;;
                               sr20=r21+r23,sr21=r21-r23;;
           mr1:0+=r19**r31(C); sr9:8=r9:8+r3:2,sr3:2=r9:8-r3:2;;
  r19=mr1:0,mr1:0+=r21**r31(C); sr1:0=r1:0+r5:4,sr5:4=r1:0-r5:4;;
  r21=mr1:0,mr1:0+=r21**r31(C); sr7:6=r7:6+r19:18,sr19:18=r7:6-r19:18;;
           mr1:0+=r7**r30(C); sr17:16=r17:16+r21:20,sr21:20=r17:16-r21:20;;
  r7=mr1:0,mr1:0+=r18**r31(C);;
  r18=mr1:0,mr1:0+=r19**r29(C);;
  r19=mr1:0,mr1:0+=r17**r30(C); sr9:8=r9:8+r7:6,sr13:12=r9:8-r7:6;;
  r17=mr1:0,mr1:0+=r20**r31(C);;
  r20=mr1:0,mr1:0+=r21**r29(C); sr11:10=r3:2+r19:18,sr15:14=r3:2-r19:18;;
  r21=mr1:0,mr1:0+=r21**r29(C); sr17:16=r1:0+r17:16,sr25:24=r1:0-r17:16; q[k6+=4]=yr11:8;;
                                sr19:18=r5:4+r21:20,sr27:26=r5:4-r21:20; q[k7+=4]=xr11:8;;
                                                                         q[k8+=4]=yr19:16;;
                                                                         q[k9+=4]=xr19:16;;
                                                                         q[k8+=0x4]=yr27:24;;
                                                                         q[k9+=4]=xr27:24;;
                                                                         q[k6+=4]=yr15:12;;
                                                                         q[k7+=4]=xr15:12;;

/*******************************************************************************/
/* The following is the loop for stages 4 and up. Note that two sets of
pointers are used to load (j0 and strides in JALU), and to store (k0 and strides
in KALU). The inputs are taken from int_buffer and saved into input buffer*/

/* Compute strides.
Bfly, group and twiddle strides are set initially to their stage 3 values,
so that on loop entry they are updated to their stage 4 values */

  k9 = j25;;          //k9=256
  j1 = j31+0x4;;      //bfly stride
  j2 = j31+0x8;;      //group stride
  j4 = j31+0x1;;      //bflies per group iteration count.
  k9 = ashiftr k9;;
  k9 = ashiftr k9;;
  k9 = ashiftr k9;;   //twiddle stride = N/8

  k6 = ashiftr k9;;
/* Compute iteration counts. */

  LC1 = j4;; //LC1=1

//  j5 = k9;;                 //j5=32, gps per stage iteration count.

  k6 = k6 - 1;;//k6=N/16-1

/* stages iteration count */

  j6 = j12 - 0x5;; //j6=3


/* j10 and k10 hold the pointers to the inputs buffer and internal buffer.
  Every stage loop iteration they are swapped. Initialized here: */

  j5 = k6;;//j5=16, gps per stage iteration count

  j10 = j31 + j28;; //j10 points to input buffer
  k10 = j29;;       //k10 points to int_buffer

//In the stage 4, BFLY_LOOP executed 1 time and GROUP_LOOP 8 times(32 Multiplications/time)
//In the stage 5, BFLY_LOOP executed 2 times and GROUP_LOOP 4 times(32 Multiplications/time)
//In the stage 6, BFLY_LOOP executed 4 times and GROUP_LOOP 2 times(32 Multiplications/time)


.align_code 4;
STAGE_LOOP:

//swap pointers
  j11 = j31+j10;; //j11=input
  j10 = k10;;     //j10=int_buff
  k10 = j11;;     //k10=input

  k8  = j30;;     //k8=Pointer to twiddles

  j5 = ashiftr j5;;// divide by two gps per stage counter. Stage 4: 8, ...

// bfly stride. Stage 4:  8; stg 5: 16,... Need two strides in j1 and j7
  j1 = j1 + j1;; //j1=8



  j7 = j1 + 0x4;; //j7=12
  k1 = j1;;       //k1=8
  k7 = j7;;       //k7=12

  j2 = j2 + j2;;  // grp stride. Stg 4: 16; stg 5: 32,...
  j3  = 0xfffffff8;;  // stride for next bfly after grp loop...
                      // ...Stg 4: 0; stg 5: 0,8; stg 6: 0,8,16 ...
  k2 = j2;;           //k2=16

// twiddles stride. Stg 4:  8; stg 5:  4,...
  k9 = ashiftr k9;;

.align_code 4;
BFLY_LOOP:

//Set pointer to buffer where inputs are loaded, and outputs stored
//k0=input, j0=int_buf
  j0 = j10+0; k0 = k10+0;;

// Advance j0 to next bfly after group loop
//j3=0
  j3 = j3+0x8;;
  j0 = j0+j3; k3 = j3;;

// load 8 twiddles
//                             yr4=WN0=exp(-0/256.pi.j)=1
                               yr22 =  [k8+=k9];;

//                             yr5=WN32=exp(-32/256.pi.j)=exp(-pi.j/8)
                               yr23 =  [k8+=k9];;

//                             xr4=WN64=exp(-64/256.pi.j)=exp(-pi.j/4)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -