fft_fxp16.asm

来自「TigerSharc TS201 32-bit floating point F」· 汇编 代码 · 共 739 行 · 第 1/2 页

ASM
739
字号
                               xr22 =  [k8+=k9];;
//reset gps cntr
  k0 = k0+k3; LC0 = j5;;//k0=input, LC0=16

//                             xr23=WN96=exp(-96/256.pi.j)=exp(-3/8.pi.j)
                               xr23 =  [k8+=k9];;

//                             yr24=WN128=exp(-128/256.pi.j)=-j
                               yr24 =  [k8+=k9];;

//                             yr25=WN160=exp(-160/256.pi.j)=exp(-5/8.pi.j)
                               yr25 =  [k8+=k9];;

//yr1:0=Xeeeeo(5) Xeeeeo(4)
//xr1:0=Xeeeeo(7) Xeeeeo(6)    xr24=WN192=exp(-192/256.pi.j)=exp(-3/4.pi.j)
  r1:0=q[j0+j7];               xr24 =  [k8+=k9];;

//yr3:2=Xeeeeo(1) Xeeeeo(0)
//xr3:2=Xeeeeo(3) Xeeeeo(2)    xr25=WN224=exp(-224/256.pi.j)=exp(-7/8.pi.j)
  r3:2=q[j0+j1];               xr25 =  [k8+=k9];;

//yr5:4=Xeeeee(5) Xeeeee(4)
//xr5:4=Xeeeee(7) Xeeeee(6)
//                  ymr1:0=Xeeeeo(4)*(-j)
//                  xmr1:0=Xeeeeo(6)*exp(-3/4.pi.j)
  r5:4 = q[j0+4];  MR1:0 += R0  ** R24 (C);;

//yr7:6=Xeeeee(1) Xeeeee(0)
//xr7:6=Xeeeee(3) Xeeeee(2)
//                  yr8=Xeeeeo(4)*(-j)
//                  xr8=Xeeeeo(6)*exp(-3/4.pi.j)
//                            ymr1:0=Xeeeeo(5)*exp(-5/8.pi.j)
//                            xmr1:0=Xeeeeo(7)*exp(-7/8.pi.j)
  r7:6 = q[j0+=j2]; R8=MR1:0, MR1:0+=R1**R25(C);;

//begin reading the next 16 numbers Xeeeoe(7:0) and Xeeeoo(7:0)
//continue to process the data already fetched
  r1:0=q[j0+j7];    R9=MR1:0, MR1:0+=R2**R22(C);;

  r3:2=q[j0+j1];  R10=MR1:0, MR1:0+=R3**R23(C);;

//                                            first butterfly of the first 16 fetches
  r5:4=q[j0+4]; R11=MR1:0, MR1:0+=R0**R24(C); sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8;;

//                                              begin saving the outputs of this stage
  r17:16 = q[j0+=j2];r8=mr1:0,MR1:0+=R1**R25(C);q[k0+4]= r13:12;;

.align_code 4;
GROUP_LOOP:

//begin reading the next 16 numbers Xeeoee(7:0) and Xeeoeo(7:0)
//begin multiplications for the second set of fetches
//perform the second buterfly of the first set of fetches
//continue saving the outputs of this stage
  r1:0 = q[j0+j7]; r9=mr1:0,MR1:0+=R2**R22(C); sr19:18=r7:6+r11:10, sr21:20=r7:6-r11:10; q[k0+k7]= r15:14;;

  r3:2=q[j0+j1];  r10=mr1:0,MR1:0+=R3**r23(C);q[k0+k1] = r21:20;;
//the first butterfly of the second set of fetches
//begin multiplications for the third set of fetches
  r5:4 = q[j0+4]; r11=mr1:0,MR1:0+=R0**R24(C);sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8; q[k0+=k2] = r19:18;;

  r7:6 = q[j0+=j2];r8=mr1:0,mr1:0+=R1**R25(C);q[k0+4]= r13:12;;


//begin reading the 4th set of fetches
//execute the second butterfly of the second set of fetches
  r1:0 = q[j0+j7]; r9=mr1:0,MR1:0+=R2**R22(C); sr19:18=r17:16+r11:10, sr21:20=r17:16-r11:10;q[k0+k7]= r15:14;;

  r3:2=q[j0+j1]; r10=mr1:0,MR1:0+=R3**r23(C); q[k0+k1] = r21:20;;

//last saving of the first set of outputs
  r5:4=q[j0+4];R11=MR1:0, MR1:0+=R0**R24(C); sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8;q[k0+=k2] = r19:18;;

.align_code 4;
  if NLC0E, jump GROUP_LOOP; r17:16 = q[j0+=j2];r8=mr1:0,MR1:0+=R1**R25(C);q[k0+4]= r13:12;;


//because the loop has been executed N/32-1 times, now compute
//the last set of outputs. In this manner, unused fetches are avoided
                r9=mr1:0,MR1:0+=R2**R22(C); sr19:18=r7:6+r11:10, sr21:20=r7:6-r11:10; q[k0+k7]= r15:14;;

               r10=mr1:0,MR1:0+=R3**r23(C);q[k0+k1] = r21:20;;

               r11=mr1:0,MR1:0+=R3**R23(C);sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8; q[k0+=k2] = r19:18;;

                                           q[k0+4]= r13:12;;

                                           sr19:18=r17:16+r11:10, sr21:20=r17:16-r11:10;q[k0+k7]= r15:14;;

                                           q[k0+k1] = r21:20;;

                                           q[k0+=k2] = r19:18;;


.align_code 4;
  if NLC1E, jump BFLY_LOOP;;

// multiply by two bfly cntr. stg 4: 2; stg 5: 4,...
  j4 = j4 + j4;; //j4=2
  j6 = j6 - 0x1;; //j6=2


//the program passes at stage 5 in which it reads from input buffer and
//saves the results into  int_buffer
//Then stage 6 is performed in which it reads from int_buffer and
//saves the results into input buffer
.align_code 4;
  if NJEQ, jump STAGE_LOOP; LC1 = j4;;//LC1=2, 4, etc

/*************************************************************/
final_2_stages:
//  Final two stages (7 and 8)
//the inputs are read from input buffer and saved into  int_buffer

/*at this point, input buffer contains the following:

      0     Xee(0:63)
      1
      ...
      63
      64    Xeo(0:63)
      65
      ...
      127
      128   Xoe(0:63)
      129
      ...
      191
      192   Xoo(0:63)
      193
      ...
      255
*/

/*do a last swap of pointers*/
  j11 = j31+j10;; //j11=int_buffer
  j10 = k10;;     //j10=input
  k10 = j11;;     //k10=int_buffer

/* In this loop, twiddle pointers are in JALU to be parallel with stores */

  j2 = j25;; //j2=256

  xr16 = j25;;//xr16=256

  k8 = j30;;  //j8=twiddles

  k7 = j25;;  //k7=N=256
  k7 = ashiftr k7;;
  k7 = ashiftr k7;;  //k7=N/4=64
  k7 = k8 + k7;;//k7=twiddles+64
  k9 = k8;;     //k9=twiddles

// Set pointer to buffer where inputs are loaded,
//and outputs stored
  j0 = j10+0; k0 = k10+0;; //k0=int_buffer, j0=input

  j2 = ashiftr j2;; //j2=N/2=128
  j1 = ashiftr j2;; //j1=N/4=64
  j3 = j1+j2;;      //j3=3N/4=192
  k1 = j1;;         //k1=N/4=64
  k2 = j2;;         //k2=N/2=128
  k3 = j3;;         //k3=3N/4=192

//the loop is performed N/16-2 times
  xr16 = ashift r16 by -4; xr17 = 2;;
  xr16 = r16 - r17;;
  j16 = xr16;;
  LC0 = j16;;      //LC0=N/16-2=14

//yr0=WN0 xr0=WN2 yr2=Xoo(0)
//yr1=WN1 xr1=WN3 xr2=Xoo(1)
//from these 4 fetches, only WN0 and WN2 will be used.
//It is only a method to load WN0 and WN2 in r0
  r1:0=q[k8+=4];  r2=l[j0+j3];;

//yr6=WN64       yr3=Xeo(0)
//xr6=WN65       xr3=Xeo(1)
  r6  =l[k7+=2]; r3=l[j0+j1];;

//yr7=WN0        yr4=Xoe(0)           ymr1:0=Xoo(0)*WN0
//xr7=WN1        xr4=Xoe(1)           xmr1:0=Xoo(1)*WN2
  r7  =l[k9+=2]; r4=l[j0+j2];         mr1:0+=r2**r0(C);;

//other 8 inputs are fetched now
//yr0=WN4 xr0=WN6 yr5=Xee(0) yr2=Xoo(0)*WN0 ymr1:0=Xeo(0)*WN0
//yr1=WN5 xr1=WN7 xr5=Xee(1) xr2=Xoo(1)*WN2 xmr1:0=Xeo(1)*WN2
//from these 4 fetches, only WN4 and WN6 will be used.
//It is only a method to load WN4 and WN6 in r0
  r1:0=q[k8+=4]; r5=l[j0+=2];r2=mr1:0,mr1:0+=r3**r0(C);;

//yr16=WN66     yr12=Xoo(2)  yr3=Xeo(0)*WN0
//xr16=WN67     xr12=Xoo(3)  xr3=Xeo(1)*WN2
//the multiplication is perfomred again only to have the trasnfer from mr1:0 into r3
  r16 =l[k7+=2];r12=l[j0+j3];r3=mr1:0,mr1:0+=r3**r0(C);;

//yr17=WN2      yr13=Xeo(2)  yr8=Xo(0) yr9=Xo(64)
//xr17=WN3      xr13=Xeo(3)  xr8=Xo(1) xr9=Xo(65)
  r17 =l[k9+=2];r13=l[j0+j1];sr8=r4+r2,sr9=r4-r2;;

//              yr14=Xoe(2)            ymr1:0=Xoo(2)*WN4 yr10=Xe(0) yr11=Xe(64)
//              xr14=Xoe(3)            xmr1:0=Xoo(3)*WN6 xr10=Xe(1) xr11=Xe(65)
                r14=l[j0+j2];          mr1:0+=r12**r0(C);sr10=r5+r3,sr11=r5-r3;;

//other 8 inputs are fetched now
//yr0=WN8 xr0=WN10 yr15=Xee(2) yr12=Xoo(2)*WN4 ymr1:0=Xeo(2)*WN4
//yr1=WN9 xr1=WN11 xr15=Xee(3) xr12=Xoo(3)*WN6 xmr1:0=Xeo(3)*WN6
//from these 4 fetches, only WN8 and WN10 will be used.
//It is only a method to load WN8 and WN10 in r0
  r1:0=q[k8+=4];r15=l[j0+=2];r12=mr1:0,mr1:0+=r13**r0(C);;

//yr7=WN4        yr2=Xoo(4)  yr13=Xeo(2)*WN4  ymr1:0=Xo(0)*WN0
//xr7=WN5        xr2=Xoo(5)  xr13=Xeo(3)*WN6  xmr1:0=Xo(1)*WN1
  r7  =l[k9+=2]; r2=l[j0+j3];r13=mr1:0,mr1:0+=r8**r7(C);;

//yr6=WN68       yr3=Xeo(4)   yr8=Xo(0)*WN0  ymr1:0=Xo(64)*WN64
//xr6=WN69       xr3=Xeo(5)   xr8=Xo(1)*WN1  xmr1:0=Xo(65)*WN65
//                                                      yr18=Xo(2)   yr19=Xo(66)
//                                                      xr18=Xo(3)   xr19=Xo(67)
//  r6  =l[k7+=2]; r3=l[j0+j1]; r8=mr1:0,mr1:0+=r9**r6(C);sr18=r14+r12,sr19=r14-r12;;
  r6  =l[k7+=2]; r3=l[j0+j1]; r8=mr1:0,mr1:0+=r9**r6(C);sr26=r14+r12,sr27=r14-r12;;


.align_code 4;
LAST:

//yr22=WN70     yr4=Xoe(4)   yr9=Xo(64)*WN64 ymr1:0=Xoo(4)*WN8
//xr22=WN71     xr4=Xoe(5)   xr9=Xo(65)*WN65 xmr1:0=Xoo(5)*WN10
//                                                     yr20=Xe(2)   yr21=Xe(66)
//                                                     xr20=Xe(3)   xr21=Xe(67)
  r22 =l[k7+=2];r4=l[j0+j2]; r9=mr1:0,mr1:0+=r2**r0(C);sr20=r15+r13,sr21=r15-r13;;

//other 8 inputs are fetched now
//yr0=WN12 xr0=WN14 yr5=Xee(4) yr2=Xoo(4)*WN8  ymr1:0=Xeo(4)*WN8
//yr1=WN13 xr1=WN15 xr5=Xee(5) xr2=Xoo(5)*WN10 xmr1:0=Xeo(5)*WN10
//from these 4 fetches, only WN12 and WN14 will be used.
//It is only a method to load WN12 and WN14 in r0
  r1:0=q[k8+=4];r5=l[j0+=2]; r2=mr1:0,mr1:0+=r3**r0(C);;

//yr17=WN6      yr12=Xoo(6)  yr3=Xeo(4)*WN8  ymr1:0=Xo(2)*WN2
//xr17=WN7      xr12=Xoo(7)  xr3=Xeo(5)*WN10 xmr1:0=Xo(3)*WN3
//                                                       yr8=X(0) yr9=X(64) yr10=X(128) yr11=X(192)
//                                                       xr8=X(1) xr9=X(65) xr10=X(129) xr11=X(193)
  r17 =l[k9+=2];r12=l[j0+j3];r3=mr1:0,mr1:0+=r26**r17(C);sr9:8=r11:10+r9:8,sr11:10=r11:10-r9:8;;

//here begins to be saved the first 8 outputs
//              yr13=Xeo(6)  yr18=Xo(2)*WN2 ymr1:0=Xo(66)*WN66
//              xr13=Xeo(7)  xr18=Xo(3)*WN3 xmr1:0=Xo(67)*WN67
//                                                        X(64), X(65) are saved
                r13=l[j0+j1];r18=mr1:0,mr1:0+=r27**r16(C);l[k0+k1]=r9;;

// yr16=WN70    yr14=Xoe(6)  yr19=Xo(66)*WN66 ymr1:0=Xoo(6)*WN12
// xr16=WN71    xr14=Xoe(7)  xr19=Xo(67)*WN67 xmr1:0=Xoo(7)*WN14
//                                                        X(128), X(129) are saved
   r16=pass r22;r14=l[j0+j2];r19=mr1:0,mr1:0+=r12**r0(C); l[k0+k2]=r10;;

//              yr15=Xee(6)  yr12=Xoo(6)*WN12 ymr1:0=Xeo(6)*WN12
//              xr15=Xee(7)  xr12=Xoo(7)*WN14 xmr1:0=Xeo(7)*WN14
//                                                         X(192), X(193) are saved
                r15=l[j0+=2];r12=mr1:0,mr1:0+=r13**r0(C); l[k0+k3]=r11;;

//              yr13=Xeo(6)*WN12            yr18=X(2) yr19=X(66) yr20=X(130) yr21=X(194)
//              xr13=Xeo(7)*WN14            xr18=X(3) xr19=X(67) xr20=X(131) xr21=X(195)
//the multiplication done only to move the content of mr1:0 into r13
//                                                                                      X(0),X(1) are saved
                r13=mr1:0,mr1:0+=r13**r0(C);sr19:18=r21:20+r19:18,sr21:20=r21:20-r19:18;l[k0+=2]=r8;;

//other 8 inputs are fetched now
//yr0=WN16 xr0=WN18
//yr1=WN17 xr1=WN19
//from these 4 fetches, only WN16 and WN18 will be used.
//It is only a method to load WN16 and WN18 in r0
  r1:0=q[k8+=4];;

//here begins to be saved the first 8 outputs
//                                    yr8=Xo(4) yr9=Xo(68)
//                                    xr8=Xo(5) xr9=Xo(69)
//                                                        X(66),X(67) are saved
                                      sr8=r4+r2,sr9=r4-r2;l[k0+k1]=r19;;

//                                   yr10=Xe(4) yr11=Xe(68)
//                                   xr10=Xe(5) xr11=Xe(69)
//                                                         X(130),X(131) are saved
                                     sr10=r5+r3,sr11=r5-r3;l[k0+k2]=r20;;

//               yr2=Xoo(8)  ymr1:0=Xo(4)*WN4
//               xr2=Xoo(9)  xmr1:0=Xo(5)*WN5 X(194),X(195) are saved
                 r2=l[j0+j3];mr1:0+=r8**r7(C);l[k0+k3]=r21;;

//               yr3=Xeo(8)  yr8=Xo(4)*WN4 ymr1:0=Xo(68)*WN68
//               xr3=Xeo(8)  xr8=Xo(5)*WN5 xmr1:0=Xo(69)*WN69
//                                                     yr18=Xo(6)   yr19=Xo(70)
//                                                     xr18=Xo(7)   xr19=Xo(71)  X(2),X(3) are saved
                 r3=l[j0+j1];r8=mr1:0,mr1:0+=r9**r6(C);sr26=r14+r12,sr27=r14-r12;l[k0+=2]=r18;;

//yr6=WN(72)
//xr6=WN(73)
  r6  =l[k7+=2];;

//                      yr7=WN16
//                      xr7=WN17
//at this point, everything is in place to jump to LAST_ and repeat the loop
.align_code 4;
  if NLC0E, jump LAST; r7  =l[k9+=2];;


//the loop is performed N/16-2 times. This is necessary to avoid fetches that are meaningless
//Now it is only a problem to repeat one more time the loop without any futile fetches
  r22 =l[k7+=2];r4=l[j0+j2]; r9=mr1:0,mr1:0+=r2**r0(C);sr20=r15+r13,sr21=r15-r13;;

  r1:0=q[k8+=4];r5=l[j0+=2]; r2=mr1:0,mr1:0+=r3**r0(C);;

  r17 =l[k9+=2];r12=l[j0+j3];r3=mr1:0,mr1:0+=r26**r17(C);sr9:8=r11:10+r9:8,sr11:10=r11:10-r9:8;;

                r13=l[j0+j1];r18=mr1:0,mr1:0+=r27**r16(C);l[k0+k1]=r9;;

   r16=pass r22;r14=l[j0+j2];r19=mr1:0,mr1:0+=r12**r0(C); l[k0+k2]=r10;;

                r15=l[j0+=2];r12=mr1:0,mr1:0+=r13**r0(C); l[k0+k3]=r11;;

                r13=mr1:0,mr1:0+=r13**r0(C);sr19:18=r21:20+r19:18,sr21:20=r21:20-r19:18;l[k0+=2]=r8;;

//from this point forward, the loop instructions are repeated without fetches
                                      sr8=r4+r2,sr9=r4-r2;l[k0+k1]=r19;;

                                     sr10=r5+r3,sr11=r5-r3;l[k0+k2]=r20;;


                          mr1:0+=r8**r7(C);l[k0+k3]=r21;;

                 r8=mr1:0,mr1:0+=r9**r6(C);sr26=r14+r12,sr27=r14-r12;l[k0+=2]=r18;;


//at this point one loop iteration has finished and we pass through it one more time
//executing only some of the mathematical operations that must still be performed

//the multiplication here is repeated only to move the content of mr1:0 into r9
                 r9=mr1:0,mr1:0+=r9**r6(C);sr20=r15+r13,sr21=r15-r13;;

                          mr1:0+=r26**r17(C);sr9:8=r11:10+r9:8,sr11:10=r11:10-r9:8;;

                r18=mr1:0,mr1:0+=r27**r16(C);l[k0+k1]=r9;;

//the multiplication here is repeated only to move the content of mr1:0 into r19
                r19=mr1:0,mr1:0+=r27**r16(C);l[k0+k2]=r10;;

                                             l[k0+k3]=r11;;

                          sr19:18=r21:20+r19:18,sr21:20=r21:20-r19:18;l[k0+=2]=r8;;

                                             l[k0+k1]=r19;;

                                             l[k0+k2]=r20;;

                                             l[k0+k3]=r21;;

                                             l[k0+=2]=r18;;

//******************************** Epilogue **********************************
_FFT16.end:
_FFTEpilogue:
	mPOPQ(yR27:24)
	mPOPQ(yR31:28)
 	mPOPQ(xR27:24)
    mPOPQ(xR31:28)

	mRETURN

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?