fft_fxp16.asm
来自「TigerSharc TS201 32-bit floating point F」· 汇编 代码 · 共 739 行 · 第 1/2 页
ASM
739 行
xr22 = [k8+=k9];;
//reset gps cntr
k0 = k0+k3; LC0 = j5;;//k0=input, LC0=16
// xr23=WN96=exp(-96/256.pi.j)=exp(-3/8.pi.j)
xr23 = [k8+=k9];;
// yr24=WN128=exp(-128/256.pi.j)=-j
yr24 = [k8+=k9];;
// yr25=WN160=exp(-160/256.pi.j)=exp(-5/8.pi.j)
yr25 = [k8+=k9];;
//yr1:0=Xeeeeo(5) Xeeeeo(4)
//xr1:0=Xeeeeo(7) Xeeeeo(6) xr24=WN192=exp(-192/256.pi.j)=exp(-3/4.pi.j)
r1:0=q[j0+j7]; xr24 = [k8+=k9];;
//yr3:2=Xeeeeo(1) Xeeeeo(0)
//xr3:2=Xeeeeo(3) Xeeeeo(2) xr25=WN224=exp(-224/256.pi.j)=exp(-7/8.pi.j)
r3:2=q[j0+j1]; xr25 = [k8+=k9];;
//yr5:4=Xeeeee(5) Xeeeee(4)
//xr5:4=Xeeeee(7) Xeeeee(6)
// ymr1:0=Xeeeeo(4)*(-j)
// xmr1:0=Xeeeeo(6)*exp(-3/4.pi.j)
r5:4 = q[j0+4]; MR1:0 += R0 ** R24 (C);;
//yr7:6=Xeeeee(1) Xeeeee(0)
//xr7:6=Xeeeee(3) Xeeeee(2)
// yr8=Xeeeeo(4)*(-j)
// xr8=Xeeeeo(6)*exp(-3/4.pi.j)
// ymr1:0=Xeeeeo(5)*exp(-5/8.pi.j)
// xmr1:0=Xeeeeo(7)*exp(-7/8.pi.j)
r7:6 = q[j0+=j2]; R8=MR1:0, MR1:0+=R1**R25(C);;
//begin reading the next 16 numbers Xeeeoe(7:0) and Xeeeoo(7:0)
//continue to process the data already fetched
r1:0=q[j0+j7]; R9=MR1:0, MR1:0+=R2**R22(C);;
r3:2=q[j0+j1]; R10=MR1:0, MR1:0+=R3**R23(C);;
// first butterfly of the first 16 fetches
r5:4=q[j0+4]; R11=MR1:0, MR1:0+=R0**R24(C); sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8;;
// begin saving the outputs of this stage
r17:16 = q[j0+=j2];r8=mr1:0,MR1:0+=R1**R25(C);q[k0+4]= r13:12;;
.align_code 4;
GROUP_LOOP:
//begin reading the next 16 numbers Xeeoee(7:0) and Xeeoeo(7:0)
//begin multiplications for the second set of fetches
//perform the second buterfly of the first set of fetches
//continue saving the outputs of this stage
r1:0 = q[j0+j7]; r9=mr1:0,MR1:0+=R2**R22(C); sr19:18=r7:6+r11:10, sr21:20=r7:6-r11:10; q[k0+k7]= r15:14;;
r3:2=q[j0+j1]; r10=mr1:0,MR1:0+=R3**r23(C);q[k0+k1] = r21:20;;
//the first butterfly of the second set of fetches
//begin multiplications for the third set of fetches
r5:4 = q[j0+4]; r11=mr1:0,MR1:0+=R0**R24(C);sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8; q[k0+=k2] = r19:18;;
r7:6 = q[j0+=j2];r8=mr1:0,mr1:0+=R1**R25(C);q[k0+4]= r13:12;;
//begin reading the 4th set of fetches
//execute the second butterfly of the second set of fetches
r1:0 = q[j0+j7]; r9=mr1:0,MR1:0+=R2**R22(C); sr19:18=r17:16+r11:10, sr21:20=r17:16-r11:10;q[k0+k7]= r15:14;;
r3:2=q[j0+j1]; r10=mr1:0,MR1:0+=R3**r23(C); q[k0+k1] = r21:20;;
//last saving of the first set of outputs
r5:4=q[j0+4];R11=MR1:0, MR1:0+=R0**R24(C); sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8;q[k0+=k2] = r19:18;;
.align_code 4;
if NLC0E, jump GROUP_LOOP; r17:16 = q[j0+=j2];r8=mr1:0,MR1:0+=R1**R25(C);q[k0+4]= r13:12;;
//because the loop has been executed N/32-1 times, now compute
//the last set of outputs. In this manner, unused fetches are avoided
r9=mr1:0,MR1:0+=R2**R22(C); sr19:18=r7:6+r11:10, sr21:20=r7:6-r11:10; q[k0+k7]= r15:14;;
r10=mr1:0,MR1:0+=R3**r23(C);q[k0+k1] = r21:20;;
r11=mr1:0,MR1:0+=R3**R23(C);sr13:12=r5:4+r9:8,sr15:14=r5:4-r9:8; q[k0+=k2] = r19:18;;
q[k0+4]= r13:12;;
sr19:18=r17:16+r11:10, sr21:20=r17:16-r11:10;q[k0+k7]= r15:14;;
q[k0+k1] = r21:20;;
q[k0+=k2] = r19:18;;
.align_code 4;
if NLC1E, jump BFLY_LOOP;;
// multiply by two bfly cntr. stg 4: 2; stg 5: 4,...
j4 = j4 + j4;; //j4=2
j6 = j6 - 0x1;; //j6=2
//the program passes at stage 5 in which it reads from input buffer and
//saves the results into int_buffer
//Then stage 6 is performed in which it reads from int_buffer and
//saves the results into input buffer
.align_code 4;
if NJEQ, jump STAGE_LOOP; LC1 = j4;;//LC1=2, 4, etc
/*************************************************************/
final_2_stages:
// Final two stages (7 and 8)
//the inputs are read from input buffer and saved into int_buffer
/*at this point, input buffer contains the following:
0 Xee(0:63)
1
...
63
64 Xeo(0:63)
65
...
127
128 Xoe(0:63)
129
...
191
192 Xoo(0:63)
193
...
255
*/
/*do a last swap of pointers*/
j11 = j31+j10;; //j11=int_buffer
j10 = k10;; //j10=input
k10 = j11;; //k10=int_buffer
/* In this loop, twiddle pointers are in JALU to be parallel with stores */
j2 = j25;; //j2=256
xr16 = j25;;//xr16=256
k8 = j30;; //j8=twiddles
k7 = j25;; //k7=N=256
k7 = ashiftr k7;;
k7 = ashiftr k7;; //k7=N/4=64
k7 = k8 + k7;;//k7=twiddles+64
k9 = k8;; //k9=twiddles
// Set pointer to buffer where inputs are loaded,
//and outputs stored
j0 = j10+0; k0 = k10+0;; //k0=int_buffer, j0=input
j2 = ashiftr j2;; //j2=N/2=128
j1 = ashiftr j2;; //j1=N/4=64
j3 = j1+j2;; //j3=3N/4=192
k1 = j1;; //k1=N/4=64
k2 = j2;; //k2=N/2=128
k3 = j3;; //k3=3N/4=192
//the loop is performed N/16-2 times
xr16 = ashift r16 by -4; xr17 = 2;;
xr16 = r16 - r17;;
j16 = xr16;;
LC0 = j16;; //LC0=N/16-2=14
//yr0=WN0 xr0=WN2 yr2=Xoo(0)
//yr1=WN1 xr1=WN3 xr2=Xoo(1)
//from these 4 fetches, only WN0 and WN2 will be used.
//It is only a method to load WN0 and WN2 in r0
r1:0=q[k8+=4]; r2=l[j0+j3];;
//yr6=WN64 yr3=Xeo(0)
//xr6=WN65 xr3=Xeo(1)
r6 =l[k7+=2]; r3=l[j0+j1];;
//yr7=WN0 yr4=Xoe(0) ymr1:0=Xoo(0)*WN0
//xr7=WN1 xr4=Xoe(1) xmr1:0=Xoo(1)*WN2
r7 =l[k9+=2]; r4=l[j0+j2]; mr1:0+=r2**r0(C);;
//other 8 inputs are fetched now
//yr0=WN4 xr0=WN6 yr5=Xee(0) yr2=Xoo(0)*WN0 ymr1:0=Xeo(0)*WN0
//yr1=WN5 xr1=WN7 xr5=Xee(1) xr2=Xoo(1)*WN2 xmr1:0=Xeo(1)*WN2
//from these 4 fetches, only WN4 and WN6 will be used.
//It is only a method to load WN4 and WN6 in r0
r1:0=q[k8+=4]; r5=l[j0+=2];r2=mr1:0,mr1:0+=r3**r0(C);;
//yr16=WN66 yr12=Xoo(2) yr3=Xeo(0)*WN0
//xr16=WN67 xr12=Xoo(3) xr3=Xeo(1)*WN2
//the multiplication is perfomred again only to have the trasnfer from mr1:0 into r3
r16 =l[k7+=2];r12=l[j0+j3];r3=mr1:0,mr1:0+=r3**r0(C);;
//yr17=WN2 yr13=Xeo(2) yr8=Xo(0) yr9=Xo(64)
//xr17=WN3 xr13=Xeo(3) xr8=Xo(1) xr9=Xo(65)
r17 =l[k9+=2];r13=l[j0+j1];sr8=r4+r2,sr9=r4-r2;;
// yr14=Xoe(2) ymr1:0=Xoo(2)*WN4 yr10=Xe(0) yr11=Xe(64)
// xr14=Xoe(3) xmr1:0=Xoo(3)*WN6 xr10=Xe(1) xr11=Xe(65)
r14=l[j0+j2]; mr1:0+=r12**r0(C);sr10=r5+r3,sr11=r5-r3;;
//other 8 inputs are fetched now
//yr0=WN8 xr0=WN10 yr15=Xee(2) yr12=Xoo(2)*WN4 ymr1:0=Xeo(2)*WN4
//yr1=WN9 xr1=WN11 xr15=Xee(3) xr12=Xoo(3)*WN6 xmr1:0=Xeo(3)*WN6
//from these 4 fetches, only WN8 and WN10 will be used.
//It is only a method to load WN8 and WN10 in r0
r1:0=q[k8+=4];r15=l[j0+=2];r12=mr1:0,mr1:0+=r13**r0(C);;
//yr7=WN4 yr2=Xoo(4) yr13=Xeo(2)*WN4 ymr1:0=Xo(0)*WN0
//xr7=WN5 xr2=Xoo(5) xr13=Xeo(3)*WN6 xmr1:0=Xo(1)*WN1
r7 =l[k9+=2]; r2=l[j0+j3];r13=mr1:0,mr1:0+=r8**r7(C);;
//yr6=WN68 yr3=Xeo(4) yr8=Xo(0)*WN0 ymr1:0=Xo(64)*WN64
//xr6=WN69 xr3=Xeo(5) xr8=Xo(1)*WN1 xmr1:0=Xo(65)*WN65
// yr18=Xo(2) yr19=Xo(66)
// xr18=Xo(3) xr19=Xo(67)
// r6 =l[k7+=2]; r3=l[j0+j1]; r8=mr1:0,mr1:0+=r9**r6(C);sr18=r14+r12,sr19=r14-r12;;
r6 =l[k7+=2]; r3=l[j0+j1]; r8=mr1:0,mr1:0+=r9**r6(C);sr26=r14+r12,sr27=r14-r12;;
.align_code 4;
LAST:
//yr22=WN70 yr4=Xoe(4) yr9=Xo(64)*WN64 ymr1:0=Xoo(4)*WN8
//xr22=WN71 xr4=Xoe(5) xr9=Xo(65)*WN65 xmr1:0=Xoo(5)*WN10
// yr20=Xe(2) yr21=Xe(66)
// xr20=Xe(3) xr21=Xe(67)
r22 =l[k7+=2];r4=l[j0+j2]; r9=mr1:0,mr1:0+=r2**r0(C);sr20=r15+r13,sr21=r15-r13;;
//other 8 inputs are fetched now
//yr0=WN12 xr0=WN14 yr5=Xee(4) yr2=Xoo(4)*WN8 ymr1:0=Xeo(4)*WN8
//yr1=WN13 xr1=WN15 xr5=Xee(5) xr2=Xoo(5)*WN10 xmr1:0=Xeo(5)*WN10
//from these 4 fetches, only WN12 and WN14 will be used.
//It is only a method to load WN12 and WN14 in r0
r1:0=q[k8+=4];r5=l[j0+=2]; r2=mr1:0,mr1:0+=r3**r0(C);;
//yr17=WN6 yr12=Xoo(6) yr3=Xeo(4)*WN8 ymr1:0=Xo(2)*WN2
//xr17=WN7 xr12=Xoo(7) xr3=Xeo(5)*WN10 xmr1:0=Xo(3)*WN3
// yr8=X(0) yr9=X(64) yr10=X(128) yr11=X(192)
// xr8=X(1) xr9=X(65) xr10=X(129) xr11=X(193)
r17 =l[k9+=2];r12=l[j0+j3];r3=mr1:0,mr1:0+=r26**r17(C);sr9:8=r11:10+r9:8,sr11:10=r11:10-r9:8;;
//here begins to be saved the first 8 outputs
// yr13=Xeo(6) yr18=Xo(2)*WN2 ymr1:0=Xo(66)*WN66
// xr13=Xeo(7) xr18=Xo(3)*WN3 xmr1:0=Xo(67)*WN67
// X(64), X(65) are saved
r13=l[j0+j1];r18=mr1:0,mr1:0+=r27**r16(C);l[k0+k1]=r9;;
// yr16=WN70 yr14=Xoe(6) yr19=Xo(66)*WN66 ymr1:0=Xoo(6)*WN12
// xr16=WN71 xr14=Xoe(7) xr19=Xo(67)*WN67 xmr1:0=Xoo(7)*WN14
// X(128), X(129) are saved
r16=pass r22;r14=l[j0+j2];r19=mr1:0,mr1:0+=r12**r0(C); l[k0+k2]=r10;;
// yr15=Xee(6) yr12=Xoo(6)*WN12 ymr1:0=Xeo(6)*WN12
// xr15=Xee(7) xr12=Xoo(7)*WN14 xmr1:0=Xeo(7)*WN14
// X(192), X(193) are saved
r15=l[j0+=2];r12=mr1:0,mr1:0+=r13**r0(C); l[k0+k3]=r11;;
// yr13=Xeo(6)*WN12 yr18=X(2) yr19=X(66) yr20=X(130) yr21=X(194)
// xr13=Xeo(7)*WN14 xr18=X(3) xr19=X(67) xr20=X(131) xr21=X(195)
//the multiplication done only to move the content of mr1:0 into r13
// X(0),X(1) are saved
r13=mr1:0,mr1:0+=r13**r0(C);sr19:18=r21:20+r19:18,sr21:20=r21:20-r19:18;l[k0+=2]=r8;;
//other 8 inputs are fetched now
//yr0=WN16 xr0=WN18
//yr1=WN17 xr1=WN19
//from these 4 fetches, only WN16 and WN18 will be used.
//It is only a method to load WN16 and WN18 in r0
r1:0=q[k8+=4];;
//here begins to be saved the first 8 outputs
// yr8=Xo(4) yr9=Xo(68)
// xr8=Xo(5) xr9=Xo(69)
// X(66),X(67) are saved
sr8=r4+r2,sr9=r4-r2;l[k0+k1]=r19;;
// yr10=Xe(4) yr11=Xe(68)
// xr10=Xe(5) xr11=Xe(69)
// X(130),X(131) are saved
sr10=r5+r3,sr11=r5-r3;l[k0+k2]=r20;;
// yr2=Xoo(8) ymr1:0=Xo(4)*WN4
// xr2=Xoo(9) xmr1:0=Xo(5)*WN5 X(194),X(195) are saved
r2=l[j0+j3];mr1:0+=r8**r7(C);l[k0+k3]=r21;;
// yr3=Xeo(8) yr8=Xo(4)*WN4 ymr1:0=Xo(68)*WN68
// xr3=Xeo(8) xr8=Xo(5)*WN5 xmr1:0=Xo(69)*WN69
// yr18=Xo(6) yr19=Xo(70)
// xr18=Xo(7) xr19=Xo(71) X(2),X(3) are saved
r3=l[j0+j1];r8=mr1:0,mr1:0+=r9**r6(C);sr26=r14+r12,sr27=r14-r12;l[k0+=2]=r18;;
//yr6=WN(72)
//xr6=WN(73)
r6 =l[k7+=2];;
// yr7=WN16
// xr7=WN17
//at this point, everything is in place to jump to LAST_ and repeat the loop
.align_code 4;
if NLC0E, jump LAST; r7 =l[k9+=2];;
//the loop is performed N/16-2 times. This is necessary to avoid fetches that are meaningless
//Now it is only a problem to repeat one more time the loop without any futile fetches
r22 =l[k7+=2];r4=l[j0+j2]; r9=mr1:0,mr1:0+=r2**r0(C);sr20=r15+r13,sr21=r15-r13;;
r1:0=q[k8+=4];r5=l[j0+=2]; r2=mr1:0,mr1:0+=r3**r0(C);;
r17 =l[k9+=2];r12=l[j0+j3];r3=mr1:0,mr1:0+=r26**r17(C);sr9:8=r11:10+r9:8,sr11:10=r11:10-r9:8;;
r13=l[j0+j1];r18=mr1:0,mr1:0+=r27**r16(C);l[k0+k1]=r9;;
r16=pass r22;r14=l[j0+j2];r19=mr1:0,mr1:0+=r12**r0(C); l[k0+k2]=r10;;
r15=l[j0+=2];r12=mr1:0,mr1:0+=r13**r0(C); l[k0+k3]=r11;;
r13=mr1:0,mr1:0+=r13**r0(C);sr19:18=r21:20+r19:18,sr21:20=r21:20-r19:18;l[k0+=2]=r8;;
//from this point forward, the loop instructions are repeated without fetches
sr8=r4+r2,sr9=r4-r2;l[k0+k1]=r19;;
sr10=r5+r3,sr11=r5-r3;l[k0+k2]=r20;;
mr1:0+=r8**r7(C);l[k0+k3]=r21;;
r8=mr1:0,mr1:0+=r9**r6(C);sr26=r14+r12,sr27=r14-r12;l[k0+=2]=r18;;
//at this point one loop iteration has finished and we pass through it one more time
//executing only some of the mathematical operations that must still be performed
//the multiplication here is repeated only to move the content of mr1:0 into r9
r9=mr1:0,mr1:0+=r9**r6(C);sr20=r15+r13,sr21=r15-r13;;
mr1:0+=r26**r17(C);sr9:8=r11:10+r9:8,sr11:10=r11:10-r9:8;;
r18=mr1:0,mr1:0+=r27**r16(C);l[k0+k1]=r9;;
//the multiplication here is repeated only to move the content of mr1:0 into r19
r19=mr1:0,mr1:0+=r27**r16(C);l[k0+k2]=r10;;
l[k0+k3]=r11;;
sr19:18=r21:20+r19:18,sr21:20=r21:20-r19:18;l[k0+=2]=r8;;
l[k0+k1]=r19;;
l[k0+k2]=r20;;
l[k0+k3]=r21;;
l[k0+=2]=r18;;
//******************************** Epilogue **********************************
_FFT16.end:
_FFTEpilogue:
mPOPQ(yR27:24)
mPOPQ(yR31:28)
mPOPQ(xR27:24)
mPOPQ(xR31:28)
mRETURN
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?