📄 fftspx_h.asm
字号:
.asg B4, B_h2_0 ;index into data N/4
.asg A6, A_h2_0 ;index into data N/4
.asg B4, B_p0 ;intermediate butterfly calculation
.asg B3, B_p1 ;intermediate butterfly calculation
.asg B1, B_p00 ;new x[N +i*2 ]
.asg B1, B_p2 ;intermediate butterfly calculation
.asg B6, B_p3 ;intermediate butterfly calculation
.asg B3, B_p20 ;new x[N +i*2+1]
.asg A6, A_p4 ;intermediate butterfly calculation
.asg A0, A_p5 ;intermediate butterfly calculation
.asg A11, A_p40 ;new x[N/2 +i*2 ]
.asg A7, A_p6 ;intermediate butterfly calculation
.asg A3, A_p7 ;intermediate butterfly calculation
.asg A10, A_p60 ;new x[N/2 +i*2+1]
.asg A3, A_p8 ;intermediate butterfly calculation
.asg B12, B_p9 ;intermediate butterfly calculation
.asg A14, A_p80 ;new x[3*N/2 +i*2+1]
.asg A3, A_pa ;intermediate butterfly calculation
.asg B10, B_pb ;intermediate butterfly calculation
.asg A0, A_pa0 ;new x[3*N/2 +i*2+1]
.asg B2, B_return ;early return decision variable
* ============================ PIPE LOOP PROLOG ============================== *
MV .L1X B_SP, A_SP ;copy stack pointer
|| STW .D2T1 A15, *-B_SP[1] ;save A15
STW .D2T2 B14, *-B_SP[2] ;save B14
|| STW .D1T1 A14, *-A_SP[3] ;save A14
STW .D2T2 B13, *-B_SP[4] ;save B13
|| STW .D1T1 A13, *-A_SP[5] ;save A13
STW .D2T2 B12, *-B_SP[6] ;save B12
|| STW .D1T1 A12, *-A_SP[7] ;save A12
STW .D2T2 B11, *-B_SP[8] ;save B11
|| STW .D1T1 A11, *-A_SP[9] ;save A11
STW .D2T2 B_n_max, *-B_SP[10] ;save B10
|| STW .D1T1 A_offset, *-A_SP[11] ;save A10
STW .D2T2 B_radix, *-B_SP[12] ;save B8
|| STW .D1T1 A_brev, *-A_SP[13] ;save A8
|| MVC .S2 CSR, B_csr ;
STW .D2T2 B_ptr_y, *-B_SP[14] ;save B6
|| STW .D1T1 A_ptr_w, *-A_SP[15] ;save A6
|| AND .L1X B_csr, -2, A_csr_no_gie ;disable interupt bit
STW .D2T2 B_ptr_x, *-B_SP[16] ;save B4
|| STW .D1T1 A_n, *-A_SP[17] ;save A4
|| MVC .S2X A_csr_no_gie, CSR ;diable interupts
|| MV .L1X B_csr, A_csr ;copy csr
STW .D2T2 B3, *-B_SP[19] ;save B3
|| STW .D1T1 A_csr, *-A_SP[18] ;save original CSR
|| ZERO .S1 A_tw_offset ;[2,0]tw_offset=0
|| MV .S2X A_n, B_stride ;[2,0]stride=n
|| MV .L2 B_ptr_x, B_x_ ;[6,0]x_ = ptr_x
LDDW .D2T2 *B_ptr_x[0],B_xp1:B_xp0 ;[3,1]x[2*i], x[2*i+1]
|| MV .L2 B_ptr_x, B_x ;[3,1]x = ptr_x
|| SHRU .S2 B_stride, 2, B_stride ;[6,0]stride>>=2
LDDW .D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[4,1]X[i+N/4]
LDDW .D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[5,1]X[i+N/2]
|| MPYSU .M1X 12, B_stride, A_fft_jmp ;[4,0]fft_jmp =3*stride
LDDW .D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[6,1]X[3N/4+i]
|| ADDAH .D1 A_ptr_w, A_tw_offset,A_w0 ;[5,0]w=ptr_w+tw_offset
MPYSU .M2 12, B_stride, B_fft_jmp ;[7,1]fft_jmp = 3*stride
|| SUB .S1 A_n, 0, A_i ;[6,0]for(i=0; i < n;){
|| ADD .L1 A_tw_offset,A_fft_jmp,A_tw_offset;[6,0]tw_offset+=fft_jmp
STW .D2T1 A_tw_offset, *-B_SP[20] ;save tw_offset
LOOP_WHILE:
MPYSU .M1X 3, B_stride, A_fft_jmp ;[1,1]fft_jmp=3*stride
ADDSP .L2 B_xp1, B_xl1p1, B_xh1 ;[10,1]xh1=x[1]+x[l1+1]
|| ZERO .D1 A_j ;[6,0]j=0
ADDSP .L1 A_xh2p1, A_xl2p1, A_xh21 ;[11,1]xh21=x[h2+1]+x[l2+1]
|| ADDSP .L2 B_xp0, B_xl1p0, B_xh0 ;[11,1]xh0=x[0]+x[l1]
SUB .S1 A_fft_jmp, A_j, A_prj_ ;[12,1]predj=(j-fft_jmp)
|| SUBSP .L1 A_xh2p0, A_xl2p0, A_xl20 ;[12,1]xl20=x[h2]-x[l2]
|| SUBSP .L2 B_xp1, B_xl1p1, B_xl1 ;[12,1]xl1=x[1]-x[l1+1]
SUB .D1 A_prj_, 3, A_prj ;[13,1]predj = predj_
|| SUBSP .L1 A_xh2p1, A_xl2p1, A_xl21 ;[13,1]xl21=x[h2+1]-x[l2+1]
|| SUBSP .L2 B_xp0, B_xl1p0, B_xl0 ;[13,1]xl0=x[0]-x[l1]
ADDSP .L1 A_xh2p0, A_xl2p0, A_xh20 ;[14,1]xh20=x[h2]+x[l2]
|| MV .S2 B_x, B_ptr_x0 ;[14,1]x = ptr_x0
|| LDDW .D1T1 *A_w0[A_j],A_si10:A_co10 ;[14,1]si1=w[j+1]co1=w[j]
|| ADD .S1 A_w0, 8, A_w0 ;[14,1]j+=1
||[!A_prj]ADDAH.D2 B_x, B_fft_jmp, B_x ;[14,1]if(!predj)x+=fft_jmp
|| MPYSU .M1X 3, B_stride, A_fft_jmp ;[1,2]fft_jmp = 3*stride
ADD .S2 B_x, 8, B_x ;[15,1]x+=2
|| LDDW .D1T2 *A_w0[A_j],B_si20:B_co20 ;[15,1]si2=w[j+3]co2=w[j+2]
|| B .S1 PREF8 + 8 ;prolog collapse
LDDW .D2T2 *B_x[0], B_xp1:B_xp0 ;[3,2]X[i]
|| MV .S2 B_x, B_x_ ;[3,2]x_ = x
|| B .S1 PREF9 + 4 ;prolog collapse
SUBSP .L2X B_xh1, A_xh21, B_yt0 ;[17,1]yt0=xh1-xh21
|| ADDSP .L1X A_xl21, B_xl0, A_xt1 ;[17,1]xt1=xl0+xl21
|| LDDW .D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[4,2]X[N/4 + i]
|| B .S1 PREF10 ;prolog collapse
ADDSP .L1X A_xl20, B_xl1, A_yt2 ;[18,1]yt2=xl1+xl20
|| SUBSP .L2X B_xl1, A_xl20, B_yt1 ;[18,1]yt1=xl1-xl20
|| LDDW .D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[ 5,2]x[N/2+i]
|| B .S1 PREF11 + 4 ;prolog collapse
SUBSP .L2X B_xh0, A_xh20, B_xt0 ;[19,1]xt0=xh0-xh20
|| LDDW .D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[ 6,2]X[3N/4+i]
|| B .S1 PREF12 + 4 ;prolog collapse
SUBSP .L2X B_xl0, A_xl21, B_xt2 ;[20,1]xt2=xl0-xl21
|| MPYSU .M2 12, B_stride, B_fft_jmp ;[ 7,2] fft_jmp = 3*stride
|| B .S2 LOOP_FOR_A ;prolog collapse
* ============================ PIPE LOOP KERNEL ============================== *
LOOP_FOR_A:
ADD .S1 A_w0, -16, A_w0 ;[26,1] j += 1
|| STW .D2T2 B_y0, *B_ptr_x0[0] ;[26,1]write x[i]
|| SUB .D1 A_prj_, 3, A_prj ;[13,2]prj = prj_
|| SUBSP .L1 A_xh2p1, A_xl2p1, A_xl21 ;[13,2]xl21=x[h2+1]-x[l2+1]
|| SUBSP .L2 B_xp0, B_xl1p0, B_xl0 ;[13,2]xl0=x[0]-x[l1]
MPYSU .M2 2, B_stride, B_h2_0 ;[27,1]h2_0=stride
|| SUBSP .L2 B_p2, B_p3, B_p20 ;[27,1]co20*yt0-si20*xt0
|| ADDSP .L1 A_xh2p0, A_xl2p0, A_xh20 ;[14,2]xh20=x[h2]+x[l2]
|| MV .S2 B_x, B_ptr_x0 ;[14,2]ptr_x0 = x
|| LDDW .D1T1 *A_w0[A_j],A_si10:A_co10 ;[14,2]si1=w[j+1]co1=w[j]
|| ADD .S1 A_w0, 8, A_w0 ;[14,2]j += 1
||[!A_prj]ADDAH.D2 B_x, B_fft_jmp, B_x ;[14,2]if(!predj)x+=fft_jmp
|| MPYSU .M1X 3, B_stride, A_fft_jmp ;[ 1,3]fft_jmp = 3*stride
ADD .S1X B_ptr_x1, -4, A_ptr_x1 ;[28,1]ptr_x1 -= 1
|| MPYSP .M1 A_co30, A_yt2, A_pa ;[28,1]pa = co30*yt2
|| MPYSP .M2X A_si30, B_xt2, B_pb ;[28,1]pb = si30*xt2
|| SUBSP .L1 A_p6, A_p7, A_p60 ;[28,1]co10*yt1-si10*xt1
|| ADDSP .L2 B_p0, B_p1, B_p00 ;[28,1]si20*yt0+co20*xt0
|| ADD .S2 B_x, 8, B_x ;[15,2]x +=2;
|| LDDW .D1T2 *A_w0[A_j],B_si20:B_co20 ;[15,2]si2=w[j+3]co2=w[j+2]
MPYSP .M2X B_xt2, A_co30, B_p9 ;[29,1]p9=xt2*co30
|| MPYSP .M1 A_yt2, A_si30, A_p8 ;[29,1]p8=yt2*si30
|| ADDSP .L1 A_p4, A_p5, A_p40 ;[29,1](si10*yt1+co10*xt1)
|| MV .S1X B_h2_0, A_h2_0 ;[29,1]copy h2_0
|| LDDW .D2T2 *B_x[0], B_xp1:B_xp0 ;[ 3,3]X[i]
|| MV .S2 B_x, B_x_ ;[ 3,3]x_ = x
SUBSP .L2X B_xh1, A_xh21, B_yt0 ;[17,2]yt0=xh1-xh21
|| ADDSP .L1X A_xl21, B_xl0, A_xt1 ;[17,2]xt1=xl0+xl21
|| LDDW .D2T1 *++B_x_[B_stride],A_xh2p1:A_xh2p0;[ 4,3]X[N/4+i]
ADDSP .L1X A_xl20, B_xl1, A_yt2 ;[18,2]yt2=xl1+xl20
|| SUBSP .L2X B_xl1, A_xl20, B_yt1 ;[18,2]yt1=xl1-xl20
|| LDDW .D2T2 *++B_x_[B_stride],B_xl1p1:B_xl1p0;[ 5,3]x[N/2+i]
SUB .D1 A_i, 4, A_i ;[32,1]i+=4
|| SUBSP .L1X A_pa, B_pb, A_pa0 ;[32,1]co30*yt2-si30*xt2
|| SUBSP .L2X B_xh0, A_xh20, B_xt0 ;[19,2]xt0=xh0-xh20
|| LDDW .D2T1 *++B_x_[B_stride],A_xl2p1:A_xl2p0;[ 6,3]x[3N/4+i]
[ A_i]B .S2 LOOP_FOR_A ;[33,1]}/* end for */
|| ADDSP .L1X B_p9, A_p8, A_p80 ;[33,1]si30*yt2+co30*xt2
|| STW .D1T2 B_p00, *++A_ptr_x1[A_h2_0] ;[33,1]save x[N/4+i]
||[ A_i]SUBSP .L2X B_xl0, A_xl21, B_xt2 ;[20,2]xt2=xl0-xl21
||[ A_i]MPYSU .M2 12, B_stride, B_fft_jmp ;[ 7,3]fft_jmp = 3*stride
PREF8:
STW .D2T2 B_p20, *++B_ptr_x1[B_h2_0] ;[34,1]save x[N/4+i+1]
|| STW .D1T1 A_p40, *++A_ptr_x1[A_h2_0] ;[34,1]save x[N/2+i]
||[ A_i]MPYSP .M2 B_co20, B_yt0, B_p2 ;[21,2]p2=co20*yt0
||[ A_i]ADDSP .L2X B_xh1, A_xh21, B_y1 ;[21,2]y1=xh1+xh21
||[ A_i]MPYSP .M1 A_co10, A_xt1, A_p5 ;[21,2]p5 = co10*xt1
PREF9:
STW .D2T1 A_p60, *++B_ptr_x1[B_h2_0] ;[35,1]x[3*N/4+i]
||[ A_i]ADD .D1 A_w0, 8, A_w0 ;[22,2]j += 1
||[ A_i]ADDSP .L2X B_xh0, A_xh20, B_y0 ;[22,2]y0=xh0+xh20
||[ A_i]MPYSP .M2 B_si20, B_yt0, B_p0 ;[22,2]p0=si20*yt0
||[ A_i]MPYSP .M1X A_si10, B_yt1, A_p4 ;[22,2]p4=si10*yt1
PREF10:
[ A_i]MPYSP .M2 B_si20, B_xt0, B_p3 ;[23,2]p3 = si20*xt0
||[ A_i]ADD .S1 A_j, 3, A_j ;[23,2]j += 1
||[ A_i]LDDW .D1T1 *A_w0[A_j],A_si30:A_co30 ;[23,2]si3=w[j+5]co3=w[j+4]
||[ A_i]MPYSP .M1X A_co10, B_yt1, A_p6 ;[23,2]p6 = co10*yt1
||[ A_i]ADDSP .L2 B_xp1, B_xl1p1, B_xh1 ;[10,3]xh1=x[1]+x[l1+1]
PREF11:
STW .D2T1 A_pa0, *++B_ptr_x1[B_h2_0] ;[37,1]save x[3*N/4+i+1]
||[ A_i]MPYSP .M2 B_co20, B_xt0, B_p1 ;[24,2]p1=co20*xt0
||[!A_prj]ZERO.S1 A_j ;[24,2]if(!predj)j = 0
||[ A_i]MPYSP .M1 A_si10, A_xt1, A_p7 ;[24,2]p7=si10*xt1
||[ A_i]ADDSP .L1 A_xh2p1, A_xl2p1, A_xh21 ;[11,3]xh21=x[h2p1]+x[l2p1]
||[ A_i]ADDSP .L2 B_xp0, B_xl1p0, B_xh0 ;[11,3]xh0=x[0]+x[l1]
PREF12:
STW .D1T1 A_p80, *++A_ptr_x1[A_h2_0] ;[38,1]save x[3*N/4+i]
||[ A_i]ADD .S2 B_ptr_x0, 4, B_ptr_x1 ;[25,2]ptr_x1 = ptr_x + 1
||[ A_i]STW .D2T2 B_y1, *B_ptr_x0[1] ;[25,2]save x[i+1]
||[ A_i]SUB .S1 A_fft_jmp, A_j, A_prj_ ;[12,3]predj=j-fft_jmp
||[ A_i]SUBSP .L1 A_xh2p0, A_xl2p0, A_xl20 ;[12,3]xl20=x[h2]-x[l2]
||[ A_i]SUBSP .L2 B_xp1, B_xl1p1, B_xl1 ;[12,3]xl1=x[1]-x[l1+1]
* ============================ PIPE LOOP EPILOG ============================== *
MV .L1X B_SP, A_SP ;copy stack pntr
|| LDW .D2T2 *-B_SP[12],B_radix ;restore B10
LDW .D2T2 *-B_SP[16],B_ptr_x ;restore ptr_x
LDW .D1T1 *-A_SP[17],A_n ;restore A_n
LDW .D1T1 *-A_SP[20],A_tw_offset ;restore tw_offset
LDW .D1T1 *-A_SP[15],A_ptr_w ;restore ptr_w
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -