📄 fft32x32.asm
字号:
|| MV .D1 A_ptr_x, A_x ; Restore A_ptr_x in A_x
|| STW .D2T2 B_n, *+B_SP[1]
STDW .D1T1 A11:A10, *+A_SP[5] ; Save A11,A10
|| STDW .D2T2 B13:B12, *+B_SP[6] ; Save B13,B12
|| AND .L2 B_radix2, 1, B_radix2
|| MV .L1 A_ptr_w, A_w0 ; Restore A_ptr_w in A_w0
|| MVK .S1 4, A_radix
|| MVC .S2 B_i, RILC
STDW .D1T1 A13:A12, *+A_SP[7] ; Save A13,A12
|| STDW .D2T2 B11:B10, *+B_SP[4] ; Save B11,B10
||[B_radix2] MVK.S1 2, A_radix
|| MV .L2 B_n, B_stride
|| ADD .L1 A_x, -16, A_x
STDW .D2T1 A15:A14, *+B_SP[3] ; Save A13,A12]
|| MVK .S2 1, B_wh
|| MPYSU .M2 6, B_stride, B_fft_jmp
|| ADDAH .D1 A_w0, A_tw_offset, A_w0
|| MV .L1 A_ptr_x, A11
[B_wh] SPLOOP 12
|| ZERO .L1 A_j
|| STW .D2T2 B_ret, *+B_SP[4]
|| SHRU .S2 B_stride, 2, B_h2
|| MV .L2 B_SP, B11
|| STW .D1T1 A_ptr_w, *+A_SP[0]
* =========================== STAGE 0 ============================ *
SPMASK
|| ADDAD .D1 A_w0, A_j, A_w ;[ 1,1]
||^ MV .S2X A_radix, B_radix
||^ SHRU .S1X B_fft_jmp, 3, A_fft_jmp
||^ STW .D2T2 B_ptr_y, *+B11[3] ; Save B_ptr_y
SPMASK
||^ MV .D1 B_h2, A_h2
||^ SHL .S2 B_radix, 2, B_radix ; Adjust B_radix
SPMASK
|| MV .S2X A_x, B_x ;[ 3,1]
||^ STW .D1T1 A11, *+A_SP[2]
SPMASK
|| LDDW .D1T2 *++A_x[2], B_x_1:B_x_0 ;[ 4,1]
||^ SHRU .S2 B_stride, 2, B_stride
||^ MV .L1 A_x, A_y
SHL .S2 B_h2, 4, B_2h2 ;[ 5,1]
|| LDDW .D1T2 *++A_x[A_h2], B_xh2_1i:B_xh2_0i ;[ 5,1]
|| LDDW .D2T1 *++B_x[3], A_x_3:A_x_2 ;[ 5,1]
SPMASK
|| LDDW .D1T2 *++A_x[A_h2], B_xl1_1i:B_xl1_0i ;[ 6,1]
|| LDDW .D2T1 *++B_x[B_h2], A_xh2_3i:A_xh2_2i ;[ 6,1]
||^ SHRU .S2 B_fft_jmp, 1, B_fft_jmp
||^ ZERO .L2 B_j
SPMASK
|| ADD .S1 6, A_j, A_j ;[ 7,1]
|| LDDW .D1T2 *A_x[A_h2], B_xl2_1i:B_xl2_0i ;[ 7,1]
|| LDDW .D2T1 *++B_x[B_h2], A_xl1_3i:A_xl1_2i ;[ 7,1]
||^ ADD .L1X A_tw_offset, B_fft_jmp, A_tw_offset
SUB .S1X A_x, B_2h2, A_x ;[ 8,1]
|| LDDW .D2T1 *B_x[B_h2], A_xl2_3i:A_xl2_2i ;[ 8,1]
SPMASK
|| SUB .L1 A_j, A_fft_jmp, A_ifj ;[ 9,1]
||^ SHL .S2 B_fft_jmp, 1, B_fft_jmp
MV .S2X A_w, B_w ;[10,1]
|| LDDW .D1T2 *A_w[0], B_co10:B_si10 ;[10,1]
ADDSUB .L2 B_x_0, B_xl1_0i, B_xh0_0:B_xl0_0 ;[11,1]
||[!A_ifj] ADD .S1X A_x, B_fft_jmp, A_x ;[11,1]
|| LDDW .D2T1 *B_w[3], A_co11:A_si11 ;[11,1]
|| LDDW .D1T2 *A_w[2], B_co30:B_si30 ;[11,1]
ADDSUB .L1 A_x_2, A_xl1_2i, A_xh0_1:A_xl0_1 ;[12,1]
|| ADDSUB .L2 B_x_1, B_xl1_1i, B_xh1_0:B_xl1_0 ;[12,1]
||[!A_ifj] ZERO .S1 A_j ;[12,1]
|| LDDW .D2T1 *B_w[5], A_co31:A_si31 ;[12,1]
|| LDDW .D1T2 *A_w[4], B_co21:B_si21 ;[12,1]
* =========================== STAGE 1 ============================ *
ADDSUB .L1 A_xh2_2i, A_xl2_2i, A_xh20_1:A_xl20_1;[13,1]
|| ADDSUB .L2 B_xh2_1i, B_xl2_1i, B_xh21_0:B_xl21_0;[13,1]
|| LDDW .D2T1 *B_w[1], A_co20:A_si20 ;[13,1]
ADD .D2 6, B_j, B_j ;[14,1]
|| ADDSUB .L2 B_xl0_0, B_xl21_0, B_xt1_0:B_xt2_0 ;[14,1]
|| ADDSUB .L1 A_xh2_3i, A_xl2_3i, A_xh21_1:A_xl21_1;[14,1]
ADDSUB .L2 B_xh2_0i, B_xl2_0i, B_xh20_0:B_xl20_0;[15,1]
|| ADDSUB .L1 A_x_3, A_xl1_3i, A_xh1_1:A_xl1_1 ;[15,1]
|| SMPY32 .M2 B_si10, B_xt1_0, B_p3 ;[15,1]
ADDSUB .L1 A_xl0_1, A_xl21_1, A_xt1_1:A_xt2_1 ;[16,1]
|| ADDSUB .L2 B_xl1_0, B_xl20_0, B_yt2_0:B_yt1_0 ;[16,1]
|| SUB .S1 A_xh1_1, A_xh21_1, A_yt0_1 ;[16,1]
|| SMPY32 .M2 B_co10, B_xt1_0, B_p1 ;[16,1]
ADDSUB .L1 A_xl1_1, A_xl20_1, A_yt2_1:A_yt1_1 ;[17,1]
|| SUB .L2 B_xh0_0, B_xh20_0, B_xt0_0 ;[17,1]
|| ADD .S1 A_xh0_1, A_xh20_1, A_x_2o ;[17,1]
|| SMPY32 .M1 A_co11, A_xt1_1, A_p5 ;[17,1]
|| SMPY32 .M2 B_co10, B_yt1_0, B_p2 ;[17,1]
SUB .S1 A_xh0_1, A_xh20_1, A_xt0_1 ;[18,1]
|| ADD .L2 B_xh0_0, B_xh20_0, B_x_0o ;[18,1]
|| SMPY32 .M1 A_si11, A_yt1_1, A_p4 ;[18,1]
|| SMPY32 .M2 B_si10, B_yt1_0, B_p0 ;[18,1]
ADD .S2 B_xh1_0, B_xh21_0, B_x_1o ;[19,1]
|| SMPY32 .M2 B_si30, B_xt2_0, B_p13 ;[19,1]
|| SMPY32 .M1 A_si11, A_xt1_1, A_p7 ;[19,1]
SUB .S2 B_xh1_0, B_xh21_0, B_yt0_0 ;[20,1]
|| SMPY32 .M1 A_si31, A_xt2_1, A_p17 ;[20,1]
|| SMPY32 .M2X A_co20, B_xt0_0, B_p9 ;[20,1]
SUB .S2 B_p2, B_p3, B_xh2_1o ;[21,1]
|| SMPY32 .M2X B_si21, A_xt0_1, B_pf ;[21,1]
|| SMPY32 .M1 A_co11, A_yt1_1, A_p6 ;[21,1]
ADD .L1 A_p4, A_p5, A_xh2_2o ;[22,1]
|| ADD .L2 B_p0, B_p1, B_xh2_0o ;[22,1]
|| ADD .S1 A_xh1_1, A_xh21_1, A_x_3o ;[22,1]
|| SMPY32 .M2 B_si30, B_yt2_0, B_p10 ;[22,1]
|| SMPY32 .M1X A_si20, B_yt0_0, A_p8 ;[22,1]
SMPY32 .M1 A_co31, A_yt2_1, A_p16 ;[23,1]
|| SMPY32 .M2X A_co20, B_yt0_0, B_pa ;[23,1]
SUB .S2X B_j, A_fft_jmp, B_ifj ;[24,1]
|| SMPY32 .M1 A_si31, A_yt2_1, A_p14 ;[24,1]
|| SMPY32 .M2 B_co30, B_xt2_0, B_p11 ;[24,1]
* =========================== STAGE 2 ============================ *
[!B_ifj] ZERO .S2 B_j ;[25,1]
|| SUB .S1 A_p6, A_p7, A_xh2_3o ;[25,1]
|| SMPY32 .M2X B_co21, A_xt0_1, B_pd ;[25,1]
|| SMPY32 .M1X B_si21, A_yt0_1, A_pc ;[25,1]
SHL .S1 A_h2, 4, A_2h2 ;[26,1]
|| STDW .D1T2 B_x_1o:B_x_0o, *++A_y[2] ;[26,1]
|| MV .S2X A_y, B_y ;[26,1]
|| SMPY32 .M2 B_co30, B_yt2_0, B_p12 ;[26,1]
|| SMPY32 .M1X A_si20, B_xt0_0, A_pb ;[26,1]
STDW .D1T2 B_xh2_1o:B_xh2_0o, *++A_y[A_h2] ;[27,1]
|| STDW .D2T1 A_x_3o:A_x_2o, *++B_y[3] ;[27,1]
|| SUB .S1 A_p16, A_p17, A_xl2_3o ;[27,1]
|| SMPY32 .M1X B_co21, A_yt0_1, A_pe ;[27,1]
STDW .D2T1 A_xh2_3o:A_xh2_2o, *++B_y[B_h2] ;[28,1]
|| ADD .S2X A_p8, B_p9, B_xl1_0o ;[28,1]
|| SMPY32 .M1 A_co31, A_xt2_1, A_p15 ;[28,1]
NOP
SUB .S2 B_p12, B_p13, B_xl2_1o ;[30,1]
|| ADD .L1X A_pc, B_pd, A_xl1_2o ;[30,1]
SUB .L1X A_pe, B_pf, A_xl1_3o ;[31,1]
|| SUB .L2X B_pa, A_pb, B_xl1_1o ;[31,1]
STDW .D1T2 B_xl1_1o:B_xl1_0o, *++A_y[A_h2] ;[32,1]
|| ADD .L1 A_p14, A_p15, A_xl2_2o ;[32,1]
|| ADD .L2 B_p10, B_p11, B_xl2_0o ;[32,1]
SUB .S1 A_y, A_2h2, A_y ;[33,1]
|| STDW .D1T2 B_xl2_1o:B_xl2_0o, *A_y[A_h2] ;[33,1]
|| STDW .D2T1 A_xl1_3o:A_xl1_2o, *++B_y[B_h2] ;[33,1]
STDW .D2T1 A_xl2_3o:A_xl2_2o, *B_y[B_h2] ;[34,1]
SPKERNEL 0, 0
||[!B_ifj] ADD .L1X A_y, B_fft_jmp, A_y ;[35,1]
* =========================== END STAGE 2 ============================ *
LOOP_WHILE:
* ============ STAGES I,II (epilog) + Outer Loop ======================= *
NOP 4
LDW .D1T1 *+A_SP[0], A_w0 ; Restore A_ptr_w in A_w0
LDW .D1T1 *+A_SP[2], A_x ; Restore A_ptr_x in A_x
ZERO .D1 A_j
NOP
MV .L2 B_fft_jmp, B_fft_jmp_old
ADDAH .D1 A_w0, A_tw_offset, A_w0
|| MV .S2 B_h2, B_h2_old
MV .S1 A_h2, A_h2_old
|| SHRU .S2 B_stride, 2, B_h2
|| ADD .D1 A_x, -16, A_x
SPMASKR
|| MV .S1 B_h2, A_h2
|| MV .L1 A_y, A_y_old
* ============ STAGE 0 (prolog)+STAGE II (epilog) + Outer Loop =======================*
MV .L1 A_x, A_y
SPMASK
|| ZERO .D2 B_j
||^ SHL .S1 A_h2_old, 4, A_2h2 ;[26,1]
||^ STDW .D1T2 B_x_1o:B_x_0o, *++A_y_old[2] ;[26,1]
||^ MV .S2X A_y_old, B_y ;[26,1]
SPMASK
|| MPYSU .M2 6, B_stride, B_fft_jmp
||^ STDW .D1T2 B_xh2_1o:B_xh2_0o, *++A_y_old[A_h2_old] ;[27,1]
SPMASK
||^ STDW .D2T1 A_xh2_3o:A_xh2_2o, *++B_y[B_h2_old] ;[28,1]
|| MV .L1 A_y_old, A_y_old_2
|| MV .L2 B_h2_old, B_h2_old_2
SHRU .S1X B_fft_jmp, 3, A_fft_jmp
|| MV .L2 B_y, B_y_old
|| MV .L1 A_h2_old, A_h2_old_2
NOP
MV .S2 B_fft_jmp_old, B_fft_jmp_old_2
SPMASK
|| SHRU .S2 B_fft_jmp, 1, B_fft_jmp_temp
||^ STDW .D1T2 B_xl1_1o:B_xl1_0o, *++A_y_old_2[A_h2_old_2];[32,1]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -