📄 dsp_fft32x32_sa.sa
字号:
; the sub-table ; ;-------------------------------------------------------------; MV .2 B_n, B_stride ZERO .1 A_tw_offset LOOP_WHILE: ;-------------------------------------------------------------; ; "j" is used as an index into the sub-table of twiddle fact- ; ; ors. Since the pointer to the sub-table of twiddle fators ; ; resets with every iteration of the outer loop, the index ; ; within the sub-table is also reset to zero. Copies are made ; ; so that it exists in both data paths. ; ;-------------------------------------------------------------; ZERO .1 A_j ZERO .2 B_j ;-------------------------------------------------------------; ; "fft_jmp" is a variable that relates the offset between ; ; data elements that use the same twiddle factor. It is ; ; always 6*stride halfwords, 1.5 * stride double words. It ; ; quarters as does stride with every iteration of the outer ; ; loop. ; ;-------------------------------------------------------------; MPYSU .2 6, B_stride, B_fft_jmp SHRU .1 B_fft_jmp, 3, A_fft_jmp ;-------------------------------------------------------------; ; Determine offsets N/4, N/2, 3N/4 and make copies to both ; ; data paths. Also copy input pointer as output pointer. ; ;-------------------------------------------------------------; SHRU .2 B_stride, 2, B_h2 MV .1 B_h2, A_h2 ADD .1 A_ptr_x, -16, A_x ADD .1 A_ptr_x, -16, A_y ADDAH .1 A_ptr_w, A_tw_offset, A_w0 SHRU .1 B_fft_jmp, 1, A_fft_jmp_1 ADD .1 A_tw_offset, A_fft_jmp_1, A_tw_offset SHRU .2 B_stride, 2, B_stride ;-------------------------------------------------------------; ; Adjustments for BDEC, as it iterates till 0. Deduct 1 ; ; from loop trip count of N/4. ; ;-------------------------------------------------------------; SHRU .2 B_n, 3, B_i SUB .2 B_i, 1, B_i ;-------------------------------------------------------------; ; Since the stride amount across iterations is variable, ; ; it is tough to put an exact stride. However for this ; ; loop stride is guranteed to be greater than or equal ; ; to 16 complex samples, 32 half words. Since this str- ; ; ide is wider than the bank width, of all the banks, ; ; stride is specified as zero. ; ;-------------------------------------------------------------;LOOP_Y: .trip 8 ;-------------------------------------------------------------; ; si10 = w[0] co10 = w[1] si11 = w[2] co11 = w[3] ; ; si20 = w[4] co20 = w[5] si21 = w[6] si21 = w[7] ; ; si30 = w[8] co30 = w[9] si31 = w[a] co31 = w[b] ; ;-------------------------------------------------------------; ADDAD .1 A_w0, A_j, A_w MV .2 A_w, B_w LDDW .D1T2 *A_w[0], B_co10:B_si10 LDDW .D2T1 *B_w[1], A_co20:A_si20 LDDW .D1T2 *A_w[2], B_co30:B_si30 LDDW .D2T1 *B_w[3], A_co11:A_si11 LDDW .D1T2 *A_w[4], B_co21:B_si21 LDDW .D2T1 *B_w[5], A_co31:A_si31 ;-------------------------------------------------------------; ; x[0] x[1] x[2] x[3] ; ; x[h2+0] x[h2+1] x[h2+2] x[h2+3] ; ; x[l1+0] x[l1+1] x[l1+2] x[l1+3] ; ; x[l2+0] x[l2+1] x[l2+2] x[l2+3] ; ;-------------------------------------------------------------; MV .2 A_x, B_x LDDW .D1T2 *++A_x[2], B_x_1:B_x_0 LDDW .D2T1 *++B_x[3], A_x_3:A_x_2 LDDW .D1T2 *++A_x[A_h2], B_xh2_1i:B_xh2_0i LDDW .D2T1 *++B_x[B_h2], A_xh2_3i:A_xh2_2i LDDW .D1T2 *++A_x[A_h2], B_xl1_1i:B_xl1_0i LDDW .D2T1 *++B_x[B_h2], A_xl1_3i:A_xl1_2i LDDW .D1T2 *A_x[A_h2], B_xl2_1i:B_xl2_0i LDDW .D2T1 *B_x[B_h2], A_xl2_3i:A_xl2_2i SHL .2 B_h2, 4, B_2h2 SUB .1 A_x, B_2h2, A_x ;-------------------------------------------------------------; ; if (!(j - fft_jmp)) ; ; { ; ; j += 12 shorts; ; ; x += fft_jmp; ; ; j = 0; ; ; x += 4 ; ; } ; ;-------------------------------------------------------------; ADD .1 6, A_j, A_j SUB .1 A_j, A_fft_jmp, A_ifj [!A_ifj]ADD .1 A_x, B_fft_jmp, A_x [!A_ifj]ZERO .1 A_j ;-------------------------------------------------------------; ; xh0_0 = x[0] + x[l1]; xh1_0 = x[1] + x[l1+1] ; ; xh0_1 = x[2] + x[l1+2]; xh1_1 = x[3] + x[l1+3] ; ; xl0_0 = x[0] - x[l1]; xl1_0 = x[1] - x[l1+1] ; ; xl0_1 = x[2] - x[l1+2]; xl1_1 = x[3] - x[l1+3] ; ;-------------------------------------------------------------; ADDSUB .2 B_x_0, B_xl1_0i, B_xh0_0:B_xl0_0 ADDSUB .2 B_x_1, B_xl1_1i, B_xh1_0:B_xl1_0 ADDSUB .1 A_x_2, A_xl1_2i, A_xh0_1:A_xl0_1 ADDSUB .1 A_x_3, A_xl1_3i, A_xh1_1:A_xl1_1 ;------------------------------------------------------------; ; xh20_0 = x[h2 ] + x[l2 ]; xh21_0 = x[h2+1] + x[l2+1] ; ; xh20_1 = x[h2+2] + x[l2+2]; xh21_1 = x[h2+3] + x[l2+3] ; ; xl20_0 = x[h2 ] - x[l2 ]; xl21_0 = x[h2+1] - x[l2+1] ; ; xl20_1 = x[h2+2] - x[l2+2]; xl21_1 = x[h2+3] - x[l2+3] ; ;------------------------------------------------------------; ADDSUB .2 B_xh2_0i, B_xl2_0i, B_xh20_0:B_xl20_0 ADDSUB .2 B_xh2_1i, B_xl2_1i, B_xh21_0:B_xl21_0 ADDSUB .1 A_xh2_2i, A_xl2_2i, A_xh20_1:A_xl20_1 ADDSUB .1 A_xh2_3i, A_xl2_3i, A_xh21_1:A_xl21_1 ;-------------------------------------------------------------; ; x0[0] = xh0_0 + xh20_0 x0[1] = xh1_0 + xh21_0 ; ; x0[2] = xh0_1 + xh20_1 x0[3] = xh1_1 + xh21_1 ; ;-------------------------------------------------------------; ADD .2 B_xh0_0, B_xh20_0, B_x_0o ADD .2 B_xh1_0, B_xh21_0, B_x_1o ADD .1 A_xh0_1, A_xh20_1, A_x_2o ADD .1 A_xh1_1, A_xh21_1, A_x_3o ;-------------------------------------------------------------; ; xt0_0 = xh0_0 - xh20_0 yt0_0 = xh1_0 - xh21_0 ; ; xt0_1 = xh0_1 - xh20_1 yt0_1 = xh1_1 - xh21_1 ; ;-------------------------------------------------------------; SUB .2 B_xh0_0, B_xh20_0, B_xt0_0 SUB .2 B_xh1_0, B_xh21_0, B_yt0_0 SUB .1 A_xh1_1, A_xh21_1, A_yt0_1 SUB .1 A_xh0_1, A_xh20_1, A_xt0_1 ;-------------------------------------------------------------; ; xt1_0 = xl0_0 + xl21_0 yt2_0 = xl1_0 + xl20_0 ; ; xt1_1 = xl0_1 + xl21_1 yt2_1 = xl1_1 + xl20_1 ; ; xt2_0 = xl0_0 - xl21_0 yt1_0 = xl1_0 - xl20_0 ; ; xt2_1 = xl0_1 - xl21_1 yt1_1 = xl1_1 - xl20_1 ; ;-------------------------------------------------------------; ADDSUB .2 B_xl0_0, B_xl21_0, B_xt1_0:B_xt2_0 ADDSUB .2 B_xl1_0, B_xl20_0, B_yt2_0:B_yt1_0 ADDSUB .1 A_xl0_1, A_xl21_1, A_xt1_1:A_xt2_1 ADDSUB .1 A_xl1_1, A_xl20_1, A_yt2_1:A_yt1_1 ;-------------------------------------------------------------; ; x2[h2 ] = (si10 * yt1_0 + co10 * xt1_0) >> 15 ; ; x2[h2+1] = (co10 * yt1_0 - si10 * xt1_0) >> 15 ; ; x2[h2+2] = (si11 * yt1_1 + co11 * xt1_1) >> 15 ; ; x2[h2+3] = (co11 * yt1_1 - si11 * xt1_1) >> 15 ; ;-------------------------------------------------------------; SMPY32 .2 B_si10, B_yt1_0, B_p0 SMPY32 .2 B_co10, B_xt1_0, B_p1 ADD .2 B_p0, B_p1, B_xh2_0o SMPY32 .2 B_co10, B_yt1_0, B_p2 SMPY32 .2 B_si10, B_xt1_0, B_p3 SUB .2 B_p2, B_p3, B_xh2_1o SMPY32 .1 A_si11, A_yt1_1, A_p4 SMPY32 .1 A_co11, A_xt1_1, A_p5 ADD .1 A_p4, A_p5, A_xh2_2o SMPY32 .1 A_co11, A_yt1_1, A_p6 SMPY32 .1 A_si11, A_xt1_1, A_p7 SUB .1 A_p6, A_p7, A_xh2_3o ;-------------------------------------------------------------; ; x2[l1 ] = (si20 * yt0_0 + co20 * xt0_0) >> 15 ; ; x2[l1+1] = (co20 * yt0_0 - si20 * xt0_0) >> 15 ; ; x2[l1+2] = (si21 * yt0_1 + co21 * xt0_1) >> 15 ; ; x2[l1+3] = (co21 * yt0_1 - si21 * xt0_1) >> 15 ; ;-------------------------------------------------------------; SMPY32 .1 A_si20, B_yt0_0, A_p8 SMPY32 .2 A_co20, B_xt0_0, B_p9 ADD .2 A_p8, B_p9, B_xl1_0o SMPY32 .2 A_co20, B_yt0_0, B_pa SMPY32 .1 A_si20, B_xt0_0, A_pb SUB .2 B_pa, A_pb, B_xl1_1o SMPY32 .1 B_si21, A_yt0_1, A_pc SMPY32 .2 B_co21, A_xt0_1, B_pd ADD .1 A_pc, B_pd, A_xl1_2o SMPY32 .1 B_co21, A_yt0_1, A_pe SMPY32 .2 B_si21, A_xt0_1, B_pf SUB .1 A_pe, B_pf, A_xl1_3o ;-------------------------------------------------------------; ; x2[l2 ] = (si30 * yt2_0 + co30 * xt2_0) >> 15 ; ; x2[l2+1] = (co30 * yt2_0 - si30 * xt2_0) >> 15 ; ; x2[l2+2] = (si31 * yt2_1 + co31 * xt2_1) >> 15 ; ; x2[l2+3] = (co31 * yt2_1 - si31 * xt2_1) >> 15 ; ;-------------------------------------------------------------; SMPY32 .2 B_si30, B_yt2_0, B_p10 SMPY32 .2 B_co30, B_xt2_0, B_p11 ADD .2 B_p10, B_p11, B_xl2_0o SMPY32 .2 B_co30, B_yt2_0, B_p12 SMPY32 .2 B_si30, B_xt2_0, B_p13 SUB .2 B_p12, B_p13, B_xl2_1o SMPY32 .1 A_si31, A_yt2_1, A_p14 SMPY32 .1 A_co31, A_xt2_1, A_p15 ADD .1 A_p14, A_p15, A_xl2_2o SMPY32 .1 A_co31, A_yt2_1, A_p16 SMPY32 .1 A_si31, A_xt2_1, A_p17 SUB .1 A_p16, A_p17, A_xl2_3o ;-------------------------------------------------------------;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -