📄 example 3-34.asm
字号:
.global _r2fft32
.bss stack,104,4 ; reserve space for stack
.bss csr_stack,4,4 ; reserve space to store csr
.sect ".text:hand"
_r2fft32:
MVC .S2 CSR, B9
MVKL .S2 csr_stack, B0
MVKH .S2 csr_stack, B0
STW .D2 B9, *B0
|| AND .S2 B9, -2, B9 ;DISABLE GIE bit
MVC .S2 B9, CSR ;DISABLE Interupts
MVKL .S1 stack, A0 ;new stack pointer in A0
MVKH .S1 stack, A0 ;new stack pointer in A0
* SAVE C RUNTIME ENVIRONMENT **********
STW .D1 A15, *A0++[1] ;**push A15 onto stack
STW .D1 B15, *A0++[1] ;**push B15 onto stack
ADD .S2X A0, 4, B15
STW .D2 B14, *B15++[2] ;**push B14 onto stack
|| STW .D1 A14, *A0++[2] ;**push A14 onto stack
STW .D2 B13, *B15++[2] ;**push B13 onto stack
|| STW .D1 A13, *A0++[2] ;**push A13 onto stack
STW .D2 B12, *B15++[2] ;**push B12 onto stack
|| STW .D1 A12, *A0++[2] ;**push A12 onto stack
STW .D2 B11, *B15++[2] ;**push B11 onto stack
|| STW .D1 A11, *A0++[2] ;**push A11 onto stack
STW .D2 B10, *B15++[1] ;**push B10 onto stack
|| STW .D1 A10, *A0++[2] ;**push A10 onto stack
STW .D2 B3, *B15++[1] ;**push B3 onto stack
*** BEGIN Benchmark Timing ***
B_START:
MVK .S1 1, A5 ;ie = 1
|| MV .L1X B4, A14 ;n2 = n
SUB .D1 A14, 1,A10 ;ro = n2 - 1
|| SHR .S1 A14, 1, A13 ;ictr = n2>>1
LMBD .L1X 1, b4, A2 ;k = 31-log2(n)
|| MVK .S1 31-1, A12 ;last stage hard coded
SUB .D1 A12, A2, A2 ;k = log2(n)-1
|| SHL .S1 A10,3,A10 ;ro >>= 3
K_LOOP:
MV .S1 A5, A1 ;wctr = ie
|| ADD .S2X A6, 4, B0 ;wptr_s = &w[1]
SHL .S2X A5, 1, B2 ;ie2b = ie<<1
|| LDW .D2 *B0--, A8 ;s = w[1]
ADD .L2 B2, 1, B2 ;ie2b += 1
|| SUB .S2X A13, 2, B1 ;compensate for prolog
MV .S1 A4, A3 ;load_re = &x[0]
|| ADD .S2X A4, 4, B8 ;load_im = &x[1]
MV .L1 A14, A7 ;n1a = n2
|| MV .L2X A14, B5 ;n1b = n2
|| SHR .S1 A14,1,A14 ;n2 >>= 1
|| LDW .D2 *B0++[B2], B6 ;c = w[0]
STW .D2 A14, *B15++[1] ;**push n2 onto stack
[!A1] SUB .S1 A3, A10, A3 ;load_re -= ro
||[!A1] SUB .S2X B8, A10, B8 ;load_im -= ro
|| STW .D2 A4, *B15++[1] ;**push &x onto stack
LDW .D1 *A3++[A7], A9 ;xt1
|| LDW .D2 *B8++[B5], B3 ;yt1
LDW .D1 *A3++[A7], A0 ;xt2
|| LDW .D2 *B8++[B5], B12 ;yt2
STW .D2 A6, *B15++[1] ;**push &w onto stack
[!A1] LDW .D2 *B0--, A8 ;update s
[!A1] MV .S1 A5, A1 ;wctr = ie
||[!A1] LDW .D2 *B0++[B2], B6 ;update c
SUB .S1 A1, 1, A1 ;wctr--
|| STW .D2 A13, *B15++[1] ;**push ictr onto stack
[!A1] SUB .S1 A3, A10, A3 ;load_re -= ro
||[!A1] SUB .S2X B8, A10, B8 ;load_im -= ro
|| SUBAW .D1 A3, A7, A6 ;copy
|| ADD .L2 B12, B3, B9 ;yt3=yt1+yt2
|| STW .D2 A2, *B15++[1] ;**push k onto stack
SUB .S1 A0, A9, A2 ;xt=xt2-xt1
|| SUB .S2 B12, B3, B10 ;yt=yt2-yt1
|| LDW .D1 *A3++[A7], A9 ;xt1
|| LDW .D2 *B8++[B5], B3 ;yt1
ADD .S1 A0, A9, A0 ;xt3=xt1+xt2
|| MV .S2X A8, B11 ;copy of s
|| LDW .D1 *A3++[A7], A0 ;xt2
|| LDW .D2 *B8++[B5], B12 ;yt2
MPYHSLU .M1X B6, A2, A12 ;t3a=hi(c)*lo(xt)
|| MPYHSLU .M2 B6, B10, B13 ;t3b=hi(c)*lo(yt)
|| STW .D1 A0, *--A6[A7] ;store x[2i]
MPYLUHS .M1X B6, A2, A13 ;t4a=lo(c)*hi(xt)
|| MPYLUHS .M2 B6, B10, B14 ;t4b=lo(c)*hi(xt)
|| STW .D1 B9, *+A6[1] ;store x[2i+1]
||[!A1] LDW .D2 *B0--, A8 ;update s
SHR .S1 A12, 15, A12 ;t3a >>= 15
|| SHR .S2 B13, 15, B13 ;t3b >>= 15
|| MPYHSLU .M1X A8, B10, A14 ;t6a=hi(s)*lo(yt)
|| MPYHSLU .M2X B11, A2, B4 ;t6b=hi(s)*lo(xt)
||[!A1] MV .D1 A5, A1 ;wctr = ie
||[!A1] LDW .D2 *B0++[B2], B6 ;update c
SUB .L1 A1, 1, A1 ;wctr--
||[B1] SUB .L2 B1, 1, B1 ;i--
|| SHR .S1 A13, 15, A13 ;t4a >>= 15
|| SHR .S2 B14, 15, B14 ;t4b >>= 15
|| MPYLUHS .M1X A8, B10, A15 ;t7a=lo(s)*yt(hi)
|| MPYLUHS .M2X B11, A2, B7 ;t7b=lo(s)*hi(xt)
;----INNER LOOP KERNEL STARTS HERE-------------------------
I_LOOP
[!A1] SUB .S1 A3, A10, A3 ;load_re -= ro
||[!A1] SUB .L2X B8, A10, B8 ;load_im -= ro
|| SUBAW .D1 A3, A7, A6 ;reset load_re
|| SHR .S2 B4, 15, B4 ;t6b >>= 15
|| SMPYH .M1X B6, A2, A11 ;t1a=hi(c)*hi(xt)
|| SMPYH .M2 B6, B10, B3 ;t1b=hi(c)*yt(hi)
|| MV .L1 A6, A4 ;split long life of store_re
|| ADD .D2 B12, B3, B9 ;yt3 = yt1+yt2
SUB .L1 A0, A9, A2 ;xt = xt2-xt1
|| SUB .L2 B12, B3, B10 ;yt = yt2-yt1
|| SHR .S1 A14, 15, A14 ;t6a >>=15
||[B1] B .S2 I_LOOP ;branch to loop start
|| SMPYH .M1X A8, B10, A9 ;t2a=hi(s)*yt(hi)
|| SMPYH .M2X B11, A2, B12 ;t2b=hi(s)*hi(xt)
|| LDW .D1 *A3++[A7], A9 ;load xt1
|| LDW .D2 *B8++[B5], B3 ;load yt1
ADD .L1 A0, A9, A0 ;xt3 = xt1+xt2
|| MV .L2X A8, B11 ;copy s to B-side
|| SHR .S1 A15, 15, A15 ;t7a >>= 15
|| SHR .S2 B7, 15, B7 ;t7b >>= 15
|| LDW .D1 *A3++[A7], A0 ;load xt2
|| LDW .D2 *B8++[B5], B12 ;load yt2
ADD .L1 A14, A15, A13 ;t8a = t6a+t7a
|| ADD .D2 B4, B7, B14 ;t8b = t6b+t7b
|| ADD .S1 A12, A13, A12 ;t5a = t3a+t4a
|| ADD .L2 B13, B14, B13 ;t5b = t3b+t4b
|| MPYHSLU .M1X B6, A2, A12 ;t3a = hi(c)*lo(xt)
|| MPYHSLU .M2 B6, B10, B13 ;t3b = hi(c)*lo(yt)
|| STW .D1 A0, *--A6[A7] ;store x[2i]
|| ADD .S2X A4, 4, B7 ;initialize store_im
ADD .S1 A11, A9, A0 ;t9a = t1a+t2a
|| SUB .S2 B3, B12, B9 ;t9b = t1b-t2b
|| ADD .L1 A12, A13, A14 ;t10a = t5a+t8a
|| SUB .L2 B13, B14, B14 ;t10b = t5b-t8b
|| MPYLUHS .M1X B6, A2, A13 ;t4a = lo(c)*hi(xt)
|| MPYLUHS .M2 B6, B10, B14 ;t4b = lo(c)*hi(yt)
|| STW .D1 B9, *+A6[1] ;store x[2i+1]
||[!A1] LDW .D2 *B0--, A8 ;update s
ADD .L1 A0, A14, A15 ;t11a = t9a+t10a
|| ADD .L2 B9, B14, B12 ;t11b = t9b+t10b
|| SHR .S1 A12, 15, A12 ;t3a >>= 15
|| SHR .S2 B13, 15, B13 ;t3b >>= 15
|| MPYHSLU .M1X A8, B10, A14 ;t4a = hi(s)*lo(yt)
|| MPYHSLU .M2X B11, A2, B4 ;t4b = hi(s)*lo(xt)
||[!A1] MV .D1 A5, A1 ;reset wctr
||[!A1] LDW .D2 *B0++[B2], B6 ;load c
SUB .L1 A1, 1, A1 ;wctr--
||[B1] SUB .L2 B1, 1, B1 ;i--
|| SHR .S1 A13, 15, A13 ;t4a >>= 15
|| SHR .S2 B14, 15, B14 ;t4b >>= 15
|| MPYLUHS .M1X A8, B10, A15 ;t6a = lo(s)*hi(yt)
|| MPYLUHS .M2X B11, A2, B7 ;t6b = hi(s)*lo(xt)
|| STW .D1 A15, *++A4[A7] ;store x[2(i+n2)]
|| STW .D2 B12, *++B7[B5] ;store x[2(i+n2)+1]
;----INNER LOOP KERNEL ENDS HERE---------------------------------
SUBAW .D1 A3, A7, A6 ;reset load_re
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -