📄 discrete cosine transform.txt
字号:
.asg A10, A_F5t ; Truncated result for F5
.asg A10, A_F6t ; Truncated result for F6
.asg B5, B_F7t ; Truncated result for F7
.asg B2, B_i ; Inner loop counter #1
.asg A1, A_i ; Inner loop counter #2
.asg B1, B_o ; Outer loop counter
.asg A2, A_c ; Prolog collapse counter
; ========================================================================= ;
* ========================================================================= *
* Initialization code / Stack Management *
* *
* This code is responsible for saving registers to the stack, disabling *
* interrupts, and setting up for the vertical loop. *
* *
* This function requires 16 words of stack. A10...A15, B10...B14, CSR, *
* IRP, and 'num_fdcts' are all pushed on the stack. For speed, this *
* code uses twin stack pointers to offload registers onto the stack as *
* quickly as possible. *
* *
* The majority of the code in this function is not interruptible. *
* Therefore, interrupts are disabled almost immediately after entry *
* into the function, and the previous interruptibility state is restored *
* on exit. The previous value of CSR is pushed on the stack and *
* restored on exit. *
* *
* Since all 32 registers are used by the vertical loop, the stack *
* pointer is saved in the IRP register. The previous contents of IRP *
* are also pushed on the stack. *
* *
* Initialization for constants (cosine terms, etc.) is overlapped with *
* the prolog of the vertical loop to save time. Pointer setup for the *
* output pointer is also hidden in the prolog. *
* *
* Early exit code suppresses most of the function's activity (including *
* most of the stack accesses) if num_fdcts (in B4) is zero. It is not *
* possible to exit the function faster. *
* ========================================================================= *
;-
STW .D2T1 A15, * B15--[16] ; Save A15, get stk frame
|| MV .L1X B15, A15 ; Twin Stack Pointer
|| SHL .S2 B4, 3, B_o ; iters == num_fdcts * 8
[ B_o]STW .D1T2 B14, *-A15 [ 2] ; Save B14 (SP[14])
||[ B_o]ADD .L2 B_o, -1, B_o ; Adj. for parallel iters
||[ B_o]ADDK .S1 48, A_i_ptr ; Point to row 3, col 0
||[!B_o]B .S2 B3 ; Abort if num_fdcts == 0
||[!B_o]LDW .D2T1 *++B15[16], A15 ; Restore A15 on abort
; ===== Interrupts masked by branch delay slots =====
;-
[ B_o]STW .D1T1 A13, *-A15 [ 3] ; Save A13 (SP[13])
||[ B_o]STW .D2T2 B11, *+B15 [ 8] ; Save B11
||[ B_o]MVC .S2 CSR, B0 ; Snapshot CSR
[ B_o]STW .D1T1 A12, *-A15 [ 5] ; Save A12 (SP[11])
||[ B_o]STW .D2T2 B12, *+B15 [10] ; Save B12
[ B_o]STW .D1T2 B13, *-A15 [ 7] ; Save B13 (SP[ 9])
||[ B_o]STW .D2T1 A11, *+B15 [12] ; Save A11
||[ B_o]MVC .S2 IRP, B5 ; Snapshot IRP
;-
[ B_o]STW .D1T1 A10, *-A15 [ 9] ; Save A10 (SP[ 7])
||[ B_o]STW .D2T2 B10, *+B15 [ 6] ; Save B10
||[ B_o]AND .L2 B0, -2, B2 ; Clear GIE bit in CSR
||[ B_o]MV .L1X B5, A1 ; Partitioning MV
[ B_o]STW .D2T2 B3, *+B15 [ 5] ; Save return address
||[ B_o]STW .D1T1 A1, *-A15 [12] ; Save IRP (SP[ 4])
||[ B_o]MV .L1X B0, A0 ; Partitioning MV
||[ B_o]MVC .S2 B2, CSR ; Mask interrupts
; ===== Branch Occurs =====
;-
; =========================== PIPE LOOP PROLOG ============================ ;
LDH .D1T1 *+A_i_ptr [ 8], A_f4 ;[ 1,1]
|| MVC .S2 B15, IRP ; Save Stack Pointer
LDH .D1T2 *-A_i_ptr [16], B_f1 ;[ 2,1]
|| MVK .S1 4, A_i ; Inner loop counter #1
LDH .D1T1 * A_i_ptr++[ 1], A_f3 ;[ 3,1]
|| MVKL .S1 cst_c7, A_c1c7 ; Cosine term C7
LDH .D1T1 *-A_i_ptr [25], A_f0 ;[ 4,1]
|| MVKL .S1 cst_c0, A_k1c0 ; Cosine term C0
;-
LDH .D1T2 *+A_i_ptr [15], B_f5 ;[ 5,1]
|| MVKL .S1 cst_c6, A_c2c6 ; Cosine term C6
|| MVKL .S2 cst_c6, B_c2c6 ; Cosine term C6
|| MV .L2X A_c1c7, B_c1c7 ; Twin constant register
LDH .D1T2 *-A_i_ptr [ 9], B_f2 ;[ 6,1]
|| MVKLH .S1 cst_c2, A_c2c6 ; Cosine term C2
|| SUB .L1 A_i, 2, A_c ; Prolog collapse cnt = 2
|| ADD .L2X A_i_ptr, -2, B_o_ptr
;-
LDH .D1T2 *+A_i_ptr [23], B_f6 ;[ 7,1]
|| MVKLH .S2 cst_c1, B_c1c7 ; Cosine term C1
LDH .D1T1 *+A_i_ptr [31], A_f7 ;[ 8,1]
|| MVKLH .S1 cst_c1, A_c1c7 ; Cosine term C1
|| MVKLH .S2 cst_c2, B_c2c6 ; Cosine term C2
MVKL .S2 cst_c5, B_c3c5 ; Cosine term C5
|| MVKLH .S1 1, A_k1c0 ; Constant: 0x0001
|| STW .D2T1 A14, *+B15 [15] ; Save A14
;-
SUB .L1 A_f3, A_f4, A_q1 ;[ 9,1] q1=g2
|| ADD .S1 A_f3, A_f4, A_h0 ;[10,1]
|| MVKLH .S2 cst_c3, B_c3c5 ; Cosine term C3
|| STW .D2T2 B_o, *+B15 [ 3] ; Spill horiz loop count
|| STW .D1T1 A0, *-A15 [14] ; Save CSR (SP[ 2])
LDH .D1T1 *+A_i_ptr [ 8], A_f4 ;[ 1,2]
|| MV .L2X A_k1c0, B_k1c0 ; Twin constant register
|| MVK .S2 16, B_i ; Inner loop counter #2
;-
; =========================== PIPE LOOP KERNEL ============================ ;
v_loop:
SHR .S1 A_F6, 13, A_F6t ;[22,1]
|| MPY .M2 B_S0, B_c3c5, B_c5S0 ;[22,1]
|| MPY .M1X A_Q0, B_c3c5, A_c5Q0 ;[22,1]
|| ADD .D2 B_f1, B_f6, B_g1 ;[12,2]
|| SUB .S2 B_f2, B_f5, B_g3 ;[12,2]
|| SUB .L2 B_f1, B_f6, B_h3 ;[12,2]
|| LDH .D1T2 *-A_i_ptr [16], B_f1 ;[ 2,3]
||[ A_c]ADD .L1 A_c, -1, A_c ;pro. collapse
;-
v_loop_1:
[!A_c]STH .D2T2 B_F4, *+B_o_ptr [ 8] ;[23,1]
|| MPY .M2 B_S1, B_c1c7, B_c7S1 ;[23,1]
|| MPYLH .M1X A_Q0, B_c3c5, A_c3Q0 ;[23,1]
|| ADD .L2 B_h3, B_g3, B_s0a ;[13,2]
|| SUB .S2 B_h3, B_g3, B_q0a ;[13,2]
|| SUB .S1 A_f0, A_f7, A_s1 ;[13,2] s1=h2
|| ADD .L1 A_f0, A_f7, A_g0 ;[13,2]
|| LDH .D1T1 * A_i_ptr++[ 1], A_f3 ;[ 3,3]
;-
v_loop_2:
[!A_c]STH .D2T2 B_F0, *-B_o_ptr [24] ;[24,1]
|| SUB .S2X B_c3S0, A_c5Q0, B_F3 ;[24,1]
|| MPYLH .M2 B_S1, B_c1c7, B_c1S1 ;[24,1]
|| ADD .L2 B_f2, B_f5, B_h1 ;[14,2]
|| SUB .S1 A_g0, A_h0, A_r0 ;[14,2]
|| ADD .L1 A_g0, A_h0, A_p0 ;[14,2]
|| MPYSU .M1X B_q0a, A_k1c0, A_q0b ;[14,2]
|| LDH .D1T1 *-A_i_ptr [25], A_f0 ;[ 4,3]
;-
v_loop_3:
[!A_c]SHR .S1 A_F2, 13, A_F2t ;[25,1]
||[!A_c]MPY .M1 A_i, 4, A_i ;[25,1]
|| SHR .S2 B_F3, 13, B_F3t ;[25,1]
|| SUB .L2X B_c7S1, A_c1Q1, B_F7 ;[25,1]
|| ADD .L1X A_c3Q0, B_c5S0, A_F5 ;[25,1]
|| SUB .D2 B_g1, B_h1, B_r1 ;[15,2]
|| MPYSU .M2 B_s0a, B_k1c0, B_s0b ;[15,2]
|| LDH .D1T2 *+A_i_ptr [15], B_f5 ;[ 5,3]
;-
v_loop_4:
ADD .L1X A_c7Q1, B_c1S1, A_F1 ;[26,1]
||[ B_o]B .S2 v_loop ;[26,1]
||[!A_c]STH .D2T1 A_F6t, *+B_o_ptr [24] ;[26,1]
|| ADD .L2 B_g1, B_h1, B_p1 ;[16,2]
|| ADDK .S1 07FFFh, A_q0b ;[16,2]
|| MPY .M1 A_r0, A_c2c6, A_c6r0 ;[16,2]
|| MPY .M2 B_i, 4, B_i ;[ 6,3]
|| LDH .D1T2 *-A_i_ptr [ 9], B_f2 ;[ 6,3]
;-
v_loop_5:
[!A_c]STH .D2T1 A_F2t, *-B_o_ptr [ 8] ;[27,1]
|| SHR .S1 A_F5, 13, A_F5t ;[27,1]
|| MPY .M2 B_r1, B_c2c6, B_c6r1 ;[17,2]
|| SUB .L2X A_p0, B_p1, B_F4 ;[17,2]
|| ADDK .S2 07FFFh, B_s0b ;[17,2]
|| MPYH .M1 A_q0b, A_k1c0, A_q0 ;[17,2]
|| LDH .D1T2 *+A_i_ptr [23], B_f6 ;[ 7,3]
;-
v_loop_6:
[!A_c]STH .D2T2 B_F3t, * B_o_ptr++[ 1] ;[28,1]
|| SHR .S1 A_F1, 13, A_F1t ;[28,1]
|| ADD .L2X A_p0, B_p1, B_F0 ;[18,2]
|| MPYLH .M1 A_r0, A_c2c6, A_c2r0 ;[18,2]
|| MPYH .M2 B_s0b, B_k1c0, B_s0 ;[18,2]
||[ B_o]SUB .S2 B_o, 1, B_o ;[18,2]
|| LDH .D1T1 *+A_i_ptr [31], A_f7 ;[ 8,3]
v_loop_7:
;-
SHR .S2 B_F7, 13, B_F7t ;[29,1]
||[!A_c]STH .D2T1 A_F5t, *+B_o_ptr [15] ;[29,1]
|| MPYLH .M2 B_r1, B_c2c6, B_c2r1 ;[19,2]
|| SUB .L1 A_q1, A_q0, A_Q0 ;[19,2]
|| ADD .D1 A_q1, A_q0, A_Q1 ;[19,2]
|| SUB .S1 A_f3, A_f4, A_q1 ;[ 9,3] q1=g2
;-
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -