📄 discrete cosine transform.txt
字号:
v_loop_8:
[!A_c]STH .D2T1 A_F1t, *-B_o_ptr [17] ;[30,1]
|| ADD .L1X B_c6r1, A_c2r0, A_F2 ;[20,2]
|| SUB .L2X A_s1, B_s0, B_S0 ;[20,2]
|| MPYLH .M1 A_Q1, A_c1c7, A_c1Q1 ;[20,2]
|| ADD .D1 A_f3, A_f4, A_h0 ;[10,3]
||[!B_i]ADD .S2 B_i, 4, B_i ;[10,3]
||[!B_i]ADDK .S1 112, A_i_ptr ;[10,3]
;-
v_loop_9:
[!A_c]STH .D2T2 B_F7t, *+B_o_ptr [31] ;[31,1]
||[!A_i]ADDK .S2 112, B_o_ptr ;[31,1]
||[!A_i]ADD .S1 A_i, 4, A_i ;[31,1]
|| SUB .L1X A_c6r0, B_c2r1, A_F6 ;[21,2]
|| MPYLH .M2 B_S0, B_c3c5, B_c3S0 ;[21,2]
|| ADD .L2X A_s1, B_s0, B_S1 ;[21,2]
|| MPY .M1 A_Q1, A_c1c7, A_c7Q1 ;[21,2]
|| LDH .D1T1 *+A_i_ptr [ 8], A_f4 ;[ 1,4]
; =========================== PIPE LOOP EPILOG ============================ ;
* ========================================================================= *
* Epilog / Inter-loop / Prolog Code *
* *
* The code from the vertical loop's epilog has been interscheduled *
* with inter-loop code and prolog code for the horizontal loop. *
* This allows hiding some of the overhead as we pipe-down one loop and *
* pipe-up the next. *
* *
* Notably, we restore B15 and IRP here (rather than after the loop) *
* and unspill our loop trip count from the stack, all in parallel with *
* the prolog and epilog code. Also, the epilog of the first loop has *
* been heavily overlapped with the prolog of the second loop. Since *
* a handful of symbolic names have been assigned to different registers, *
* and others have conflicting names between the two loops, we use a set *
* of intermediate symbolic names that bridge the transition. *
* *
* To save a STH/LDH pair, the value of "F7t" from the first loop is *
* forwarded directly to the input "f7" of the second loop. (The last *
* FDCT performed by the vertical loop overlaps the first FDCT performed *
* by the second loop.) This is done through a "sign extension", to *
* exactly mimic the overflow behavior of the original C code. *
* *
* For speed, we twin the stack pointer in a spare slot here so that the *
* stack restore after the loop can proceed as quickly as possible. *
* ========================================================================= *
.asg A4, Ah_io_ptr ; Horiz Input/output pointer
.asg A14, Av_c1Q1 ; Vert: Intermediate c1 * Q1
.asg A6, Av_c3Q0 ; Vert: Intermediate c3 * Q0
.asg A8, Av_c7Q1 ; Vert: Intermediate c7 * Q1
.asg B6, Bv_c1S1 ; Vert: Intermediate c1 * S1
.asg B3, Bv_c5S0 ; Vert: Intermediate c5 * S0
.asg B5, Bv_c7S1 ; Vert: Intermediate c7 * S1
.asg A8, Av_F1 ; Vert: Freq. domain term F1
.asg A5, Av_F2 ; Vert: Freq. domain term F2
.asg B4, Bv_F3 ; Vert: Freq. domain term F3
.asg A9, Av_F5 ; Vert: Freq. domain term F5
.asg B4, Bv_F7 ; Vert: Freq. domain term F7
.asg A8, Av_F1t ; Vert: Trunc. result for F1
.asg A5, Av_F2t ; Vert: Trunc. result for F2
.asg B7, Bv_F3t ; Vert: Trunc. result for F3
.asg A9, Av_F5t ; Vert: Trunc. result for F5
.asg B5, Bv_F7t ; Vert: Trunc. result for F7
;-
SHR .S1 A_F6, 13, A_F6t ;[22,4]
|| MPYLH .M2 B_S1, B_c1c7, Bv_c1S1 ;[24,4]
|| MPY .M1X A_Q0, B_c3c5, A_c5Q0 ;[22,4]
|| STH .D2T2 B_F4, *+B_o_ptr [ 8] ;[23,4]
MPY .M2 B_S0, B_c3c5, Bv_c5S0 ;[22,4]
|| MPYLH .M1X A_Q0, B_c3c5, Av_c3Q0 ;[23,4]
|| STH .D2T1 A_F6t, *+B_o_ptr [24] ;[26,4]
|| MVC .S2 IRP, B15
;-
STH .D2T2 B_F0, *-B_o_ptr [24] ;[24,4]
|| MPY .M2 B_S1, B_c1c7, B_c7S1 ;[23,4]
|| SUB .S2X B_c3S0, A_c5Q0, B_F3 ;[24,4]
|| ADD .L1X A_c7Q1, Bv_c1S1, A_F1 ;[26,4]
|| ADDK .S1 -54, Ah_io_ptr ; Adjust pointer
SHR .S2 B_F3, 13, B_F3t ;[25,4]
|| LDW .D2T1 *B15[4], A2 ; Load IRP's value
|| SHR .S1 A_F1, 13, Av_F1t ;[28,4]
;-
; ========================================================================= ;
; =============== SYMBOLIC REGISTER ASSIGNMENTS: HORIZ LOOP =============== ;
.asg A14, A_c3c5 ; Cosine terms c3, c5 (packed)
.asg B1, B_k_rnd ; Rounding constant 0x7FFF
.asg A12, A_k_rnd ; Rounding constant 0x7FFF
.asg B2, B_io_ptr; Input/output pointer
.asg A4, A_io_ptr; Input/output pointer
.asg A7, A_f0 ; Spatial domain sample f0
.asg B13, B_f1 ; Spatial domain sample f1
.asg B3, B_f2 ; Spatial domain sample f2
.asg A10, A_f3 ; Spatial domain sample f3
.asg A0, A_f4 ; Spatial domain sample f4
.asg B7, B_f5 ; Spatial domain sample f5
.asg B9, B_f6t ; Spatial domain sample f6 (tmp)
.asg B10, B_f6 ; Spatial domain sample f6
.asg A8, A_f7 ; Spatial domain sample f7
.asg A0, A_g0 ; Node g0 in flow graph
.asg B3, B_g1 ; Node g1 in flow graph
.asg B8, B_h1 ; Node h1 in flow graph
.asg A9, A_h0 ; Node h0 in flow graph
.asg A1, A_s1 ; Node s1 in flow graph
.asg B13, B_h3 ; Node h3 in flow graph
.asg B10, B_g3 ; Node g3 in flow graph
.asg A10, A_q1 ; Node q1 in flow graph
.asg A5, A_p0 ; Node p0 in flow graph
.asg B4, B_p1 ; Node p1 in flow graph
.asg B4, B_s0a ; Node s0 intermediate value
.asg B5, B_s0b ; Node s0 intermediate value
.asg B5, B_s0c ; Node s0 intermediate value
.asg B9, B_s0 ; Node s0 in flow graph
.asg A0, A_r0 ; Node r0 in flow graph
.asg B7, B_r1 ; Node r1 in flow graph
.asg B5, B_q0a ; Node q0 intermediate value
.asg A3, A_q0b ; Node q0 intermediate value
.asg A0, A_q0c ; Node q0 intermediate value
.asg A6, A_q0 ; Node q0 in flow graph
.asg A9, A_Q1 ; Node Q1 in flow graph
.asg B8, B_S1 ; Node S1 in flow graph
.asg A6, A_Q0 ; Node Q0 in flow graph
.asg B5, B_S0 ; Node S0 in flow graph
.asg A0, A_c1Q1 ; Intermediate value c1 * Q1
.asg A5, A_c2r0 ; Intermediate value c2 * r0
.asg A3, A_c3Q0 ; Intermediate value c3 * Q0
.asg A9, A_c5Q0 ; Intermediate value c5 * Q0
.asg A3, A_c6r0 ; Intermediate value c6 * r0
.asg A3, A_c7Q1 ; Intermediate value c7 * Q1
.asg B7, B_c1S1 ; Intermediate value c1 * S1
.asg B3, B_c2r1 ; Intermediate value c2 * r1
.asg B4, B_c3S0 ; Intermediate value c3 * S0
.asg B10, B_c5S0 ; Intermediate value c5 * S0
.asg B4, B_c6r1 ; Intermediate value c6 * r1
.asg B4, B_c7S1 ; Intermediate value c7 * S1
.asg B5, B_F0 ; Frequency domain term F0
.asg A6, A_F1 ; Frequency domain term F1
.asg B9, B_F2 ; Frequency domain term F2
.asg B3, B_F3 ; Frequency domain term F3
.asg A1, A_F4 ; Frequency domain term F4
.asg A9, A_F5 ; Frequency domain term F5
.asg A3, A_F6 ; Frequency domain term F6
.asg B7, B_F7 ; Frequency domain term F7
.asg B5, B_F0r ; Rounded value for F0
.asg A6, A_F1r ; Rounded value for F1
.asg B3, B_F2r ; Rounded value for F2
.asg B3, B_F3r ; Rounded value for F3
.asg A7, A_F4r ; Rounded value for F4
.asg A9, A_F5r ; Rounded value for F5
.asg A3, A_F6r ; Rounded value for F6
.asg B5, B_F7r ; Rounded value for F7
.asg B6, B_F0t ; Truncated result for F0
.asg A8, A_F1t ; Truncated result for F1
.asg B6, B_F2t ; Truncated result for F2
.asg B4, B_F3t ; Truncated result for F3
.asg A7, A_F4t ; Truncated result for F4
.asg A0, A_F5t ; Truncated result for F5
.asg A5, A_F6t ; Truncated result for F6
.asg B13, B_F7t ; Truncated result for F7
.asg A2, A_o ; Outer loop counter
.asg B0, B_c ; Prolog collapse counter
.asg A1, A_c ; Prolog collapse counter copy
; ========================================================================= ;
* ========================================================================= *
* (Instructions marked "(v)" in the prolog below are from the vertical *
* loop's epilog.) *
* ========================================================================= *
; =========================== PIPE LOOP PROLOG ============================ ;
LDH .D1T2 *-A_io_ptr [ 4], B_f2 ;[ 1,1]
|| SUB A_io_ptr, 12, B_io_ptr
|| STH .D2T1 Av_F1t, *-B_o_ptr [16] ;[30,4] (v)
|| SHR .S1 Av_F2, 13, Av_F2t ;[25,4] (v)
LDH .D1T1 *-A_io_ptr [ 3], A_f3 ;[ 2,1]
|| LDH .D2T2 *+B_io_ptr [ 5], B_f5 ;[ 2,1]
|| SUB .L2X Bv_c7S1, Av_c1Q1, Bv_F7 ;[25,4] (v)
|| ADD .L1X Av_c3Q0, Bv_c5S0, Av_F5 ;[25,4] (v)
;-
LDH .D2T1 *+B_io_ptr [ 4], A_f4 ;[ 3,1]
|| LDH .D1T2 * A_io_ptr--[ 7], B_f6t ;[ 3,1]
|| MVK .S1 0x7FFF, A_k_rnd ; Rounding value
|| MVK .S2 0x7FFF, B_k_rnd ; Rounding value
LDH .D2T1 * B_io_ptr--[ 8], A_f0 ;[ 5,1]
|| LDH .D1T2 *+A_io_ptr [ 2], B_f1 ;[ 4,1]
|| SHR .S1 Av_F5, 13, Av_F5t ;[27,4] (v)
;-
SHL .S1X Bv_F7, 3, A_f7 ;[29,4] (v)
|| STH .D2T2 Bv_F3t, * B_o_ptr [ 0] ;[28,4] (v)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -