📄 scale_vert_h.asm
字号:
[!B_j]MV .S2 A_l_hh, B_j ;if(!j)j=l_hh;
|| ADDAH .D2 B_ptr_ln1_x, B_n_x, B_ptr_ln2_x ;3rd line index
|| ADDAH .D1 A_ptr_ln0_x, A_n_x, A_ptr_ln1_x ;1st line index
;=
[ B_k]SUB .L2 B_k, 1, B_k ;if(k)k++
|| ADDAH .D2 B_ptr_ln2_x, B_n_x, B_ptr_ln3_x ;3rd line
|| ADDAH .D1 A_ptr_ln1_x, A_n_x, A_ptr_ln2_x ;2nd line
|| MV .S1 A_mod_hh, A_hh ;copy new filter start
STH .D2T2 B_hh_i, *B_mod_hh++ ;store rotated values
|| ADDAH .D1 A_ptr_ln2_x, A_n_x, A_ptr_ln3_x ;3rd line copy
|| MPY .M2 0, B_zero, B_zero ;const = 0
|| MV .L1 A_mod_hh, A_filter ;filter = mod_hh;
; BRANCH OCCURS
*==============================================================================*
LDDW .D2T2 *B_ptr_ln0_x[0], B_x07x06:B_x05x04 ;x7654=*(ptr_ln0_x+ka)
|| LDDW .D1T1 *A_ptr_ln0_x[0], A_x03x02:A_x01x00 ;x3210=*(ptr_ln0_x+ka)
|| SHRU .S2 B_block, 2, B_block ;double words
LDDW .D2T2 *B_ptr_ln2_x[0], B_x27x26:B_x25x24 ;x7654=*(ptr_ln2_x+ka)
|| LDDW .D1T1 *A_ptr_ln3_x[0], A_x33x32:A_x31x30 ;x3210=*(ptr_ln3_x+ka)
|| B .S1 LOOPX7+20 ;collpase prolog
|| SUB .S2 B_block, 2, B_block ;block+=2
|| MPY .M2 B_l_hh, B_n_x, B_i ;i=0;i<n_y*l_hh;i+=32)
;=
LDDW .D2T2 *B_ptr_ln1_x[0], B_x17x16:B_x15x14 ;x7654=*(ptr_ln1_x+ka)
|| LDDW .D1T1 *A_ptr_ln2_x[0], A_x23x22:A_x21x20 ;x3210=*(ptr_ln2_x+ka)
|| B .S2 LOOPX8+16 ;collpase prolog
LDDW .D2T2 *B_ptr_ln3_x[0], B_x37x36:B_x35x34 ;x7654=*(ptr_ln3_x+ka)
|| B .S1 LOOPX9+16 ;collpase prolog
|| ADD .S2 B_ptr_plane_y, 8, B_ptr_ln_y ;ln0_y = plane_y
|| ZERO .L2 B_y7:B_y6 ;y7 = y6 = 0x0
|| ZERO .L1 A_y3:A_y2 ;y3 = y2 = 0x0
;=
LDDW .D1T1 *A_ptr_ln1_x[0], A_x13x12:A_x11x10 ;x3210=*(ptr_ln1_x+ka)
|| B .S1 LOOPX ;collpase prolog
|| MV .L1X B_ptr_plane_y, A_ptr_ln_y ;ptr_ln0_y=ptr_plane_y
MVK .S1 1, A_taps_ ;prologue code
|| MPY .M1 0, A_ka, A_ka ;ka = 0
|| ZERO .L1 A_y1:A_y0 ;y0 = y1 = 0x0;
|| ZERO .L2 B_y5:B_y4 ;y4 = y5 = 0x0;
|| SHRU .S2 B_i, 5, B_i ;i = i/32
;=
LDDW .D1T1 *A_filter++[1], A_h3h2:A_h1h0;h3h2h1h0=*filter++
|| SUB .S1 A_l_hh, 4, A_taps ;taps_count=l_hh
|| MV .L1 A_n_x, A_n_x_ ;copy cols
|| ZERO .D2 B_ka ;ka = 0
|| MVK .S2 1, B_pro ;prologue code
|| SUB .L2 B_i, 1, B_i ;i++
*============================== PIPE LOOP KERNEL ==============================*
LOOPX:
[!B_pro]ADD .L2 B_y7, B_p207, B_y7 ;y7+=p207
|| PACK2 .S2 B_x17x16, B_x07x06, B_x16x06 ;transpose 2x2 data
|| PACK2 .S1 A_x13x12, A_x03x02, A_x12x02 ;transpose 2x2 data
|| PACK2 .L1 A_x31x30, A_x21x20, A_x30x20 ;transpose 2x2 data
|| LDDW .D2T2 *B_ptr_ln0_x[B_ka],B_x07x06:B_x05x04;x7654=*(ptr_ln0_x+ka)
|| LDDW .D1T1 *A_ptr_ln0_x[A_ka],A_x03x02:A_x01x00;x3210=*(ptr_ln0_x+ka)
[!A_taps_]MPY2.M2 B_zero, B_zero, B_y7:B_y6 ;if(!sample)y7y6=0:0
|| ADD .S2 B_p26, B_p06, B_p206 ;4 pt filter 6 sum
|| ADD .L2 B_p24, B_p04, B_p204 ;4 pt filter 4 sum
|| ADD .L1 A_p20, A_p00, A_p200 ;4 pt filter 0 sum
|| DOTP2 .M1 A_x12x02, A_h1h0, A_p02 ;x[1,2]x[0,2]'h[1]h[0]
|| PACK2 .S1 A_x11x10, A_x01x00, A_x10x00 ;transpose 2x2 data
|| LDDW .D2T2 *B_ptr_ln2_x[B_ka],B_x27x26:B_x25x24;x7654=*(ptr_ln2_x+ka)
|| LDDW .D1T1 *A_ptr_ln3_x[A_ka],A_x33x32:A_x31x30;x3210=*(ptr_ln3_x+ka)
[!B_pro]ADD .L2 B_y6, B_p206, B_y6 ;col 6 accumulater
||[!B_pro]ADD .L1 A_y0, A_p200, A_y0 ;col 0 accumulater
|| DOTP2 .M2X B_x36x26, A_h3h2, B_p26 ;x[3,6]x[2,6]'h[3]h[2]
|| PACK2 .S2 B_x15x14, B_x05x04, B_x14x04 ;transpose 2x2 data
|| PACKH2 .S1 A_x33x32, A_x23x22, A_x33x23 ;transpose 2x2 data
|| DOTP2 .M1 A_x10x00, A_h1h0, A_p00 ;x[1,0]x[0,0]'h[1]h[0]
|| LDDW .D2T2 *B_ptr_ln1_x[B_ka],B_x17x16:B_x15x14;x7654=*(ptr_ln1_x+ka)
|| LDDW .D1T1 *A_ptr_ln2_x[A_ka],A_x23x22:A_x21x20;x3210=*(ptr_ln2_x+ka)
ADD .S1 A_p23, A_p03, A_p203 ;4 pt filter 3 sum
|| ADD .D1 A_p21, A_p01, A_p201 ;4 pt filter 1 sum
|| PACKH2 .S2 B_x37x36, B_x27x26, B_x37x27 ;transpose 2x2 block
|| DOTP2 .M2X B_x16x06, A_h1h0, B_p06 ;x[1,6]x[0,6]'h[1]h[0]
|| PACK2 .L2 B_x35x34, B_x25x24, B_x34x24 ;transpose 2x2 block
|| DOTP2 .M1 A_x33x23, A_h3h2, A_p23 ;x[3,3]x[2,3]'h[3]h[2]
|| PACK2 .L1 A_x33x32, A_x23x22, A_x32x22 ;transpose 2x2 block
|| LDDW .D2T2 *B_ptr_ln3_x[B_ka],B_x37x36:B_x35x34;x7654=*(ptr_ln3_x+ka)
BDEC .S2 LOOPX, B_i ;}
|| PACKH2 .L2 B_y7, B_y6, B_t_y76 ;data ready for store
|| ADD .D2 B_p25, B_p05, B_p205 ;4 pt filter sum 5
||[!B_pro]ADD .L1 A_y1, A_p201, A_y1 ;col 1 accumulater
|| DOTP2 .M2X B_x37x27, A_h3h2, B_p27 ;x[3,7]x[2,7]'h[3]h[2]
|| DOTP2 .M1 A_x32x22, A_h3h2, A_p22 ;x[3,2]x[2,2]'h[3]h[2]
|| PACKH2 .S1 A_x11x10, A_x01x00, A_x11x01 ;traspose 2x2 block
|| LDDW .D1T1 *A_ptr_ln1_x[A_ka],A_x13x12:A_x11x10;x3210=*(ptr_ln1_x+ka)
PACKH2 .S1 A_y1, A_y0, A_t_y10 ;data ready for store
||[!B_pro]ADD .L2 B_y5, B_p205, B_y5 ;col 5 accumulater
||[!B_pro]ADD .D2 B_y4, B_p204, B_y4 ;col 4 accumulater
||[!B_pro]ADD .D1 A_y3, A_p203, A_y3 ;col 3 accumulater
|| DOTP2 .M2X B_x17x07, A_h1h0, B_p07 ;x[1,7]x[0,7]'h[1]h[0]
||[!A_taps]MV .L1 A_l_hh:A_hh, A_taps:A_filter;if(!taps)filter=hh
|| MVD .M1 A_taps, A_taps_ ;sample=taps_count
[!A_taps_]ZERO.L2 B_y5:B_y4 ;if(!sample)y5y4=0:0
||[!A_taps_]ZERO.L1 A_y1:A_y0 ;if(!sample)y1y0=0:0
|| ZERO .D2 B_pro ;collapse prolog
|| PACKH2 .S2 B_y5, B_y4, B_t_y54 ;data ready for store
|| DOTP2 .M2X B_x34x24, A_h3h2, B_p24 ;x[3,4]x[2,4]'h[3]h[2]
|| DOTP2 .M1 A_x30x20, A_h3h2, A_p20 ;x[3,0]x[2,0]'h[3]h[2]
|| ADD .S1 A_taps, -4, A_taps ;taps_count -= 4;
|| LDDW .D1T1 *A_filter++[1], A_h3h2:A_h1h0 ;h3h2h1h0=*filter++
LOOPX7:
[!A_taps_]STDW.D2T2 B_t_y76:B_t_y54, *B_ptr_ln_y++[2] ;if(!samp)*ln_y++=y7-4
||[!A_taps_]ZERO.L1 A_y3:A_y2 ;if(!samp)y3y2 = 0:0
|| PACKH2 .S1 A_y3, A_y2, A_t_y32 ;data ready for store
|| DOTP2 .M2X B_x14x04, A_h1h0, B_p04 ;x[1,4]x[0,4]'h[1]h[0]
|| DOTP2 .M1 A_x31x21, A_h3h2, A_p21 ;x[3,1]x[2,1]'h[3]h[2]
|| PACKH2 .L2 B_x17x16, B_x07x06, B_x17x07 ;transpose 2x2 block
||[!A_taps]SUB .S2 B_ka, B_block, B_ka ;if(!taps)ka-=block+8
LOOPX8:
[!A_taps_]STDW.D1T1 A_t_y32:A_t_y10, *A_ptr_ln_y++[2] ;if(!samp)*ln_y=y3210
|| DOTP2 .M2X B_x15x05, A_h1h0, B_p05 ;x[1,5]x[,.5]'h[1]h[0]
|| ADD .L1 A_p22, A_p02, A_p202 ;4 pt filter sum col 2
|| DOTP2 .M1 A_x11x01, A_h1h0, A_p01 ;x[1,1]x[,.1]'h[1]h[0]
|| PACK2 .L2 B_x37x36, B_x27x26, B_x36x26 ;transpose 2x2 block
|| PACKH2 .S2 B_x15x14, B_x05x04, B_x15x05 ;transpose 2x2 block
|| PACKH2 .S1 A_x31x30, A_x21x20, A_x31x21 ;transpose 2x2 block
LOOPX9:
ADD .L2 B_p27, B_p07, B_p207 ;4 pt filter sum col 7
|| DOTP2 .M2X B_x35x25, A_h3h2, B_p25 ;x[3,5]x[2,5]'h[3]h[2]
|| ADD .D1 A_y2, A_p202, A_y2 ;col 2 accumulater
|| DOTP2 .M1 A_x13x03, A_h1h0, A_p03 ;x[1,3]x[0,3]'h[1]h[0]
|| PACKH2 .S2 B_x35x34, B_x25x24, B_x35x25 ;transpose 2x2 block
|| PACKH2 .S1 A_x13x12, A_x03x02, A_x13x03 ;transpose 2x2 block
|| ADD .D2 B_ka, B_n_x, B_ka ;ka+=4*n_x
|| ADD .L1X A_n_x_, B_ka, A_ka ;ka+=4*n_x
*============================= PIPE LOOP EPILOG ===============================*
LDDW .D2T2 *+B_SP[1], B_ret:B_csr ; Get rtn, CSR
|| MV B_SP, A_SP ; Twin Stack Ptr
LDDW .D1T1 *+A_SP[2], A11:A10 ; Restore A11, A10
|| LDDW .D2T2 *+B_SP[3], B11:B10 ; Restore B11, B10
LDDW .D1T1 *+A_SP[4], A13:A12 ; Restore A13, A12
|| LDDW .D2T2 *+B_SP[5], B13:B12 ; Restore B13, B12
LDDW .D1T1 *+A_SP[6], A15:A14 ; Restore A15, A14
LDW .D2T2 *++B_SP[14],B14 ; Restore B14, ...
; ...release stack
BNOP .S2 B_ret, 4 ; Return to caller
MVC .S2 B_csr, CSR ; Restore CSR
; ===== Branch Occurs =====
; ===== Interruptibility state restored here =====
* ========================================================================= *
* End of file: scale_vert_h.asm *
* ------------------------------------------------------------------------- *
* Copyright (c) 2000 Texas Instruments, Incorporated. *
* All Rights Reserved. *
* ========================================================================= *
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -