📄 skl_mb_sse.asm
字号:
%endif movq [ecx+edx+%2], mm0%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_Add_8x4_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 COPY_ADD_HH_RND0 1,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 retalign 16Skl_Copy_8x4_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 COPY_ADD_HH_RND0 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 retalign 16Skl_Add_8x8_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 COPY_ADD_HH_RND0 1,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 retalign 16Skl_Copy_8x8_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 ; 55c nop COPY_ADD_HH_RND0 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 retalign 16Skl_Add_16x8_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 HH_SETUP mm4, mm5, 8 COPY_ADD_HH_RND0 1,0, mm2,mm3 COPY_ADD_HH_RND0 1,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 COPY_ADD_HH_RND0 1,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 COPY_ADD_HH_RND0 1,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 1,0, mm2,mm3 COPY_ADD_HH_RND0 1,8, mm4,mm5 retalign 16Skl_Copy_16x8_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 HH_SETUP mm4, mm5, 8 ; 103c nop COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] nop COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 retalign 16Skl_HV_Pass_2Taps_SSE:Skl_Copy_16x16_HH_Rnd0_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 HH_SETUP mm4, mm5, 8 nop COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] nop COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] nop COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] nop COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND0 0,0, mm2,mm3 COPY_ADD_HH_RND0 0,8, mm4,mm5 ret;//////////////////////////////////////////////////////////////////////align 16Skl_Copy_8x4_HH_Rnd1_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 COPY_ADD_HH_RND1 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 retalign 16Skl_Copy_8x8_HH_Rnd1_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 ; 53c COPY_ADD_HH_RND1 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 retalign 16Skl_Copy_16x8_HH_Rnd1_SSE: PROLOG0 movq mm7, [Mask] HH_SETUP mm2, mm3, 0 HH_SETUP mm4, mm5, 8 ; 103c COPY_ADD_HH_RND1 0,0, mm2,mm3 COPY_ADD_HH_RND1 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 COPY_ADD_HH_RND1 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 COPY_ADD_HH_RND1 0,8, mm4,mm5 lea eax, [eax+2*edx] lea ecx, [ecx+2*edx] COPY_ADD_HH_RND1 0,0, mm2,mm3 COPY_ADD_HH_RND1 0,8, mm4,mm5 ret;//////////////////////////////////////////////////////////////////////;//;// 8x8 -> 16x16 upsampling (16b);//;//////////////////////////////////////////////////////////////////////%macro MUL_PACK 4 ; %1/%2: regs %3/%4/%5: Up13/Up31 pmullw %1, %3 ; [Up13] pmullw mm4, %4 ; [Up31] pmullw %2, %3 ; [Up13] pmullw mm5, %4 ; [Up31] paddsw %1, [Cst2] paddsw %2, [Cst2] paddsw %1, mm4 paddsw %2, mm5%endmacro%macro COL03 3 ;%1/%2: regs, %3: row -trashes mm4/mm5 movq %2, [edx+%3*16+0*2] ; <- 0|1|2|3 pshufw %1, %2, (0+0*4+0*16+1*64) ; %1 = 0|0|0|1 pshufw mm4, %2, (0+1*4+1*16+2*64) ; mm4= 0|1|1|2 pshufw %2, %2, (1+2*4+2*16+3*64) ; %2 = 1|2|2|3 pshufw mm5, [edx+%3*16+2*2], (0+1*4+1*16+2*64) ; mm5 = 2|3|3|4%endmacro%macro COL47 3 ;%1-%2: regs, %3: row -trashes mm4/mm5 pshufw %1, [edx+%3*16+2*2], (1+2*4+2*16+3*64) ; 3|4|4|5 movq mm5, [edx+%3*16+2*4] ; <- 4|5|6|7 pshufw mm4, mm5, (0+1*4+1*16+2*64) ; 4|5|5|6 pshufw %2, mm5, (1+2*4+2*16+3*64) ; 5|6|6|7 pshufw mm5, mm5, (2+3*4+3*16+3*64) ; 6|7|7|7%endmacro%macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output ; we need to perform: (%1,%3) -> (%1 = 3*%1+%3, mm4 = 3*%3+%1), %3 preserved. movq mm4, [Cst3] movq mm5, [Cst3] pmullw mm4, %3 pmullw mm5, %4 paddsw mm4, %1 paddsw mm5, %2 pmullw %1, [Cst3] pmullw %2, [Cst3] paddsw %1, %3 paddsw %2, %4%endmacro;////////////////////////////////////////////////////////////////////// ; Note: we can use ">>2" instead of "/4" here, since we ; are (supposed to be) averaging positive values%macro STORE_1 2 psraw %1, 2 psraw %2, 2 packuswb %1,%2 movq [ecx], %1%endmacro%macro STORE_2 2 ; pack and store (%1,%2) + (mm4,mm5) psraw %1, 4 psraw %2, 4 psraw mm4, 4 psraw mm5, 4 packuswb %1,%2 packuswb mm4, mm5 movq [ecx], %1 movq [ecx+eax], mm4 lea ecx, [ecx+2*eax]%endmacroalign 16Skl_Copy_Upsampled_8x8_16To8_SSE: ; 315c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS movq mm6, [Up13] movq mm7, [Up31] COL03 mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 movq mm4, mm0 movq mm5, mm1 STORE_1 mm4, mm5 add ecx, eax COL03 mm2, mm3, 1 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03 mm0, mm1, 2 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03 mm2, mm3, 3 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03 mm0, mm1, 4 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03 mm2, mm3, 5 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL03 mm0, mm1, 6 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL03 mm2, mm3, 7 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 STORE_1 mm2, mm3 mov ecx, [esp+4] add ecx, 8 COL47 mm0, mm1, 0 MUL_PACK mm0,mm1, mm6, mm7 movq mm4, mm0 movq mm5, mm1 STORE_1 mm4, mm5 add ecx, eax COL47 mm2, mm3, 1 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47 mm0, mm1, 2 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47 mm2, mm3, 3 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47 mm0, mm1, 4 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47 mm2, mm3, 5 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 COL47 mm0, mm1, 6 MUL_PACK mm0,mm1, mm6, mm7 MIX_ROWS mm2, mm3, mm0, mm1 STORE_2 mm2, mm3 COL47 mm2, mm3, 7 MUL_PACK mm2,mm3, mm6, mm7 MIX_ROWS mm0, mm1, mm2, mm3 STORE_2 mm0, mm1 STORE_1 mm2, mm3 ret;////////////////////////////////////////////////////////////////////// ; Note: grrr... the 'pcmpgtw' stuff are the "/4" and "/16" operators ; implemented with ">>2" and ">>4" using: ; x/4 = ( (x-(x<0))>>2 ) + (x<0) ; x/16 = ( (x-(x<0))>>4 ) + (x<0)%macro STORE_ADD_1 2 ; We substract the rounder '2' for corner pixels, ; since when 'x' is negative, (x*4 + 2)/4 is *not* ; equal to 'x'. In fact, the correct relation is: ; (x*4 + 2)/4 = x - (x<0) ; So, better revert to (x*4)/4 = x. psubsw %1, [Cst2000] psubsw %2, [Cst0002] pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, %1 pcmpgtw mm7, %2 paddsw %1, mm6 paddsw %2, mm7 psraw %1, 2 psraw %2, 2 psubsw %1, mm6 psubsw %2, mm7 ; mix with destination [ecx] movq mm6, [ecx] movq mm7, [ecx] punpcklbw mm6, [Cst0] punpckhbw mm7, [Cst0] paddsw %1, mm6 paddsw %2, mm7 packuswb %1,%2 movq [ecx], %1%endmacro%macro STORE_ADD_2 2 pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, %1 pcmpgtw mm7, %2 paddsw %1, mm6 paddsw %2, mm7 psraw %1, 4 psraw %2, 4 psubsw %1, mm6 psubsw %2, mm7 pxor mm6, mm6 pxor mm7, mm7 pcmpgtw mm6, mm4 pcmpgtw mm7, mm5 paddsw mm4, mm6 paddsw mm5, mm7 psraw mm4, 4 psraw mm5, 4 psubsw mm4, mm6 psubsw mm5, mm7 ; mix with destination movq mm6, [ecx] movq mm7, [ecx] punpcklbw mm6, [Cst0] punpckhbw mm7, [Cst0] paddsw %1, mm6 paddsw %2, mm7 movq mm6, [ecx+eax] movq mm7, [ecx+eax] punpcklbw mm6, [Cst0] punpckhbw mm7, [Cst0] paddsw mm4, mm6 paddsw mm5, mm7 packuswb %1,%2 packuswb mm4, mm5 movq [ecx], %1 movq [ecx+eax], mm4 lea ecx, [ecx+2*eax]%endmacroalign 16Skl_Add_Upsampled_8x8_16To8_SSE: ; 549c mov ecx, [esp+4] ; Dst mov edx, [esp+8] ; Src mov eax, [esp+12] ; BpS COL03 mm0, mm1, 0 MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL03 mm2, mm3, 1 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 2 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 3 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 4 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 5 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL03 mm0, mm1, 6 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL03 mm2, mm3, 7 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 mov ecx, [esp+4] add ecx, 8 COL47 mm0, mm1, 0 MUL_PACK mm0,mm1, [Up13], [Up31] movq mm4, mm0 movq mm5, mm1 STORE_ADD_1 mm4, mm5 add ecx, eax COL47 mm2, mm3, 1 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 2 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 3 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 4 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 5 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 COL47 mm0, mm1, 6 MUL_PACK mm0,mm1, [Up13], [Up31] MIX_ROWS mm2, mm3, mm0, mm1 STORE_ADD_2 mm2, mm3 COL47 mm2, mm3, 7 MUL_PACK mm2,mm3, [Up13], [Up31] MIX_ROWS mm0, mm1, mm2, mm3 STORE_ADD_2 mm0, mm1 STORE_ADD_1 mm2, mm3 ret;////////////////////////////////////////////////////////////////////// ; pfeewwww... Never Do That On Stage Again. :)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -