📄 deblock-a.asm
字号:
mov r3, r1m lea r4, [r3*3] sub r0, 4 lea r1, [r0+r4] %assign pad 0x78-(stack_offset&15) SUB esp, pad%define pix_tmp esp+12 ; transpose 6x16 -> tmp space TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp lea r0, [r0+r3*8] lea r1, [r1+r3*8] TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 ; vertical filter lea r0, [pix_tmp+0x30] PUSH dword r4m PUSH dword r3m PUSH dword r2m PUSH dword 16 PUSH dword r0 call x264_deblock_%2_luma_%1%ifidn %2, v8 add dword [esp ], 8 ; pix_tmp+0x38 add dword [esp+16], 2 ; tc0+2 call x264_deblock_%2_luma_%1%endif ADD esp, 20 ; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) mov r0, r0mp sub r0, 2 lea r1, [r0+r4] movq m0, [pix_tmp+0x10] movq m1, [pix_tmp+0x20] movq m2, [pix_tmp+0x30] movq m3, [pix_tmp+0x40] TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) lea r0, [r0+r3*8] lea r1, [r1+r3*8] movq m0, [pix_tmp+0x18] movq m1, [pix_tmp+0x28] movq m2, [pix_tmp+0x38] movq m3, [pix_tmp+0x48] TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4) ADD esp, pad RET%endmacro ; DEBLOCK_LUMAINIT_MMXDEBLOCK_LUMA mmxext, v8, 8INIT_XMMDEBLOCK_LUMA sse2, v, 16%endif ; ARCH%macro LUMA_INTRA_P012 4 ; p0..p3 in memory mova t0, p2 mova t1, p0 pavgb t0, p1 pavgb t1, q0 pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 mova t5, t1 mova t2, p2 mova t3, p0 paddb t2, p1 paddb t3, q0 paddb t2, t3 mova t3, t2 mova t4, t2 psrlw t2, 1 pavgb t2, mpb_00 pxor t2, t0 pand t2, mpb_01 psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; mova t1, p2 mova t2, p2 pavgb t1, q1 psubb t2, q1 paddb t3, t3 psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 pand t2, mpb_01 psubb t1, t2 pavgb t1, p1 pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 psrlw t3, 2 pavgb t3, mpb_00 pxor t3, t1 pand t3, mpb_01 psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 mova t3, p0 mova t2, p0 pxor t3, q1 pavgb t2, q1 pand t3, mpb_01 psubb t2, t3 pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 pxor t1, t2 pxor t2, p0 pand t1, mask1p pand t2, mask0 pxor t1, t2 pxor t1, p0 mova %1, t1 ; store p0 mova t1, %4 ; p3 mova t2, t1 pavgb t1, p2 paddb t2, p2 pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 paddb t2, t2 paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 psrlw t2, 2 pavgb t2, mpb_00 pxor t2, t1 pand t2, mpb_01 psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 pxor t0, p1 pxor t1, p2 pand t0, mask1p pand t1, mask1p pxor t0, p1 pxor t1, p2 mova %2, t0 ; store p1 mova %3, t1 ; store p2%endmacro%macro LUMA_INTRA_SWAP_PQ 0 %define q1 m0 %define q0 m1 %define p0 m2 %define p1 m3 %define p2 q2 %define mask1p mask1q%endmacro%macro DEBLOCK_LUMA_INTRA 2 %define p1 m0 %define p0 m1 %define q0 m2 %define q1 m3 %define t0 m4 %define t1 m5 %define t2 m6 %define t3 m7%ifdef ARCH_X86_64 %define p2 m8 %define q2 m9 %define t4 m10 %define t5 m11 %define mask0 m12 %define mask1p m13 %define mask1q [rsp-24] %define mpb_00 m14 %define mpb_01 m15%else %define spill(x) [esp+16*x+((stack_offset+4)&15)] %define p2 [r4+r1] %define q2 [r0+2*r1] %define t4 spill(0) %define t5 spill(1) %define mask0 spill(2) %define mask1p spill(3) %define mask1q spill(4) %define mpb_00 [pb_00 GLOBAL] %define mpb_01 [pb_01 GLOBAL]%endif;-----------------------------------------------------------------------------; void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );-----------------------------------------------------------------------------cglobal x264_deblock_%2_luma_intra_%1, 4,6,16%ifndef ARCH_X86_64 sub esp, 0x60%endif lea r4, [r1*4] lea r5, [r1*3] ; 3*stride dec r2d ; alpha-1 jl .end neg r4 dec r3d ; beta-1 jl .end add r4, r0 ; pix-4*stride mova p1, [r4+2*r1] mova p0, [r4+r5] mova q0, [r0] mova q1, [r0+r1]%ifdef ARCH_X86_64 pxor mpb_00, mpb_00 mova mpb_01, [pb_01 GLOBAL] LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 SWAP 7, 12 ; m12=mask0 pavgb t5, mpb_00 pavgb t5, mpb_01 ; alpha/4+1 movdqa p2, [r4+r1] movdqa q2, [r0+2*r1] DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 pand t0, mask0 pand t4, t0 pand t2, t0 mova mask1q, t4 mova mask1p, t2%else LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 mova m4, t5 mova mask0, m7 pavgb m4, [pb_00 GLOBAL] pavgb m4, [pb_01 GLOBAL] ; alpha/4+1 DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 pand m6, mask0 DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 pand m4, m6 mova mask1p, m4 DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 pand m4, m6 mova mask1q, m4%endif LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] LUMA_INTRA_SWAP_PQ LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5].end:%ifndef ARCH_X86_64 add esp, 0x60%endif RETINIT_MMX%ifdef ARCH_X86_64;-----------------------------------------------------------------------------; void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );-----------------------------------------------------------------------------cglobal x264_deblock_h_luma_intra_%1, 4,7 movsxd r10, r1d lea r11, [r10*3] lea r6, [r0-4] lea r5, [r0-4+r11] sub rsp, 0x88 %define pix_tmp rsp ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) lea r6, [r6+r10*8] lea r5, [r5+r10*8] TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r10, r11), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] mov r1, 0x10 call x264_deblock_v_luma_intra_%1 ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) lea r5, [r6+r11] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) shl r10, 3 sub r6, r10 sub r5, r10 shr r10, 3 TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r10, r11) add rsp, 0x88 RET%elsecglobal x264_deblock_h_luma_intra_%1, 2,4 lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3]%assign pad 0x8c-(stack_offset&15) SUB rsp, pad %define pix_tmp rsp ; transpose 8x16 -> tmp space TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) lea r0, [pix_tmp+0x40] PUSH dword r3m PUSH dword r2m PUSH dword 16 PUSH r0 call x264_deblock_%2_luma_intra_%1%ifidn %2, v8 add dword [rsp], 8 ; pix_tmp+8 call x264_deblock_%2_luma_intra_%1%endif ADD esp, 16 mov r1, r1m mov r0, r0mp lea r3, [r1*3] sub r0, 4 lea r2, [r0+r3] ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) lea r0, [r0+r1*8] lea r2, [r2+r1*8] TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) ADD rsp, pad RET%endif ; ARCH_X86_64%endmacro ; DEBLOCK_LUMA_INTRAINIT_XMMDEBLOCK_LUMA_INTRA sse2, v%ifndef ARCH_X86_64INIT_MMXDEBLOCK_LUMA_INTRA mmxext, v8%endifINIT_MMX%macro CHROMA_V_START 0 dec r2d ; alpha-1 dec r3d ; beta-1 mov t5, r0 sub t5, r1 sub t5, r1%endmacro%macro CHROMA_H_START 0 dec r2d dec r3d sub r0, 2 lea t6, [r1*3] mov t5, r0 add r0, t6%endmacro%define t5 r5%define t6 r6;-----------------------------------------------------------------------------; void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );-----------------------------------------------------------------------------cglobal x264_deblock_v_chroma_mmxext, 5,6 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] call chroma_inter_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET;-----------------------------------------------------------------------------; void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );-----------------------------------------------------------------------------cglobal x264_deblock_h_chroma_mmxext, 5,7%ifdef ARCH_X86_64 %define buf0 [rsp-24] %define buf1 [rsp-16]%else %define buf0 r0m %define buf1 r2m%endif CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) movq buf0, m0 movq buf1, m3 call chroma_inter_body_mmxext movq m0, buf0 movq m3, buf1 TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) RETALIGN 16chroma_inter_body_mmxext: LOAD_MASK r2d, r3d movd m6, [r4] ; tc0 punpcklbw m6, m6 pand m7, m6 DEBLOCK_P0_Q0 ret; in: %1=p0 %2=p1 %3=q1; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2%macro CHROMA_INTRA_P0 3 movq m4, %1 pxor m4, %3 pand m4, [pb_01 GLOBAL] ; m4 = (p0^q1)&1 pavgb %1, %3 psubusb %1, m4 pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))%endmacro%define t5 r4%define t6 r5;-----------------------------------------------------------------------------; void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );-----------------------------------------------------------------------------cglobal x264_deblock_v_chroma_intra_mmxext, 4,5 CHROMA_V_START movq m0, [t5] movq m1, [t5+r1] movq m2, [r0] movq m3, [r0+r1] call chroma_intra_body_mmxext movq [t5+r1], m1 movq [r0], m2 RET;-----------------------------------------------------------------------------; void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );-----------------------------------------------------------------------------cglobal x264_deblock_h_chroma_intra_mmxext, 4,6 CHROMA_H_START TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6) call chroma_intra_body_mmxext TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6) RETALIGN 16chroma_intra_body_mmxext: LOAD_MASK r2d, r3d movq m5, m1 movq m6, m2 CHROMA_INTRA_P0 m1, m0, m3 CHROMA_INTRA_P0 m2, m3, m0 psubb m1, m5 psubb m2, m6 pand m1, m7 pand m2, m7 paddb m1, m5 paddb m2, m6 ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -