📄 quant-a.asm
字号:
INIT_MMXDEQUANT_DC mmxextINIT_XMMDEQUANT_DC sse2;-----------------------------------------------------------------------------; void x264_denoise_dct_mmx( int16_t *dct, uint32_t *sum, uint16_t *offset, int size );-----------------------------------------------------------------------------%macro DENOISE_DCT 1-2 0cglobal x264_denoise_dct_%1, 4,5,%2 movzx r4d, word [r0] ; backup DC coefficient pxor m6, m6.loop: sub r3, mmsize mova m2, [r0+r3*2+0*mmsize] mova m3, [r0+r3*2+1*mmsize] PABSW m0, m2 PABSW m1, m3 mova m4, m0 mova m5, m1 psubusw m0, [r2+r3*2+0*mmsize] psubusw m1, [r2+r3*2+1*mmsize] PSIGNW m0, m2 PSIGNW m1, m3 mova [r0+r3*2+0*mmsize], m0 mova [r0+r3*2+1*mmsize], m1 mova m2, m4 mova m3, m5 punpcklwd m4, m6 punpckhwd m2, m6 punpcklwd m5, m6 punpckhwd m3, m6 paddd m4, [r1+r3*4+0*mmsize] paddd m2, [r1+r3*4+1*mmsize] paddd m5, [r1+r3*4+2*mmsize] paddd m3, [r1+r3*4+3*mmsize] mova [r1+r3*4+0*mmsize], m4 mova [r1+r3*4+1*mmsize], m2 mova [r1+r3*4+2*mmsize], m5 mova [r1+r3*4+3*mmsize], m3 jg .loop mov [r0], r4w ; restore DC coefficient RET%endmacro%define PABSW PABSW_MMX%define PSIGNW PSIGNW_MMX%ifndef ARCH_X86_64INIT_MMXDENOISE_DCT mmx%endifINIT_XMMDENOISE_DCT sse2, 7%define PABSW PABSW_SSSE3%define PSIGNW PSIGNW_SSSE3DENOISE_DCT ssse3, 7;-----------------------------------------------------------------------------; int x264_decimate_score( int16_t *dct );-----------------------------------------------------------------------------%macro DECIMATE_MASK_SSE2 6%ifidn %5, ssse3 pabsw xmm0, [%3+ 0] pabsw xmm1, [%3+16]%else movdqa xmm0, [%3+ 0] movdqa xmm1, [%3+16] ABS2_MMX xmm0, xmm1, xmm3, xmm4%endif packsswb xmm0, xmm1 pxor xmm2, xmm2 pcmpeqb xmm2, xmm0 pcmpgtb xmm0, %4 pmovmskb %1, xmm2 pmovmskb %2, xmm0%endmacro%macro DECIMATE_MASK_MMX 6 movq mm0, [%3+ 0] movq mm1, [%3+ 8] movq mm2, [%3+16] movq mm3, [%3+24] ABS2_MMX mm0, mm1, mm4, mm5 ABS2_MMX mm2, mm3, mm4, mm5 packsswb mm0, mm1 packsswb mm2, mm3 pxor mm4, mm4 pxor mm5, mm5 pcmpeqb mm4, mm0 pcmpeqb mm5, mm2 pcmpgtb mm0, %4 pcmpgtb mm2, %4 pmovmskb %6, mm4 pmovmskb %1, mm5 shl %1, 8 or %1, %6 pmovmskb %6, mm0 pmovmskb %2, mm2 shl %2, 8 or %2, %6%endmacrocextern x264_decimate_table4cextern x264_decimate_table8%macro DECIMATE4x4 2;A LUT is faster than bsf on AMD processors, and no slower on Intel;This is not true for score64.cglobal x264_decimate_score%1_%2, 1,3%ifdef PIC lea r10, [x264_decimate_table4 GLOBAL] lea r11, [decimate_mask_table4 GLOBAL] %define table r10 %define mask_table r11%else %define table x264_decimate_table4 %define mask_table decimate_mask_table4%endif DECIMATE_MASK edx, eax, r0, [pb_1 GLOBAL], %2, ecx xor edx, 0xffff je .ret test eax, eax jne .ret9%if %1==15 shr edx, 1%endif movzx ecx, dl movzx eax, byte [mask_table + rcx] cmp edx, ecx je .ret bsr ecx, ecx shr edx, 1 shr edx, cl bsf ecx, edx shr edx, 1 shr edx, cl add al, byte [table + rcx] add al, byte [mask_table + rdx].ret: REP_RET.ret9: mov eax, 9 RET%endmacro%ifndef ARCH_X86_64%define DECIMATE_MASK DECIMATE_MASK_MMXDECIMATE4x4 15, mmxextDECIMATE4x4 16, mmxext%endif%define DECIMATE_MASK DECIMATE_MASK_SSE2DECIMATE4x4 15, sse2DECIMATE4x4 15, ssse3DECIMATE4x4 16, sse2DECIMATE4x4 16, ssse3%macro DECIMATE8x8 1%ifdef ARCH_X86_64cglobal x264_decimate_score64_%1, 1,4%ifdef PIC lea r10, [x264_decimate_table8 GLOBAL] %define table r10%else %define table x264_decimate_table8%endif mova m5, [pb_1 GLOBAL] DECIMATE_MASK r1d, eax, r0, m5, %1, null test eax, eax jne .ret9 DECIMATE_MASK r2d, eax, r0+32, m5, %1, null shl r2d, 16 or r1d, r2d DECIMATE_MASK r2d, r3d, r0+64, m5, %1, null shl r2, 32 or eax, r3d or r1, r2 DECIMATE_MASK r2d, r3d, r0+96, m5, %1, null shl r2, 48 or r1, r2 xor r1, -1 je .ret or eax, r3d jne .ret9.loop: bsf rcx, r1 shr r1, cl add al, byte [table + rcx] shr r1, 1 jne .loop.ret: REP_RET.ret9: mov eax, 9 RET%else ; ARCH%ifidn %1, mmxextcglobal x264_decimate_score64_%1, 1,6%elsecglobal x264_decimate_score64_%1, 1,5%endif mova m7, [pb_1 GLOBAL] DECIMATE_MASK r3, r2, r0, m7, %1, r5 test r2, r2 jne .ret9 DECIMATE_MASK r4, r2, r0+32, m7, %1, r5 shl r4, 16 or r3, r4 DECIMATE_MASK r4, r1, r0+64, m7, %1, r5 or r2, r1 DECIMATE_MASK r1, r0, r0+96, m7, %1, r5 shl r1, 16 or r4, r1 xor r3, -1 je .tryret xor r4, -1.cont: or r0, r2 jne .ret9 ;r0 is zero at this point, so we don't need to zero it.loop: bsf ecx, r3 test r3, r3 je .largerun shrd r3, r4, cl shr r4, cl add r0b, byte [x264_decimate_table8 + ecx] shrd r3, r4, 1 shr r4, 1 cmp r0, 6 ;score64's threshold is never higher than 6 jge .ret9 ;this early termination is only useful on 32-bit because it can be done in the latency after shrd test r3, r3 jne .loop test r4, r4 jne .loop.ret: REP_RET.tryret: xor r4, -1 jne .cont REP_RET.ret9: mov eax, 9 RET.largerun: mov r3, r4 xor r4, r4 bsf ecx, r3 shr r3, cl shr r3, 1 jne .loop REP_RET%endif ; ARCH%endmacro%ifndef ARCH_X86_64INIT_MMX%define DECIMATE_MASK DECIMATE_MASK_MMXDECIMATE8x8 mmxext%endifINIT_XMM%define DECIMATE_MASK DECIMATE_MASK_SSE2DECIMATE8x8 sse2DECIMATE8x8 ssse3;-----------------------------------------------------------------------------; int x264_coeff_last( int16_t *dct );-----------------------------------------------------------------------------%macro LAST_MASK_SSE2 2-3 movdqa xmm0, [%2+ 0] packsswb xmm0, [%2+16] pcmpeqb xmm0, xmm2 pmovmskb %1, xmm0%endmacro%macro LAST_MASK_MMX 3 movq mm0, [%2+ 0] movq mm1, [%2+16] packsswb mm0, [%2+ 8] packsswb mm1, [%2+24] pcmpeqb mm0, mm2 pcmpeqb mm1, mm2 pmovmskb %1, mm0 pmovmskb %3, mm1 shl %3, 8 or %1, %3%endmacro%macro LAST_X86 3 bsr %1, %2%endmacro%macro LAST_SSE4A 3 lzcnt %1, %2 xor %1, %3%endmacro%macro COEFF_LAST4 1%ifdef ARCH_X86_64cglobal x264_coeff_last4_%1, 1,1 LAST rax, [r0], 0x3f shr eax, 4 RET%elsecglobal x264_coeff_last4_%1, 0,3 mov edx, r0mp mov eax, [edx+4] xor ecx, ecx test eax, eax cmovz eax, [edx] setnz cl LAST eax, eax, 0x1f shr eax, 4 lea eax, [eax+ecx*2] RET%endif%endmacro%define LAST LAST_X86COEFF_LAST4 mmxext%define LAST LAST_SSE4ACOEFF_LAST4 mmxext_lzcnt%macro COEFF_LAST 1cglobal x264_coeff_last15_%1, 1,3 pxor m2, m2 LAST_MASK r1d, r0-2, r2d xor r1d, 0xffff LAST eax, r1d, 0x1f dec eax RETcglobal x264_coeff_last16_%1, 1,3 pxor m2, m2 LAST_MASK r1d, r0, r2d xor r1d, 0xffff LAST eax, r1d, 0x1f RET%ifndef ARCH_X86_64cglobal x264_coeff_last64_%1, 1, 5-mmsize/16 pxor m2, m2 LAST_MASK r2d, r0+64, r4d LAST_MASK r3d, r0+96, r4d shl r3d, 16 or r2d, r3d xor r2d, -1 jne .secondhalf LAST_MASK r1d, r0, r4d LAST_MASK r3d, r0+32, r4d shl r3d, 16 or r1d, r3d not r1d LAST eax, r1d, 0x1f RET.secondhalf: LAST eax, r2d, 0x1f add eax, 32 RET%elsecglobal x264_coeff_last64_%1, 1,4 pxor m2, m2 LAST_MASK_SSE2 r1d, r0 LAST_MASK_SSE2 r2d, r0+32 LAST_MASK_SSE2 r3d, r0+64 LAST_MASK_SSE2 r0d, r0+96 shl r2d, 16 shl r0d, 16 or r1d, r2d or r3d, r0d shl r3, 32 or r1, r3 not r1 LAST rax, r1, 0x3f RET%endif%endmacro%define LAST LAST_X86%ifndef ARCH_X86_64INIT_MMX%define LAST_MASK LAST_MASK_MMXCOEFF_LAST mmxext%endifINIT_XMM%define LAST_MASK LAST_MASK_SSE2COEFF_LAST sse2%define LAST LAST_SSE4ACOEFF_LAST sse2_lzcnt;-----------------------------------------------------------------------------; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel );-----------------------------------------------------------------------------%macro LAST_MASK4_MMX 2-3 movq mm0, [%2] packsswb mm0, mm0 pcmpeqb mm0, mm2 pmovmskb %1, mm0%endmacro%macro LZCOUNT_X86 3 bsr %1, %2 xor %1, %3%endmacro%macro LZCOUNT_SSE4A 3 lzcnt %1, %2%endmacro; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args%ifdef WIN64 DECLARE_REG_TMP 3,1,2,0,4,5,6%elifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,6%else DECLARE_REG_TMP 6,3,2,1,4,5,0%endif%macro COEFF_LEVELRUN 2cglobal x264_coeff_level_run%2_%1,0,7 movifnidn t0, r0mp movifnidn t1, r1mp pxor m2, m2 LAST_MASK t5d, t0-(%2&1)*2, t4d not t5d shl t5d, 32-((%2+1)&~1) mov t4d, %2-1 LZCOUNT t3d, t5d, 0x1f xor t6d, t6d add t5d, t5d sub t4d, t3d shl t5d, t3b mov [t1], t4d.loop: LZCOUNT t3d, t5d, 0x1f mov t2w, [t0+t4*2] mov [t1+t6 +36], t3b mov [t1+t6*2+ 4], t2w inc t3d shl t5d, t3b inc t6d sub t4d, t3d jge .loop REP_RET%endmacroINIT_MMX%define LZCOUNT LZCOUNT_X86%ifndef ARCH_X86_64%define LAST_MASK LAST_MASK_MMXCOEFF_LEVELRUN mmxext, 15COEFF_LEVELRUN mmxext, 16%endif%define LAST_MASK LAST_MASK4_MMXCOEFF_LEVELRUN mmxext, 4INIT_XMM%define LAST_MASK LAST_MASK_SSE2COEFF_LEVELRUN sse2, 15COEFF_LEVELRUN sse2, 16%define LZCOUNT LZCOUNT_SSE4ACOEFF_LEVELRUN sse2_lzcnt, 15COEFF_LEVELRUN sse2_lzcnt, 16INIT_MMX%define LAST_MASK LAST_MASK4_MMXCOEFF_LEVELRUN mmxext_lzcnt, 4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -