📄 quant-a.asm
字号:
;*****************************************************************************;* quant-a.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005-2008 x264 project;*;* Authors: Loren Merritt <lorenm@u.washington.edu>;* Jason Garrett-Glaser <darkshikari@gmail.com>;* Christian Heine <sennindemokrit@gmx.net>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.;*****************************************************************************%include "x86inc.asm"%include "x86util.asm"SECTION_RODATApb_1: times 16 db 1pw_1: times 8 dw 1pd_1: times 4 dd 1pb_01: times 8 db 0, 1%macro DQM4 3 dw %1, %2, %1, %2, %2, %3, %2, %3%endmacro%macro DQM8 6 dw %1, %4, %5, %4, %1, %4, %5, %4 dw %4, %2, %6, %2, %4, %2, %6, %2 dw %5, %6, %3, %6, %5, %6, %3, %6 ; last line not used, just padding for power-of-2 stride times 8 dw 0%endmacrodequant4_scale: DQM4 10, 13, 16 DQM4 11, 14, 18 DQM4 13, 16, 20 DQM4 14, 18, 23 DQM4 16, 20, 25 DQM4 18, 23, 29dequant8_scale: DQM8 20, 18, 32, 19, 25, 24 DQM8 22, 19, 35, 21, 28, 26 DQM8 26, 23, 42, 24, 33, 31 DQM8 28, 25, 45, 26, 35, 33 DQM8 32, 28, 51, 30, 40, 38 DQM8 36, 32, 58, 34, 46, 43decimate_mask_table4: db 0,3,2,6,2,5,5,9,1,5,4,8,5,8,8,12,1,4,4,8,4,7,7,11,4,8,7,11,8,11,11,15,1,4 db 3,7,4,7,7,11,3,7,6,10,7,10,10,14,4,7,7,11,7,10,10,14,7,11,10,14,11,14,14 db 18,0,4,3,7,3,6,6,10,3,7,6,10,7,10,10,14,3,6,6,10,6,9,9,13,6,10,9,13,10,13 db 13,17,4,7,6,10,7,10,10,14,6,10,9,13,10,13,13,17,7,10,10,14,10,13,13,17,10 db 14,13,17,14,17,17,21,0,3,3,7,3,6,6,10,2,6,5,9,6,9,9,13,3,6,6,10,6,9,9,13 db 6,10,9,13,10,13,13,17,3,6,5,9,6,9,9,13,5,9,8,12,9,12,12,16,6,9,9,13,9,12 db 12,16,9,13,12,16,13,16,16,20,3,7,6,10,6,9,9,13,6,10,9,13,10,13,13,17,6,9 db 9,13,9,12,12,16,9,13,12,16,13,16,16,20,7,10,9,13,10,13,13,17,9,13,12,16 db 13,16,16,20,10,13,13,17,13,16,16,20,13,17,16,20,17,20,20,24SECTION .text%macro QUANT_DC_START_MMX 0 movd m6, r1m ; mf movd m7, r2m ; bias%ifidn m0, mm0 pshufw m6, m6, 0 pshufw m7, m7, 0%else pshuflw m6, m6, 0 pshuflw m7, m7, 0 punpcklqdq m6, m6 punpcklqdq m7, m7%endif%endmacro%macro QUANT_DC_START_SSSE3 0 movdqa m5, [pb_01 GLOBAL] movd m6, r1m ; mf movd m7, r2m ; bias pshufb m6, m5 pshufb m7, m5%endmacro%macro PABSW_MMX 2 pxor %1, %1 pcmpgtw %1, %2 pxor %2, %1 psubw %2, %1 SWAP %1, %2%endmacro%macro PSIGNW_MMX 2 pxor %1, %2 psubw %1, %2%endmacro%macro PABSW_SSSE3 2 pabsw %1, %2%endmacro%macro PSIGNW_SSSE3 2 psignw %1, %2%endmacro%macro QUANT_ONE 4;;; %1 (m64) dct[y][x];;; %2 (m64/mmx) mf[y][x] or mf[0][0] (as uint16_t);;; %3 (m64/mmx) bias[y][x] or bias[0][0] (as uint16_t) mova m1, %1 ; load dct coeffs PABSW m0, m1 paddusw m0, %3 ; round pmulhuw m0, %2 ; divide PSIGNW m0, m1 ; restore sign mova %1, m0 ; store%if %4 por m5, m0%else SWAP m5, m0%endif%endmacro%macro QUANT_TWO 7 mova m1, %1 mova m3, %2 PABSW m0, m1 PABSW m2, m3 paddusw m0, %5 paddusw m2, %6 pmulhuw m0, %3 pmulhuw m2, %4 PSIGNW m0, m1 PSIGNW m2, m3 mova %1, m0 mova %2, m2%if %7 por m5, m0 por m5, m2%else SWAP m5, m0 por m5, m2%endif%endmacro%macro QUANT_END_MMX 0 xor eax, eax%ifndef ARCH_X86_64%if mmsize==8 packsswb m5, m5 movd ecx, m5 test ecx, ecx%else pxor m4, m4 pcmpeqb m5, m4 pmovmskb ecx, m5 cmp ecx, (1<<mmsize)-1%endif%else%if mmsize==16 packsswb m5, m5%endif movq rcx, m5 test rcx, rcx%endif setne al%endmacro%macro QUANT_END_SSE4 0 xor eax, eax ptest m5, m5 setne al%endmacro;-----------------------------------------------------------------------------; void x264_quant_4x4_dc_mmxext( int16_t dct[16], int mf, int bias );-----------------------------------------------------------------------------%macro QUANT_DC 2-3 0cglobal %1, 1,1,%3 QUANT_DC_START%if %2==1 QUANT_ONE [r0], m6, m7, 0%else%assign x 0%rep %2/2 QUANT_TWO [r0+x], [r0+x+mmsize], m6, m6, m7, m7, x%assign x x+mmsize*2%endrep%endif QUANT_END RET%endmacro;-----------------------------------------------------------------------------; int x264_quant_4x4_mmx( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );-----------------------------------------------------------------------------%macro QUANT_AC 2cglobal %1, 3,3%assign x 0%rep %2/2 QUANT_TWO [r0+x], [r0+x+mmsize], [r1+x], [r1+x+mmsize], [r2+x], [r2+x+mmsize], x%assign x x+mmsize*2%endrep QUANT_END RET%endmacroINIT_MMX%define QUANT_END QUANT_END_MMX%define PABSW PABSW_MMX%define PSIGNW PSIGNW_MMX%define QUANT_DC_START QUANT_DC_START_MMXQUANT_DC x264_quant_2x2_dc_mmxext, 1%ifndef ARCH_X86_64 ; not needed because sse2 is fasterQUANT_DC x264_quant_4x4_dc_mmxext, 4QUANT_AC x264_quant_4x4_mmx, 4QUANT_AC x264_quant_8x8_mmx, 16%endifINIT_XMMQUANT_DC x264_quant_4x4_dc_sse2, 2, 8QUANT_AC x264_quant_4x4_sse2, 2QUANT_AC x264_quant_8x8_sse2, 8%define PABSW PABSW_SSSE3%define PSIGNW PSIGNW_SSSE3QUANT_DC x264_quant_4x4_dc_ssse3, 2, 8QUANT_AC x264_quant_4x4_ssse3, 2QUANT_AC x264_quant_8x8_ssse3, 8INIT_MMXQUANT_DC x264_quant_2x2_dc_ssse3, 1%define QUANT_END QUANT_END_SSE4;Not faster on Conroe, so only used in SSE4 versions%define QUANT_DC_START QUANT_DC_START_SSSE3INIT_XMMQUANT_DC x264_quant_4x4_dc_sse4, 2, 8QUANT_AC x264_quant_4x4_sse4, 2QUANT_AC x264_quant_8x8_sse4, 8;=============================================================================; dequant;=============================================================================%macro DEQUANT16_L 3;;; %1 dct[y][x];;; %2,%3 dequant_mf[i_mf][y][x];;; m2 i_qbits mova m0, %2 packssdw m0, %3 pmullw m0, %1 psllw m0, m2 mova %1, m0%endmacro%macro DEQUANT32_R 3;;; %1 dct[y][x];;; %2,%3 dequant_mf[i_mf][y][x];;; m2 -i_qbits;;; m3 f;;; m4 0 mova m0, %1 mova m1, m0 punpcklwd m0, m4 punpckhwd m1, m4 pmaddwd m0, %2 pmaddwd m1, %3 paddd m0, m3 paddd m1, m3 psrad m0, m2 psrad m1, m2 packssdw m0, m1 mova %1, m0%endmacro%macro DEQUANT_LOOP 3%if 8*(%2-2*%3) mov t0d, 8*(%2-2*%3)%%loop: %1 [r0+t0+8*%3], [r1+t0*2+16*%3], [r1+t0*2+24*%3] %1 [r0+t0 ], [r1+t0*2 ], [r1+t0*2+ 8*%3] sub t0d, 16*%3 jge %%loop REP_RET%else %1 [r0+8*%3], [r1+16*%3], [r1+24*%3] %1 [r0 ], [r1 ], [r1+ 8*%3] RET%endif%endmacro%macro DEQUANT16_FLAT 2-5 mova m0, %1%assign i %0-2%rep %0-1%if i mova m %+ i, [r0+%2] pmullw m %+ i, m0%else pmullw m0, [r0+%2]%endif psllw m %+ i, m4 mova [r0+%2], m %+ i %assign i i-1 %rotate 1%endrep%endmacro%ifdef WIN64 DECLARE_REG_TMP 6,3,2%elifdef ARCH_X86_64 DECLARE_REG_TMP 4,3,2%else DECLARE_REG_TMP 2,0,1%endif%macro DEQUANT_START 2 movifnidn t2d, r2m imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 lea t1, [t0*3] sub t2d, t1d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %1%ifdef ARCH_X86_64 add r1, t2 ; dequant_mf[i_mf]%else add r1, r1mp ; dequant_mf[i_mf] mov r0, r0mp ; dct%endif sub t0d, %2 jl .rshift32 ; negative qbits => rightshift%endmacro;-----------------------------------------------------------------------------; void x264_dequant_4x4_mmx( int16_t dct[4][4], int dequant_mf[6][4][4], int i_qp );-----------------------------------------------------------------------------%macro DEQUANT 4cglobal x264_dequant_%2x%2_%1, 0,3.skip_prologue: DEQUANT_START %3+2, %3.lshift: movd m2, t0d DEQUANT_LOOP DEQUANT16_L, %2*%2/4, %4.rshift32: neg t0d movd m2, t0d mova m3, [pd_1 GLOBAL] pxor m4, m4 pslld m3, m2 psrld m3, 1 DEQUANT_LOOP DEQUANT32_R, %2*%2/4, %4cglobal x264_dequant_%2x%2_flat16_%1, 0,3 movifnidn t2d, r2m%if %2 == 8 cmp t2d, 12 jl x264_dequant_%2x%2_%1.skip_prologue sub t2d, 12%endif imul t0d, t2d, 0x2b shr t0d, 8 ; i_qbits = i_qp / 6 lea t1, [t0*3] sub t2d, t1d sub t2d, t1d ; i_mf = i_qp % 6 shl t2d, %3%ifdef PIC lea r1, [dequant%2_scale GLOBAL] add r1, t2%else lea r1, [dequant%2_scale + t2 GLOBAL]%endif movifnidn r0, r0mp movd m4, t0d%if %2 == 4%ifidn %1, mmx DEQUANT16_FLAT [r1], 0, 16 DEQUANT16_FLAT [r1+8], 8, 24%else DEQUANT16_FLAT [r1], 0, 16%endif%elifidn %1, mmx DEQUANT16_FLAT [r1], 0, 8, 64, 72 DEQUANT16_FLAT [r1+16], 16, 24, 48, 56 DEQUANT16_FLAT [r1+16], 80, 88, 112, 120 DEQUANT16_FLAT [r1+32], 32, 40, 96, 104%else DEQUANT16_FLAT [r1], 0, 64 DEQUANT16_FLAT [r1+16], 16, 48, 80, 112 DEQUANT16_FLAT [r1+32], 32, 96%endif RET%endmacro ; DEQUANT%ifndef ARCH_X86_64INIT_MMXDEQUANT mmx, 4, 4, 1DEQUANT mmx, 8, 6, 1%endifINIT_XMMDEQUANT sse2, 4, 4, 2DEQUANT sse2, 8, 6, 2%macro DEQUANT_DC 1cglobal x264_dequant_4x4dc_%1, 0,3 DEQUANT_START 6, 6.lshift: movd m3, [r1] movd m2, t0d pslld m3, m2%if mmsize==16 pshuflw m3, m3, 0 punpcklqdq m3, m3%else pshufw m3, m3, 0%endif%assign x 0%rep 16/mmsize mova m0, [r0+mmsize*0+x] mova m1, [r0+mmsize*1+x] pmullw m0, m3 pmullw m1, m3 mova [r0+mmsize*0+x], m0 mova [r0+mmsize*1+x], m1%assign x x+mmsize*2%endrep RET.rshift32: neg t0d movd m3, t0d mova m4, [pw_1 GLOBAL] mova m5, m4 pslld m4, m3 psrld m4, 1 movd m2, [r1]%if mmsize==8 punpcklwd m2, m2%else pshuflw m2, m2, 0%endif punpcklwd m2, m4%assign x 0%rep 32/mmsize mova m0, [r0+x] mova m1, m0 punpcklwd m0, m5 punpckhwd m1, m5 pmaddwd m0, m2 pmaddwd m1, m2 psrad m0, m3 psrad m1, m3 packssdw m0, m1 mova [r0+x], m0%assign x x+mmsize%endrep RET%endmacro
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -