📄 pixel-a.asm
字号:
;*****************************************************************************;* pixel.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2003-2008 x264 project;*;* Authors: Loren Merritt <lorenm@u.washington.edu>;* Holger Lubitz <holger@lubitz.org>;* Laurent Aimar <fenrir@via.ecp.fr>;* Alex Izvorski <aizvorksi@gmail.com>;* Jason Garrett-Glaser <darkshikari@gmail.com>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.;*****************************************************************************%include "x86inc.asm"%include "x86util.asm"SECTION_RODATApw_1: times 8 dw 1pw_00ff: times 8 dw 0xffssim_c1: times 4 dd 416 ; .01*.01*255*255*64ssim_c2: times 4 dd 235963 ; .03*.03*255*255*64*63mask_ff: times 16 db 0xff times 16 db 0mask_ac4: dw 0, -1, -1, -1, 0, -1, -1, -1mask_ac4b: dw 0, -1, 0, -1, -1, -1, -1, -1mask_ac8: dw 0, -1, -1, -1, -1, -1, -1, -1hsub_mul: times 8 db 1, -1hmul_4p: times 2 db 1, 1, 1, 1, 1, -1, 1, -1hmul_8p: times 8 db 1 times 4 db 1, -1mask_10: times 4 dw 0, -1mask_1100: times 2 dd 0, -1SECTION .text%macro HADDD 2 ; sum junk%if mmsize == 16 movhlps %2, %1 paddd %1, %2 pshuflw %2, %1, 0xE paddd %1, %2%else pshufw %2, %1, 0xE paddd %1, %2%endif%endmacro%macro HADDW 2 pmaddwd %1, [pw_1 GLOBAL] HADDD %1, %2%endmacro%macro HADDUW 2 mova %2, %1 pslld %1, 16 psrld %2, 16 psrld %1, 16 paddd %1, %2 HADDD %1, %2%endmacro;=============================================================================; SSD;=============================================================================%macro SSD_LOAD_FULL 5 mova m1, [r0+%1] mova m2, [r2+%2] mova m3, [r0+%3] mova m4, [r2+%4]%if %5 lea r0, [r0+2*r1] lea r2, [r2+2*r3]%endif%endmacro%macro LOAD 5 movh m%1, %3 movh m%2, %4%if %5 lea r0, [r0+2*r1]%endif%endmacro%macro JOIN 7 movh m%3, %5 movh m%4, %6%if %7 lea r2, [r2+2*r3]%endif punpcklbw m%1, m7 punpcklbw m%3, m7 psubw m%1, m%3 punpcklbw m%2, m7 punpcklbw m%4, m7 psubw m%2, m%4%endmacro%macro JOIN_SSE2 7 movh m%3, %5 movh m%4, %6%if %7 lea r2, [r2+2*r3]%endif punpcklqdq m%1, m%2 punpcklqdq m%3, m%4 DEINTB %2, %1, %4, %3, 7 psubw m%2, m%4 psubw m%1, m%3%endmacro%macro JOIN_SSSE3 7 movh m%3, %5 movh m%4, %6%if %7 lea r2, [r2+2*r3]%endif punpcklbw m%1, m%3 punpcklbw m%2, m%4%endmacro%macro SSD_LOAD_HALF 5 LOAD 1, 2, [r0+%1], [r0+%3], 1 JOIN 1, 2, 3, 4, [r2+%2], [r2+%4], 1 LOAD 3, 4, [r0+%1], [r0+%3], %5 JOIN 3, 4, 5, 6, [r2+%2], [r2+%4], %5%endmacro%macro SSD_CORE 7-8%ifidn %8, FULL mova m%6, m%2 mova m%7, m%4 psubusb m%2, m%1 psubusb m%4, m%3 psubusb m%1, m%6 psubusb m%3, m%7 por m%1, m%2 por m%3, m%4 mova m%2, m%1 mova m%4, m%3 punpckhbw m%1, m%5 punpckhbw m%3, m%5 punpcklbw m%2, m%5 punpcklbw m%4, m%5%endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4%endmacro%macro SSD_CORE_SSE2 7-8%ifidn %8, FULL DEINTB %6, %1, %7, %2, %5 psubw m%6, m%7 psubw m%1, m%2 SWAP %2, %6 DEINTB %6, %3, %7, %4, %5 psubw m%6, m%7 psubw m%3, m%4 SWAP %4, %6%endif pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4%endmacro%macro SSD_CORE_SSSE3 7-8%ifidn %8, FULL mova m%6, m%1 mova m%7, m%3 punpcklbw m%1, m%2 punpcklbw m%3, m%4 punpckhbw m%6, m%2 punpckhbw m%7, m%4 SWAP %6, %2 SWAP %7, %4%endif pmaddubsw m%1, m%5 pmaddubsw m%2, m%5 pmaddubsw m%3, m%5 pmaddubsw m%4, m%5 pmaddwd m%1, m%1 pmaddwd m%2, m%2 pmaddwd m%3, m%3 pmaddwd m%4, m%4%endmacro%macro SSD_END 1 paddd m1, m2 paddd m3, m4%if %1 paddd m0, m1%else SWAP 0, 1%endif paddd m0, m3%endmacro%macro SSD_ITER 7 SSD_LOAD_%1 %2,%3,%4,%5,%7 SSD_CORE 1, 2, 3, 4, 7, 5, 6, %1 SSD_END %6%endmacro;-----------------------------------------------------------------------------; int x264_pixel_ssd_16x16_mmx( uint8_t *, int, uint8_t *, int );-----------------------------------------------------------------------------%macro SSD 3-4 0cglobal x264_pixel_ssd_%1x%2_%3, 4,4,%4%ifidn %3, ssse3 mova m7, [hsub_mul GLOBAL]%elifidn %3, sse2 mova m7, [pw_00ff GLOBAL]%elif %1 >= mmsize pxor m7, m7%endif%assign i 0%rep %2/4%if %1 > mmsize SSD_ITER FULL, 0, 0, mmsize, mmsize, i, 0 SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, 1 SSD_ITER FULL, 0, 0, mmsize, mmsize, 1, 0 SSD_ITER FULL, r1, r3, r1+mmsize, r3+mmsize, 1, i<%2/4-1%elif %1 == mmsize SSD_ITER FULL, 0, 0, r1, r3, i, 1 SSD_ITER FULL, 0, 0, r1, r3, 1, i<%2/4-1%else SSD_ITER HALF, 0, 0, r1, r3, i, i<%2/4-1%endif%assign i i+1%endrep HADDD m0, m1 movd eax, m0 RET%endmacroINIT_MMXSSD 16, 16, mmxSSD 16, 8, mmxSSD 8, 16, mmxSSD 8, 8, mmxSSD 8, 4, mmxSSD 4, 8, mmxSSD 4, 4, mmxINIT_XMMSSD 16, 16, sse2slow, 8SSD 16, 8, sse2slow, 8SSD 8, 16, sse2slow, 8SSD 8, 8, sse2slow, 8SSD 8, 4, sse2slow, 8%define SSD_CORE SSD_CORE_SSE2%define JOIN JOIN_SSE2SSD 16, 16, sse2, 8SSD 16, 8, sse2, 8SSD 8, 16, sse2, 8SSD 8, 8, sse2, 8SSD 8, 4, sse2, 8%define SSD_CORE SSD_CORE_SSSE3%define JOIN JOIN_SSSE3SSD 16, 16, ssse3, 8SSD 16, 8, ssse3, 8SSD 8, 16, ssse3, 8SSD 8, 8, ssse3, 8SSD 8, 4, ssse3, 8INIT_MMXSSD 4, 8, ssse3SSD 4, 4, ssse3;=============================================================================; variance;=============================================================================%macro VAR_START 1 pxor m5, m5 ; sum pxor m6, m6 ; sum squared%if %1 mova m7, [pw_00ff GLOBAL]%else pxor m7, m7 ; zero%endif%endmacro%macro VAR_END 1 HADDW m5, m7 movd r1d, m5 imul r1d, r1d HADDD m6, m1 shr r1d, %1 movd eax, m6 sub eax, r1d ; sqr - (sum * sum >> shift) RET%endmacro%macro VAR_CORE 0 paddw m5, m0 paddw m5, m3 paddw m5, m1 paddw m5, m4 pmaddwd m0, m0 pmaddwd m3, m3 pmaddwd m1, m1 pmaddwd m4, m4 paddd m6, m0 paddd m6, m3 paddd m6, m1 paddd m6, m4%endmacro%macro VAR_2ROW 2 mov r2d, %2.loop: mova m0, [r0] mova m1, m0 mova m3, [r0+%1] mova m4, m3 punpcklbw m0, m7 punpckhbw m1, m7%ifidn %1, r1 lea r0, [r0+%1*2]%else add r0, r1%endif punpcklbw m3, m7 punpckhbw m4, m7 dec r2d VAR_CORE jg .loop%endmacro;-----------------------------------------------------------------------------; int x264_pixel_var_wxh_mmxext( uint8_t *, int );-----------------------------------------------------------------------------INIT_MMXcglobal x264_pixel_var_16x16_mmxext, 2,3 VAR_START 0 VAR_2ROW 8, 16 VAR_END 8cglobal x264_pixel_var_8x8_mmxext, 2,3 VAR_START 0 VAR_2ROW r1, 4 VAR_END 6INIT_XMMcglobal x264_pixel_var_16x16_sse2, 2,3,8 VAR_START 1 mov r2d, 8.loop: mova m0, [r0] mova m3, [r0+r1] DEINTB 1, 0, 4, 3, 7 lea r0, [r0+r1*2] VAR_CORE dec r2d jg .loop VAR_END 8cglobal x264_pixel_var_8x8_sse2, 2,4,8 VAR_START 1 mov r2d, 2 lea r3, [r1*3].loop: movh m0, [r0] movh m3, [r0+r1] movhps m0, [r0+r1*2] movhps m3, [r0+r3] DEINTB 1, 0, 4, 3, 7 lea r0, [r0+r1*4] VAR_CORE dec r2d jg .loop VAR_END 6;=============================================================================; SATD;=============================================================================%macro TRANS_SSE2 5-6; TRANSPOSE2x2; %1: transpose width (d/q) - use SBUTTERFLY qdq for dq; %2: ord/unord (for compat with sse4, unused); %3/%4: source regs; %5/%6: tmp regs%ifidn %1, d%define mask [mask_10 GLOBAL]%define shift 16%elifidn %1, q%define mask [mask_1100 GLOBAL]%define shift 32%endif%if %0==6 ; less dependency if we have two tmp mova m%5, mask ; ff00 mova m%6, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pand m%6, m%5 ; x5.. pandn m%5, m%3 ; ..x0 psrl%1 m%3, shift ; ..x1 por m%4, m%5 ; x4x0 por m%3, m%6 ; x5x1%else ; more dependency, one insn less. sometimes faster, sometimes not mova m%5, m%4 ; x5x4 psll%1 m%4, shift ; x4.. pxor m%4, m%3 ; (x4^x1)x0 pand m%4, mask ; (x4^x1).. pxor m%3, m%4 ; x4x0 psrl%1 m%4, shift ; ..(x1^x4) pxor m%5, m%4 ; x5x1 SWAP %4, %3, %5%endif%endmacro%define TRANS TRANS_SSE2%macro TRANS_SSE4 5-6 ; see above%ifidn %1, d mova m%5, m%3%ifidn %2, ord psrl%1 m%3, 16%endif pblendw m%3, m%4, 10101010b psll%1 m%4, 16%ifidn %2, ord pblendw m%4, m%5, 01010101b%else psrl%1 m%5, 16 por m%4, m%5%endif%elifidn %1, q mova m%5, m%3 shufps m%3, m%4, 10001000b shufps m%5, m%4, 11011101b SWAP %4, %5%endif%endmacro%macro JDUP_SSE2 2 punpckldq %1, %2 ; doesn't need to dup. sse2 does things by zero extending to words and full h_2d%endmacro%macro JDUP_CONROE 2 ; join 2x 32 bit and duplicate them ; emulating shufps is faster on conroe punpcklqdq %1, %2 movsldup %1, %1%endmacro%macro JDUP_PENRYN 2 ; just use shufps on anything post conroe shufps %1, %2, 0%endmacro%macro HSUMSUB 5 pmaddubsw m%2, m%5 pmaddubsw m%1, m%5 pmaddubsw m%4, m%5 pmaddubsw m%3, m%5%endmacro%macro DIFF_UNPACK_SSE2 5 punpcklbw m%1, m%5 punpcklbw m%2, m%5 punpcklbw m%3, m%5 punpcklbw m%4, m%5 psubw m%1, m%2 psubw m%3, m%4%endmacro%macro DIFF_SUMSUB_SSSE3 5 HSUMSUB %1, %2, %3, %4, %5 psubw m%1, m%2 psubw m%3, m%4%endmacro%macro LOAD_DUP_2x4P 4 ; dst, tmp, 2* pointer movd %1, %3 movd %2, %4 JDUP %1, %2%endmacro%macro LOAD_DUP_4x8P_CONROE 8 ; 4*dst, 4*pointer movddup m%3, %6 movddup m%4, %8 movddup m%1, %5 movddup m%2, %7%endmacro%macro LOAD_DUP_4x8P_PENRYN 8 ; penryn and nehalem run punpcklqdq and movddup in different units movh m%3, %6 movh m%4, %8 punpcklqdq m%3, m%3 movddup m%1, %5 punpcklqdq m%4, m%4 movddup m%2, %7%endmacro%macro LOAD_SUMSUB_8x2P 9 LOAD_DUP_4x8P %1, %2, %3, %4, %6, %7, %8, %9 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5%endmacro%macro LOAD_SUMSUB_8x4P_SSSE3 7-10 r0, r2, 0; 4x dest, 2x tmp, 1x mul, [2* ptr], [increment?] LOAD_SUMSUB_8x2P %1, %2, %5, %6, %7, [%8], [%9], [%8+r1], [%9+r3] LOAD_SUMSUB_8x2P %3, %4, %5, %6, %7, [%8+2*r1], [%9+2*r3], [%8+r4], [%9+r5]%if %10 lea %8, [%8+4*r1] lea %9, [%9+4*r3]%endif%endmacro%macro LOAD_SUMSUB_16P_SSSE3 7 ; 2*dst, 2*tmp, mul, 2*ptr movddup m%1, [%7] movddup m%2, [%7+8] mova m%4, [%6] movddup m%3, m%4 punpckhqdq m%4, m%4 DIFF_SUMSUB_SSSE3 %1, %3, %2, %4, %5%endmacro%macro LOAD_SUMSUB_16P_SSE2 7 ; 2*dst, 2*tmp, mask, 2*ptr movu m%4, [%7] mova m%2, [%6] DEINTB %1, %2, %3, %4, %5 psubw m%1, m%3 psubw m%2, m%4 SUMSUB_BA m%1, m%2, m%3%endmacro%macro LOAD_SUMSUB_16x4P 10-13 r0, r2, none; 8x dest, 1x tmp, 1x mul, [2* ptr] [2nd tmp] LOAD_SUMSUB_16P %1, %5, %2, %3, %10, %11, %12 LOAD_SUMSUB_16P %2, %6, %3, %4, %10, %11+r1, %12+r3 LOAD_SUMSUB_16P %3, %7, %4, %9, %10, %11+2*r1, %12+2*r3 LOAD_SUMSUB_16P %4, %8, %13, %9, %10, %11+r4, %12+r5%endmacro; in: r4=3*stride1, r5=3*stride2; in: %2 = horizontal offset; in: %3 = whether we need to increment pix1 and pix2; clobber: m3..m7; out: %1 = satd%macro SATD_4x4_MMX 3 %xdefine %%n n%1 LOAD_DIFF m4, m3, none, [r0+%2], [r2+%2] LOAD_DIFF m5, m3, none, [r0+r1+%2], [r2+r3+%2] LOAD_DIFF m6, m3, none, [r0+2*r1+%2], [r2+2*r3+%2] LOAD_DIFF m7, m3, none, [r0+r4+%2], [r2+r5+%2]%if %3 lea r0, [r0+4*r1] lea r2, [r2+4*r3]%endif HADAMARD4_2D 4, 5, 6, 7, 3, %%n paddw m4, m6 SWAP %%n, 4%endmacro%macro SATD_8x4_SSE 8-9%ifidn %1, sse2 HADAMARD4_2D_SSE %2, %3, %4, %5, %6, amax%else HADAMARD4_V m%2, m%3, m%4, m%5, m%6 ; doing the abs first is a slight advantage ABS4 m%2, m%4, m%3, m%5, m%6, m%7 HADAMARD 1, max, %2, %4, %6, %7%endif%ifnidn %9, swap paddw m%8, m%2%else SWAP %8, %2%endif%ifidn %1, sse2 paddw m%8, m%4%else HADAMARD 1, max, %3, %5, %6, %7 paddw m%8, m%3%endif%endmacro%macro SATD_START_MMX 0 lea r4, [3*r1] ; 3*stride1 lea r5, [3*r3] ; 3*stride2%endmacro%macro SATD_END_MMX 0 pshufw m1, m0, 01001110b
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -