📄 mc-a.asm
字号:
;*****************************************************************************;* mc-a.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2003-2008 x264 project;*;* Authors: Loren Merritt <lorenm@u.washington.edu>;* Jason Garrett-Glaser <darkshikari@gmail.com>;* Laurent Aimar <fenrir@via.ecp.fr>;* Min Chen <chenm001.163.com>;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.;*****************************************************************************%include "x86inc.asm"SECTION_RODATApw_4: times 8 dw 4pw_8: times 8 dw 8pw_32: times 8 dw 32pw_64: times 8 dw 64sw_64: dd 64SECTION .text;=============================================================================; weighted prediction;=============================================================================; implicit bipred only:; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64%ifdef ARCH_X86_64 DECLARE_REG_TMP 0,1,2,3,4,5,10,11 %macro AVG_START 0-1 0 PROLOGUE 6,7,%1%ifdef WIN64 movsxd r5, r5d%endif .height_loop: %endmacro%else DECLARE_REG_TMP 1,2,3,4,5,6,1,2 %macro AVG_START 0-1 0 PROLOGUE 0,7,%1 mov t0, r0m mov t1, r1m mov t2, r2m mov t3, r3m mov t4, r4m mov t5, r5m .height_loop: %endmacro%endif%macro SPLATW 2%if mmsize==16 pshuflw %1, %2, 0 punpcklqdq %1, %1%else pshufw %1, %2, 0%endif%endmacro%macro BIWEIGHT_MMX 2 movh m0, %1 movh m1, %2 punpcklbw m0, m5 punpcklbw m1, m5 pmullw m0, m2 pmullw m1, m3 paddw m0, m1 paddw m0, m4 psraw m0, 6%endmacro%macro BIWEIGHT_START_MMX 0 movd m2, r6m SPLATW m2, m2 ; weight_dst mova m3, [pw_64 GLOBAL] psubw m3, m2 ; weight_src mova m4, [pw_32 GLOBAL] ; rounding pxor m5, m5%endmacro%macro BIWEIGHT_SSSE3 2 movh m0, %1 movh m1, %2 punpcklbw m0, m1 pmaddubsw m0, m3 paddw m0, m4 psraw m0, 6%endmacro%macro BIWEIGHT_START_SSSE3 0 movzx t6d, byte r6m ; FIXME x86_64 mov t7d, 64 sub t7d, t6d shl t7d, 8 add t6d, t7d movd m3, t6d mova m4, [pw_32 GLOBAL] SPLATW m3, m3 ; weight_dst,src%endmacro%macro BIWEIGHT_ROW 4 BIWEIGHT [%2], [%3]%if %4==mmsize/2 packuswb m0, m0 movh [%1], m0%else SWAP 0, 6 BIWEIGHT [%2+mmsize/2], [%3+mmsize/2] packuswb m6, m0 mova [%1], m6%endif%endmacro;-----------------------------------------------------------------------------; int x264_pixel_avg_weight_w16_mmxext( uint8_t *dst, int, uint8_t *src1, int, uint8_t *src2, int, int i_weight );-----------------------------------------------------------------------------%macro AVG_WEIGHT 2-3 0cglobal x264_pixel_avg_weight_w%2_%1 BIWEIGHT_START AVG_START %3%if %2==8 && mmsize==16 BIWEIGHT [t2], [t4] SWAP 0, 6 BIWEIGHT [t2+t3], [t4+t5] packuswb m6, m0 movlps [t0], m6 movhps [t0+t1], m6%else%assign x 0%rep 1+%2/(mmsize*2) BIWEIGHT_ROW t0+x, t2+x, t4+x, %2 BIWEIGHT_ROW t0+x+t1, t2+x+t3, t4+x+t5, %2%assign x x+mmsize%endrep%endif lea t0, [t0+t1*2] lea t2, [t2+t3*2] lea t4, [t4+t5*2] sub eax, 2 jg .height_loop REP_RET%endmacro%define BIWEIGHT BIWEIGHT_MMX%define BIWEIGHT_START BIWEIGHT_START_MMXINIT_MMXAVG_WEIGHT mmxext, 4AVG_WEIGHT mmxext, 8AVG_WEIGHT mmxext, 16INIT_XMM%define x264_pixel_avg_weight_w4_sse2 x264_pixel_avg_weight_w4_mmxextAVG_WEIGHT sse2, 8, 7AVG_WEIGHT sse2, 16, 7%define BIWEIGHT BIWEIGHT_SSSE3%define BIWEIGHT_START BIWEIGHT_START_SSSE3INIT_MMXAVG_WEIGHT ssse3, 4INIT_XMMAVG_WEIGHT ssse3, 8, 7AVG_WEIGHT ssse3, 16, 7;=============================================================================; pixel avg;=============================================================================;-----------------------------------------------------------------------------; void x264_pixel_avg_4x4_mmxext( uint8_t *dst, int dst_stride,; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride, int weight );;-----------------------------------------------------------------------------%macro AVGH 3cglobal x264_pixel_avg_%1x%2_%3 mov eax, %2 cmp dword r6m, 32 jne x264_pixel_avg_weight_w%1_%3%if mmsize == 16 && %1 == 16 test dword r4m, 15 jz x264_pixel_avg_w%1_sse2%endif jmp x264_pixel_avg_w%1_mmxext%endmacro;-----------------------------------------------------------------------------; void x264_pixel_avg_w4_mmxext( uint8_t *dst, int dst_stride,; uint8_t *src1, int src1_stride, uint8_t *src2, int src2_stride,; int height, int weight );;-----------------------------------------------------------------------------%macro AVG_END 0 sub eax, 2 lea t4, [t4+t5*2] lea t2, [t2+t3*2] lea t0, [t0+t1*2] jg .height_loop REP_RET%endmacro%macro AVG_FUNC 3cglobal %1 AVG_START %2 m0, [t2] %2 m1, [t2+t3] pavgb m0, [t4] pavgb m1, [t4+t5] %3 [t0], m0 %3 [t0+t1], m1 AVG_END%endmacroINIT_MMXAVG_FUNC x264_pixel_avg_w4_mmxext, movd, movdAVGH 4, 8, mmxextAVGH 4, 4, mmxextAVGH 4, 2, mmxextAVG_FUNC x264_pixel_avg_w8_mmxext, movq, movqAVGH 8, 16, mmxextAVGH 8, 8, mmxextAVGH 8, 4, mmxextcglobal x264_pixel_avg_w16_mmxext AVG_START movq mm0, [t2 ] movq mm1, [t2+8] movq mm2, [t2+t3 ] movq mm3, [t2+t3+8] pavgb mm0, [t4 ] pavgb mm1, [t4+8] pavgb mm2, [t4+t5 ] pavgb mm3, [t4+t5+8] movq [t0 ], mm0 movq [t0+8], mm1 movq [t0+t1 ], mm2 movq [t0+t1+8], mm3 AVG_ENDAVGH 16, 16, mmxextAVGH 16, 8, mmxextINIT_XMMAVG_FUNC x264_pixel_avg_w16_sse2, movdqu, movdqaAVGH 16, 16, sse2AVGH 16, 8, sse2AVGH 8, 16, sse2AVGH 8, 8, sse2AVGH 8, 4, sse2AVGH 16, 16, ssse3AVGH 16, 8, ssse3AVGH 8, 16, ssse3AVGH 8, 8, ssse3AVGH 8, 4, ssse3INIT_MMXAVGH 4, 8, ssse3AVGH 4, 4, ssse3AVGH 4, 2, ssse3;=============================================================================; pixel avg2;=============================================================================;-----------------------------------------------------------------------------; void x264_pixel_avg2_w4_mmxext( uint8_t *dst, int dst_stride,; uint8_t *src1, int src_stride,; uint8_t *src2, int height );;-----------------------------------------------------------------------------%macro AVG2_W8 2cglobal x264_pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3].height_loop: %2 mm0, [r2] %2 mm1, [r2+r3] pavgb mm0, [r2+r4] pavgb mm1, [r2+r6] %2 [r0], mm0 %2 [r0+r1], mm1 sub r5d, 2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] jg .height_loop REP_RET%endmacroAVG2_W8 4, movdAVG2_W8 8, movq%macro AVG2_W16 2cglobal x264_pixel_avg2_w%1_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3].height_loop: movq mm0, [r2] %2 mm1, [r2+8] movq mm2, [r2+r3] %2 mm3, [r2+r3+8] pavgb mm0, [r2+r4] pavgb mm1, [r2+r4+8] pavgb mm2, [r2+r6] pavgb mm3, [r2+r6+8] movq [r0], mm0 %2 [r0+8], mm1 movq [r0+r1], mm2 %2 [r0+r1+8], mm3 lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop REP_RET%endmacroAVG2_W16 12, movdAVG2_W16 16, movqcglobal x264_pixel_avg2_w20_mmxext, 6,7 sub r4, r2 lea r6, [r4+r3].height_loop: movq mm0, [r2] movq mm1, [r2+8] movd mm2, [r2+16] movq mm3, [r2+r3] movq mm4, [r2+r3+8] movd mm5, [r2+r3+16] pavgb mm0, [r2+r4] pavgb mm1, [r2+r4+8] pavgb mm2, [r2+r4+16] pavgb mm3, [r2+r6] pavgb mm4, [r2+r6+8] pavgb mm5, [r2+r6+16] movq [r0], mm0 movq [r0+8], mm1 movd [r0+16], mm2 movq [r0+r1], mm3 movq [r0+r1+8], mm4 movd [r0+r1+16], mm5 lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop REP_RETcglobal x264_pixel_avg2_w16_sse2, 6,7 sub r4, r2 lea r6, [r4+r3].height_loop: movdqu xmm0, [r2] movdqu xmm2, [r2+r3] movdqu xmm1, [r2+r4] movdqu xmm3, [r2+r6] pavgb xmm0, xmm1 pavgb xmm2, xmm3 movdqa [r0], xmm0 movdqa [r0+r1], xmm2 lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop REP_RET%macro AVG2_W20 1cglobal x264_pixel_avg2_w20_%1, 6,7 sub r4, r2 lea r6, [r4+r3].height_loop: movdqu xmm0, [r2] movdqu xmm2, [r2+r3] movd mm4, [r2+16] movd mm5, [r2+r3+16]%ifidn %1, sse2_misalign pavgb xmm0, [r2+r4] pavgb xmm2, [r2+r6]%else movdqu xmm1, [r2+r4] movdqu xmm3, [r2+r6] pavgb xmm0, xmm1 pavgb xmm2, xmm3%endif pavgb mm4, [r2+r4+16] pavgb mm5, [r2+r6+16] movdqa [r0], xmm0 movd [r0+16], mm4 movdqa [r0+r1], xmm2 movd [r0+r1+16], mm5 lea r2, [r2+r3*2] lea r0, [r0+r1*2] sub r5d, 2 jg .height_loop REP_RET%endmacroAVG2_W20 sse2AVG2_W20 sse2_misalign; Cacheline split code for processors with high latencies for loads; split over cache lines. See sad-a.asm for a more detailed explanation.; This particular instance is complicated by the fact that src1 and src2; can have different alignments. For simplicity and code size, only the; MMX cacheline workaround is used. As a result, in the case of SSE2; pixel_avg, the cacheline check functions calls the SSE2 version if there; is no cacheline split, and the MMX workaround if there is.%macro INIT_SHIFT 2 and eax, 7 shl eax, 3 movd %1, [sw_64 GLOBAL] movd %2, eax psubw %1, %2%endmacro%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction setcglobal x264_pixel_avg2_w%1_cache%2_%3 mov eax, r2m and eax, 0x1f|(%2>>1) cmp eax, (32-%1)|(%2>>1) jle x264_pixel_avg2_w%1_%3;w12 isn't needed because w16 is just as fast if there's no cacheline split%if %1 == 12 jmp x264_pixel_avg2_w16_cache_mmxext%else jmp x264_pixel_avg2_w%1_cache_mmxext%endif%endmacro%macro AVG_CACHELINE_START 0 %assign stack_offset 0 INIT_SHIFT mm6, mm7 mov eax, r4m INIT_SHIFT mm4, mm5 PROLOGUE 6,6 and r2, ~7 and r4, ~7 sub r4, r2.height_loop:%endmacro%macro AVG_CACHELINE_LOOP 2 movq mm0, [r2+8+%1] movq mm1, [r2+%1] movq mm2, [r2+r4+8+%1] movq mm3, [r2+r4+%1] psllq mm0, mm6 psrlq mm1, mm7 psllq mm2, mm4 psrlq mm3, mm5 por mm0, mm1 por mm2, mm3 pavgb mm0, mm2 %2 [r0+%1], mm0%endmacrox264_pixel_avg2_w8_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq add r2, r3 add r0, r1 dec r5d jg .height_loop REP_RETx264_pixel_avg2_w16_cache_mmxext: AVG_CACHELINE_START AVG_CACHELINE_LOOP 0, movq AVG_CACHELINE_LOOP 8, movq add r2, r3 add r0, r1 dec r5d jg .height_loop REP_RET
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -