📄 mc-a2.asm.svn-base
字号:
;*****************************************************************************;* mc-a2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.;*****************************************************************************BITS 32;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "i386inc.asm";=============================================================================; Read only data;=============================================================================SECTION_RODATAALIGN 16pw_1: times 4 dw 1pw_16: times 4 dw 16pw_32: times 4 dw 32;=============================================================================; Macros;=============================================================================%macro LOAD_ADD 3 movd %1, %2 movd mm7, %3 punpcklbw %1, mm0 punpcklbw mm7, mm0 paddw %1, mm7%endmacro%macro FILT_V 0 psubw mm1, mm2 ; a-b psubw mm4, mm5 psubw mm2, mm3 ; b-c psubw mm5, mm6 psllw mm2, 2 psllw mm5, 2 psubw mm1, mm2 ; a-5*b+4*c psubw mm4, mm5 psllw mm3, 4 psllw mm6, 4 paddw mm1, mm3 ; a-5*b+20*c paddw mm4, mm6%endmacro%macro FILT_H 0 psubw mm1, mm2 ; a-b psubw mm4, mm5 psraw mm1, 2 ; (a-b)/4 psraw mm4, 2 psubw mm1, mm2 ; (a-b)/4-b psubw mm4, mm5 paddw mm1, mm3 ; (a-b)/4-b+c paddw mm4, mm6 psraw mm1, 2 ; ((a-b)/4-b+c)/4 psraw mm4, 2 paddw mm1, mm3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 paddw mm4, mm6%endmacro%macro FILT_PACK 1 paddw mm1, mm7 paddw mm4, mm7 psraw mm1, %1 psraw mm4, %1 packuswb mm1, mm4%endmacro;=============================================================================; Code;=============================================================================SECTION .text;-----------------------------------------------------------------------------; void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,; int i_stride, int i_width, int i_height );;-----------------------------------------------------------------------------cglobal x264_hpel_filter_mmxext push ebp mov ebp, esp push ebx push esi push edi picgetgot ebx %define tdsth ebp + 8 %define tdstv ebp + 12 %define tdstc ebp + 16 %define tsrc ebp + 20 %define tstride ebp + 24 %define twidth ebp + 28 %define theight ebp + 32 %define tpw_1 ebp - 36 %define tpw_16 ebp - 28 %define tpw_32 ebp - 20 %define tbuffer esp + 8 %define x eax %define dsth ebx %define dstv ebx %define dstc ebx %define src ecx %define src3 edx %define stride esi %define width edi mov stride, [tstride] mov width, [twidth] lea eax, [stride*2 + 24 + 24] sub esp, eax pxor mm0, mm0 ; mov globals onto the stack, to free up ebx movq mm1, [pw_1 GOT_ebx] movq mm2, [pw_16 GOT_ebx] movq mm3, [pw_32 GOT_ebx] movq [tpw_1], mm1 movq [tpw_16], mm2 movq [tpw_32], mm3.loopy: mov src, [tsrc] mov dstv, [tdstv] lea src3, [src + stride] sub src, stride sub src, stride xor x, xALIGN 16.vertical_filter: prefetcht0 [src3 + stride*2 + 32] LOAD_ADD mm1, [src ], [src3 + stride*2 ] ; a0 LOAD_ADD mm2, [src + stride ], [src3 + stride ] ; b0 LOAD_ADD mm3, [src + stride*2 ], [src3 ] ; c0 LOAD_ADD mm4, [src + 4], [src3 + stride*2 + 4] ; a1 LOAD_ADD mm5, [src + stride + 4], [src3 + stride + 4] ; b1 LOAD_ADD mm6, [src + stride*2 + 4], [src3 + 4] ; c1 FILT_V movq mm7, [tpw_16] movq [tbuffer + x*2], mm1 movq [tbuffer + x*2 + 8], mm4 paddw mm1, mm7 paddw mm4, mm7 psraw mm1, 5 psraw mm4, 5 packuswb mm1, mm4 movntq [dstv + x], mm1 add x, 8 add src, 8 add src3, 8 cmp x, width jle .vertical_filter pshufw mm2, [tbuffer], 0 movq [tbuffer - 8], mm2 ; pad left ; no need to pad right, since vertical_filter already did 4 extra pixels mov dstc, [tdstc] xor x, x movq mm7, [tpw_32].center_filter: movq mm1, [tbuffer + x*2 - 4 ] movq mm2, [tbuffer + x*2 - 2 ] movq mm3, [tbuffer + x*2 ] movq mm4, [tbuffer + x*2 + 4 ] movq mm5, [tbuffer + x*2 + 6 ] paddw mm3, [tbuffer + x*2 + 2 ] ; c0 paddw mm2, mm4 ; b0 paddw mm1, mm5 ; a0 movq mm6, [tbuffer + x*2 + 8 ] paddw mm4, [tbuffer + x*2 + 14] ; a1 paddw mm5, [tbuffer + x*2 + 12] ; b1 paddw mm6, [tbuffer + x*2 + 10] ; c1 FILT_H FILT_PACK 6 movntq [dstc + x], mm1 add x, 8 cmp x, width jl .center_filter mov dsth, [tdsth] mov src, [tsrc] xor x, x.horizontal_filter: movd mm1, [src + x - 2] movd mm2, [src + x - 1] movd mm3, [src + x ] movd mm6, [src + x + 1] movd mm4, [src + x + 2] movd mm5, [src + x + 3] punpcklbw mm1, mm0 punpcklbw mm2, mm0 punpcklbw mm3, mm0 punpcklbw mm6, mm0 punpcklbw mm4, mm0 punpcklbw mm5, mm0 paddw mm3, mm6 ; c0 paddw mm2, mm4 ; b0 paddw mm1, mm5 ; a0 movd mm7, [src + x + 7] movd mm6, [src + x + 6] punpcklbw mm7, mm0 punpcklbw mm6, mm0 paddw mm4, mm7 ; c1 paddw mm5, mm6 ; b1 movd mm7, [src + x + 5] movd mm6, [src + x + 4] punpcklbw mm7, mm0 punpcklbw mm6, mm0 paddw mm6, mm7 ; a1 movq mm7, [tpw_1] FILT_H FILT_PACK 1 movntq [dsth + x], mm1 add x, 8 cmp x, width jl .horizontal_filter add [tsrc], stride add [tdsth], stride add [tdstv], stride add [tdstc], stride dec dword [theight] jg .loopy lea esp, [ebp-12] pop edi pop esi pop ebx pop ebp ret;-----------------------------------------------------------------------------; void x264_plane_copy_mmxext( uint8_t *dst, int i_dst,; uint8_t *src, int i_src, int w, int h);-----------------------------------------------------------------------------cglobal x264_plane_copy_mmxext push edi push esi push ebx mov edi, [esp+16] ; dst mov ebx, [esp+20] ; i_dst mov esi, [esp+24] ; src mov eax, [esp+28] ; i_src mov edx, [esp+32] ; w add edx, 3 and edx, ~3 sub ebx, edx sub eax, edx.loopy: mov ecx, edx sub ecx, 64 jl .endx.loopx: prefetchnta [esi+256] movq mm0, [esi ] movq mm1, [esi+ 8] movq mm2, [esi+16] movq mm3, [esi+24] movq mm4, [esi+32] movq mm5, [esi+40] movq mm6, [esi+48] movq mm7, [esi+56] movntq [edi ], mm0 movntq [edi+ 8], mm1 movntq [edi+16], mm2 movntq [edi+24], mm3 movntq [edi+32], mm4 movntq [edi+40], mm5 movntq [edi+48], mm6 movntq [edi+56], mm7 add esi, 64 add edi, 64 sub ecx, 64 jge .loopx.endx: prefetchnta [esi+256] add ecx, 64 shr ecx, 2 rep movsd add edi, ebx add esi, eax sub dword [esp+36], 1 jg .loopy pop ebx pop esi pop edi emms ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -