📄 mc-a2.asm
字号:
;*****************************************************************************;* mc-a2.asm: h264 encoder library;*****************************************************************************;* Copyright (C) 2005 x264 project;*;* This program is free software; you can redistribute it and/or modify;* it under the terms of the GNU General Public License as published by;* the Free Software Foundation; either version 2 of the License, or;* (at your option) any later version.;*;* This program is distributed in the hope that it will be useful,;* but WITHOUT ANY WARRANTY; without even the implied warranty of;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the;* GNU General Public License for more details.;*;* You should have received a copy of the GNU General Public License;* along with this program; if not, write to the Free Software;* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.;*****************************************************************************BITS 64;=============================================================================; Macros and other preprocessor constants;=============================================================================%include "amd64inc.asm";=============================================================================; Read only data;=============================================================================SECTION .rodataALIGN 16mmx_dw_one: times 4 dw 16mmx_dd_one: times 2 dd 512mmx_dw_20: times 4 dw 20mmx_dw_5: times 4 dw -5%assign tbuffer 0;=============================================================================; Macros;=============================================================================%macro LOAD_4 9 movd %1, %5 movd %2, %6 movd %3, %7 movd %4, %8 punpcklbw %1, %9 punpcklbw %2, %9 punpcklbw %3, %9 punpcklbw %4, %9%endmacro%macro FILT_2 2 psubw %1, %2 psllw %2, 2 psubw %1, %2%endmacro%macro FILT_4 3 paddw %2, %3 psllw %2, 2 paddw %1, %2 psllw %2, 2 paddw %1, %2%endmacro%macro FILT_6 4 psubw %1, %2 psllw %2, 2 psubw %1, %2 paddw %1, %3 paddw %1, %4 psraw %1, 5%endmacro%macro FILT_ALL 1 LOAD_4 mm1, mm2, mm3, mm4, [%1], [%1 + rcx], [%1 + 2 * rcx], [%1 + rbx], mm0 FILT_2 mm1, mm2 movd mm5, [%1 + 4 * rcx] movd mm6, [%1 + rdx] FILT_4 mm1, mm3, mm4 punpcklbw mm5, mm0 punpcklbw mm6, mm0 psubw mm1, mm5 psllw mm5, 2 psubw mm1, mm5 paddw mm1, mm6%endmacro;=============================================================================; Code;=============================================================================SECTION .textcglobal x264_horizontal_filter_mmxextcglobal x264_center_filter_mmxext;-----------------------------------------------------------------------------;; void x264_center_filter_mmxext( uint8_t *dst1, int i_dst1_stride,; uint8_t *dst2, int i_dst2_stride,; uint8_t *src, int i_src_stride,; int i_width, int i_height );;;-----------------------------------------------------------------------------ALIGN 16x264_center_filter_mmxext : push r15 pushreg r15%ifdef WIN64 push rdi pushreg rdi push rsi pushreg rsi%endif push rbp pushreg rbp push rbx pushreg rbx push r12 pushreg r12 push r13 pushreg r13 push r14 pushreg r14 lea rbp, [rsp] setframe rbp, 0 endprolog%ifdef WIN64 movsxd r13, dword [rsp+64+48] ; src_stride mov r12, [rsp+64+40] ; src%else movsxd r13, r9d ; src_stride mov r12, r8 ; src%endif sub r12, r13 sub r12, r13 ; tsrc = src - 2 * src_stride ; use 24 instead of 18 (used in i386/mc-a2.asm) to keep rsp aligned lea rax, [r13 + r13 + 24 + tbuffer] sub rsp, rax mov r10, parm3q ; dst2 movsxd r11, parm4d ; dst2_stride mov r8, parm1q ; dst1 movsxd r9, parm2d ; dst1_stride%ifdef WIN64 movsxd r14, dword [rbp + 64 + 56] ; width movsxd r15, dword [rbp + 64 + 64] ; height%else movsxd r14, dword [rbp + 56] ; width movsxd r15, dword [rbp + 64] ; height%endif mov rcx, r13 ; src_stride lea rbx, [r13 + r13 * 2] ; 3 * src_stride lea rdx, [r13 + r13 * 4] ; 5 * src_stride pxor mm0, mm0 ; 0 ---> mm0 movq mm7, [mmx_dd_one GLOBAL] ; for rounding.loopcy: xor rax, rax mov rsi, r12 ; tsrc FILT_ALL rsi pshufw mm2, mm1, 0 movq [rsp + tbuffer], mm2 movq [rsp + tbuffer + 8], mm1 paddw mm1, [mmx_dw_one GLOBAL] psraw mm1, 5 packuswb mm1, mm1 movd [r8], mm1 ; dst1[0] = mm1 add rax, 8 add rsi, 4 lea rdi, [r8 - 4] ; rdi = dst1 - 4.loopcx1: FILT_ALL rsi movq [rsp + tbuffer + 2 * rax], mm1 paddw mm1, [mmx_dw_one GLOBAL] psraw mm1, 5 packuswb mm1, mm1 movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1 add rsi, 4 add rax, 4 cmp rax, r14 ; cmp rax, width jnz .loopcx1 FILT_ALL rsi pshufw mm2, mm1, 7 movq [rsp + tbuffer + 2 * rax], mm1 movq [rsp + tbuffer + 2 * rax + 8], mm2 paddw mm1, [mmx_dw_one GLOBAL] psraw mm1, 5 packuswb mm1, mm1 movd [rdi + rax], mm1 ; dst1[rax - 4] = mm1 add r12, r13 ; tsrc = tsrc + src_stride add r8, r9 ; dst1 = dst1 + dst1_stride xor rax, rax.loopcx2: movq mm2, [rsp + 2 * rax + 2 + 4 + tbuffer] movq mm3, [rsp + 2 * rax + 4 + 4 + tbuffer] movq mm4, [rsp + 2 * rax + 6 + 4 + tbuffer] movq mm5, [rsp + 2 * rax + 8 + 4 + tbuffer] movq mm1, [rsp + 2 * rax + 4 + tbuffer] movq mm6, [rsp + 2 * rax + 10 + 4 + tbuffer] paddw mm2, mm5 paddw mm3, mm4 paddw mm1, mm6 movq mm5, [mmx_dw_20 GLOBAL] movq mm4, [mmx_dw_5 GLOBAL] movq mm6, mm1 pxor mm7, mm7 punpckhwd mm5, mm2 punpcklwd mm4, mm3 punpcklwd mm2, [mmx_dw_20 GLOBAL] punpckhwd mm3, [mmx_dw_5 GLOBAL] pcmpgtw mm7, mm1 pmaddwd mm2, mm4 pmaddwd mm3, mm5 punpcklwd mm1, mm7 punpckhwd mm6, mm7 paddd mm2, mm1 paddd mm3, mm6 paddd mm2, [mmx_dd_one GLOBAL] paddd mm3, [mmx_dd_one GLOBAL] psrad mm2, 10 psrad mm3, 10 packssdw mm2, mm3 packuswb mm2, mm0 movd [r10 + rax], mm2 ; dst2[rax] = mm2 add rax, 4 cmp rax, r14 ; cmp rax, width jnz .loopcx2 add r10, r11 ; dst2 += dst2_stride dec r15 ; height test r15, r15 jnz .loopcy lea rsp, [rbp] pop r14 pop r13 pop r12 pop rbx pop rbp%ifdef WIN64 pop rsi pop rdi%endif pop r15 ret;-----------------------------------------------------------------------------;; void x264_horizontal_filter_mmxext( uint8_t *dst, int i_dst_stride,; uint8_t *src, int i_src_stride,; int i_width, int i_height );;;-----------------------------------------------------------------------------ALIGN 16x264_horizontal_filter_mmxext : movsxd r10, parm2d ; dst_stride movsxd r11, parm4d ; src_stride%ifdef WIN64 mov rdx, r8 ; src mov r9, rcx ; dst movsxd rcx, parm6d ; height%else movsxd rcx, parm6d ; height mov r9, rdi ; dst%endif movsxd r8, parm5d ; width pxor mm0, mm0 movq mm7, [mmx_dw_one GLOBAL] sub rdx, 2loophy: dec rcx xor rax, raxloophx: prefetchnta [rdx + rax + 48] LOAD_4 mm1, mm2, mm3, mm4, [rdx + rax], [rdx + rax + 1], [rdx + rax + 2], [rdx + rax + 3], mm0 FILT_2 mm1, mm2 movd mm5, [rdx + rax + 4] movd mm6, [rdx + rax + 5] FILT_4 mm1, mm3, mm4 movd mm2, [rdx + rax + 4] movd mm3, [rdx + rax + 6] punpcklbw mm5, mm0 punpcklbw mm6, mm0 FILT_6 mm1, mm5, mm6, mm7 movd mm4, [rdx + rax + 7] movd mm5, [rdx + rax + 8] punpcklbw mm2, mm0 punpcklbw mm3, mm0 ; mm2(1), mm3(20), mm6(-5) ready FILT_2 mm2, mm6 movd mm6, [rdx + rax + 9] punpcklbw mm4, mm0 punpcklbw mm5, mm0 ; mm2(1-5), mm3(20), mm4(20), mm5(-5) ready FILT_4 mm2, mm3, mm4 punpcklbw mm6, mm0 FILT_6 mm2, mm5, mm6, mm7 packuswb mm1, mm2 movq [r9 + rax], mm1 add rax, 8 cmp rax, r8 ; cmp rax, width jnz loophx add rdx, r11 ; src_pitch add r9, r10 ; dst_pitch test rcx, rcx jnz loophy ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -