📄 mem_transfer_mmx.asm
字号:
;/****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - 8<->16 bit transfer functions -; *; * Copyright (C) 2001 Peter Ross <pross@xvid.org>; * 2001 Michael Militzer <isibaar@xvid.org>; * 2002 Pascal Massimino <skal@planet-d.net>; * 2004 Andre Werthmann <wertmann@aei.mpg.de>; *; * This program is free software ; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation ; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY ; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program ; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: mem_transfer_mmx.asm,v 1.1 2005/01/05 23:02:15 edgomez Exp $; *; ***************************************************************************/BITS 64%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Read only data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endif;=============================================================================; Code;=============================================================================SECTION .text align=16cglobal transfer_8to16copy_x86_64cglobal transfer_16to8copy_x86_64cglobal transfer_8to16sub_x86_64cglobal transfer_8to16subro_x86_64cglobal transfer_8to16sub2_x86_64cglobal transfer_8to16sub2ro_x86_64cglobal transfer_16to8add_x86_64cglobal transfer8x8_copy_x86_64;-----------------------------------------------------------------------------;; void transfer_8to16copy_x86_64(int16_t * const dst,; const uint8_t * const src,; uint32_t stride);;;-----------------------------------------------------------------------------%macro COPY_8_TO_16 1 movq mm0, [rax] movq mm1, [rax+rdx] movq mm2, mm0 movq mm3, mm1 punpcklbw mm0, mm7 movq [rcx+%1*32], mm0 punpcklbw mm1, mm7 movq [rcx+%1*32+16], mm1 punpckhbw mm2, mm7 punpckhbw mm3, mm7 lea rax, [rax+2*rdx] movq [rcx+%1*32+8], mm2 movq [rcx+%1*32+24], mm3%endmacroALIGN 16transfer_8to16copy_x86_64: ; rdx is Stride mov rax, rsi ; Src mov rcx, rdi ; Dst pxor mm7, mm7 COPY_8_TO_16 0 COPY_8_TO_16 1 COPY_8_TO_16 2 COPY_8_TO_16 3 ret.endfunc;-----------------------------------------------------------------------------;; void transfer_16to8copy_x86_64(uint8_t * const dst,; const int16_t * const src,; uint32_t stride);;;-----------------------------------------------------------------------------%macro COPY_16_TO_8 1 movq mm0, [rax+%1*32] movq mm1, [rax+%1*32+8] packuswb mm0, mm1 movq [rcx], mm0 movq mm2, [rax+%1*32+16] movq mm3, [rax+%1*32+24] packuswb mm2, mm3 movq [rcx+rdx], mm2%endmacroALIGN 16transfer_16to8copy_x86_64: ; rdx is Stride mov rax, rsi ; Src mov rcx, rdi ; Dst COPY_16_TO_8 0 lea rcx,[rcx+2*rdx] COPY_16_TO_8 1 lea rcx,[rcx+2*rdx] COPY_16_TO_8 2 lea rcx,[rcx+2*rdx] COPY_16_TO_8 3 ret.endfunc;-----------------------------------------------------------------------------;; void transfer_8to16sub_x86_64(int16_t * const dct,; uint8_t * const cur,; const uint8_t * const ref,; const uint32_t stride);;;-----------------------------------------------------------------------------; when second argument == 1, reference (ebx) block is to current (eax)%macro COPY_8_TO_16_SUB 2 movq mm0, [rax] ; cur movq mm2, [rax+rdx] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7 movq mm4, [rbx] ; ref punpckhbw mm1, mm7 punpckhbw mm3, mm7 movq mm5, [rbx+rdx] ; ref movq mm6, mm4%if %2 == 1 movq [rax], mm4 movq [rax+rdx], mm5%endif punpcklbw mm4, mm7 punpckhbw mm6, mm7 psubsw mm0, mm4 psubsw mm1, mm6 movq mm6, mm5 punpcklbw mm5, mm7 punpckhbw mm6, mm7 psubsw mm2, mm5 lea rax, [rax+2*rdx] psubsw mm3, mm6 lea rbx,[rbx+2*rdx] movq [rcx+%1*32+ 0], mm0 ; dst movq [rcx+%1*32+ 8], mm1 movq [rcx+%1*32+16], mm2 movq [rcx+%1*32+24], mm3%endmacroALIGN 16transfer_8to16sub_x86_64: push rbx mov rax, rsi ; Cur mov rbx, rdx ; Ref mov rdx, rcx ; Stride mov rcx, rdi ; Dst pxor mm7, mm7 COPY_8_TO_16_SUB 0, 1 COPY_8_TO_16_SUB 1, 1 COPY_8_TO_16_SUB 2, 1 COPY_8_TO_16_SUB 3, 1 pop rbx ret.endfuncALIGN 16transfer_8to16subro_x86_64: push rbx mov rax, rsi ; Cur mov rbx, rdx ; Ref mov rdx, rcx ; Stride mov rcx, rdi ; Dst pxor mm7, mm7 COPY_8_TO_16_SUB 0, 0 COPY_8_TO_16_SUB 1, 0 COPY_8_TO_16_SUB 2, 0 COPY_8_TO_16_SUB 3, 0 pop rbx ret.endfunc;-----------------------------------------------------------------------------;; void transfer_8to16sub2_x86_64(int16_t * const dct,; uint8_t * const cur,; const uint8_t * ref1,; const uint8_t * ref2,; const uint32_t stride);;-----------------------------------------------------------------------------%macro COPY_8_TO_16_SUB2_SSE 1 movq mm0, [rax] ; cur movq mm2, [rax+rdx] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7 movq mm4, [rbx] ; ref1 pavgb mm4, [rsi] ; ref2 movq [rax], mm4 punpckhbw mm1, mm7 punpckhbw mm3, mm7 movq mm5, [rbx+rdx] ; ref pavgb mm5, [rsi+rdx] ; ref2 movq [rax+rdx], mm5 movq mm6, mm4 punpcklbw mm4, mm7 punpckhbw mm6, mm7 psubsw mm0, mm4 psubsw mm1, mm6 lea rsi, [rsi+2*rdx] movq mm6, mm5 punpcklbw mm5, mm7 punpckhbw mm6, mm7 psubsw mm2, mm5 lea rax, [rax+2*rdx] psubsw mm3, mm6 lea rbx, [rbx+2*rdx] movq [rcx+%1*32+ 0], mm0 ; dst movq [rcx+%1*32+ 8], mm1 movq [rcx+%1*32+16], mm2 movq [rcx+%1*32+24], mm3%endmacroALIGN 16transfer_8to16sub2_x86_64: push rbx mov rax, rsi ; Cur mov rbx, rdx ; Ref1 mov rdx, r8 ; Stride mov rsi, rcx ; Ref2 mov rcx, rdi ; Dst pxor mm7, mm7 COPY_8_TO_16_SUB2_SSE 0 COPY_8_TO_16_SUB2_SSE 1 COPY_8_TO_16_SUB2_SSE 2 COPY_8_TO_16_SUB2_SSE 3 pop rbx ret.endfunc;-----------------------------------------------------------------------------;; void transfer_8to16sub2ro_x86_64(int16_t * const dct,; const uint8_t * const cur,; const uint8_t * ref1,; const uint8_t * ref2,; const uint32_t stride);;-----------------------------------------------------------------------------%macro COPY_8_TO_16_SUB2RO_SSE 1 movq mm0, [rsi] ; cur movq mm2, [rsi+r8] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7 movq mm4, [rdx] ; ref1 pavgb mm4, [rcx] ; ref2 punpckhbw mm1, mm7 punpckhbw mm3, mm7 movq mm5, [rdx+r8] ; ref pavgb mm5, [rcx+r8] ; ref2 movq mm6, mm4 punpcklbw mm4, mm7 punpckhbw mm6, mm7 psubsw mm0, mm4 psubsw mm1, mm6 lea rcx, [rcx+2*r8] movq mm6, mm5 punpcklbw mm5, mm7 punpckhbw mm6, mm7 psubsw mm2, mm5 lea rsi, [rsi+2*r8] psubsw mm3, mm6 lea rdx, [rdx+2*r8] movq [rdi+%1*32+ 0], mm0 ; dst movq [rdi+%1*32+ 8], mm1 movq [rdi+%1*32+16], mm2 movq [rdi+%1*32+24], mm3%endmacroALIGN 16transfer_8to16sub2ro_x86_64: pxor mm7, mm7 COPY_8_TO_16_SUB2RO_SSE 0 COPY_8_TO_16_SUB2RO_SSE 1 COPY_8_TO_16_SUB2RO_SSE 2 COPY_8_TO_16_SUB2RO_SSE 3 ret.endfunc;-----------------------------------------------------------------------------;; void transfer_16to8add_x86_64(uint8_t * const dst,; const int16_t * const src,; uint32_t stride);;;-----------------------------------------------------------------------------%macro COPY_16_TO_8_ADD 1 movq mm0, [rcx] movq mm2, [rcx+rdx] movq mm1, mm0 movq mm3, mm2 punpcklbw mm0, mm7 punpcklbw mm2, mm7 punpckhbw mm1, mm7 punpckhbw mm3, mm7 paddsw mm0, [rax+%1*32+ 0] paddsw mm1, [rax+%1*32+ 8] paddsw mm2, [rax+%1*32+16] paddsw mm3, [rax+%1*32+24] packuswb mm0, mm1 movq [rcx], mm0 packuswb mm2, mm3 movq [rcx+rdx], mm2%endmacroALIGN 16transfer_16to8add_x86_64: ; rdx is Stride mov rax, rsi ; Src mov rcx, rdi ; Dst pxor mm7, mm7 COPY_16_TO_8_ADD 0 lea rcx,[rcx+2*rdx] COPY_16_TO_8_ADD 1 lea rcx,[rcx+2*rdx] COPY_16_TO_8_ADD 2 lea rcx,[rcx+2*rdx] COPY_16_TO_8_ADD 3 ret.endfunc;-----------------------------------------------------------------------------;; void transfer8x8_copy_x86_64(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride);;;;-----------------------------------------------------------------------------%macro COPY_8_TO_8 0 movq mm0, [rax] movq mm1, [rax+rdx] movq [rcx], mm0 lea rax, [rax+2*rdx] movq [rcx+rdx], mm1%endmacroALIGN 16transfer8x8_copy_x86_64: ; rdx is Stride mov rax, rsi ; Src mov rcx, rdi ; Dst COPY_8_TO_8 lea rcx,[rcx+2*rdx] COPY_8_TO_8 lea rcx,[rcx+2*rdx] COPY_8_TO_8 lea rcx,[rcx+2*rdx] COPY_8_TO_8 ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -