📄 mem_transfer_3dne.asm
字号:
;/****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - 8<->16 bit transfer functions -; *; * Copyright (C) 2002 Jaan Kalda; *; * This program is free software ; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation ; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY ; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program ; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: mem_transfer_3dne.asm,v 1.7 2005/09/13 12:12:15 suxen_drol Exp $; *; ***************************************************************************/; these 3dne functions are compatible with iSSE, but are optimized specifically; for K7 pipelinesBITS 32%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Read only data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 8mm_zero: dd 0,0;=============================================================================; Macros;=============================================================================%macro nop4 0 db 08Dh, 074h, 026h, 0%endmacro;=============================================================================; Code;=============================================================================SECTION .textcglobal transfer_8to16copy_3dnecglobal transfer_16to8copy_3dnecglobal transfer_8to16sub_3dnecglobal transfer_8to16subro_3dnecglobal transfer_8to16sub2_3dnecglobal transfer_16to8add_3dnecglobal transfer8x8_copy_3dnecglobal transfer8x4_copy_3dne;-----------------------------------------------------------------------------;; void transfer_8to16copy_3dne(int16_t * const dst,; const uint8_t * const src,; uint32_t stride);;;-----------------------------------------------------------------------------ALIGN 16transfer_8to16copy_3dne: mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; Stride mov ecx, [esp+ 4] ; Dst punpcklbw mm0, [byte eax] punpcklbw mm1, [eax+4] movq mm2, [eax+edx] movq mm3, [eax+edx] pxor mm7, mm7 lea eax, [eax+2*edx] punpcklbw mm2, mm7 punpckhbw mm3, mm7 psrlw mm0, 8 psrlw mm1, 8 punpcklbw mm4, [eax] punpcklbw mm5, [eax+edx+4] movq [byte ecx+0*64], mm0 movq [ecx+0*64+8], mm1 punpcklbw mm6, [eax+edx] punpcklbw mm7, [eax+4] lea eax, [byte eax+2*edx] psrlw mm4, 8 psrlw mm5, 8 punpcklbw mm0, [eax] punpcklbw mm1, [eax+edx+4] movq [ecx+0*64+16], mm2 movq [ecx+0*64+24], mm3 psrlw mm6, 8 psrlw mm7, 8 punpcklbw mm2, [eax+edx] punpcklbw mm3, [eax+4] lea eax, [byte eax+2*edx] movq [byte ecx+0*64+32], mm4 movq [ecx+0*64+56], mm5 psrlw mm0, 8 psrlw mm1, 8 punpcklbw mm4, [eax] punpcklbw mm5, [eax+edx+4] movq [byte ecx+0*64+48], mm6 movq [ecx+0*64+40], mm7 psrlw mm2, 8 psrlw mm3, 8 punpcklbw mm6, [eax+edx] punpcklbw mm7, [eax+4] movq [byte ecx+1*64], mm0 movq [ecx+1*64+24], mm1 psrlw mm4, 8 psrlw mm5, 8 movq [ecx+1*64+16], mm2 movq [ecx+1*64+8], mm3 psrlw mm6, 8 psrlw mm7, 8 movq [byte ecx+1*64+32], mm4 movq [ecx+1*64+56], mm5 movq [byte ecx+1*64+48], mm6 movq [ecx+1*64+40], mm7 ret.endfunc;-----------------------------------------------------------------------------;; void transfer_16to8copy_3dne(uint8_t * const dst,; const int16_t * const src,; uint32_t stride);;;-----------------------------------------------------------------------------ALIGN 16transfer_16to8copy_3dne: mov eax, [esp+ 8] ; Src mov ecx, [esp+ 4] ; Dst mov edx, [esp+12] ; Stride movq mm0, [byte eax+0*32] packuswb mm0, [eax+0*32+8] movq mm1, [eax+0*32+16] packuswb mm1, [eax+0*32+24] movq mm5, [eax+2*32+16] movq mm2, [eax+1*32] packuswb mm2, [eax+1*32+8] movq mm3, [eax+1*32+16] packuswb mm3, [eax+1*32+24] movq mm6, [eax+3*32] movq mm4, [eax+2*32] packuswb mm4, [eax+2*32+8] packuswb mm5, [eax+2*32+24] movq mm7, [eax+3*32+16] packuswb mm7, [eax+3*32+24] packuswb mm6, [eax+3*32+8] movq [ecx], mm0 lea eax, [3*edx] add eax, ecx movq [ecx+edx], mm1 movq [ecx+2*edx], mm2 movq [byte eax], mm3 movq [ecx+4*edx], mm4 lea ecx, [byte ecx+4*edx] movq [eax+2*edx], mm5 movq [eax+4*edx], mm7 movq [ecx+2*edx], mm6 ret.endfunc;-----------------------------------------------------------------------------;; void transfer_8to16sub_3dne(int16_t * const dct,; uint8_t * const cur,; const uint8_t * const ref,; const uint32_t stride);;;-----------------------------------------------------------------------------; when second argument == 1, reference (ebx) block is to current (eax)%macro COPY_8_TO_16_SUB 2 movq mm1, [eax] ; cur movq mm0, mm1 movq mm4, [ecx] ; ref movq mm6, mm4%if %2 == 1 movq [eax], mm4%endif punpckhbw mm1, mm7 punpckhbw mm6, mm7 punpcklbw mm4, mm7ALIGN 8 movq mm2, [byte eax+edx] punpcklbw mm0, mm7 movq mm3, [byte eax+edx] punpcklbw mm2, mm7 movq mm5, [byte ecx+edx] ; ref punpckhbw mm3, mm7%if %2 == 1 movq [byte eax+edx], mm5%endif psubsw mm1, mm6 movq mm6, mm5 psubsw mm0, mm4%if (%1 < 3) lea eax,[eax+2*edx] lea ecx,[ecx+2*edx]%else mov ecx,[esp] add esp,byte 4%endif movq [edi+%1*32+ 8], mm1 movq [byte edi+%1*32+ 0], mm0 ; dst punpcklbw mm5, mm7 punpckhbw mm6, mm7 psubsw mm2, mm5 psubsw mm3, mm6 movq [edi+%1*32+16], mm2 movq [edi+%1*32+24], mm3%endmacroALIGN 16transfer_8to16sub_3dne: mov eax, [esp + 8] ; Cur mov ecx, [esp +12] ; Ref push edi mov edx, [dword esp+4+16] ; Stride mov edi, [esp+4+ 4] ; Dst pxor mm7, mm7 nopALIGN 4 COPY_8_TO_16_SUB 0, 1 COPY_8_TO_16_SUB 1, 1 COPY_8_TO_16_SUB 2, 1 COPY_8_TO_16_SUB 3, 1 mov edi, ecx ret.endfuncALIGN 16transfer_8to16subro_3dne: mov eax, [esp + 8] ; Cur mov ecx, [esp +12] ; Ref push edi mov edx, [dword esp+4+16] ; Stride mov edi, [esp+4+ 4] ; Dst pxor mm7, mm7 nopALIGN 4 COPY_8_TO_16_SUB 0, 0 COPY_8_TO_16_SUB 1, 0 COPY_8_TO_16_SUB 2, 0 COPY_8_TO_16_SUB 3, 0 mov edi, ecx ret.endfunc;-----------------------------------------------------------------------------;; void transfer_8to16sub2_3dne(int16_t * const dct,; uint8_t * const cur,; const uint8_t * ref1,; const uint8_t * ref2,; const uint32_t stride);;-----------------------------------------------------------------------------%macro COPY_8_TO_16_SUB2_SSE 1 db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur punpcklbw mm0, mm7 movq mm2, [byte eax+edx] punpcklbw mm2, mm7 db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte eax] punpckhbw mm1, mm7 movq mm3, [byte eax+edx] punpckhbw mm3, mm7 movq mm4, [byte ebx] ; ref1 pavgb mm4, [byte esi] ; ref2 movq [eax], mm4 movq mm5, [ebx+edx] ; ref pavgb mm5, [esi+edx] ; ref2 movq [eax+edx], mm5 movq mm6, mm4 punpcklbw mm4, mm7 punpckhbw mm6, mm7%if (%1 < 3) lea esi,[esi+2*edx] lea ebx,[byte ebx+2*edx] lea eax,[eax+2*edx]%else mov esi,[esp] mov ebx,[esp+4] add esp,byte 8%endif psubsw mm0, mm4 psubsw mm1, mm6 movq mm6, mm5 punpcklbw mm5, mm7 punpckhbw mm6, mm7 psubsw mm2, mm5 psubsw mm3, mm6 movq [byte ecx+%1*32+ 0], mm0 ; dst movq [ecx+%1*32+ 8], mm1 movq [ecx+%1*32+16], mm2 movq [ecx+%1*32+24], mm3%endmacroALIGN 16transfer_8to16sub2_3dne: mov edx, [esp +20] ; Stride mov ecx, [esp + 4] ; Dst mov eax, [esp + 8] ; Cur push ebx lea ebp,[byte ebp] mov ebx, [esp+4+12] ; Ref1 push esi pxor mm7, mm7 mov esi, [esp+8+16] ; Ref2 nop4 COPY_8_TO_16_SUB2_SSE 0 COPY_8_TO_16_SUB2_SSE 1 COPY_8_TO_16_SUB2_SSE 2 COPY_8_TO_16_SUB2_SSE 3 ret.endfunc;-----------------------------------------------------------------------------;; void transfer_16to8add_3dne(uint8_t * const dst,; const int16_t * const src,; uint32_t stride);;;-----------------------------------------------------------------------------%macro COPY_16_TO_8_ADD 1 db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx] punpcklbw mm0, mm7 movq mm2, [byte ecx+edx] punpcklbw mm2, mm7 db 0Fh, 6Fh, 4ch, 21h, 00 ;movq mm1, [byte ecx] punpckhbw mm1, mm7 movq mm3, [byte ecx+edx] punpckhbw mm3, mm7 paddsw mm0, [byte eax+%1*32+ 0] paddsw mm1, [eax+%1*32+ 8] paddsw mm2, [eax+%1*32+16] paddsw mm3, [eax+%1*32+24] packuswb mm0, mm1 packuswb mm2, mm3 mov esp,esp movq [byte ecx], mm0 movq [ecx+edx], mm2%endmacroALIGN 16transfer_16to8add_3dne: mov ecx, [esp+ 4] ; Dst mov edx, [esp+12] ; Stride mov eax, [esp+ 8] ; Src pxor mm7, mm7 nop COPY_16_TO_8_ADD 0 lea ecx,[byte ecx+2*edx] COPY_16_TO_8_ADD 1 lea ecx,[byte ecx+2*edx] COPY_16_TO_8_ADD 2 lea ecx,[byte ecx+2*edx] COPY_16_TO_8_ADD 3 ret.endfunc;-----------------------------------------------------------------------------;; void transfer8x8_copy_3dne(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride);;;;-----------------------------------------------------------------------------%macro COPY_8_TO_8 0 movq mm0, [byte eax] movq mm1, [eax+edx] movq [byte ecx], mm0 lea eax,[byte eax+2*edx] movq [ecx+edx], mm1%endmacroALIGN 16transfer8x8_copy_3dne: mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; Stride mov ecx, [esp+ 4] ; Dst COPY_8_TO_8 lea ecx,[byte ecx+2*edx] COPY_8_TO_8 lea ecx,[byte ecx+2*edx] COPY_8_TO_8 lea ecx,[byte ecx+2*edx] COPY_8_TO_8 ret.endfunc;-----------------------------------------------------------------------------;; void transfer8x4_copy_3dne(uint8_t * const dst,; const uint8_t * const src,; const uint32_t stride);;;;-----------------------------------------------------------------------------ALIGN 16transfer8x4_copy_3dne: mov eax, [esp+ 8] ; Src mov edx, [esp+12] ; Stride mov ecx, [esp+ 4] ; Dst COPY_8_TO_8 lea ecx,[byte ecx+2*edx] COPY_8_TO_8 ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -