📄 mem_transfer_3dne.asm
字号:
;/****************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - 8<->16 bit transfer functions -
; *
; * Copyright (C) 2002 Jaan Kalda
; *
; * This program is free software ; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation ; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program ; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; * $Id: mem_transfer_3dne.asm,v 1.7 2005/09/13 12:12:15 suxen_drol Exp $
; *
; ***************************************************************************/
; these 3dne functions are compatible with iSSE, but are optimized specifically
; for K7 pipelines
BITS 32
%macro cglobal 1
%ifdef PREFIX
%ifdef MARK_FUNCS
global _%1:function %1.endfunc-%1
%define %1 _%1:function %1.endfunc-%1
%else
global _%1
%define %1 _%1
%endif
%else
%ifdef MARK_FUNCS
global %1:function %1.endfunc-%1
%else
global %1
%endif
%endif
%endmacro
;=============================================================================
; Read only data
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif
ALIGN 8
mm_zero:
dd 0,0
;=============================================================================
; Macros
;=============================================================================
%macro nop4 0
db 08Dh, 074h, 026h, 0
%endmacro
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal transfer_8to16copy_3dne
cglobal transfer_16to8copy_3dne
cglobal transfer_8to16sub_3dne
cglobal transfer_8to16subro_3dne
cglobal transfer_8to16sub2_3dne
cglobal transfer_16to8add_3dne
cglobal transfer8x8_copy_3dne
cglobal transfer8x4_copy_3dne
;-----------------------------------------------------------------------------
;
; void transfer_8to16copy_3dne(int16_t * const dst,
; const uint8_t * const src,
; uint32_t stride);
;
;-----------------------------------------------------------------------------
ALIGN 16
transfer_8to16copy_3dne:
mov eax, [esp+ 8] ; Src
mov edx, [esp+12] ; Stride
mov ecx, [esp+ 4] ; Dst
punpcklbw mm0, [byte eax]
punpcklbw mm1, [eax+4]
movq mm2, [eax+edx]
movq mm3, [eax+edx]
pxor mm7, mm7
lea eax, [eax+2*edx]
punpcklbw mm2, mm7
punpckhbw mm3, mm7
psrlw mm0, 8
psrlw mm1, 8
punpcklbw mm4, [eax]
punpcklbw mm5, [eax+edx+4]
movq [byte ecx+0*64], mm0
movq [ecx+0*64+8], mm1
punpcklbw mm6, [eax+edx]
punpcklbw mm7, [eax+4]
lea eax, [byte eax+2*edx]
psrlw mm4, 8
psrlw mm5, 8
punpcklbw mm0, [eax]
punpcklbw mm1, [eax+edx+4]
movq [ecx+0*64+16], mm2
movq [ecx+0*64+24], mm3
psrlw mm6, 8
psrlw mm7, 8
punpcklbw mm2, [eax+edx]
punpcklbw mm3, [eax+4]
lea eax, [byte eax+2*edx]
movq [byte ecx+0*64+32], mm4
movq [ecx+0*64+56], mm5
psrlw mm0, 8
psrlw mm1, 8
punpcklbw mm4, [eax]
punpcklbw mm5, [eax+edx+4]
movq [byte ecx+0*64+48], mm6
movq [ecx+0*64+40], mm7
psrlw mm2, 8
psrlw mm3, 8
punpcklbw mm6, [eax+edx]
punpcklbw mm7, [eax+4]
movq [byte ecx+1*64], mm0
movq [ecx+1*64+24], mm1
psrlw mm4, 8
psrlw mm5, 8
movq [ecx+1*64+16], mm2
movq [ecx+1*64+8], mm3
psrlw mm6, 8
psrlw mm7, 8
movq [byte ecx+1*64+32], mm4
movq [ecx+1*64+56], mm5
movq [byte ecx+1*64+48], mm6
movq [ecx+1*64+40], mm7
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void transfer_16to8copy_3dne(uint8_t * const dst,
; const int16_t * const src,
; uint32_t stride);
;
;-----------------------------------------------------------------------------
ALIGN 16
transfer_16to8copy_3dne:
mov eax, [esp+ 8] ; Src
mov ecx, [esp+ 4] ; Dst
mov edx, [esp+12] ; Stride
movq mm0, [byte eax+0*32]
packuswb mm0, [eax+0*32+8]
movq mm1, [eax+0*32+16]
packuswb mm1, [eax+0*32+24]
movq mm5, [eax+2*32+16]
movq mm2, [eax+1*32]
packuswb mm2, [eax+1*32+8]
movq mm3, [eax+1*32+16]
packuswb mm3, [eax+1*32+24]
movq mm6, [eax+3*32]
movq mm4, [eax+2*32]
packuswb mm4, [eax+2*32+8]
packuswb mm5, [eax+2*32+24]
movq mm7, [eax+3*32+16]
packuswb mm7, [eax+3*32+24]
packuswb mm6, [eax+3*32+8]
movq [ecx], mm0
lea eax, [3*edx]
add eax, ecx
movq [ecx+edx], mm1
movq [ecx+2*edx], mm2
movq [byte eax], mm3
movq [ecx+4*edx], mm4
lea ecx, [byte ecx+4*edx]
movq [eax+2*edx], mm5
movq [eax+4*edx], mm7
movq [ecx+2*edx], mm6
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void transfer_8to16sub_3dne(int16_t * const dct,
; uint8_t * const cur,
; const uint8_t * const ref,
; const uint32_t stride);
;
;-----------------------------------------------------------------------------
; when second argument == 1, reference (ebx) block is to current (eax)
%macro COPY_8_TO_16_SUB 2
movq mm1, [eax] ; cur
movq mm0, mm1
movq mm4, [ecx] ; ref
movq mm6, mm4
%if %2 == 1
movq [eax], mm4
%endif
punpckhbw mm1, mm7
punpckhbw mm6, mm7
punpcklbw mm4, mm7
ALIGN 8
movq mm2, [byte eax+edx]
punpcklbw mm0, mm7
movq mm3, [byte eax+edx]
punpcklbw mm2, mm7
movq mm5, [byte ecx+edx] ; ref
punpckhbw mm3, mm7
%if %2 == 1
movq [byte eax+edx], mm5
%endif
psubsw mm1, mm6
movq mm6, mm5
psubsw mm0, mm4
%if (%1 < 3)
lea eax,[eax+2*edx]
lea ecx,[ecx+2*edx]
%else
mov ecx,[esp]
add esp,byte 4
%endif
movq [edi+%1*32+ 8], mm1
movq [byte edi+%1*32+ 0], mm0 ; dst
punpcklbw mm5, mm7
punpckhbw mm6, mm7
psubsw mm2, mm5
psubsw mm3, mm6
movq [edi+%1*32+16], mm2
movq [edi+%1*32+24], mm3
%endmacro
ALIGN 16
transfer_8to16sub_3dne:
mov eax, [esp + 8] ; Cur
mov ecx, [esp +12] ; Ref
push edi
mov edx, [dword esp+4+16] ; Stride
mov edi, [esp+4+ 4] ; Dst
pxor mm7, mm7
nop
ALIGN 4
COPY_8_TO_16_SUB 0, 1
COPY_8_TO_16_SUB 1, 1
COPY_8_TO_16_SUB 2, 1
COPY_8_TO_16_SUB 3, 1
mov edi, ecx
ret
.endfunc
ALIGN 16
transfer_8to16subro_3dne:
mov eax, [esp + 8] ; Cur
mov ecx, [esp +12] ; Ref
push edi
mov edx, [dword esp+4+16] ; Stride
mov edi, [esp+4+ 4] ; Dst
pxor mm7, mm7
nop
ALIGN 4
COPY_8_TO_16_SUB 0, 0
COPY_8_TO_16_SUB 1, 0
COPY_8_TO_16_SUB 2, 0
COPY_8_TO_16_SUB 3, 0
mov edi, ecx
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void transfer_8to16sub2_3dne(int16_t * const dct,
; uint8_t * const cur,
; const uint8_t * ref1,
; const uint8_t * ref2,
; const uint32_t stride)
;
;-----------------------------------------------------------------------------
%macro COPY_8_TO_16_SUB2_SSE 1
db 0Fh, 6Fh, 44h, 20h, 00 ;movq mm0, [byte eax] ; cur
punpcklbw mm0, mm7
movq mm2, [byte eax+edx]
punpcklbw mm2, mm7
db 0Fh, 6Fh, 4ch, 20h, 00 ;movq mm1, [byte eax]
punpckhbw mm1, mm7
movq mm3, [byte eax+edx]
punpckhbw mm3, mm7
movq mm4, [byte ebx] ; ref1
pavgb mm4, [byte esi] ; ref2
movq [eax], mm4
movq mm5, [ebx+edx] ; ref
pavgb mm5, [esi+edx] ; ref2
movq [eax+edx], mm5
movq mm6, mm4
punpcklbw mm4, mm7
punpckhbw mm6, mm7
%if (%1 < 3)
lea esi,[esi+2*edx]
lea ebx,[byte ebx+2*edx]
lea eax,[eax+2*edx]
%else
mov esi,[esp]
mov ebx,[esp+4]
add esp,byte 8
%endif
psubsw mm0, mm4
psubsw mm1, mm6
movq mm6, mm5
punpcklbw mm5, mm7
punpckhbw mm6, mm7
psubsw mm2, mm5
psubsw mm3, mm6
movq [byte ecx+%1*32+ 0], mm0 ; dst
movq [ecx+%1*32+ 8], mm1
movq [ecx+%1*32+16], mm2
movq [ecx+%1*32+24], mm3
%endmacro
ALIGN 16
transfer_8to16sub2_3dne:
mov edx, [esp +20] ; Stride
mov ecx, [esp + 4] ; Dst
mov eax, [esp + 8] ; Cur
push ebx
lea ebp,[byte ebp]
mov ebx, [esp+4+12] ; Ref1
push esi
pxor mm7, mm7
mov esi, [esp+8+16] ; Ref2
nop4
COPY_8_TO_16_SUB2_SSE 0
COPY_8_TO_16_SUB2_SSE 1
COPY_8_TO_16_SUB2_SSE 2
COPY_8_TO_16_SUB2_SSE 3
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void transfer_16to8add_3dne(uint8_t * const dst,
; const int16_t * const src,
; uint32_t stride);
;
;-----------------------------------------------------------------------------
%macro COPY_16_TO_8_ADD 1
db 0Fh, 6Fh, 44h, 21h, 00 ;movq mm0, [byte ecx]
punpcklbw mm0, mm7
movq mm2, [byte ecx+edx]
punpcklbw mm2, mm7
db 0Fh, 6Fh, 4ch, 21h, 00 ;movq mm1, [byte ecx]
punpckhbw mm1, mm7
movq mm3, [byte ecx+edx]
punpckhbw mm3, mm7
paddsw mm0, [byte eax+%1*32+ 0]
paddsw mm1, [eax+%1*32+ 8]
paddsw mm2, [eax+%1*32+16]
paddsw mm3, [eax+%1*32+24]
packuswb mm0, mm1
packuswb mm2, mm3
mov esp,esp
movq [byte ecx], mm0
movq [ecx+edx], mm2
%endmacro
ALIGN 16
transfer_16to8add_3dne:
mov ecx, [esp+ 4] ; Dst
mov edx, [esp+12] ; Stride
mov eax, [esp+ 8] ; Src
pxor mm7, mm7
nop
COPY_16_TO_8_ADD 0
lea ecx,[byte ecx+2*edx]
COPY_16_TO_8_ADD 1
lea ecx,[byte ecx+2*edx]
COPY_16_TO_8_ADD 2
lea ecx,[byte ecx+2*edx]
COPY_16_TO_8_ADD 3
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void transfer8x8_copy_3dne(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride);
;
;
;-----------------------------------------------------------------------------
%macro COPY_8_TO_8 0
movq mm0, [byte eax]
movq mm1, [eax+edx]
movq [byte ecx], mm0
lea eax,[byte eax+2*edx]
movq [ecx+edx], mm1
%endmacro
ALIGN 16
transfer8x8_copy_3dne:
mov eax, [esp+ 8] ; Src
mov edx, [esp+12] ; Stride
mov ecx, [esp+ 4] ; Dst
COPY_8_TO_8
lea ecx,[byte ecx+2*edx]
COPY_8_TO_8
lea ecx,[byte ecx+2*edx]
COPY_8_TO_8
lea ecx,[byte ecx+2*edx]
COPY_8_TO_8
ret
.endfunc
;-----------------------------------------------------------------------------
;
; void transfer8x4_copy_3dne(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride);
;
;
;-----------------------------------------------------------------------------
ALIGN 16
transfer8x4_copy_3dne:
mov eax, [esp+ 8] ; Src
mov edx, [esp+12] ; Stride
mov ecx, [esp+ 4] ; Dst
COPY_8_TO_8
lea ecx,[byte ecx+2*edx]
COPY_8_TO_8
ret
.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -