📄 interpolate8x8_mmx.asm
字号:
;/*****************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - mmx 8x8 block-based halfpel interpolation -
; *
; * Copyright(C) 2001 Peter Ross <pross@xvid.org>
; * 2002 Michael Militzer <isibaar@xvid.org>
; *
; * This program is free software ; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation ; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program ; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; ****************************************************************************/
BITS 32
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
;=============================================================================
; Read only data
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata data align=16
%endif
;-----------------------------------------------------------------------------
; (16 - r) rounding table
;-----------------------------------------------------------------------------
ALIGN 16
rounding_lowpass_mmx:
times 4 dw 16
times 4 dw 15
;-----------------------------------------------------------------------------
; (1 - r) rounding table
;-----------------------------------------------------------------------------
rounding1_mmx:
times 4 dw 1
times 4 dw 0
;-----------------------------------------------------------------------------
; (2 - r) rounding table
;-----------------------------------------------------------------------------
rounding2_mmx:
times 4 dw 2
times 4 dw 1
mmx_one:
times 8 db 1
mmx_two:
times 8 db 2
mmx_three:
times 8 db 3
mmx_five:
times 4 dw 5
mmx_mask:
times 8 db 254
mmx_mask2:
times 8 db 252
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal interpolate8x8_halfpel_h_mmx
cglobal interpolate8x8_halfpel_v_mmx
cglobal interpolate8x8_halfpel_hv_mmx
cglobal interpolate8x8_avg4_mmx
cglobal interpolate8x8_avg2_mmx
cglobal interpolate8x8_6tap_lowpass_h_mmx
cglobal interpolate8x8_6tap_lowpass_v_mmx
%macro CALC_AVG 6
punpcklbw %3, %6
punpckhbw %4, %6
paddusw %1, %3 ; mm01 += mm23
paddusw %2, %4
paddusw %1, %5 ; mm01 += rounding
paddusw %2, %5
psrlw %1, 1 ; mm01 >>= 1
psrlw %2, 1
%endmacro
;-----------------------------------------------------------------------------
;
; void interpolate8x8_halfpel_h_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;-----------------------------------------------------------------------------
%macro COPY_H_MMX 0
movq mm0, [esi]
movq mm2, [esi + 1]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6 ; mm01 = [src]
punpckhbw mm1, mm6 ; mm23 = [src + 1]
CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6
packuswb mm0, mm1
movq [edi], mm0 ; [dst] = mm01
add esi, edx ; src += stride
add edi, edx ; dst += stride
%endmacro
ALIGN 16
interpolate8x8_halfpel_h_mmx:
push esi
push edi
mov eax, [esp + 8 + 16] ; rounding
movq mm7, [rounding1_mmx + eax * 8]
mov edi, [esp + 8 + 4] ; dst
mov esi, [esp + 8 + 8] ; src
mov edx, [esp + 8 + 12] ; stride
pxor mm6, mm6 ; zero
COPY_H_MMX
COPY_H_MMX
COPY_H_MMX
COPY_H_MMX
COPY_H_MMX
COPY_H_MMX
COPY_H_MMX
COPY_H_MMX
pop edi
pop esi
ret
;-----------------------------------------------------------------------------
;
; void interpolate8x8_halfpel_v_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;-----------------------------------------------------------------------------
%macro COPY_V_MMX 0
movq mm0, [esi]
movq mm2, [esi + edx]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6 ; mm01 = [src]
punpckhbw mm1, mm6 ; mm23 = [src + 1]
CALC_AVG mm0, mm1, mm2, mm3, mm7, mm6
packuswb mm0, mm1
movq [edi], mm0 ; [dst] = mm01
add esi, edx ; src += stride
add edi, edx ; dst += stride
%endmacro
ALIGN 16
interpolate8x8_halfpel_v_mmx:
push esi
push edi
mov eax, [esp + 8 + 16] ; rounding
movq mm7, [rounding1_mmx + eax * 8]
mov edi, [esp + 8 + 4] ; dst
mov esi, [esp + 8 + 8] ; src
mov edx, [esp + 8 + 12] ; stride
pxor mm6, mm6 ; zero
COPY_V_MMX
COPY_V_MMX
COPY_V_MMX
COPY_V_MMX
COPY_V_MMX
COPY_V_MMX
COPY_V_MMX
COPY_V_MMX
pop edi
pop esi
ret
;-----------------------------------------------------------------------------
;
; void interpolate8x8_halfpel_hv_mmx(uint8_t * const dst,
; const uint8_t * const src,
; const uint32_t stride,
; const uint32_t rounding);
;
;
;-----------------------------------------------------------------------------
%macro COPY_HV_MMX 0
; current row
movq mm0, [esi]
movq mm2, [esi + 1]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm6 ; mm01 = [src]
punpcklbw mm2, mm6 ; mm23 = [src + 1]
punpckhbw mm1, mm6
punpckhbw mm3, mm6
paddusw mm0, mm2 ; mm01 += mm23
paddusw mm1, mm3
; next row
movq mm4, [esi + edx]
movq mm2, [esi + edx + 1]
movq mm5, mm4
movq mm3, mm2
punpcklbw mm4, mm6 ; mm45 = [src + stride]
punpcklbw mm2, mm6 ; mm23 = [src + stride + 1]
punpckhbw mm5, mm6
punpckhbw mm3, mm6
paddusw mm4, mm2 ; mm45 += mm23
paddusw mm5, mm3
; add current + next row
paddusw mm0, mm4 ; mm01 += mm45
paddusw mm1, mm5
paddusw mm0, mm7 ; mm01 += rounding2
paddusw mm1, mm7
psrlw mm0, 2 ; mm01 >>= 2
psrlw mm1, 2
packuswb mm0, mm1
movq [edi], mm0 ; [dst] = mm01
add esi, edx ; src += stride
add edi, edx ; dst += stride
%endmacro
ALIGN 16
interpolate8x8_halfpel_hv_mmx:
push esi
push edi
mov eax, [esp + 8 + 16] ; rounding
movq mm7, [rounding2_mmx + eax * 8]
mov edi, [esp + 8 + 4] ; dst
mov esi, [esp + 8 + 8] ; src
mov eax, 8
pxor mm6, mm6 ; zero
mov edx, [esp + 8 + 12] ; stride
COPY_HV_MMX
COPY_HV_MMX
COPY_HV_MMX
COPY_HV_MMX
COPY_HV_MMX
COPY_HV_MMX
COPY_HV_MMX
COPY_HV_MMX
pop edi
pop esi
ret
;-----------------------------------------------------------------------------
;
; void interpolate8x8_avg2_mmx(uint8_t const *dst,
; const uint8_t * const src1,
; const uint8_t * const src2,
; const uint32_t stride,
; const uint32_t rounding,
; const uint32_t height);
;
;-----------------------------------------------------------------------------
%macro AVG2_MMX_RND0 0
movq mm0, [eax] ; src1 -> mm0
movq mm1, [ebx] ; src2 -> mm1
movq mm4, [eax+edx]
movq mm5, [ebx+edx]
movq mm2, mm0 ; src1 -> mm2
movq mm3, mm1 ; src2 -> mm3
pand mm2, mm7 ; isolate the lsb
pand mm3, mm7 ; isolate the lsb
por mm2, mm3 ; ODD(src1) OR ODD(src2) -> mm2
movq mm3, mm4
movq mm6, mm5
pand mm3, mm7
pand mm6, mm7
por mm3, mm6
pand mm0, [mmx_mask]
pand mm1, [mmx_mask]
pand mm4, [mmx_mask]
pand mm5, [mmx_mask]
psrlq mm0, 1 ; src1 / 2
psrlq mm1, 1 ; src2 / 2
psrlq mm4, 1
psrlq mm5, 1
paddb mm0, mm1 ; src1/2 + src2/2 -> mm0
paddb mm0, mm2 ; correct rounding error
paddb mm4, mm5
paddb mm4, mm3
lea eax, [eax+2*edx]
lea ebx, [ebx+2*edx]
movq [ecx], mm0 ; (src1 + src2 + 1) / 2 -> dst
movq [ecx+edx], mm4
%endmacro
%macro AVG2_MMX_RND1 0
movq mm0, [eax] ; src1 -> mm0
movq mm1, [ebx] ; src2 -> mm1
movq mm4, [eax+edx]
movq mm5, [ebx+edx]
movq mm2, mm0 ; src1 -> mm2
movq mm3, mm1 ; src2 -> mm3
pand mm2, mm7 ; isolate the lsb
pand mm3, mm7 ; isolate the lsb
pand mm2, mm3 ; ODD(src1) AND ODD(src2) -> mm2
movq mm3, mm4
movq mm6, mm5
pand mm3, mm7
pand mm6, mm7
pand mm3, mm6
pand mm0, [mmx_mask]
pand mm1, [mmx_mask]
pand mm4, [mmx_mask]
pand mm5, [mmx_mask]
psrlq mm0, 1 ; src1 / 2
psrlq mm1, 1 ; src2 / 2
psrlq mm4, 1
psrlq mm5, 1
paddb mm0, mm1 ; src1/2 + src2/2 -> mm0
paddb mm0, mm2 ; correct rounding error
paddb mm4, mm5
paddb mm4, mm3
lea eax, [eax+2*edx]
lea ebx, [ebx+2*edx]
movq [ecx], mm0 ; (src1 + src2 + 1) / 2 -> dst
movq [ecx+edx], mm4
%endmacro
ALIGN 16
interpolate8x8_avg2_mmx:
push ebx
mov eax, [esp + 4 + 20] ; rounding
test eax, eax
jnz near .rounding1
mov eax, [esp + 4 + 24] ; height -> eax
sub eax, 8
test eax, eax
mov ecx, [esp + 4 + 4] ; dst -> edi
mov eax, [esp + 4 + 8] ; src1 -> esi
mov ebx, [esp + 4 + 12] ; src2 -> eax
mov edx, [esp + 4 + 16] ; stride -> edx
movq mm7, [mmx_one]
jz near .start0
AVG2_MMX_RND0
lea ecx, [ecx+2*edx]
.start0
AVG2_MMX_RND0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -