📄 recon_mmx.c
字号:
/********************************************************************
* *
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
* *
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2003 *
* by the Xiph.Org Foundation http://www.xiph.org/ *
* *
********************************************************************
function:
last mod: $Id: reconstruct.c,v 1.6 2003/12/03 08:59:41 arc Exp $
********************************************************************/
#include "codec_internal.h"
static const unsigned __int64 V128 = 0x8080808080808080LL;
static void copy8x8__mmx (unsigned char *src,
unsigned char *dest,
unsigned int stride)
{
//Is this even the fastest way to do this?
__asm {
align 16
mov eax, src
mov ebx, dest
mov ecx, stride
lea edi, [ecx + ecx * 2]
movq mm0, [eax]
movq mm1, [eax + ecx]
movq mm2, [eax + ecx * 2]
movq mm3, [eax + edi]
lea eax, [eax + ecx * 4]
movq [ebx], mm0
movq [ebx + ecx], mm1
movq [ebx + ecx * 2], mm2
movq [ebx + edi], mm3
lea ebx, [ebx + ecx * 4]
movq mm0, [eax]
movq mm1, [eax + ecx]
movq mm2, [eax + ecx * 2]
movq mm3, [eax + edi]
movq [ebx], mm0
movq [ebx + ecx], mm1
movq [ebx + ecx * 2], mm2
movq [ebx + edi], mm3
};
}
static void recon_intra8x8__mmx (unsigned char *ReconPtr, ogg_int16_t *ChangePtr,
ogg_uint32_t LineStep)
{
__asm {
align 16
mov eax, ReconPtr
mov ebx, ChangePtr
mov ecx, LineStep
movq mm0, V128
lea edi, [128 + ebx]
loop_start:
movq mm2, [ebx]
packsswb mm2, [8 + ebx]
por mm0, mm0
pxor mm2, mm0
lea ebx, [16 + ebx]
cmp ebx, edi
movq [eax], mm2
lea eax, [eax + ecx]
jc loop_start
};
}
static void recon_inter8x8__mmx (unsigned char *ReconPtr, unsigned char *RefPtr,
ogg_int16_t *ChangePtr, ogg_uint32_t LineStep)
{
__asm {
align 16
mov eax, ReconPtr
mov ebx, ChangePtr
mov ecx, LineStep
mov edx, RefPtr
pxor mm0, mm0
lea edi, [128 + ebx]
loop_start:
movq mm2, [edx]
movq mm4, [ebx]
movq mm3, mm2
movq mm5, [8 + ebx]
punpcklbw mm2, mm0
paddsw mm2, mm4
punpckhbw mm3, mm0
paddsw mm3, mm5
add edx, ecx
packuswb mm2, mm3
lea ebx, [16 + ebx]
cmp ebx, edi
movq [eax], mm2
lea eax, [eax + ecx]
jc loop_start
};
}
static void recon_inter8x8_half__mmx (unsigned char *ReconPtr, unsigned char *RefPtr1,
unsigned char *RefPtr2, ogg_int16_t *ChangePtr,
ogg_uint32_t LineStep)
{
__asm {
align 16
mov eax, ReconPtr
mov ebx, ChangePtr
mov ecx, RefPtr1
mov edx, RefPtr2
pxor mm0, mm0
lea edi, [128 + ebx]
loop_start:
movq mm2, [ecx]
movq mm4, [edx]
movq mm3, mm2
punpcklbw mm2, mm0
movq mm5, mm4
movq mm6, [ebx]
punpckhbw mm3, mm0
movq mm7, [8 + ebx]
punpcklbw mm4, mm0
punpckhbw mm5, mm0
paddw mm2, mm4
paddw mm3, mm5
psrlw mm2, 1
psrlw mm3, 1
paddw mm2, mm6
paddw mm3, mm7
lea ebx, [16 + ebx]
packuswb mm2, mm3
add ecx, LineStep
add edx, LineStep
movq [eax], mm2
add eax, LineStep
cmp ebx, edi
jc loop_start
};
}
void dsp_mmx_recon_init(DspFunctions *funcs)
{
TH_DEBUG("enabling accelerated x86_32 mmx recon functions.\n");
funcs->copy8x8 = copy8x8__mmx;
funcs->recon_intra8x8 = recon_intra8x8__mmx;
funcs->recon_inter8x8 = recon_inter8x8__mmx;
funcs->recon_inter8x8_half = recon_inter8x8_half__mmx;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -