fdct_mmx.c
来自「mediastreamer2是开源的网络传输媒体流的库」· C语言 代码 · 共 335 行
C
335 行
;//==========================================================================
;//
;// THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
;// KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
;// IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
;// PURPOSE.
;//
;// Copyright (c) 1999 - 2001 On2 Technologies Inc. All Rights Reserved.
;//
;//--------------------------------------------------------------------------
#include "theora/theora.h"
#include "codec_internal.h"
#include "dsp.h"
static const ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
static const ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
static const ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
static const ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
static const ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
static const ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
static const ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;
static __inline void Transpose_mmx( ogg_int16_t *InputData1, ogg_int16_t *OutputData1,
ogg_int16_t *InputData2, ogg_int16_t *OutputData2)
{
__asm {
align 16
mov eax, InputData1
mov ebx, InputData2
mov ecx, OutputData1
mov edx, OutputData2
movq mm0, [eax] ; /* mm0 = a0 a1 a2 a3 */
movq mm4, [ebx] ; /* mm4 = e4 e5 e6 e7 */
movq mm1, [16 + eax] ; /* mm1 = b0 b1 b2 b3 */
movq mm5, [16 + ebx] ; /* mm5 = f4 f5 f6 f7 */
movq mm2, [32 + eax] ; /* mm2 = c0 c1 c2 c3 */
movq mm6, [32 + ebx] ; /* mm6 = g4 g5 g6 g7 */
movq mm3, [48 + eax] ; /* mm3 = d0 d1 d2 d3 */
movq [16 + ecx], mm1 ; /* save b0 b1 b2 b3 */
movq mm7, [48 + ebx] ; /* mm7 = h0 h1 h2 h3 */
; /* Transpose 2x8 block */
movq mm1, mm4 ; /* mm1 = e3 e2 e1 e0 */
punpcklwd mm4, mm5 ; /* mm4 = f1 e1 f0 e0 */
movq [ecx], mm0 ; /* save a3 a2 a1 a0 */
punpckhwd mm1, mm5 ; /* mm1 = f3 e3 f2 e2 */
movq mm0, mm6 ; /* mm0 = g3 g2 g1 g0 */
punpcklwd mm6, mm7 ; /* mm6 = h1 g1 h0 g0 */
movq mm5, mm4 ; /* mm5 = f1 e1 f0 e0 */
punpckldq mm4, mm6 ; /* mm4 = h0 g0 f0 e0 = MM4 */
punpckhdq mm5, mm6 ; /* mm5 = h1 g1 f1 e1 = MM5 */
movq mm6, mm1 ; /* mm6 = f3 e3 f2 e2 */
movq [edx], mm4 ;
punpckhwd mm0, mm7 ; /* mm0 = h3 g3 h2 g2 */
movq [16 + edx], mm5 ;
punpckhdq mm6, mm0 ; /* mm6 = h3 g3 f3 e3 = MM7 */
movq mm4, [ecx] ; /* mm4 = a3 a2 a1 a0 */
punpckldq mm1, mm0 ; /* mm1 = h2 g2 f2 e2 = MM6 */
movq mm5, [16 + ecx] ; /* mm5 = b3 b2 b1 b0 */
movq mm0, mm4 ; /* mm0 = a3 a2 a1 a0 */
movq [48 + edx], mm6 ;
punpcklwd mm0, mm5 ; /* mm0 = b1 a1 b0 a0 */
movq [32 + edx], mm1 ;
punpckhwd mm4, mm5 ; /* mm4 = b3 a3 b2 a2 */
movq mm5, mm2 ; /* mm5 = c3 c2 c1 c0 */
punpcklwd mm2, mm3 ; /* mm2 = d1 c1 d0 c0 */
movq mm1, mm0 ; /* mm1 = b1 a1 b0 a0 */
punpckldq mm0, mm2 ; /* mm0 = d0 c0 b0 a0 = MM0 */
punpckhdq mm1, mm2 ; /* mm1 = d1 c1 b1 a1 = MM1 */
movq mm2, mm4 ; /* mm2 = b3 a3 b2 a2 */
movq [ecx], mm0 ;
punpckhwd mm5, mm3 ; /* mm5 = d3 c3 d2 c2 */
movq [16 + ecx], mm1 ;
punpckhdq mm4, mm5 ; /* mm4 = d3 c3 b3 a3 = MM3 */
punpckldq mm2, mm5 ; /* mm2 = d2 c2 b2 a2 = MM2 */
movq [48 + ecx], mm4 ;
movq [32 + ecx], mm2 ;
};
}
static __inline void Fdct_mmx( ogg_int16_t *InputData1, ogg_int16_t *InputData2, ogg_int16_t *temp)
{
__asm {
align 16
mov eax, InputData1
mov ebx, InputData2
mov ecx, temp
movq mm0, [eax] ;
movq mm1, [16 + eax] ;
movq mm2, [48 + eax] ;
movq mm3, [16 + ebx] ;
movq mm4, mm0 ;
movq mm5, mm1 ;
movq mm6, mm2 ;
movq mm7, mm3 ;
;
paddsw mm0, [48 + ebx] ; /* mm0 = ip0 + ip7 = is07 */
paddsw mm1, [32 + eax] ; /* mm1 = ip1 + ip2 = is12 */
paddsw mm2, [ebx] ; /* mm2 = ip3 + ip4 = is34 */
paddsw mm3, [32 + ebx] ; /* mm3 = ip5 + ip6 = is56 */
psubsw mm4, [48 + ebx] ; /* mm4 = ip0 - ip7 = id07 */
psubsw mm5, [32 + eax] ; /* mm5 = ip1 - ip2 = id12 */
;
psubsw mm0, mm2 ; /* mm0 = is07 - is34 */
;
paddsw mm2, mm2 ;
;
psubsw mm6, [ebx] ; /* mm6 = ip3 - ip4 = id34 */
;
paddsw mm2, mm0 ; /* mm2 = is07 + is34 = is0734 */
psubsw mm1, mm3 ; /* mm1 = is12 - is56 */
movq [ecx], mm0 ; /* Save is07 - is34 to free mm0; */
paddsw mm3, mm3 ;
paddsw mm3, mm1 ; /* mm3 = is12 + 1s56 = is1256 */
;
psubsw mm7, [32 + ebx] ; /* mm7 = ip5 - ip6 = id56 */
; /* ------------------------------------------------------------------- */
psubsw mm5, mm7 ; /* mm5 = id12 - id56 */
paddsw mm7, mm7 ;
paddsw mm7, mm5 ; /* mm7 = id12 + id56 */
; /* ------------------------------------------------------------------- */
psubsw mm2, mm3 ; /* mm2 = is0734 - is1256 */
paddsw mm3, mm3 ;
;
movq mm0, mm2 ; /* make a copy */
paddsw mm3, mm2 ; /* mm3 = is0734 + is1256 */
;
pmulhw mm0, xC4S4 ; /* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
paddw mm0, mm2 ; /* mm0 = xC4S4 * ( is0734 - is1256 ) */
psrlw mm2, 15 ;
paddw mm0, mm2 ; /* Truncate mm0, now it is op[4] */
;
movq mm2, mm3 ;
movq [ebx], mm0 ; /* save ip4, now mm0,mm2 are free */
;
movq mm0, mm3 ;
pmulhw mm3, xC4S4 ; /* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
;
psrlw mm2, 15 ;
paddw mm3, mm0 ; /* mm3 = xC4S4 * ( is0734 +is1256 ) */
paddw mm3, mm2 ; /* Truncate mm3, now it is op[0] */
;
movq [eax], mm3 ;
; /* ------------------------------------------------------------------- */
movq mm3, [ecx] ; /* mm3 = irot_input_y */
pmulhw mm3, xC2S6 ; /* mm3 = xC2S6 * irot_input_y - irot_input_y */
;
movq mm2, [ecx] ;
movq mm0, mm2 ;
;
psrlw mm2, 15 ; /* mm3 = xC2S6 * irot_input_y */
paddw mm3, mm0 ;
;
paddw mm3, mm2 ; /* Truncated */
movq mm0, mm5 ;
;
movq mm2, mm5 ;
pmulhw mm0, xC6S2 ; /* mm0 = xC6S2 * irot_input_x */
;
psrlw mm2, 15 ;
paddw mm0, mm2 ; /* Truncated */
;
paddsw mm3, mm0 ; /* ip[2] */
movq [32 + eax], mm3 ; /* Save ip2 */
;
movq mm0, mm5 ;
movq mm2, mm5 ;
;
pmulhw mm5, xC2S6 ; /* mm5 = xC2S6 * irot_input_x - irot_input_x */
psrlw mm2, 15 ;
;
movq mm3, [ecx] ;
paddw mm5, mm0 ; /* mm5 = xC2S6 * irot_input_x */
;
paddw mm5, mm2 ; /* Truncated */
movq mm2, mm3 ;
;
pmulhw mm3, xC6S2 ; /* mm3 = xC6S2 * irot_input_y */
psrlw mm2, 15 ;
;
paddw mm3, mm2 ; /* Truncated */
psubsw mm3, mm5 ;
;
movq [32 + ebx], mm3 ;
; /* ------------------------------------------------------------------- */
movq mm0, xC4S4 ;
movq mm2, mm1 ;
movq mm3, mm1 ;
;
pmulhw mm1, mm0 ; /* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
psrlw mm2, 15 ;
;
paddw mm1, mm3 ; /* mm0 = xC4S4 * ( is12 - is56 ) */
paddw mm1, mm2 ; /* Truncate mm1, now it is icommon_product1 */
;
movq mm2, mm7 ;
movq mm3, mm7 ;
;
pmulhw mm7, mm0 ; /* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
psrlw mm2, 15 ;
;
paddw mm7, mm3 ; /* mm7 = xC4S4 * ( id12 + id56 ) */
paddw mm7, mm2 ; /* Truncate mm7, now it is icommon_product2 */
; /* ------------------------------------------------------------------- */
pxor mm0, mm0 ; /* Clear mm0 */
psubsw mm0, mm6 ; /* mm0 = - id34 */
;
psubsw mm0, mm7 ; /* mm0 = - ( id34 + idcommon_product2 ) */
paddsw mm6, mm6 ;
paddsw mm6, mm0 ; /* mm6 = id34 - icommon_product2 */
;
psubsw mm4, mm1 ; /* mm4 = id07 - icommon_product1 */
paddsw mm1, mm1 ;
paddsw mm1, mm4 ; /* mm1 = id07 + icommon_product1 */
; /* ------------------------------------------------------------------- */
movq mm7, xC1S7 ;
movq mm2, mm1 ;
;
movq mm3, mm1 ;
pmulhw mm1, mm7 ; /* mm1 = xC1S7 * irot_input_x - irot_input_x */
;
movq mm7, xC7S1 ;
psrlw mm2, 15 ;
;
paddw mm1, mm3 ; /* mm1 = xC1S7 * irot_input_x */
paddw mm1, mm2 ; /* Trucated */
;
pmulhw mm3, mm7 ; /* mm3 = xC7S1 * irot_input_x */
paddw mm3, mm2 ; /* Truncated */
;
movq mm5, mm0 ;
movq mm2, mm0 ;
;
movq mm7, xC1S7 ;
pmulhw mm0, mm7 ; /* mm0 = xC1S7 * irot_input_y - irot_input_y */
;
movq mm7, xC7S1 ;
psrlw mm2, 15 ;
;
paddw mm0, mm5 ; /* mm0 = xC1S7 * irot_input_y */
paddw mm0, mm2 ; /* Truncated */
;
pmulhw mm5, mm7 ; /* mm5 = xC7S1 * irot_input_y */
paddw mm5, mm2 ; /* Truncated */
;
psubsw mm1, mm5 ; /* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
paddsw mm3, mm0 ; /* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
;
movq [16 + eax], mm1 ;
movq [48 + ebx], mm3 ;
; /* ------------------------------------------------------------------- */
movq mm0, xC3S5 ;
movq mm1, xC5S3 ;
;
movq mm5, mm6 ;
movq mm7, mm6 ;
;
movq mm2, mm4 ;
movq mm3, mm4 ;
;
pmulhw mm4, mm0 ; /* mm4 = xC3S5 * irot_input_x - irot_input_x */
pmulhw mm6, mm1 ; /* mm6 = xC5S3 * irot_input_y - irot_input_y */
;
psrlw mm2, 15 ;
psrlw mm5, 15 ;
;
paddw mm4, mm3 ; /* mm4 = xC3S5 * irot_input_x */
paddw mm6, mm7 ; /* mm6 = xC5S3 * irot_input_y */
;
paddw mm4, mm2 ; /* Truncated */
paddw mm6, mm5 ; /* Truncated */
;
psubsw mm4, mm6 ; /* ip3 */
movq [48 + eax], mm4 ;
;
movq mm4, mm3 ;
movq mm6, mm7 ;
;
pmulhw mm3, mm1 ; /* mm3 = xC5S3 * irot_input_x - irot_input_x */
pmulhw mm7, mm0 ; /* mm7 = xC3S5 * irot_input_y - irot_input_y */
;
paddw mm4, mm2 ;
paddw mm6, mm5 ;
;
paddw mm3, mm4 ; /* mm3 = xC5S3 * irot_input_x */
paddw mm7, mm6 ; /* mm7 = xC3S5 * irot_input_y */
;
paddw mm3, mm7 ; /* ip5 */
movq [16 + ebx], mm3 ;
};
}
static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
{
static ogg_int16_t tmp[32];
ogg_int16_t* align_tmp = (ogg_int16_t*)((unsigned char*)tmp + (16 - ((int)tmp)&15));
Transpose_mmx(InputData, OutputData, InputData + 4, OutputData + 4);
Fdct_mmx(OutputData, OutputData + 4, align_tmp);
Transpose_mmx(InputData + 32, OutputData + 32, InputData + 36, OutputData + 36);
Fdct_mmx(OutputData+32, OutputData + 36, align_tmp);
Transpose_mmx(OutputData, OutputData, OutputData + 32, OutputData + 32);
Fdct_mmx(OutputData, OutputData + 32, align_tmp);
Transpose_mmx(OutputData + 4, OutputData + 4, OutputData + 36, OutputData + 36);
Fdct_mmx(OutputData + 4, OutputData + 36, align_tmp);
__asm emms
}
void dsp_mmx_fdct_init(DspFunctions *funcs)
{
TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
funcs->fdct_short = fdct_short__mmx;
}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?