📄 fastidctmmx32.c
字号:
// this function is written by 鲍金龙, 2000/10
// baojinlong@sohu.com
// fast idct mmx32 version
//!!!!!!!!!!!!!!!!!!!! Disclaimer of 鲍金龙, 2000/11/22
/*
I translate the Chen-Wang algorithm C code simply, use mmx and mmx
extension technology;
A little technic is the new ZIG_ZAG table, the matrix transposed before
the IDCT stage, so only one transposition needed;
Compare different IDCT algorithm's mmx implementation, this function's
accuracy and speed both are accessble;
*/
// ***********note : This function use Chen-Wang algorithm ; Follow the Disclaimer list ;
/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
/*
* Disclaimer of Warranty
*
* These software programs are available to the user without any license fee or
* royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
* any and all warranties, whether express, implied, or statuary, including any
* implied warranties or merchantability or of fitness for a particular
* purpose. In no event shall the copyright-holder be liable for any
* incidental, punitive, or consequential damages of any kind whatsoever
* arising from the use of these programs.
*
* This disclaimer of warranty extends to the user of these programs and user's
* customers, employees, agents, transferees, successors, and assigns.
*
* The MPEG Software Simulation Group does not represent or warrant that the
* programs furnished hereunder are free of infringement of any third-party
* patents.
*
* Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
* are subject to royalty fees to patent holders. Many of these patents are
* general enough such that they are unavoidable regardless of implementation
* design.
*
*/
/**********************************************************/
/* inverse two dimensional DCT, Chen-Wang algorithm */
/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984) */
/* 32-bit integer arithmetic (8 bit coefficients) */
/* 11 mults, 29 adds per DCT */
/* sE, 18.8.91 */
/**********************************************************/
/* coefficients extended to 12 bit for IEEE1180-1990 */
/* compliance sE, 2.1.94 */
/**********************************************************/
// constant used in this function
#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
#define W7 565 /* 2048*sqrt(2)*cos(7*pi/16) */
const static short W7_W1_sW1_W7[4] ={ W7,-W1,W1,W7};
const static short W3_W5_sW5_W3[4] ={ W3,-W5,W5,W3};
const static short sW2_W6_W6_W2[4] ={ W2,W6,W6,-W2};
const static short x2k_s2k_2k_2k[4] ={ 2048,2048,-2048,2048};
const static short x256_s256_256_256[4] ={256,256,-256,256};
const static int x128_128[2] ={128,128};
const static int x8192_8192[2] ={8192,8192};
const static __int64 x12f0000 = 0xffffffffffff0000;
const static __int64 x120ffff = 0xffff;
const static __int64 x120s20 = 0x20;
// use this table replace the Zig-Zag scan table
static unsigned char rscan[2][64]=
{
{ /* Zig-Zag scan pattern */
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
27, 34, 41, 48, 56, 49, 42, 35,
28, 21, 14, 7, 15, 22, 29, 36,
43, 50, 57, 58, 51, 44, 37, 30,
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63
},
{/* Alternate scan pattern */
0, 1, 2, 3, 8, 9, 16, 17,
10, 11, 4, 5, 6, 7, 15, 14,
13, 12, 19, 18, 24, 25, 32, 33,
26, 27, 20, 21, 22, 23, 28, 29,
30, 31, 34, 35, 40, 41, 48, 49,
42, 43, 36, 37, 38, 39, 44, 45,
46, 47, 50, 51, 56, 57, 58, 59,
42, 53, 54, 55, 60, 61, 62, 63
}
};
// function body
static void FastIDCT2Xmmx32(short *blk)
{
// ***********note : This function need mmx extension instruction;
__asm
{
mov esi, blk
mov edx, 8
mov edi,esi
AnalyseRow: pxor mm7,mm7
movq mm2, [esi] // x3 x2 x1 x0
movq mm1, [esi+8] // x7 x6 x5 x4
movq mm0,mm2 // x3 x2 x1 x0
pand mm2,x12f0000 // x3 x2 x1 00
por mm2,mm1
pcmpeqw mm2,mm7
xor ebx,ebx //
pmovmskb ebx,mm2
cmp ebx,0xff
je SkipRow
movq mm2,mm1
punpcklwd mm1,mm0 // b1 b5 b0 b4
punpckhwd mm0,mm2 // b7 b3 b6 b2
movq mm4,mm1
punpckhwd mm1,mm0 // b7 b1 b3 b5 ---mm1
punpckldq mm0,mm0 // b6 b2 b6 b2 ---mm0 x2 x3 x2 x3
punpckldq mm4,mm4 // b0 b4 b0 b4 ---mm4 x0 x1 x0 x1
movq mm5,mm1
punpckhdq mm1,mm1 // b7 b1 b7 b1 ---mm1 x5 x4 x5 x4
punpckldq mm5,mm5 // b3 b5 b3 b5 ---mm5 x7 x6 x7 x6
pmaddwd mm1,W7_W1_sW1_W7 // 1x4 1x5 ---mm1
pmaddwd mm5,W3_W5_sW5_W3 // 1x6 1x7 ---mm5
// stage 2
pmaddwd mm4,x2k_s2k_2k_2k //
paddd mm4,x128_128 // 2x0 2x8 ---mm4
pmaddwd mm0,sW2_W6_W6_W2 // 2x2 2x3 ---mm0 2 3?
movq mm6,mm1 // 1x4 1x5
paddd mm1,mm5 // 2x1 2x6 ---mm1
psubd mm6,mm5 // 2x4 2x5 ---mm6
// stage 3
movq mm5,mm4 // 2x0 2x8
paddd mm4,mm0 // 3x3 3x7 ---mm4
psubd mm5,mm0 // 3x0 3x8 ---mm5 free mm0
movq mm0,mm6 // 2x4 2x5
movq mm7,mm6
psrlq mm0,32 // 00 2x4
psllq mm7,32 // 2x5 00
paddd mm6,mm0 // 2x4 2x4+2x5
psubd mm6,mm7 // 2x4-2x5 2x5+2x4 --mm6 free mm7,mm0
movq mm7,mm1 // x1 x6
movd eax,mm6
psrlq mm6,32
imul eax,181
movd ebx,mm6
imul ebx,181
movd mm6,eax
movd mm0,ebx
psllq mm6,32
por mm6,mm0 // (x4+x5)*181 (x4-x5)*181
paddd mm6,x128_128
psrad mm6,8 // 3x2 3x4 ---mm6
// stage 4
// 37--mm4 24--mm6 08--mm5 16--mm1
movq mm3,mm4 // x3 x7
punpckldq mm1,mm6 // x4 x6 ---mm1
punpckhdq mm7,mm6 // x2 x1 ---mm7
paddd mm4,mm7 // b1 b0 ---mm4
psubd mm3,mm7 // b6 b7 ---mm3
movq mm7,mm5 // x0 x8
paddd mm5,mm1 // b2 b3 ---mm5
psubd mm7,mm1 // b5 b4 ---mm7
psrad mm4,8
psrad mm5,8
psrad mm3,8
psrad mm7,8
//output
packssdw mm5,mm3 // b6 b7 b2 b3
packssdw mm4,mm7 // b5 b4 b1 b0
movq mm7,mm5 // b6 b7 b2 b3
psrld mm5,16 // 00 b6 00 b2
pslld mm7,16 // b7 00 b3 00
por mm7,mm5 // b7 b6 b3 b2
movq mm5,mm4
punpckldq mm4,mm7 // b3 b2 b1 b0
punpckhdq mm5,mm7 // b7 b6 b5 b4
movq [esi],mm4
movq [esi+8],mm5
dec edx
jz Transposition
add esi,16
jmp AnalyseRow
SkipRow: pshufw mm0,mm0,0
psllw mm0,3
dec edx
jz PrevTranse // avoid stallment
movq [esi],mm0
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -