⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fastidctmmx32.c

📁 idct mmx technology implement
💻 C
📖 第 1 页 / 共 2 页
字号:
//   this function is written by 鲍金龙, 2000/10
//   baojinlong@sohu.com


// fast idct mmx32 version

//!!!!!!!!!!!!!!!!!!!! Disclaimer of 鲍金龙, 2000/11/22
/*
       I translate the Chen-Wang algorithm  C code simply, use mmx and mmx 
	   extension technology;

	   A little technic is the new ZIG_ZAG table, the matrix transposed before 
	   the IDCT stage, so only one transposition needed; 

	   Compare different IDCT algorithm's mmx implementation, this function's
	   accuracy and speed both are accessble;
*/


// ***********note : This function use Chen-Wang algorithm ; Follow the Disclaimer list ;      

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

/**********************************************************/
/* inverse two dimensional DCT, Chen-Wang algorithm       */
/* (cf. IEEE ASSP-32, pp. 803-816, Aug. 1984)             */
/* 32-bit integer arithmetic (8 bit coefficients)         */
/* 11 mults, 29 adds per DCT                              */
/*                                      sE, 18.8.91       */
/**********************************************************/
/* coefficients extended to 12 bit for IEEE1180-1990      */
/* compliance                           sE,  2.1.94       */
/**********************************************************/




// constant used in this function
#define W1 2841 /* 2048*sqrt(2)*cos(1*pi/16) */
#define W2 2676 /* 2048*sqrt(2)*cos(2*pi/16) */
#define W3 2408 /* 2048*sqrt(2)*cos(3*pi/16) */
#define W5 1609 /* 2048*sqrt(2)*cos(5*pi/16) */
#define W6 1108 /* 2048*sqrt(2)*cos(6*pi/16) */
#define W7 565  /* 2048*sqrt(2)*cos(7*pi/16) */

const static short  W7_W1_sW1_W7[4] ={ W7,-W1,W1,W7};
const static short  W3_W5_sW5_W3[4] ={ W3,-W5,W5,W3};
const static short  sW2_W6_W6_W2[4] ={ W2,W6,W6,-W2}; 
const static short  x2k_s2k_2k_2k[4] ={ 2048,2048,-2048,2048};
const static short  x256_s256_256_256[4] ={256,256,-256,256};  
const static int    x128_128[2]  ={128,128};
const static int    x8192_8192[2] ={8192,8192};  

const static 	__int64  x12f0000 = 0xffffffffffff0000;
const static 	__int64  x120ffff = 0xffff;
const static 	__int64  x120s20 = 0x20;


// use this table replace the Zig-Zag scan table
static unsigned char rscan[2][64]=
{
	{ /* Zig-Zag scan pattern  */
    0,    8,    1,    2,    9,    16,    24,    17,
	10,   3,    4,    11,   18,   25,    32,    40,
    33,   26,   19,   12,   5,    6,     13,    20,
	27,   34,   41,   48,   56,   49,    42,    35,
    28,   21,   14,   7,    15,   22,    29,    36,
	43,   50,   57,   58,   51,   44,    37,    30,
    23,   31,   38,   45,   52,   59,    60,    53,
	46,   39,   47,   54,   61,   62,    55,    63
	},
	{/* Alternate scan pattern */
    0,    1,    2,    3,    8,    9,     16,    17,
	10,   11,   4,    5,    6,    7,     15,    14,
    13,   12,   19,   18,   24,   25,    32,    33,
	26,   27,   20,   21,   22,   23,    28,    29,
    30,   31,   34,   35,   40,   41,    48,    49,
	42,   43,   36,   37,   38,   39,    44,    45,
    46,   47,   50,   51,   56,   57,    58,    59,
	42,   53,   54,   55,   60,   61,    62,    63
	}
};


// function body
static void FastIDCT2Xmmx32(short *blk)
{
// ***********note : This function need mmx extension instruction; 

	__asm
	{
		            mov        esi, blk
					mov        edx, 8
                    mov        edi,esi

AnalyseRow:	    	pxor       mm7,mm7
					movq       mm2, [esi]      // x3 x2 x1 x0
					movq       mm1, [esi+8]    // x7 x6 x5 x4
					movq       mm0,mm2         // x3 x2 x1 x0					
					pand       mm2,x12f0000    // x3 x2 x1 00
					por        mm2,mm1
					pcmpeqw    mm2,mm7
					xor        ebx,ebx //
					pmovmskb   ebx,mm2   
					cmp        ebx,0xff
					je         SkipRow


		  movq        mm2,mm1
		  punpcklwd   mm1,mm0       // b1 b5 b0 b4
          punpckhwd   mm0,mm2       // b7 b3 b6 b2
		  movq        mm4,mm1    
		  punpckhwd   mm1,mm0       // b7 b1 b3 b5 ---mm1    
		  punpckldq   mm0,mm0       // b6 b2 b6 b2 ---mm0   x2 x3 x2 x3
		  punpckldq   mm4,mm4       // b0 b4 b0 b4 ---mm4   x0 x1 x0 x1
          movq        mm5,mm1
		  punpckhdq   mm1,mm1       // b7 b1 b7 b1 ---mm1   x5 x4 x5 x4
		  punpckldq   mm5,mm5       // b3 b5 b3 b5 ---mm5   x7 x6 x7 x6 
		  pmaddwd     mm1,W7_W1_sW1_W7   //  1x4 1x5 ---mm1
		  pmaddwd     mm5,W3_W5_sW5_W3   //  1x6 1x7 ---mm5
//  stage 2
		  pmaddwd     mm4,x2k_s2k_2k_2k   // 
		  paddd       mm4,x128_128         //  2x0 2x8 ---mm4    
		  pmaddwd     mm0,sW2_W6_W6_W2    //  2x2 2x3 ---mm0     2 3?
		  movq        mm6,mm1             //  1x4 1x5
		  paddd       mm1,mm5             //  2x1 2x6 ---mm1
		  psubd       mm6,mm5             //  2x4 2x5 ---mm6
// stage 3
          movq        mm5,mm4   // 2x0 2x8
          paddd       mm4,mm0   // 3x3 3x7  ---mm4
		  psubd       mm5,mm0   // 3x0 3x8  ---mm5  free mm0

          movq        mm0,mm6   // 2x4      2x5
		  movq        mm7,mm6
		  psrlq       mm0,32    // 00       2x4
		  psllq       mm7,32    // 2x5      00
		  paddd       mm6,mm0   // 2x4      2x4+2x5 
		  psubd       mm6,mm7   // 2x4-2x5  2x5+2x4 --mm6 free mm7,mm0    
		  
          movq        mm7,mm1   //  x1 x6
 
          movd        eax,mm6
		  psrlq       mm6,32
		  imul        eax,181
		  movd        ebx,mm6
		  imul        ebx,181
		  movd        mm6,eax
		  movd        mm0,ebx
		  psllq       mm6,32
		  por         mm6,mm0   //  (x4+x5)*181  (x4-x5)*181
		  paddd       mm6,x128_128
		  psrad       mm6,8     //  3x2 3x4 ---mm6
// stage 4
// 37--mm4  24--mm6  08--mm5  16--mm1
		  movq        mm3,mm4   //  x3 x7
          punpckldq   mm1,mm6   //  x4 x6  ---mm1
		  punpckhdq   mm7,mm6   //  x2 x1  ---mm7

		  paddd       mm4,mm7   //  b1 b0  ---mm4
		  psubd       mm3,mm7   //  b6 b7  ---mm3
		  movq        mm7,mm5   //  x0 x8
		  paddd       mm5,mm1   //  b2 b3  ---mm5
		  psubd       mm7,mm1   //  b5 b4  ---mm7
		  psrad       mm4,8
		  psrad       mm5,8
		  psrad       mm3,8
		  psrad       mm7,8
    //output
		  packssdw    mm5,mm3   //  b6 b7 b2 b3 
	      packssdw    mm4,mm7   //  b5 b4 b1 b0 
		  movq        mm7,mm5   //  b6 b7 b2 b3
		  psrld       mm5,16    //  00 b6 00 b2
		  pslld       mm7,16    //  b7 00 b3 00
		  por         mm7,mm5   //  b7 b6 b3 b2
		  movq        mm5,mm4
          punpckldq   mm4,mm7   //  b3 b2 b1 b0
          punpckhdq   mm5,mm7   //  b7 b6 b5 b4
		  movq        [esi],mm4    
		  movq        [esi+8],mm5   


    dec  edx 
    jz   Transposition
	add  esi,16
	jmp  AnalyseRow


SkipRow:   			pshufw     mm0,mm0,0
					psllw      mm0,3
                    dec        edx
					jz         PrevTranse       // avoid stallment
					movq       [esi],mm0

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -