📄 mmxidct.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 4 页
字号:
 *
 *		Error:			None
 *
 ***************************************************************************************
 */

// RowIDCT gets ready to transpose.

template<class Ti,class Tj,class Tc> static __forceinline void RowIDCT(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J, Tc C)
{
	BeginIDCT(r0,r1,r2,r3,r4,r5,r6,r7,I,J,C);

		movq		(r3, I(2)	);/* r3 = D. */
		 psubsw		(r4, r7		);/* r4 = E. = E - G */
		paddsw		(r1, r1		);/* r1 = H. + H. */
		 paddsw		(r7, r7		);/* r7 = G + G */
		paddsw		(r1, r2		);/* r1 = R1 = A.. + H. */
		 paddsw		(r7, r4		);/* r7 = G. = E + G */
		psubsw		(r4, r3		);/* r4 = R4 = E. - D. */
		 paddsw		(r3, r3 );
		psubsw		(r6, r5		);/* r6 = R6 = F. - B.. */
		 paddsw		(r5, r5 );
		paddsw		(r3, r4		);/* r3 = R3 = E. + D. */
		 paddsw		(r5, r6		);/* r5 = R5 = F. + B.. */
		psubsw		(r7, r0		);/* r7 = R7 = G. - C. */
		 paddsw		(r0, r0 );
		movq		(I(1), r1	);/* save R1 */
		 paddsw		(r0, r7		);/* r0 = R0 = G. + C. */
}
// end RowIDCT macro (8 + 38 = 46 cycles)


/**************************************************************************************
 *
 *		Routine:		ColumnIDCT
 *
 *		Description:	The Macro does 1-D IDct on 4 columns
 *
 *		Input:			None
 *
 *		Output:			None
 *
 *		Return:			None
 *
 *		Special Note:	None
 *
 *		Error:			None
 *
 ***************************************************************************************
 */
// Column IDCT normalizes and stores final results.

template<class Ti,class Tj,class Tc> static __forceinline void ColumnIDCT(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J, Tc C,unsigned char *Eight)
{
	BeginIDCT (r0,r1,r2,r3,r4,r5,r6,r7,I,J,C);

		paddsw		(r2, Eight	);/* adjust R2 (and R1) for shift */
		 paddsw		(r1, r1		);/* r1 = H. + H. */
		paddsw		(r1, r2		);/* r1 = R1 = A.. + H. */
		 psraw		(r2, 4		);/* r2 = NR2 */
		psubsw		(r4, r7		);/* r4 = E. = E - G */
		 psraw		(r1, 4		);/* r1 = NR1 */
		movq		(r3, I(2)	);/* r3 = D. */
		 paddsw		(r7, r7		);/* r7 = G + G */
		movq		(I(2), r2	);/* store NR2 at I2 */
		 paddsw		(r7, r4		);/* r7 = G. = E + G */
		movq		(I(1), r1	);/* store NR1 at I1 */
		 psubsw		(r4, r3		);/* r4 = R4 = E. - D. */
		paddsw		(r4, Eight	);/* adjust R4 (and R3) for shift */
		 paddsw		(r3, r3		);/* r3 = D. + D. */
		paddsw		(r3, r4		);/* r3 = R3 = E. + D. */
		 psraw		(r4, 4		);/* r4 = NR4 */
		psubsw		(r6, r5		);/* r6 = R6 = F. - B.. */
		 psraw		(r3, 4		);/* r3 = NR3 */
		paddsw		(r6, Eight	);/* adjust R6 (and R5) for shift */
		 paddsw		(r5, r5		);/* r5 = B.. + B.. */
		paddsw		(r5, r6		);/* r5 = R5 = F. + B.. */
		 psraw		(r6, 4		);/* r6 = NR6 */
		movq		(J(4), r4	);/* store NR4 at J4 */
		 psraw		(r5, 4		);/* r5 = NR5 */
		movq		(I(3), r3	);/* store NR3 at I3 */
		 psubsw		(r7, r0		);/* r7 = R7 = G. - C. */
		paddsw		(r7, Eight	);/* adjust R7 (and R0) for shift */
		 paddsw		(r0, r0 		);/* r0 = C. + C. */
		paddsw		(r0, r7		);/* r0 = R0 = G. + C. */
		 psraw		(r7, 4		);/* r7 = NR7 */
		movq		(J(6), r6	);/* store NR6 at J6 */
		 psraw		(r0, 4		);/* r0 = NR0 */
		movq		(J(5), r5	);/* store NR5 at J5 */

		movq		(J(7), r7	);/* store NR7 at J7 */

		movq		(I(0), r0	);/* store NR0 at I0 */

}
// end ColumnIDCT macro (38 + 19 = 57 cycles)

/**************************************************************************************
 *
 *		Routine:		Transpose
 *
 *		Description:	The Macro does two 4x4 transposes in place.
 *
 *		Input:			None
 *
 *		Output:			None
 *
 *		Return:			None
 *
 *		Special Note:	None
 *
 *		Error:			None
 *
 ***************************************************************************************
 */

/* Following macro does two 4x4 transposes in place.

  At entry (we assume):

	r0 = a3 a2 a1 a0
	I(1) = b3 b2 b1 b0
	r2 = c3 c2 c1 c0
	r3 = d3 d2 d1 d0

	r4 = e3 e2 e1 e0
	r5 = f3 f2 f1 f0
	r6 = g3 g2 g1 g0
	r7 = h3 h2 h1 h0

   At exit, we have:

	I(0) = d0 c0 b0 a0
	I(1) = d1 c1 b1 a1
	I(2) = d2 c2 b2 a2
	I(3) = d3 c3 b3 a3

	J(4) = h0 g0 f0 e0
	J(5) = h1 g1 f1 e1
	J(6) = h2 g2 f2 e2
	J(7) = h3 g3 f3 e3

   I(0) I(1) I(2) I(3)  is the transpose of r0 I(1) r2 r3.
   J(4) J(5) J(6) J(7)  is the transpose of r4 r5 r6 r7.

   Since r1 is free at entry, we calculate the Js first. */


template<class Ti,class Tj> static __forceinline void Transpose(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J)
{
		movq		(r1, r4			);// r1 = e3 e2 e1 e0
		 punpcklwd	(r4, r5			);// r4 = f1 e1 f0 e0
		movq		(I(0), r0		);// save a3 a2 a1 a0
		 punpckhwd	(r1, r5			);// r1 = f3 e3 f2 e2
		movq		(r0, r6			);// r0 = g3 g2 g1 g0
		 punpcklwd	(r6, r7			);// r6 = h1 g1 h0 g0
		movq		(r5, r4			);// r5 = f1 e1 f0 e0
		 punpckldq	(r4, r6			);// r4 = h0 g0 f0 e0 = R4
		punpckhdq	(r5, r6			);// r5 = h1 g1 f1 e1 = R5
		 movq		(r6, r1			);// r6 = f3 e3 f2 e2
		movq		(J(4), r4 );
		 punpckhwd	(r0, r7			);// r0 = h3 g3 h2 g2
		movq		(J(5), r5 );
		 punpckhdq	(r6, r0			);// r6 = h3 g3 f3 e3 = R7
		movq		(r4, I(0)		);// r4 = a3 a2 a1 a0
		 punpckldq	(r1, r0			);// r1 = h2 g2 f2 e2 = R6
		movq		(r5, I(1)		);// r5 = b3 b2 b1 b0
		 movq		(r0, r4			);// r0 = a3 a2 a1 a0
		movq		(J(7), r6 );
		 punpcklwd	(r0, r5			);// r0 = b1 a1 b0 a0
		movq		(J(6), r1 );
		 punpckhwd	(r4, r5			);// r4 = b3 a3 b2 a2
		movq		(r5, r2			);// r5 = c3 c2 c1 c0
		 punpcklwd	(r2, r3			);// r2 = d1 c1 d0 c0
		movq		(r1, r0			);// r1 = b1 a1 b0 a0
		 punpckldq	(r0, r2			);// r0 = d0 c0 b0 a0 = R0
		punpckhdq	(r1, r2			);// r1 = d1 c1 b1 a1 = R1
		 movq		(r2, r4			);// r2 = b3 a3 b2 a2
		movq		(I(0), r0 );
		 punpckhwd	(r5, r3			);// r5 = d3 c3 d2 c2
		movq		(I(1), r1 );
		 punpckhdq	(r4, r5			);// r4 = d3 c3 b3 a3 = R3
		punpckldq	(r2, r5			);// r2 = d2 c2 b2 a2 = R2

		movq		(I(3), r4 );

		movq		(I(2), r2 );
}
// end Transpose macro (19 cycles).



/**************************************************************************************
 *
 *		Routine:		MMX_idct
 *
 *		Description:	Perform IDCT on a 8x8 block
 *
 *		Input:			Pointer to input and output buffer
 *
 *		Output:			None
 *
 *		Return:			None
 *
 *		Special Note:	The input coefficients are in ZigZag order
 *
 *		Error:			None
 *
 ***************************************************************************************
 */

struct I10_1
{
private:
 unsigned char *edx;
public:
 I10_1(unsigned char *Iedx):edx(Iedx) {}
 unsigned char* operator()(int K) const {return edx + (  K      * 16);}
};

struct I_2
{
private:
 unsigned char *edx;
public:
 I_2(unsigned char *Iedx):edx(Iedx) {}
 unsigned char* operator()(int K) const {return edx + (  K      * 16) + 64;}
};

struct I10_2
{
private:
 unsigned char *edx;
public:
 I10_2(unsigned char *Iedx):edx(Iedx) {}
 unsigned char* operator()(int K) const {return edx + (K * 16);}
};

struct I10_3
{
private:
 unsigned char *edx;
public:
 I10_3(unsigned char *Iedx):edx(Iedx) {}
 unsigned char* operator()(int K) const {return edx + (K * 16) + 8;}
};

struct J10_1
{
private:
 unsigned char *edx;
public:
 J10_1(unsigned char *Iedx):edx(Iedx) {}
 unsigned char* operator()(int K) const {return edx + ( (K - 4) * 16) + 8;}
};

struct J_2
{
private:
 unsigned char *edx;
public:
 J_2(unsigned char *Iedx):edx(Iedx) {}
 unsigned char* operator()(int K) const {return edx + ( (K - 4) * 16) + 72;}
};

struct C10
{
private:
 unsigned char *ecx;
public:
 C10(unsigned char *Iecx):ecx(Iecx) {}
 unsigned char* operator()(int I) const {return ecx + CosineOffset + (I-1)*8;}
};

extern "C" void MMX_idct (	ogg_int16_t * input, ogg_int16_t * qtbl, ogg_int16_t * output)
{

//	uint16 *constants = idctconstants;
#	define M(I)		(ecx + MaskOffset + I*8)
//#	define C(I)		[ecx + CosineOffset + (I-1)*8]
//#	define Eight	[ecx + EightOffset]

__m64 r0,r1,r2,r3,r4,r5,r6,r7;
	unsigned char *eax=(unsigned char*)input;	// eax = quantized input
	 unsigned char *edx=(unsigned char*)output;	// edx = destination (= idct buffer)
/*
	mov		ecx, [edx]		// (+1 at least) preload the cache before writing
	 mov	ebx, [edx+28]   // in case proc doesn't cache on writes
	mov		ecx, [edx+56]	// gets all the cache lines
	 mov	ebx, [edx+84]	// regardless of alignment (beyond 32-bit)
	mov		ecx, [edx+112]	// also avoids address contention stalls
	 mov	ebx, [edx+124]
*/
	unsigned char *ebx=(unsigned char*)qtbl;	// ebx = quantization table
	 unsigned char *ecx=(unsigned char*)idctconstants; ////[0]//

	movq	(r0, eax);
	 //
	pmullw	(r0, ebx		);// r0 = 03 02 01 00
	 //
	movq	(r1, eax+16);
	 //
	pmullw	(r1, ebx+16	);// r1 = 13 12 11 10
	 //
	movq	(r2, M(0)		);// r2 = __ __ __ FF
	 movq	(r3, r0			);// r3 = 03 02 01 00
	movq	(r4, eax+8);
	 psrlq	(r0, 16			);// r0 = __ 03 02 01
	pmullw	(r4, ebx+8		);// r4 = 07 06 05 04
	 pand	(r3, r2			);// r3 = __ __ __ 00
	movq	(r5, r0			);// r5 = __ 03 02 01
	 movq	(r6, r1			);// r6 = 13 12 11 10
	pand	(r5, r2			);// r5 = __ __ __ 01
	 psllq	(r6, 32			);// r6 = 11 10 __ __
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 pxor	(r0, r5			);// r0 = __ 03 02 __
	pand	(r7, r6			);// r7 = 11 __ __ __
	 por	(r0, r3			);// r0 = __ 03 02 00
	pxor	(r6, r7			);// r6 = __ 10 __ __
	 por	(r0, r7			);// r0 = 11 03 02 00 = R0
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 movq	(r3, r4			);// r3 = 07 06 05 04
	movq	(edx, r0		);// write R0 = r0
	 pand	(r3, r2			);// r3 = __ __ __ 04
	movq	(r0, eax+32);
	 psllq	(r3, 16			);// r3 = __ __ 04 __
	pmullw	(r0, ebx+32	);// r0 = 23 22 21 20
	 pand	(r7, r1			);// r7 = 13 __ __ __
	por	(	r5, r3			);// r5 = __ __ 04 01
	 por	(r7, r6			);// r7 = 13 10 __ __
	movq	(r3, eax+24);
	 por	(r7, r5			);// r7 = 13 10 04 01 = R1
	pmullw	(r3, ebx+24	);// r3 = 17 16 15 14
	 psrlq	(r4, 16			);// r4 = __ 07 06 05
	movq	(edx+16, r7	);// write R1 = r7
	 movq	(r5, r4			);// r5 = __ 07 06 05
	movq	(r7, r0			);// r7 = 23 22 21 20
	 psrlq	(r4, 16			);// r4 = __ __ 07 06
	psrlq	(r7, 48			);// r7 = __ __ __ 23
	 movq	(r6, r2			);// r6 = __ __ __ FF
	pand	(r5, r2			);// r5 = __ __ __ 05
	 pand	(r6, r4			);// r6 = __ __ __ 06
	movq	(edx+80, r7	);// partial R9 = __ __ __ 23
	 pxor	(r4, r6			);// r4 = __ __ 07 __
	psrlq	(r1, 32			);// r1 = __ __ 13 12
	 por	(r4, r5			);// r4 = __ __ 07 05
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 pand	(r1, r2			);// r1 = __ __ __ 12
	movq	(r5, eax+48);
	 psllq	(r0, 16			);// r0 = 22 21 20 __
	pmullw	(r5, ebx+48	);// r5 = 33 32 31 30
	 pand	(r7, r0			);// r7 = 22 __ __ __
	movq	(edx+64, r1	);// partial R8 = __ __ __ 12
	 por	(r7, r4			);// r7 = 22 __ 07 05
	movq	(r4, r3			);// r4 = 17 16 15 14
	 pand	(r3, r2			);// r3 = __ __ __ 14
	movq	(r1, M(2)		);// r1 = __ FF __ __
	 psllq	(r3, 32			);// r3 = __ 14 __ __
	por	(	r7, r3			);// r7 = 22 14 07 05 = R2
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -