⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mmxidct.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 4 页
字号:
		 psraw		(r5, 4		);/* r5 = NR5 */
		movq		(I(3), r3	);/* store NR3 at I3 */
		 psubsw		(r7, r0		);/* r7 = R7 = G. - C. */
		paddsw		(r7, Eight	);/* adjust R7 (and R0) for shift */
		 paddsw		(r0, r0 		);/* r0 = C. + C. */
		paddsw		(r0, r7		);/* r0 = R0 = G. + C. */
		 psraw		(r7, 4		);/* r7 = NR7 */
		movq		(J(6), r6	);/* store NR6 at J6 */
		 psraw		(r0, 4		);/* r0 = NR0 */
		movq		(J(5), r5	);/* store NR5 at J5 */

		movq		(J(7), r7	);/* store NR7 at J7 */

		movq		(I(0), r0	);/* store NR0 at I0 */

}
// end ColumnIDCT macro (38 + 19 = 57 cycles)
/* --------------------------------------------------------------- */

/* --------------------------------------------------------------- */
/* IDCT 10 */

extern "C" void MMX_idct10 (	ogg_int16_t * input, ogg_int16_t * qtbl, ogg_int16_t * output)
{

#	define M(I)		(ecx + MaskOffset + I*8)

__m64 r0,r1,r2,r3,r4,r5,r6,r7;

	unsigned char *	eax=(unsigned char*)input;// eax = quantized input
	 unsigned char *edx =(unsigned char*)output;// edx = destination (= idct buffer)
/*
	mov		ecx, [edx]		// (+1 at least) preload the cache before writing
	 mov	ebx, [edx+28]   // in case proc doesn't cache on writes
	mov		ecx, [edx+56]	// gets all the cache lines
	 mov	ebx, [edx+84]	// regardless of alignment (beyond 32-bit)
	mov		ecx, [edx+112]	// also avoids address contention stalls
	 mov	ebx, [edx+124]
*/
	unsigned char *ebx=(unsigned char*)qtbl;	// ebx = quantization table
	unsigned char *ecx=(unsigned char*)idctconstants; //// [0]//

	movq	(r0, eax);
	 //
	pmullw	(r0, ebx);		// r0 = 03 02 01 00
	 //
	movq	(r1, eax+16);
	 //
	pmullw	(r1, ebx+16);	// r1 = 13 12 11 10
	 //
	movq	(r2, M(0));		// r2 = __ __ __ FF
	 movq	(r3, r0			);// r3 = 03 02 01 00
	movq	(r4, eax+8);
	 psrlq	(r0, 16			);// r0 = __ 03 02 01
	pmullw	(r4, ebx+8		);// r4 = 07 06 05 04
	 pand	(r3, r2			);// r3 = __ __ __ 00
	movq	(r5, r0			);// r5 = __ 03 02 01
	 movq	(r6, r1			);// r6 = 13 12 11 10
	pand	(r5, r2			);// r5 = __ __ __ 01
	 psllq	(r6, 32			);// r6 = 11 10 __ __
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 pxor	(r0, r5			);// r0 = __ 03 02 __
	pand	(r7, r6			);// r7 = 11 __ __ __
	 por	(r0, r3			);// r0 = __ 03 02 00
	pxor	(r6, r7			);// r6 = __ 10 __ __
	 por	(r0, r7			);// r0 = 11 03 02 00 = R0
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 movq	(r3, r4			);// r3 = 07 06 05 04
	movq	(edx, r0		);// write R0 = r0
	 pand	(r3, r2			);// r3 = __ __ __ 04
	movq	(r0, eax+32);
	 psllq	(r3, 16			);// r3 = __ __ 04 __
	pmullw	(r0, ebx+32	);// r0 = 23 22 21 20
	 pand	(r7, r1			);// r7 = 13 __ __ __
	por	(	r5, r3			);// r5 = __ __ 04 01
	 por	(r7, r6			);// r7 = 13 10 __ __
	movq	(r3, eax+24);
	 por	(r7, r5			);// r7 = 13 10 04 01 = R1
	pmullw	(r3, ebx+24	);// r3 = 17 16 15 14
	 psrlq	(r4, 16			);// r4 = __ 07 06 05
	movq	(edx+16, r7	);// write R1 = r7
	 movq	(r5, r4			);// r5 = __ 07 06 05
	movq	(r7, r0			);// r7 = 23 22 21 20
	 psrlq	(r4, 16			);// r4 = __ __ 07 06
	psrlq	(r7, 48			);// r7 = __ __ __ 23
	 movq	(r6, r2			);// r6 = __ __ __ FF
	pand	(r5, r2			);// r5 = __ __ __ 05
	 pand	(r6, r4			);// r6 = __ __ __ 06
	movq	(edx+80, r7	);// partial R9 = __ __ __ 23
	 pxor	(r4, r6			);// r4 = __ __ 07 __
	psrlq	(r1, 32			);// r1 = __ __ 13 12
	 por	(r4, r5			);// r4 = __ __ 07 05
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 pand	(r1, r2			);// r1 = __ __ __ 12
	movq	(r5, eax+48);
	 psllq	(r0, 16			);// r0 = 22 21 20 __
	pmullw	(r5, ebx+48	);// r5 = 33 32 31 30
	 pand	(r7, r0			);// r7 = 22 __ __ __
	movq	(edx+64, r1	);// partial R8 = __ __ __ 12
	 por	(r7, r4			);// r7 = 22 __ 07 05
	movq	(r4, r3			);// r4 = 17 16 15 14
	 pand	(r3, r2			);// r3 = __ __ __ 14
	movq	(r1, M(2)		);// r1 = __ FF __ __
	 psllq	(r3, 32			);// r3 = __ 14 __ __
	por	(	r7, r3			);// r7 = 22 14 07 05 = R2
	 movq	(r3, r5			);// r3 = 33 32 31 30
	psllq	(r3, 48			);// r3 = 30 __ __ __
	 pand	(r1, r0			);// r1 = __ 21 __ __
	movq	(edx+32, r7	);// write R2 = r7
	 por	(r6, r3			);// r6 = 30 __ __ 06
	movq	(r7, M(1)		);// r7 = __ __ FF __
	 por	(r6, r1			);// r6 = 30 21 __ 06
	movq	(r1, eax+56);
	 pand	(r7, r4			);// r7 = __ __ 15 __
	pmullw	(r1, ebx+56	);// r1 = 37 36 35 34
	 por	(r7, r6			);// r7 = 30 21 15 06 = R3
	pand	(r0, M(1)		);// r0 = __ __ 20 __
	 psrlq	(r4, 32			);// r4 = __ __ 17 16
	movq	(edx+48, r7	);// write R3 = r7
	 movq	(r6, r4			);// r6 = __ __ 17 16
	movq	(r7, M(3)		);// r7 = FF __ __ __
	 pand	(r4, r2			);// r4 = __ __ __ 16
	movq	(r3, M(1)		);// r3 = __ __ FF __
	 pand	(r7, r1			);// r7 = 37 __ __ __
	pand	(r3, r5			);// r3 = __ __ 31 __
	 por	(r0, r4			);// r0 = __ __ 20 16
	psllq	(r3, 16			);// r3 = __ 31 __ __
	 por	(r7, r0			);// r7 = 37 __ 20 16
	movq	(r4, M(2)		);// r4 = __ FF __ __
	 por	(r7, r3			);// r7 = 37 31 20 16 = R4
	movq	(r0, eax+80);
	 movq	(r3, r4			);// r3 = __ __ FF __
	pmullw	(r0, ebx+80	);// r0 = 53 52 51 50
	 pand	(r4, r5			);// r4 = __ 32 __ __
	movq	(edx+8, r7		);// write R4 = r7
	 por	(r6, r4			);// r6 = __ 32 17 16
	movq	(r4, r3			);// r4 = __ FF __ __
	 psrlq	(r6, 16			);// r6 = __ __ 32 17
	movq	(r7, r0			);// r7 = 53 52 51 50
	 pand	(r4, r1			);// r4 = __ 36 __ __
	psllq	(r7, 48			);// r7 = 50 __ __ __
	 por	(r6, r4			);// r6 = __ 36 32 17
	movq	(r4, eax+88);
	 por	(r7, r6			);// r7 = 50 36 32 17 = R5
	pmullw	(r4, ebx+88	);// r4 = 57 56 55 54
	 psrlq	(r3, 16			);// r3 = __ __ FF __
	movq	(edx+24, r7	);// write R5 = r7
	 pand	(r3, r1			);// r3 = __ __ 35 __
	psrlq	(r5, 48			);// r5 = __ __ __ 33
	 pand	(r1, r2			);// r1 = __ __ __ 34
	movq	(r6, eax+104);
	 por	(r5, r3			);// r5 = __ __ 35 33
	pmullw	(r6, ebx+104	);// r6 = 67 66 65 64
	 psrlq	(r0, 16			);// r0 = __ 53 52 51
	movq	(r7, r4			);// r7 = 57 56 55 54
	 movq	(r3, r2			);// r3 = __ __ __ FF
	psllq	(r7, 48			);// r7 = 54 __ __ __
	 pand	(r3, r0			);// r3 = __ __ __ 51
	pxor	(r0, r3			);// r0 = __ 53 52 __
	 psllq	(r3, 32			);// r3 = __ 51 __ __
	por	(	r7, r5			);// r7 = 54 __ 35 33
	 movq	(r5, r6			);// r5 = 67 66 65 64
	pand	(r6, M(1)		);// r6 = __ __ 65 __
	 por	(r7, r3			);// r7 = 54 51 35 33 = R6
	psllq	(r6, 32			);// r6 = 65 __ __ __
	 por	(r0, r1			);// r0 = __ 53 52 34
	movq	(edx+40, r7	);// write R6 = r7
	 por	(r0, r6			);// r0 = 65 53 52 34 = R7
	movq	(r7, eax+120);
	 movq	(r6, r5			);// r6 = 67 66 65 64
	pmullw	(r7, ebx+120	);// r7 = 77 76 75 74
	 psrlq	(r5, 32			);// r5 = __ __ 67 66
	pand	(r6, r2			);// r6 = __ __ __ 64
	 movq	(r1, r5			);// r1 = __ __ 67 66
	movq	(edx+56, r0	);// write R7 = r0
	 pand	(r1, r2			);// r1 = __ __ __ 66
	movq	(r0, eax+112);
	 movq	(r3, r7			);// r3 = 77 76 75 74
	pmullw	(r0, ebx+112	);// r0 = 73 72 71 70
	 psllq	(r3, 16			);// r3 = 76 75 74 __
	pand	(r7, M(3)		);// r7 = 77 __ __ __
	 pxor	(r5, r1			);// r5 = __ __ 67 __
	por	(	r6, r5			);// r6 = __ __ 67 64
	 movq	(r5, r3			);// r5 = 76 75 74 __
	pand	(r5, M(3)		);// r5 = 76 __ __ __
	 por	(r7, r1			);// r7 = 77 __ __ 66
	movq	(r1, eax+96);
	 pxor	(r3, r5			);// r3 = __ 75 74 __
	pmullw	(r1, ebx+96 	);// r1 = 63 62 61 60
	 por	(r7, r3			);// r7 = 77 75 74 66 = R15
	por	(	r6, r5			);// r6 = 76 __ 67 64
	 movq	(r5, r0			);// r5 = 73 72 71 70
	movq	(edx+120, r7	);// store R15 = r7
	 psrlq	(r5, 16			);// r5 = __ 73 72 71
	pand	(r5, M(2)		);// r5 = __ 73 __ __
	 movq	(r7, r0			);// r7 = 73 72 71 70
	por	(	r6, r5			);// r6 = 76 73 67 64 = R14
	 pand	(r0, r2			);// r0 = __ __ __ 70
	pxor	(r7, r0			);// r7 = 73 72 71 __
	 psllq	(r0, 32			);// r0 = __ 70 __ __
	movq	(edx+104, r6	);// write R14 = r6
	 psrlq	(r4, 16			);// r4 = __ 57 56 55
	movq	(r5, eax+72);
	 psllq	(r7, 16			);// r7 = 72 71 __ __
	pmullw	(r5, ebx+72	);// r5 = 47 46 45 44
	 movq	(r6, r7			);// r6 = 72 71 __ __
	movq	(r3, M(2)		);// r3 = __ FF __ __
	 psllq	(r6, 16			);// r6 = 71 __ __ __
	pand	(r7, M(3)		);// r7 = 72 __ __ __
	 pand	(r3, r1			);// r3 = __ 62 __ __
	por	(	r7, r0			);// r7 = 72 70 __ __
	 movq	(r0, r1			);// r0 = 63 62 61 60
	pand	(r1, M(3)		);// r1 = 63 __ __ __
	 por	(r6, r3			);// r6 = 71 62 __ __
	movq	(r3, r4			);// r3 = __ 57 56 55
	 psrlq	(r1, 32			);// r1 = __ __ 63 __
	pand	(r3, r2			);// r3 = __ __ __ 55
	 por	(r7, r1			);// r7 = 72 70 63 __
	por	(	r7, r3			);// r7 = 72 70 63 55 = R13
	 movq	(r3, r4			);// r3 = __ 57 56 55
	pand	(r3, M(1)		);// r3 = __ __ 56 __
	 movq	(r1, r5			);// r1 = 47 46 45 44
	movq	(edx+88, r7	);// write R13 = r7
	 psrlq	(r5, 48			);// r5 = __ __ __ 47
	movq	(r7, eax+64);
	 por	(r6, r3			);// r6 = 71 62 56 __
	pmullw	(r7, ebx+64	);// r7 = 43 42 41 40
	 por	(r6, r5			);// r6 = 71 62 56 47 = R12
	pand	(r4, M(2)		);// r4 = __ 57 __ __
	 psllq	(r0, 32			);// r0 = 61 60 __ __
	movq	(edx+72, r6	);// write R12 = r6
	 movq	(r6, r0			);// r6 = 61 60 __ __
	pand	(r0, M(3)		);// r0 = 61 __ __ __
	 psllq	(r6, 16			);// r6 = 60 __ __ __
	movq	(r5, eax+40);
	 movq	(r3, r1			);// r3 = 47 46 45 44
	pmullw	(r5, ebx+40	);// r5 = 27 26 25 24
	 psrlq	(r1, 16			);// r1 = __ 47 46 45
	pand	(r1, M(1)		);// r1 = __ __ 46 __
	 por	(r0, r4			);// r0 = 61 57 __ __
	pand	(r2, r7			);// r2 = __ __ __ 40
	 por	(r0, r1			);// r0 = 61 57 46 __
	por	(	r0, r2			);// r0 = 61 57 46 40 = R11
	 psllq	(r3, 16			);// r3 = 46 45 44 __
	movq	(r4, r3			);// r4 = 46 45 44 __
	 movq	(r2, r5			);// r2 = 27 26 25 24
	movq	(edx+112, r0	);// write R11 = r0
	 psrlq	(r2, 48			);// r2 = __ __ __ 27
	pand	(r4, M(2)		);// r4 = __ 45 __ __
	 por	(r6, r2			);// r6 = 60 __ __ 27
	movq	(r2, M(1)		);// r2 = __ __ FF __
	 por	(r6, r4			);// r6 = 60 45 __ 27
	pand	(r2, r7			);// r2 = __ __ 41 __
	 psllq	(r3, 32			);// r3 = 44 __ __ __
	por	(	r3, edx+80	);// r3 = 44 __ __ 23
	 por	(r6, r2			);// r6 = 60 45 41 27 = R10
	movq	(r2, M(3)		);// r2 = FF __ __ __
	 psllq	(r5, 16			);// r5 = 26 25 24 __
	movq	(edx+96, r6	);// store R10 = r6
	 pand	(r2, r5			);// r2 = 26 __ __ __
	movq	(r6, M(2)		);// r6 = __ FF __ __
	 pxor	(r5, r2			);// r5 = __ 25 24 __
	pand	(r6, r7			);// r6 = __ 42 __ __
	 psrlq	(r2, 32			);// r2 = __ __ 26 __
	pand	(r7, M(3)		);// r7 = 43 __ __ __
	 por	(r3, r2			);// r3 = 44 __ 26 23
	por	(	r7, edx+64	);// r7 = 43 __ __ 12
	 por	(r6, r3			);// r6 = 44 42 26 23 = R9
	por	(	r7, r5			);// r7 = 43 25 24 12 = R8

	movq	(edx+80, r6	);// store R9 = r6

	movq	(edx+64, r7	);// store R8 = r7
	 //
	// 123c  ( / 64 coeffs  < 2c / coeff)

#	undef M

// Done w/dequant + descramble + partial transpose// now do the idct itself.

//#	define I( K)	[edx + (  K      * 16)]
//#	define J( K)	[edx + ( (K - 4) * 16) + 8]

	RowIDCT_10(r0,r1,r2,r3,r4,r5,r6,r7,I10_1(edx),C10(ecx));		// 33 c
	Transpose(r0,r1,r2,r3,r4,r5,r6,r7,I10_1(edx),J10_1(edx));		// 19 c

//#	define I( K)	[edx + (  K      * 16) + 64]
//#	define J( K)	[edx + ( (K - 4) * 16) + 72]

//	RowIDCT			// 46 c
//	Transpose		// 19 c

//#	define I( K)	[edx + (K * 16)]
//#	define J( K)	I( K)

	ColumnIDCT_10(r0,r1,r2,r3,r4,r5,r6,r7,I10_2(edx),I10_2(edx),C10(ecx),ecx + EightOffset);		// 44 c

//#	define I( K)	[edx + (K * 16) + 8]
//#	define J( K)	I( K)

	ColumnIDCT_10(r0,r1,r2,r3,r4,r5,r6,r7,I10_3(edx),I10_3(edx),C10(ecx),ecx + EightOffset);		// 44 c
}

/**************************************************************************************
 *
 *		Routine:		MMX_idct1
 *
 *		Description:	Perform IDCT on a 8x8 block with at most 1 nonzero coefficients
 *
 *		Input:			Pointer to input and output buffer
 *
 *		Output:			None
 *
 *		Return:			None
 *
 *		Special Note:	None
 *
 *		Error:			None
 *
 ***************************************************************************************
 */

/* --------------------------------------------------------------- */
/* IDCT 1 */
extern "C" void MMX_idct1 (ogg_int16_t * input, ogg_int16_t * qtbl, ogg_int16_t * output)
{

        if(input[0])
        {
            int i;
            ogg_int32_t temp = (ogg_int32_t)input[0];
	    __m64 *iBuf=(__m64*)output;

            temp *= qtbl[0];

            //necessary in order to match tim's
            temp += 15;

            temp >>= 5;

            temp &= 0xffff;

            temp += temp << 16;
            __m64 temp8=_mm_set1_pi32(temp);
            for(i = 0; i < 16; i += 2)
            {
                iBuf[i] = temp8;
                iBuf[i+1] = temp8;
            }
        }
        else
        {
	        /* special case where there is only a 0 dc coeff */
    	    memset( output, 0, 128);
        }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -