⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 mmxidct_asm.s

📁 This code is based on mpeg_play, available from: http://bmrc.berkeley.edu/frame/research/mpeg/
💻 S
📖 第 1 页 / 共 2 页
字号:
#if defined(i386) && defined(USE_MMX)	/* * the input data is tranposed and each 16 bit element in the 8x8 matrix * is left aligned: * for example in 11...1110000 format * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component * (element[0][0] of the matrix) *//* extrn re_matrix */.data	.align 16	.type	 preSC,@objectpreSC:  .short  16384,22725,21407,19266,16384,12873,8867,4520        .short  22725,31521,29692,26722,22725,17855,12299,6270        .short  21407,29692,27969,25172,21407,16819,11585,5906        .short  19266,26722,25172,22654,19266,15137,10426,5315        .short  16384,22725,21407,19266,16384,12873,8867,4520        .short  12873,17855,16819,15137,25746,20228,13933,7103        .short  17734,24598,23170,20853,17734,13933,9597,4892        .short  18081,25080,23624,21261,18081,14206,9785,4988	.size	 preSC,128	.align 8	.type	x0005000200010001,@object	.size	x0005000200010001,8x0005000200010001:	.long	0x00010001,0x00050002	.align 8	.type	x0040000000000000,@object	.size	x0040000000000000,8x0040000000000000:	.long	0, 0x00400000	.align 8	.type	x5a825a825a825a82,@object	.size	x5a825a825a825a82,8x5a825a825a825a82:	.long	0x5a825a82, 0x5a825a82	.align 8	.type	x539f539f539f539f,@object	.size	x539f539f539f539f,8x539f539f539f539f:	.long	0x539f539f,0x539f539f	.align 8	.type	x4546454645464546,@object	.size	x4546454645464546,8x4546454645464546:	.long	0x45464546,0x45464546	.align 8	.type	x61f861f861f861f8,@object	.size	x61f861f861f861f8,8x61f861f861f861f8:	.long	0x61f861f8,0x61f861f8	.align 8	.type	 scratch1,@object	.size	 scratch1,8scratch1:	.long 0,0	.align 8	.type	 scratch3,@object	.size	 scratch3,8scratch3:	.long 0,0	.align 8	.type	 scratch5,@object	.size	 scratch5,8scratch5:	.long 0,0	.align 8	.type	 scratch7,@object	.size	 scratch7,8scratch7:	.long 0,0	.type	 x0,@object	.size	 x0,8x0:	.long 0,0	.align 8.text	.align 4.globl IDCT_mmx	.type	 IDCT_mmx,@functionIDCT_mmx:	pushl %ebp	movl %esp,%ebp	pushl %ebx	pushl %ecx	pushl %edx	pushl %esi	pushl %edi	movl 8(%ebp),%esi		/* source matrix */	leal preSC, %ecx/* column 0: even part * use V4, V12, V0, V8 to produce V22..V25 */	movq 8*12(%ecx), %mm0	/* maybe the first mul can be done together */				/* with the dequantization in iHuff module */	pmulhw 8*12(%esi), %mm0		/* V12 */	movq 8*4(%ecx), %mm1	pmulhw 8*4(%esi), %mm1		/* V4 */	movq (%ecx), %mm3	psraw $1, %mm0			/* t64=t66 */	pmulhw (%esi), %mm3		/* V0 */	movq 8*8(%ecx), %mm5		/* duplicate V4 */	movq %mm1, %mm2			/* added 11/1/96 */	pmulhw 8*8(%esi),%mm5		/* V8 */	psubsw %mm0, %mm1		/* V16 */	pmulhw x5a825a825a825a82, %mm1	/* 23170 ->V18 */	paddsw %mm0, %mm2		/* V17 */	movq %mm2, %mm0			/* duplicate V17 */	psraw $1, %mm2			/* t75=t82 */	psraw $2, %mm0			/* t72 */	movq %mm3, %mm4			/* duplicate V0 */	paddsw %mm5, %mm3		/* V19 */	psubsw %mm5, %mm4		/* V20 ;mm5 free *//* moved from the block below */	movq 8*10(%ecx), %mm7	psraw $1, %mm3			/* t74=t81 */	movq %mm3, %mm6			/* duplicate t74=t81 */	psraw $2, %mm4			/* t77=t79 */	psubsw %mm0, %mm1		/* V21 ; mm0 free */	paddsw %mm2, %mm3		/* V22 */	movq %mm1, %mm5			/* duplicate V21 */	paddsw %mm4, %mm1		/* V23 */	movq %mm3, 8*4(%esi)		/* V22 */	psubsw %mm5, %mm4		/* V24; mm5 free */	movq %mm1, 8*12(%esi)		/* V23 */	psubsw %mm2, %mm6		/* V25; mm2 free */	movq %mm4, (%esi)		/* V24 *//* keep mm6 alive all along the next block */	/* movq %mm6, 8*8(%esi) 	V25 *//* column 0: odd part * use V2, V6, V10, V14 to produce V31, V39, V40, V41 *//* moved above: movq 8*10(%ecx), %mm7 */	pmulhw 8*10(%esi), %mm7		/* V10 */	movq 8*6(%ecx), %mm0	pmulhw 8*6(%esi), %mm0		/* V6 */	movq 8*2(%ecx), %mm5	movq %mm7, %mm3			/* duplicate V10 */	pmulhw 8*2(%esi), %mm5		/* V2 */	movq 8*14(%ecx), %mm4	psubsw %mm0, %mm7		/* V26 */	pmulhw 8*14(%esi), %mm4		/* V14 */	paddsw %mm0, %mm3		/* V29 ; free mm0 */	movq %mm7, %mm1			/* duplicate V26 */	psraw $1, %mm3			/* t91=t94 */	pmulhw x539f539f539f539f,%mm7	/* V33 */	psraw $1, %mm1			/* t96 */	movq %mm5, %mm0			/* duplicate V2 */	psraw $2, %mm4			/* t85=t87 */	paddsw %mm4,%mm5		/* V27 */	psubsw %mm4, %mm0		/* V28 ; free mm4 */	movq %mm0, %mm2			/* duplicate V28 */	psraw $1, %mm5			/* t90=t93 */	pmulhw x4546454645464546,%mm0	/* V35 */	psraw $1, %mm2			/* t97 */	movq %mm5, %mm4			/* duplicate t90=t93 */	psubsw %mm2, %mm1		/* V32 ; free mm2 */	pmulhw x61f861f861f861f8,%mm1	/* V36 */	psllw $1, %mm7			/* t107 */	paddsw %mm3, %mm5		/* V31 */	psubsw %mm3, %mm4		/* V30 ; free mm3 */	pmulhw x5a825a825a825a82,%mm4	/* V34 */	nop	psubsw %mm1, %mm0		/* V38 */	psubsw %mm7, %mm1		/* V37 ; free mm7 */	psllw $1, %mm1			/* t114 *//* move from the next block */	movq %mm6, %mm3			/* duplicate V25 *//* move from the next block */	movq 8*4(%esi), %mm7		/* V22 */	psllw $1, %mm0			/* t110 */	psubsw %mm5, %mm0		/* V39 (mm5 needed for next block) */	psllw $2, %mm4			/* t112 *//* moved from the next block */	movq 8*12(%esi), %mm2		/* V23 */	psubsw %mm0, %mm4		/* V40 */	paddsw %mm4, %mm1		/* V41; free mm0 *//* moved from the next block */	psllw $1, %mm2			/* t117=t125 *//* column 0: output butterfly *//* moved above: * movq %mm6, %mm3			duplicate V25 * movq 8*4(%esi), %mm7			V22 * movq 8*12(%esi), %mm2		V23 * psllw $1, %mm2			t117=t125 */	psubsw %mm1, %mm6		/* tm6 */	paddsw %mm1, %mm3		/* tm8; free mm1 */	movq %mm7, %mm1			/* duplicate V22 */	paddsw %mm5, %mm7		/* tm0 */	movq %mm3, 8*8(%esi)		/* tm8; free mm3 */	psubsw %mm5, %mm1		/* tm14; free mm5 */	movq %mm6, 8*6(%esi)		/* tm6; free mm6 */	movq %mm2, %mm3			/* duplicate t117=t125 */	movq (%esi), %mm6		/* V24 */	paddsw %mm0, %mm2		/* tm2 */	movq %mm7, (%esi)		/* tm0; free mm7 */	psubsw %mm0, %mm3		/* tm12; free mm0 */	movq %mm1, 8*14(%esi)		/* tm14; free mm1 */	psllw $1, %mm6			/* t119=t123 */	movq %mm2, 8*2(%esi)		/* tm2; free mm2 */	movq %mm6, %mm0			/* duplicate t119=t123 */	movq %mm3, 8*12(%esi)		/* tm12; free mm3 */	paddsw %mm4, %mm6		/* tm4 *//* moved from next block */	movq 8*5(%ecx), %mm1	psubsw %mm4, %mm0		/* tm10; free mm4 *//* moved from next block */	pmulhw 8*5(%esi), %mm1		/* V5 */	movq %mm6, 8*4(%esi)		/* tm4; free mm6 */	movq %mm0, 8*10(%esi)		/* tm10; free mm0 *//* column 1: even part * use V5, V13, V1, V9 to produce V56..V59 *//* moved to prev block: *	movq 8*5(%ecx), %mm1 *	pmulhw 8*5(%esi), %mm1		 V5 */	movq 8*13(%ecx), %mm7	psllw $1, %mm1			/* t128=t130 */	pmulhw 8*13(%esi), %mm7		/* V13 */	movq %mm1, %mm2			/* duplicate t128=t130 */	movq 8(%ecx), %mm3	pmulhw 8(%esi), %mm3		/* V1 */	movq 8*9(%ecx), %mm5	psubsw %mm7, %mm1		/* V50 */	pmulhw 8*9(%esi), %mm5		/* V9 */	paddsw %mm7, %mm2		/* V51 */	pmulhw x5a825a825a825a82, %mm1	/* 23170 ->V52 */	movq %mm2, %mm6			/* duplicate V51 */	psraw $1, %mm2			/* t138=t144 */	movq %mm3, %mm4			/* duplicate V1 */	psraw $2, %mm6			/* t136 */	paddsw %mm5, %mm3		/* V53 */	psubsw %mm5, %mm4		/* V54 ;mm5 free */	movq %mm3, %mm7			/* duplicate V53 *//* moved from next block */	movq 8*11(%ecx), %mm0	psraw $1, %mm4			/* t140=t142 */	psubsw %mm6, %mm1		/* V55 ; mm6 free */	paddsw %mm2, %mm3		/* V56 */	movq %mm4, %mm5			/* duplicate t140=t142 */	paddsw %mm1, %mm4		/* V57 */	movq %mm3, 8*5(%esi)		/* V56 */	psubsw %mm1, %mm5		/* V58; mm1 free */	movq %mm4, 8*13(%esi)		/* V57 */	psubsw %mm2, %mm7		/* V59; mm2 free */	movq %mm5, 8*9(%esi)		/* V58 *//* keep mm7 alive all along the next block *	movq %mm7, 8(%esi)		V59 * moved above *	movq 8*11(%ecx), %mm0 */	pmulhw 8*11(%esi), %mm0		/* V11 */	movq 8*7(%ecx), %mm6	pmulhw 8*7(%esi), %mm6		/* V7 */	movq 8*15(%ecx), %mm4	movq %mm0, %mm3			/* duplicate V11 */	pmulhw 8*15(%esi), %mm4		/* V15 */	movq 8*3(%ecx), %mm5	psllw $1, %mm6			/* t146=t152 */	pmulhw 8*3(%esi), %mm5		/* V3 */	paddsw %mm6, %mm0		/* V63 *//* note that V15 computation has a correction step:  * this is a 'magic' constant that rebiases the results to be closer to the * expected result.  this magic constant can be refined to reduce the error * even more by doing the correction step in a later stage when the number * is actually multiplied by 16 */	paddw x0005000200010001, %mm4	psubsw %mm6, %mm3		/* V60 ; free mm6 */	psraw $1, %mm0			/* t154=t156 */	movq %mm3, %mm1			/* duplicate V60 */	pmulhw x539f539f539f539f, %mm1	/* V67 */	movq %mm5, %mm6			/* duplicate V3 */	psraw $2, %mm4			/* t148=t150 */	paddsw %mm4, %mm5		/* V61 */	psubsw %mm4, %mm6		/* V62 ; free mm4 */	movq %mm5, %mm4			/* duplicate V61 */	psllw $1, %mm1			/* t169 */	paddsw %mm0, %mm5		/* V65 -> result */	psubsw %mm0, %mm4		/* V64 ; free mm0 */	pmulhw x5a825a825a825a82, %mm4	/* V68 */	psraw $1, %mm3			/* t158 */	psubsw %mm6, %mm3		/* V66 */	movq %mm5, %mm2			/* duplicate V65 */	pmulhw x61f861f861f861f8, %mm3	/* V70 */	psllw $1, %mm6			/* t165 */	pmulhw x4546454645464546, %mm6	/* V69 */	psraw $1, %mm2			/* t172 *//* moved from next block */	movq 8*5(%esi), %mm0		/* V56 */	psllw $1, %mm4			/* t174 *//* moved from next block */	psraw $1, %mm0			/* t177=t188 */	nop	psubsw %mm3, %mm6		/* V72 */	psubsw %mm1, %mm3		/* V71 ; free mm1 */	psubsw %mm2, %mm6		/* V73 ; free mm2 *//* moved from next block */	psraw $1, %mm5			/* t178=t189 */	psubsw %mm6, %mm4		/* V74 *//* moved from next block */	movq %mm0, %mm1			/* duplicate t177=t188 */	paddsw %mm4, %mm3		/* V75 *//* moved from next block */	paddsw %mm5, %mm0		/* tm1 *//* location *  5 - V56 * 13 - V57 *  9 - V58 *  X - V59, mm7 *  X - V65, mm5 *  X - V73, mm6 *  X - V74, mm4 *  X - V75, mm3 * free mm0, mm1 & mm2 * moved above *	movq 8*5(%esi), %mm0		V56 *	psllw $1, %mm0			t177=t188 ! new !! *	psllw $1, %mm5			t178=t189 ! new !! *	movq %mm0, %mm1			duplicate t177=t188 *	paddsw %mm5, %mm0		tm1 */	movq 8*13(%esi), %mm2		/* V57 */	psubsw %mm5, %mm1		/* tm15; free mm5 */	movq %mm0, 8(%esi)		/* tm1; free mm0 */	psraw $1, %mm7			/* t182=t184 ! new !! *//* save the store as used directly in the transpose *	movq %mm1, 120(%esi)		tm15; free mm1 */

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -