📄 mmxidct.s

📁 PIXIL is a small footprint operating environment, complete with PDA PIM applications, a browser and
💻 S
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* This code is basically the Intel-DCT code, found in the   LiViD source tree, slightly modified to fit into dvdview. */	/* * the input data is tranposed and each 16 bit element in the 8x8 matrix * is left aligned: * for example in 11...1110000 format * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component * (element[0][0] of the matrix) *//* extrn re_matrix */.data	.align 16	.type	 preSC,@objectpreSC:  .short  16384,22725,21407,19266,16384,12873,8867,4520        .short  22725,31521,29692,26722,22725,17855,12299,6270        .short  21407,29692,27969,25172,21407,16819,11585,5906        .short  19266,26722,25172,22654,19266,15137,10426,5315        .short  16384,22725,21407,19266,16384,12873,8867,4520        .short  12873,17855,16819,15137,25746,20228,13933,7103        .short  17734,24598,23170,20853,17734,13933,9597,4892        .short  18081,25080,23624,21261,18081,14206,9785,4988	.size	 preSC,128	.align 8	.type	x0005000200010001,@object	.size	x0005000200010001,8x0005000200010001:	.long	0x00010001,0x00050002	.align 8	.type	x0040000000000000,@object	.size	x0040000000000000,8x0040000000000000:	.long	0, 0x00400000	.align 8	.type	x5a825a825a825a82,@object	.size	x5a825a825a825a82,8x5a825a825a825a82:	.long	0x5a825a82, 0x5a825a82	.align 8	.type	x539f539f539f539f,@object	.size	x539f539f539f539f,8x539f539f539f539f:	.long	0x539f539f,0x539f539f	.align 8	.type	x4546454645464546,@object	.size	x4546454645464546,8x4546454645464546:	.long	0x45464546,0x45464546	.align 8	.type	x61f861f861f861f8,@object	.size	x61f861f861f861f8,8x61f861f861f861f8:	.long	0x61f861f8,0x61f861f8	.align 8	.type	 scratch1,@object	.size	 scratch1,8scratch1:	.long 0,0	.align 8	.type	 scratch3,@object	.size	 scratch3,8scratch3:	.long 0,0	.align 8	.type	 scratch5,@object	.size	 scratch5,8scratch5:	.long 0,0	.align 8	.type	 scratch7,@object	.size	 scratch7,8scratch7:	.long 0,0	.type	 x0,@object	.size	 x0,8x0:	.long 0,0	.align 8.text	.align 4.globl IDCT_mmx	.type	 IDCT_mmx,@functionIDCT_mmx:	pushl %ebp	movl %esp,%ebp	pushl %ebx	pushl %ecx	pushl %edx	pushl %esi	pushl %edi        movl 8(%ebp),%edi		/* output XXX */       	movl 12(%ebp),%esi		/* source matrix */#if 0	movq (%esi), %mm0	movq 8(%esi), %mm1	psllw $4, %mm0	movq 16(%esi), %mm2	psllw $4, %mm1	movq 24(%esi), %mm3	psllw $4, %mm2	movq 32(%esi), %mm4	psllw $4, %mm3	movq 40(%esi), %mm5	psllw $4, %mm4	movq 48(%esi), %mm6	psllw $4, %mm5	movq 56(%esi), %mm7	psllw $4, %mm6	psllw $4, %mm7	movq %mm0,  (%esi)	movq %mm1, 8(%esi)	movq %mm2,16(%esi)	movq %mm3,24(%esi)	movq %mm4,32(%esi)	movq %mm5,40(%esi)	movq %mm6,48(%esi)	movq %mm7,56(%esi)	movq 64(%esi), %mm0	movq 72(%esi), %mm1	psllw $4, %mm0	movq 80(%esi), %mm2	psllw $4, %mm1	movq 88(%esi), %mm3	psllw $4, %mm2	movq 96(%esi), %mm4	psllw $4, %mm3	movq 104(%esi), %mm5	psllw $4, %mm4	movq 112(%esi), %mm6	psllw $4, %mm5	movq 120(%esi), %mm7	psllw $4, %mm6	psllw $4, %mm7	movq %mm0,64(%esi)	movq %mm1,72(%esi)	movq %mm2,80(%esi)	movq %mm3,88(%esi)	movq %mm4,96(%esi)	movq %mm5,104(%esi)	movq %mm6,112(%esi)	movq %mm7,120(%esi)#endif	leal preSC, %ecx/* column 0: even part * use V4, V12, V0, V8 to produce V22..V25 */	movq 8*12(%ecx), %mm0	/* maybe the first mul can be done together */				/* with the dequantization in iHuff module */	pmulhw 8*12(%esi), %mm0		/* V12 */	movq 8*4(%ecx), %mm1	pmulhw 8*4(%esi), %mm1		/* V4 */	movq (%ecx), %mm3	psraw $1, %mm0			/* t64=t66 */	pmulhw (%esi), %mm3		/* V0 */	movq 8*8(%ecx), %mm5		/* duplicate V4 */	movq %mm1, %mm2			/* added 11/1/96 */	pmulhw 8*8(%esi),%mm5		/* V8 */	psubsw %mm0, %mm1		/* V16 */	pmulhw x5a825a825a825a82, %mm1	/* 23170 ->V18 */	paddsw %mm0, %mm2		/* V17 */	movq %mm2, %mm0			/* duplicate V17 */	psraw $1, %mm2			/* t75=t82 */	psraw $2, %mm0			/* t72 */	movq %mm3, %mm4			/* duplicate V0 */	paddsw %mm5, %mm3		/* V19 */	psubsw %mm5, %mm4		/* V20 ;mm5 free *//* moved from the block below */	movq 8*10(%ecx), %mm7	psraw $1, %mm3			/* t74=t81 */	movq %mm3, %mm6			/* duplicate t74=t81 */	psraw $2, %mm4			/* t77=t79 */	psubsw %mm0, %mm1		/* V21 ; mm0 free */	paddsw %mm2, %mm3		/* V22 */	movq %mm1, %mm5			/* duplicate V21 */	paddsw %mm4, %mm1		/* V23 */	movq %mm3, 8*4(%esi)		/* V22 */	psubsw %mm5, %mm4		/* V24; mm5 free */	movq %mm1, 8*12(%esi)		/* V23 */	psubsw %mm2, %mm6		/* V25; mm2 free */	movq %mm4, (%esi)		/* V24 *//* keep mm6 alive all along the next block */	/* movq %mm6, 8*8(%esi) 	V25 *//* column 0: odd part * use V2, V6, V10, V14 to produce V31, V39, V40, V41 *//* moved above: movq 8*10(%ecx), %mm7 */	pmulhw 8*10(%esi), %mm7		/* V10 */	movq 8*6(%ecx), %mm0	pmulhw 8*6(%esi), %mm0		/* V6 */	movq 8*2(%ecx), %mm5	movq %mm7, %mm3			/* duplicate V10 */	pmulhw 8*2(%esi), %mm5		/* V2 */	movq 8*14(%ecx), %mm4	psubsw %mm0, %mm7		/* V26 */	pmulhw 8*14(%esi), %mm4		/* V14 */	paddsw %mm0, %mm3		/* V29 ; free mm0 */	movq %mm7, %mm1			/* duplicate V26 */	psraw $1, %mm3			/* t91=t94 */	pmulhw x539f539f539f539f,%mm7	/* V33 */	psraw $1, %mm1			/* t96 */	movq %mm5, %mm0			/* duplicate V2 */	psraw $2, %mm4			/* t85=t87 */	paddsw %mm4,%mm5		/* V27 */	psubsw %mm4, %mm0		/* V28 ; free mm4 */	movq %mm0, %mm2			/* duplicate V28 */	psraw $1, %mm5			/* t90=t93 */	pmulhw x4546454645464546,%mm0	/* V35 */	psraw $1, %mm2			/* t97 */	movq %mm5, %mm4			/* duplicate t90=t93 */	psubsw %mm2, %mm1		/* V32 ; free mm2 */	pmulhw x61f861f861f861f8,%mm1	/* V36 */	psllw $1, %mm7			/* t107 */	paddsw %mm3, %mm5		/* V31 */	psubsw %mm3, %mm4		/* V30 ; free mm3 */	pmulhw x5a825a825a825a82,%mm4	/* V34 */	nop	psubsw %mm1, %mm0		/* V38 */	psubsw %mm7, %mm1		/* V37 ; free mm7 */	psllw $1, %mm1			/* t114 *//* move from the next block */	movq %mm6, %mm3			/* duplicate V25 *//* move from the next block */	movq 8*4(%esi), %mm7		/* V22 */	psllw $1, %mm0			/* t110 */	psubsw %mm5, %mm0		/* V39 (mm5 needed for next block) */	psllw $2, %mm4			/* t112 *//* moved from the next block */	movq 8*12(%esi), %mm2		/* V23 */	psubsw %mm0, %mm4		/* V40 */	paddsw %mm4, %mm1		/* V41; free mm0 *//* moved from the next block */	psllw $1, %mm2			/* t117=t125 *//* column 0: output butterfly *//* moved above: * movq %mm6, %mm3			duplicate V25 * movq 8*4(%esi), %mm7			V22 * movq 8*12(%esi), %mm2		V23 * psllw $1, %mm2			t117=t125 */	psubsw %mm1, %mm6		/* tm6 */	paddsw %mm1, %mm3		/* tm8; free mm1 */	movq %mm7, %mm1			/* duplicate V22 */	paddsw %mm5, %mm7		/* tm0 */	movq %mm3, 8*8(%esi)		/* tm8; free mm3 */	psubsw %mm5, %mm1		/* tm14; free mm5 */	movq %mm6, 8*6(%esi)		/* tm6; free mm6 */	movq %mm2, %mm3			/* duplicate t117=t125 */	movq (%esi), %mm6		/* V24 */	paddsw %mm0, %mm2		/* tm2 */	movq %mm7, (%esi)		/* tm0; free mm7 */	psubsw %mm0, %mm3		/* tm12; free mm0 */	movq %mm1, 8*14(%esi)		/* tm14; free mm1 */	psllw $1, %mm6			/* t119=t123 */	movq %mm2, 8*2(%esi)		/* tm2; free mm2 */	movq %mm6, %mm0			/* duplicate t119=t123 */	movq %mm3, 8*12(%esi)		/* tm12; free mm3 */	paddsw %mm4, %mm6		/* tm4 *//* moved from next block */	movq 8*5(%ecx), %mm1	psubsw %mm4, %mm0		/* tm10; free mm4 *//* moved from next block */	pmulhw 8*5(%esi), %mm1		/* V5 */	movq %mm6, 8*4(%esi)		/* tm4; free mm6 */	movq %mm0, 8*10(%esi)		/* tm10; free mm0 *//* column 1: even part * use V5, V13, V1, V9 to produce V56..V59 *//* moved to prev block: *	movq 8*5(%ecx), %mm1 *	pmulhw 8*5(%esi), %mm1		 V5 */	movq 8*13(%ecx), %mm7	psllw $1, %mm1			/* t128=t130 */	pmulhw 8*13(%esi), %mm7		/* V13 */	movq %mm1, %mm2			/* duplicate t128=t130 */	movq 8(%ecx), %mm3	pmulhw 8(%esi), %mm3		/* V1 */	movq 8*9(%ecx), %mm5	psubsw %mm7, %mm1		/* V50 */	pmulhw 8*9(%esi), %mm5		/* V9 */	paddsw %mm7, %mm2		/* V51 */	pmulhw x5a825a825a825a82, %mm1	/* 23170 ->V52 */	movq %mm2, %mm6			/* duplicate V51 */	psraw $1, %mm2			/* t138=t144 */	movq %mm3, %mm4			/* duplicate V1 */	psraw $2, %mm6			/* t136 */	paddsw %mm5, %mm3		/* V53 */	psubsw %mm5, %mm4		/* V54 ;mm5 free */	movq %mm3, %mm7			/* duplicate V53 *//* moved from next block */	movq 8*11(%ecx), %mm0	psraw $1, %mm4			/* t140=t142 */	psubsw %mm6, %mm1		/* V55 ; mm6 free */	paddsw %mm2, %mm3		/* V56 */	movq %mm4, %mm5			/* duplicate t140=t142 */	paddsw %mm1, %mm4		/* V57 */	movq %mm3, 8*5(%esi)		/* V56 */	psubsw %mm1, %mm5		/* V58; mm1 free */	movq %mm4, 8*13(%esi)		/* V57 */	psubsw %mm2, %mm7		/* V59; mm2 free */	movq %mm5, 8*9(%esi)		/* V58 *//* keep mm7 alive all along the next block *	movq %mm7, 8(%esi)		V59 * moved above *	movq 8*11(%ecx), %mm0 */	pmulhw 8*11(%esi), %mm0		/* V11 */	movq 8*7(%ecx), %mm6	pmulhw 8*7(%esi), %mm6		/* V7 */	movq 8*15(%ecx), %mm4	movq %mm0, %mm3			/* duplicate V11 */	pmulhw 8*15(%esi), %mm4		/* V15 */	movq 8*3(%ecx), %mm5	psllw $1, %mm6			/* t146=t152 */	pmulhw 8*3(%esi), %mm5		/* V3 */	paddsw %mm6, %mm0		/* V63 *//* note that V15 computation has a correction step:  * this is a 'magic' constant that rebiases the results to be closer to the * expected result.  this magic constant can be refined to reduce the error * even more by doing the correction step in a later stage when the number * is actually multiplied by 16 */	paddw x0005000200010001, %mm4	psubsw %mm6, %mm3		/* V60 ; free mm6 */	psraw $1, %mm0			/* t154=t156 */	movq %mm3, %mm1			/* duplicate V60 */	pmulhw x539f539f539f539f, %mm1	/* V67 */	movq %mm5, %mm6			/* duplicate V3 */	psraw $2, %mm4			/* t148=t150 */	paddsw %mm4, %mm5		/* V61 */	psubsw %mm4, %mm6		/* V62 ; free mm4 */	movq %mm5, %mm4			/* duplicate V61 */	psllw $1, %mm1			/* t169 */	paddsw %mm0, %mm5		/* V65 -> result */	psubsw %mm0, %mm4		/* V64 ; free mm0 */	pmulhw x5a825a825a825a82, %mm4	/* V68 */	psraw $1, %mm3			/* t158 */	psubsw %mm6, %mm3		/* V66 */	movq %mm5, %mm2			/* duplicate V65 */	pmulhw x61f861f861f861f8, %mm3	/* V70 */	psllw $1, %mm6			/* t165 */	pmulhw x4546454645464546, %mm6	/* V69 */	psraw $1, %mm2			/* t172 *//* moved from next block */	movq 8*5(%esi), %mm0		/* V56 */	psllw $1, %mm4			/* t174 *//* moved from next block */	psraw $1, %mm0			/* t177=t188 */	nop	psubsw %mm3, %mm6		/* V72 */	psubsw %mm1, %mm3		/* V71 ; free mm1 */	psubsw %mm2, %mm6		/* V73 ; free mm2 *//* moved from next block */	psraw $1, %mm5			/* t178=t189 */	psubsw %mm6, %mm4		/* V74 *//* moved from next block */	movq %mm0, %mm1			/* duplicate t177=t188 */	paddsw %mm4, %mm3		/* V75 *//* moved from next block */	paddsw %mm5, %mm0		/* tm1 *//* location *  5 - V56 * 13 - V57 *  9 - V58 *  X - V59, mm7 *  X - V65, mm5 *  X - V73, mm6 *  X - V74, mm4 *  X - V75, mm3 * free mm0, mm1 & mm2 * moved above *	movq 8*5(%esi), %mm0		V56 *	psllw $1, %mm0			t177=t188 ! new !! *	psllw $1, %mm5			t178=t189 ! new !! *	movq %mm0, %mm1			duplicate t177=t188 *	paddsw %mm5, %mm0		tm1 */	movq 8*13(%esi), %mm2		/* V57 */	psubsw %mm5, %mm1		/* tm15; free mm5 */	movq %mm0, 8(%esi)		/* tm1; free mm0 */	psraw $1, %mm7			/* t182=t184 ! new !! *//* save the store as used directly in the transpose *	movq %mm1, 120(%esi)		tm15; free mm1 */	movq %mm7, %mm5			/* duplicate t182=t184 */	psubsw %mm3, %mm7		/* tm7 */	paddsw %mm3, %mm5		/* tm9; free mm3 */	movq 8*9(%esi), %mm0		/* V58 */	movq %mm2, %mm3			/* duplicate V57 */	movq %mm7, 8*7(%esi)		/* tm7; free mm7 */	psubsw %mm6, %mm3		/* tm13 */	paddsw %mm6, %mm2		/* tm3 ; free mm6 *//* moved up from the transpose */	movq %mm3, %mm7/* moved up from the transpose */	punpcklwd %mm1, %mm3	movq %mm0, %mm6			/* duplicate V58 */	movq %mm2, 8*3(%esi)		/* tm3; free mm2 */	paddsw %mm4, %mm0		/* tm5 */	psubsw %mm4, %mm6		/* tm11; free mm4 *//* moved up from the transpose */	punpckhwd %mm1, %mm7	movq %mm0, 8*5(%esi)		/* tm5; free mm0 *//* moved up from the transpose */	movq %mm5, %mm2/* transpose - M4 part *  ---------       --------- * | M1 | M2 |     | M1'| M3'| *  ---------  -->  --------- * | M3 | M4 |     | M2'| M4'| *  ---------       --------- * Two alternatives: use full mmword approach so the following code can be * scheduled before the transpose is done without stores, or use the faster * half mmword stores (when possible) */	movd %mm3, 8*9+4(%esi)		/* MS part of tmt9 */	punpcklwd %mm6, %mm5	movd %mm7, 8*13+4(%esi)		/* MS part of tmt13 */	punpckhwd %mm6, %mm2	movd %mm5, 8*9(%esi)		/* LS part of tmt9 */	punpckhdq %mm3, %mm5		/* free mm3 */	movd %mm2, 8*13(%esi)		/* LS part of tmt13 */	punpckhdq %mm7, %mm2		/* free mm7 *//* moved up from the M3 transpose */	movq 8*8(%esi), %mm0/* moved up from the M3 transpose */	movq 8*10(%esi), %mm1/* moved up from the M3 transpose */	movq %mm0, %mm3/* shuffle the rest of the data, and write it with 2 mmword writes */	movq %mm5, 8*11(%esi)		/* tmt11 *//* moved up from the M3 transpose */	punpcklwd %mm1, %mm0	movq %mm2, 8*15(%esi)		/* tmt15 *//* moved up from the M3 transpose */	punpckhwd %mm1, %mm3/* transpose - M3 part * moved up to previous code section *	movq 8*8(%esi), %mm0 *	movq 8*10(%esi), %mm1 *	movq %mm0, %mm3 *	punpcklwd %mm1, %mm0 *	punpckhwd %mm1, %mm3 */	movq 8*12(%esi), %mm6	movq 8*14(%esi), %mm4	movq %mm6, %mm2/* shuffle the data and write the lower parts of the transposed in 4 dwords */	punpcklwd %mm4, %mm6	movq %mm0, %mm1	punpckhdq %mm6, %mm1	movq %mm3, %mm7	punpckhwd %mm4, %mm2		/* free mm4 */	punpckldq %mm6, %mm0		/* free mm6 *//* moved from next block */	movq 8*13(%esi), %mm4		/* tmt13 */	punpckldq %mm2, %mm3	punpckhdq %mm2, %mm7		/* free mm2 *//* moved from next block */	movq %mm3, %mm5			/* duplicate tmt5 *//* column 1: even part (after transpose)
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -