fdct_mmx.c

来自「Motion JPEG编解码器源代码」· C语言 代码 · 共 565 行 · 第 1/2 页

C
565
字号
	movq_m2r(*(fdct_tg_all_16 + 8), mm6);	/* 6 ; tg_3_16			*/	pmulhw_r2r(mm1, mm0);		/* tp465*tg_1_16		*/		movq_r2m(mm7, *y0);		/* 7 ; save y0			*/	pmulhw_r2r(mm4, mm6);		/* tm465*tg_3_16		*/		movq_r2m(mm5, *y6);		/* save y6			*/	movq_r2r(mm3, mm7);		/* t7				*/		movq_m2r(*(fdct_tg_all_16 + 8), mm5);		/* 5 ; tg_3_16			*/	psubsw_r2r(mm2, mm7);		/* tm765 = t7 - tp65		*/		paddsw_r2r(mm2, mm3);		/* 2 ; tp765 = t7 + tp65	*/	pmulhw_r2r(mm7, mm5);		/* tm765*tg_3_16		*/		paddsw_r2r(mm3, mm0);		/* y1 = tp765 + tp465*tg_1_16 	*/	paddsw_r2r(mm4, mm6);		/* tm465*tg_3_16		*/		pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);	/* tp765*tg_1_16		*/		por_m2r(fdct_one_corr, mm0);	/* correction y1 +0.5		*/	paddsw_r2r(mm7, mm5);		/* tm765*tg_3_16		*/		psubsw_r2r(mm6, mm7);		/* 6 ; y3 = tm765 - tm465*tg_3_16 */	inp += 4;			/* increment pointer		*/		movq_r2m(mm0, *y1);		/* 0 ; save y1			*/	paddsw_r2r(mm4, mm5);		/* 4 ; y5 = tm765*tg_3_16 + tm465 */		movq_r2m(mm7, *y3);		/* 7 ; save y3			*/	psubsw_r2r(mm1, mm3);		/* 1 ; y7 = tp765*tg_1_16 - tp465 */		movq_r2m(mm5, *y5);		/* save y5			*/		/* .mmx32_fdct_col47: ; begin processing last four columns */	movq_m2r(*x1, mm0);		/* 0 ; x1 			*/		movq_r2m(mm3, *y7);		/* 3 ; save y7 (columns 0-4)	*/		movq_m2r(*x6, mm1);		/* 1 ; x6			*/	movq_r2r(mm0, mm2);		/* 2 ; x1			*/		movq_m2r(*x2, mm3);		/* 3 ; x2			*/	paddsw_r2r(mm1, mm0);		/* t1 = x[1] + x[6]		*/		movq_m2r(*x5, mm4);		/*  4 ; x5			*/	psllw_i2r(SHIFT_FRW_COL, mm0);	/* t1				*/		movq_m2r(*x0, mm5);		/* 5 ; x0			*/	paddsw_r2r(mm3, mm4);		/* t2 = x[2] + x[5]		*/		paddsw_m2r(*x7, mm5);		/* t0 = x[0] + x[7]		*/	psllw_i2r(SHIFT_FRW_COL, mm4);	/* t2				*/		movq_r2r(mm0, mm6);		/* 6 ; t1			*/	psubsw_r2r(mm1, mm2);		/* 1 ; t6 = x[1] - x[6]		*/		movq_m2r(*(fdct_tg_all_16 + 4), mm1);		/* 1 ; tg_2_16			*/	psubsw_r2r(mm4, mm0);		/* tm12 = t1 - t2 		*/		movq_m2r(*x3, mm7);		/* 7 ; x3			*/	pmulhw_r2r(mm0, mm1);		/* tm12*tg_2_16			*/		paddsw_m2r(*x4, mm7);		/* t3 = x[3] + x[4]		*/	psllw_i2r(SHIFT_FRW_COL, mm5);	/* t0				*/		paddsw_r2r(mm4, mm6);		/* 4 ; tp12 = t1 + t2		*/	psllw_i2r(SHIFT_FRW_COL, mm7);	/* t3				*/		movq_r2r(mm5, mm4);		/* 4 ; t0			*/	psubsw_r2r(mm7, mm5);		/* tm03 = t0 - t3		*/		paddsw_r2r(mm5, mm1);		/* y2 = tm03 + tm12*tg_2_16	*/	paddsw_r2r(mm7, mm4);		/* 7 ; tp03 = t0 + t3		*/		por_m2r(fdct_one_corr, mm1);	/* correction y2 +0.5		*/	psllw_i2r(SHIFT_FRW_COL+1, mm2); /* t6				*/		pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5);	/* tm03*tg_2_16			*/	movq_r2r(mm4, mm7);		/* 7 ; tp03			*/		psubsw_m2r(*x5, mm3);		/* t5 = x[2] - x[5]		*/	psubsw_r2r(mm6, mm4);		/* y4 = tp03 - tp12		*/		movq_r2m(mm1, *(y2+4));		/* save y2			*/	paddsw_r2r(mm6, mm7);		/* y0 = tp03 + tp12		*/		movq_m2r(*x3, mm1);		/* 1 ; x3			*/	psllw_i2r(SHIFT_FRW_COL+1, mm3); /* t5				*/		psubsw_m2r(*x4, mm1);		/* t4 = x[3] - x[4]		*/	movq_r2r(mm2, mm6);		/* 6 ; t6			*/		movq_r2m(mm4, *(y4+4));		/* save y4			*/	paddsw_r2r(mm3, mm2);		/* t6 + t5			*/		pmulhw_m2r(*ocos_4_16, mm2);	/* tp65 = (t6 + t5)*cos_4_16	*/	psubsw_r2r(mm3, mm6);		/* 3 ; t6 - t5			*/		pmulhw_m2r(*ocos_4_16, mm6);	/* tm65 = (t6 - t5)*cos_4_16	*/	psubsw_r2r(mm0, mm5);		/* 0 ; y6 = tm03*tg_2_16 - tm12 */		por_m2r(fdct_one_corr, mm5);	/* correction y6 +0.5		*/	psllw_i2r(SHIFT_FRW_COL, mm1);	/* t4				*/		por_m2r(fdct_one_corr, mm2);	/* correction tp65 +0.5		*/	movq_r2r(mm1, mm4);		/* 4 ; t4			*/		movq_m2r(*x0, mm3);		/* 3 ; x0			*/	paddsw_r2r(mm6, mm1);		/* tp465 = t4 + tm65		*/		psubsw_m2r(*x7, mm3);		/* t7 = x[0] - x[7]		*/	psubsw_r2r(mm6, mm4);		/* 6 ; tm465 = t4 - tm65	*/		movq_m2r(*(fdct_tg_all_16 + 0), mm0);		/* 0 ; tg_1_16			*/	psllw_i2r(SHIFT_FRW_COL, mm3);	/* t7				*/		movq_m2r(*(fdct_tg_all_16 + 8), mm6);		/* 6 ; tg_3_16			*/	pmulhw_r2r(mm1, mm0);		/* tp465*tg_1_16		*/		movq_r2m(mm7, *(y0+4));		/* 7 ; save y0			*/	pmulhw_r2r(mm4, mm6);		/* tm465*tg_3_16		*/		movq_r2m(mm5, *(y6+4));		/* 5 ; save y6			*/	movq_r2r(mm3, mm7);		/* 7 ; t7			*/		movq_m2r(*(fdct_tg_all_16 + 8), mm5);		/* 5 ; tg_3_16			*/	psubsw_r2r(mm2, mm7);		/* tm765 = t7 - tp65		*/		paddsw_r2r(mm2, mm3);		/* 2 ; tp765 = t7 + tp65	*/	pmulhw_r2r(mm7, mm5);		/* tm765*tg_3_16		*/		paddsw_r2r(mm3, mm0);		/* y1 = tp765 + tp465*tg_1_16	*/	paddsw_r2r(mm4, mm6);		/* tm465*tg_3_16		*/		pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3);	/* tp765*tg_1_16		*/		por_m2r(fdct_one_corr, mm0);	/* correction y1 +0.5		*/	paddsw_r2r(mm7, mm5);		/* tm765*tg_3_16		*/		psubsw_r2r(mm6, mm7);		/* 6 ; y3 = tm765 - tm465*tg_3_16 */		movq_r2m(mm0, *(y1+4));		/* 0 ; save y1			*/	paddsw_r2r(mm4, mm5);		/* 4 ; y5 = tm765*tg_3_16 + tm465 */		movq_r2m(mm7, *(y3+4));		/* 7 ; save y3			*/	psubsw_r2r(mm1, mm3);		/* 1 ; y7 = tp765*tg_1_16 - tp465 */		movq_r2m(mm5, *(y5+4));		/* 5 ; save y5			*/		movq_r2m(mm3, *(y7+4));		/* 3 ; save y7			*/		/*    }   ; end of forward_dct_col07() */	/*  done with dct_row transform        */	/* * fdct_mmx32_cols() -- * the following subroutine repeats the row-transform operation,  * except with different shift&round constants.  This version * does NOT transpose the output again.  Thus the final output * is transposed with respect to the source. *  *  The output is stored into blk[], which destroys the original *  input data. */	out = inp = blk;	table = (int16_t *)tab_frw_01234567;						/*		 * for ( x = 8; x > 0; --x )  ; transform one row per iteration 		 *  ---------- loop begin 		*/	for(i=0; i<8; i++)	{		movd_m2r(*(inp+6), mm5);	/* mm5 = 7 6 		*/				punpcklwd_m2r(*(inp+4), mm5);	/* mm5 =  5 7 4 6	*/				movq_r2r(mm5, mm2);		/* mm2 = 5 7 4 6	*/		psrlq_i2r(32, mm5);		/* mm5 = _ _ 5 7	*/				movq_m2r(*inp, mm0);		/* mm0 = 3 2 1 0	*/		punpcklwd_r2r(mm2, mm5);	/* mm5 = 4 5 6 7	*/				movq_r2r(mm0, mm1);		/* mm1 = 3 2 1 0	*/		paddsw_r2r(mm5, mm0);		/* mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0) */				psubsw_r2r(mm5, mm1);		/* mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4) */		movq_r2r(mm0, mm2);		/* mm2 = [ xt3 xt2 xt1 xt0 ] */				/* movq [ xt3xt2xt1xt0 ], mm0; */		/* movq [ xt7xt6xt5xt4 ], mm1; */				punpcklwd_r2r(mm1, mm0);	/* mm0 = [ xt5 xt1 xt4 xt0 ] */				punpckhwd_r2r(mm1, mm2);	/* mm2 = [ xt7 xt3 xt6 xt2 ] */		movq_r2r(mm2, mm1);		/* mm1			*/				/* shuffle bytes around */					/*  movq mm0,  [INP] ; 0 ; x3 x2 x1 x0	 */			/*  movq mm1,  [INP+8] ; 1 ; x7 x6 x5 x4 */		movq_r2r(mm0, mm2);	 	/* 2 ; x3 x2 x1 x0	*/				movq_m2r(*table, mm3); 		/* 3 ; w06 w04 w02 w00 	*/		punpcklwd_r2r(mm1, mm0);	/* x5 x1 x4 x0 		*/			movq_r2r(mm0, mm5);		/* 5 ; x5 x1 x4 x0	*/		punpckldq_r2r(mm0, mm0);	/* x4 x0 x4 x0  [ xt2 xt0 xt2 xt0 ] */				movq_m2r(*(table+4), mm4);	/* 4 ; w07 w05 w03 w01	*/		punpckhwd_r2r(mm1, mm2);	/* 1 ; x7 x3 x6 x2	*/				pmaddwd_r2r(mm0, mm3);		/* x4*w06+x0*w04 x4*w02+x0*w00 */		movq_r2r(mm2, mm6);		/* 6 ; x7 x3 x6 x2	*/				movq_m2r(*(table+16), mm1);	/* 1 ; w22 w20 w18 w16	*/		punpckldq_r2r(mm2, mm2);	/* x6 x2 x6 x2  [ xt3 xt1 xt3 xt1 ] */				pmaddwd_r2r(mm2, mm4);		/* x6*w07+x2*w05 x6*w03+x2*w01 */		punpckhdq_r2r(mm5, mm5);	/* x5 x1 x5 x1  [ xt6 xt4 xt6 xt4 ] */				pmaddwd_m2r(*(table+8), mm0);	/* x4*w14+x0*w12 x4*w10+x0*w08 */		punpckhdq_r2r(mm6, mm6);	/*  x7 x3 x7 x3  [ xt7 xt5 xt7 xt5 ] */				movq_m2r(*(table+20), mm7);	/* 7 ; w23 w21 w19 w17		*/		pmaddwd_r2r(mm5, mm1);		/* x5*w22+x1*w20 x5*w18+x1*w16 	*/					/*			 * mm3 = a1, a0 (y2,y0)			 * mm1 = b1, b0 (y3,y1)			 * mm0 = a3,a2  (y6,y4)			 * mm5 = b3,b2  (y7,y5)			 */		paddd_m2r(*round_frw_row, mm3);	/* +rounder (y2,y0) 	*/		pmaddwd_r2r(mm6, mm7);		/* x7*w23+x3*w21 x7*w19+x3*w17 */				pmaddwd_m2r(*(table+12), mm2);	/* x6*w15+x2*w13 x6*w11+x2*w09 */		paddd_r2r(mm4, mm3);		/* 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0) */				pmaddwd_m2r(*(table+24), mm5);	/* x5*w30+x1*w28 x5*w26+x1*w24	*/				pmaddwd_m2r(*(table+28), mm6);	/* x7*w31+x3*w29 x7*w27+x3*w25 */		paddd_r2r(mm7, mm1);		/* 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1) */				paddd_m2r(*round_frw_row, mm0);	/* +rounder (y6,y4)	*/		psrad_i2r(SHIFT_FRW_ROW, mm3);	/* (y2, y0)		*/				paddd_m2r(*round_frw_row, mm1);	/* +rounder (y3,y1) 	*/		paddd_r2r(mm2, mm0);		/* 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4) */				paddd_m2r(*round_frw_row, mm5);	/* +rounder (y7,y5)	*/		psrad_i2r(SHIFT_FRW_ROW, mm1);	/* y1=a1+b1 y0=a0+b0	*/				paddd_r2r(mm6, mm5);		/* 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5) */		psrad_i2r(SHIFT_FRW_ROW, mm0);	/* y3=a3+b3 y2=a2+b2	*/				out += 8;			/* increment row-output address by 1 row */		psrad_i2r(SHIFT_FRW_ROW, mm5);	/* y4=a3-b3 y5=a2-b2	*/				inp += 8;			/*  increment row-address by 1 row */		packssdw_r2r(mm0, mm3);		/* 0 ; y6 y4 y2 y0	*/				packssdw_r2r(mm5, mm1);		/* 3 ; y7 y5 y3 y1	*/		movq_r2r(mm3, mm6);		/* mm0 = y6 y4 y2 y0	*/				punpcklwd_r2r(mm1, mm3);	/* y3 y2 y1 y0		*/				punpckhwd_r2r(mm1, mm6);	/* y7 y6 y5 y4		*/		table += 32;			/* increment to next table */			movq_r2m(mm3, *(out-8));	/* 1 ; save y3 y2 y1 y0	*/				movq_r2m(mm6, *(out-4));	/* 7 ; save y7 y6 y5 y4 */	}	emms();}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?