fdct_mmx.c
来自「Motion JPEG编解码器源代码」· C语言 代码 · 共 565 行 · 第 1/2 页
C
565 行
movq_m2r(*(fdct_tg_all_16 + 8), mm6); /* 6 ; tg_3_16 */ pmulhw_r2r(mm1, mm0); /* tp465*tg_1_16 */ movq_r2m(mm7, *y0); /* 7 ; save y0 */ pmulhw_r2r(mm4, mm6); /* tm465*tg_3_16 */ movq_r2m(mm5, *y6); /* save y6 */ movq_r2r(mm3, mm7); /* t7 */ movq_m2r(*(fdct_tg_all_16 + 8), mm5); /* 5 ; tg_3_16 */ psubsw_r2r(mm2, mm7); /* tm765 = t7 - tp65 */ paddsw_r2r(mm2, mm3); /* 2 ; tp765 = t7 + tp65 */ pmulhw_r2r(mm7, mm5); /* tm765*tg_3_16 */ paddsw_r2r(mm3, mm0); /* y1 = tp765 + tp465*tg_1_16 */ paddsw_r2r(mm4, mm6); /* tm465*tg_3_16 */ pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); /* tp765*tg_1_16 */ por_m2r(fdct_one_corr, mm0); /* correction y1 +0.5 */ paddsw_r2r(mm7, mm5); /* tm765*tg_3_16 */ psubsw_r2r(mm6, mm7); /* 6 ; y3 = tm765 - tm465*tg_3_16 */ inp += 4; /* increment pointer */ movq_r2m(mm0, *y1); /* 0 ; save y1 */ paddsw_r2r(mm4, mm5); /* 4 ; y5 = tm765*tg_3_16 + tm465 */ movq_r2m(mm7, *y3); /* 7 ; save y3 */ psubsw_r2r(mm1, mm3); /* 1 ; y7 = tp765*tg_1_16 - tp465 */ movq_r2m(mm5, *y5); /* save y5 */ /* .mmx32_fdct_col47: ; begin processing last four columns */ movq_m2r(*x1, mm0); /* 0 ; x1 */ movq_r2m(mm3, *y7); /* 3 ; save y7 (columns 0-4) */ movq_m2r(*x6, mm1); /* 1 ; x6 */ movq_r2r(mm0, mm2); /* 2 ; x1 */ movq_m2r(*x2, mm3); /* 3 ; x2 */ paddsw_r2r(mm1, mm0); /* t1 = x[1] + x[6] */ movq_m2r(*x5, mm4); /* 4 ; x5 */ psllw_i2r(SHIFT_FRW_COL, mm0); /* t1 */ movq_m2r(*x0, mm5); /* 5 ; x0 */ paddsw_r2r(mm3, mm4); /* t2 = x[2] + x[5] */ paddsw_m2r(*x7, mm5); /* t0 = x[0] + x[7] */ psllw_i2r(SHIFT_FRW_COL, mm4); /* t2 */ movq_r2r(mm0, mm6); /* 6 ; t1 */ psubsw_r2r(mm1, mm2); /* 1 ; t6 = x[1] - x[6] */ movq_m2r(*(fdct_tg_all_16 + 4), mm1); /* 1 ; tg_2_16 */ psubsw_r2r(mm4, mm0); /* tm12 = t1 - t2 */ movq_m2r(*x3, mm7); /* 7 ; x3 */ pmulhw_r2r(mm0, mm1); /* tm12*tg_2_16 */ paddsw_m2r(*x4, mm7); /* t3 = x[3] + x[4] */ psllw_i2r(SHIFT_FRW_COL, mm5); /* t0 */ paddsw_r2r(mm4, mm6); /* 4 ; tp12 = t1 + t2 */ psllw_i2r(SHIFT_FRW_COL, mm7); /* t3 */ movq_r2r(mm5, mm4); /* 4 ; t0 */ psubsw_r2r(mm7, mm5); /* tm03 = t0 - t3 */ paddsw_r2r(mm5, mm1); /* y2 = tm03 + tm12*tg_2_16 */ paddsw_r2r(mm7, mm4); /* 7 ; tp03 = t0 + t3 */ por_m2r(fdct_one_corr, mm1); /* correction y2 +0.5 */ psllw_i2r(SHIFT_FRW_COL+1, mm2); /* t6 */ pmulhw_m2r(*(fdct_tg_all_16 + 4), mm5); /* tm03*tg_2_16 */ movq_r2r(mm4, mm7); /* 7 ; tp03 */ psubsw_m2r(*x5, mm3); /* t5 = x[2] - x[5] */ psubsw_r2r(mm6, mm4); /* y4 = tp03 - tp12 */ movq_r2m(mm1, *(y2+4)); /* save y2 */ paddsw_r2r(mm6, mm7); /* y0 = tp03 + tp12 */ movq_m2r(*x3, mm1); /* 1 ; x3 */ psllw_i2r(SHIFT_FRW_COL+1, mm3); /* t5 */ psubsw_m2r(*x4, mm1); /* t4 = x[3] - x[4] */ movq_r2r(mm2, mm6); /* 6 ; t6 */ movq_r2m(mm4, *(y4+4)); /* save y4 */ paddsw_r2r(mm3, mm2); /* t6 + t5 */ pmulhw_m2r(*ocos_4_16, mm2); /* tp65 = (t6 + t5)*cos_4_16 */ psubsw_r2r(mm3, mm6); /* 3 ; t6 - t5 */ pmulhw_m2r(*ocos_4_16, mm6); /* tm65 = (t6 - t5)*cos_4_16 */ psubsw_r2r(mm0, mm5); /* 0 ; y6 = tm03*tg_2_16 - tm12 */ por_m2r(fdct_one_corr, mm5); /* correction y6 +0.5 */ psllw_i2r(SHIFT_FRW_COL, mm1); /* t4 */ por_m2r(fdct_one_corr, mm2); /* correction tp65 +0.5 */ movq_r2r(mm1, mm4); /* 4 ; t4 */ movq_m2r(*x0, mm3); /* 3 ; x0 */ paddsw_r2r(mm6, mm1); /* tp465 = t4 + tm65 */ psubsw_m2r(*x7, mm3); /* t7 = x[0] - x[7] */ psubsw_r2r(mm6, mm4); /* 6 ; tm465 = t4 - tm65 */ movq_m2r(*(fdct_tg_all_16 + 0), mm0); /* 0 ; tg_1_16 */ psllw_i2r(SHIFT_FRW_COL, mm3); /* t7 */ movq_m2r(*(fdct_tg_all_16 + 8), mm6); /* 6 ; tg_3_16 */ pmulhw_r2r(mm1, mm0); /* tp465*tg_1_16 */ movq_r2m(mm7, *(y0+4)); /* 7 ; save y0 */ pmulhw_r2r(mm4, mm6); /* tm465*tg_3_16 */ movq_r2m(mm5, *(y6+4)); /* 5 ; save y6 */ movq_r2r(mm3, mm7); /* 7 ; t7 */ movq_m2r(*(fdct_tg_all_16 + 8), mm5); /* 5 ; tg_3_16 */ psubsw_r2r(mm2, mm7); /* tm765 = t7 - tp65 */ paddsw_r2r(mm2, mm3); /* 2 ; tp765 = t7 + tp65 */ pmulhw_r2r(mm7, mm5); /* tm765*tg_3_16 */ paddsw_r2r(mm3, mm0); /* y1 = tp765 + tp465*tg_1_16 */ paddsw_r2r(mm4, mm6); /* tm465*tg_3_16 */ pmulhw_m2r(*(fdct_tg_all_16 + 0), mm3); /* tp765*tg_1_16 */ por_m2r(fdct_one_corr, mm0); /* correction y1 +0.5 */ paddsw_r2r(mm7, mm5); /* tm765*tg_3_16 */ psubsw_r2r(mm6, mm7); /* 6 ; y3 = tm765 - tm465*tg_3_16 */ movq_r2m(mm0, *(y1+4)); /* 0 ; save y1 */ paddsw_r2r(mm4, mm5); /* 4 ; y5 = tm765*tg_3_16 + tm465 */ movq_r2m(mm7, *(y3+4)); /* 7 ; save y3 */ psubsw_r2r(mm1, mm3); /* 1 ; y7 = tp765*tg_1_16 - tp465 */ movq_r2m(mm5, *(y5+4)); /* 5 ; save y5 */ movq_r2m(mm3, *(y7+4)); /* 3 ; save y7 */ /* } ; end of forward_dct_col07() */ /* done with dct_row transform */ /* * fdct_mmx32_cols() -- * the following subroutine repeats the row-transform operation, * except with different shift&round constants. This version * does NOT transpose the output again. Thus the final output * is transposed with respect to the source. * * The output is stored into blk[], which destroys the original * input data. */ out = inp = blk; table = (int16_t *)tab_frw_01234567; /* * for ( x = 8; x > 0; --x ) ; transform one row per iteration * ---------- loop begin */ for(i=0; i<8; i++) { movd_m2r(*(inp+6), mm5); /* mm5 = 7 6 */ punpcklwd_m2r(*(inp+4), mm5); /* mm5 = 5 7 4 6 */ movq_r2r(mm5, mm2); /* mm2 = 5 7 4 6 */ psrlq_i2r(32, mm5); /* mm5 = _ _ 5 7 */ movq_m2r(*inp, mm0); /* mm0 = 3 2 1 0 */ punpcklwd_r2r(mm2, mm5); /* mm5 = 4 5 6 7 */ movq_r2r(mm0, mm1); /* mm1 = 3 2 1 0 */ paddsw_r2r(mm5, mm0); /* mm0 = [3+4, 2+5, 1+6, 0+7] (xt3, xt2, xt1, xt0) */ psubsw_r2r(mm5, mm1); /* mm1 = [3-4, 2-5, 1-6, 0-7] (xt7, xt6, xt5, xt4) */ movq_r2r(mm0, mm2); /* mm2 = [ xt3 xt2 xt1 xt0 ] */ /* movq [ xt3xt2xt1xt0 ], mm0; */ /* movq [ xt7xt6xt5xt4 ], mm1; */ punpcklwd_r2r(mm1, mm0); /* mm0 = [ xt5 xt1 xt4 xt0 ] */ punpckhwd_r2r(mm1, mm2); /* mm2 = [ xt7 xt3 xt6 xt2 ] */ movq_r2r(mm2, mm1); /* mm1 */ /* shuffle bytes around */ /* movq mm0, [INP] ; 0 ; x3 x2 x1 x0 */ /* movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4 */ movq_r2r(mm0, mm2); /* 2 ; x3 x2 x1 x0 */ movq_m2r(*table, mm3); /* 3 ; w06 w04 w02 w00 */ punpcklwd_r2r(mm1, mm0); /* x5 x1 x4 x0 */ movq_r2r(mm0, mm5); /* 5 ; x5 x1 x4 x0 */ punpckldq_r2r(mm0, mm0); /* x4 x0 x4 x0 [ xt2 xt0 xt2 xt0 ] */ movq_m2r(*(table+4), mm4); /* 4 ; w07 w05 w03 w01 */ punpckhwd_r2r(mm1, mm2); /* 1 ; x7 x3 x6 x2 */ pmaddwd_r2r(mm0, mm3); /* x4*w06+x0*w04 x4*w02+x0*w00 */ movq_r2r(mm2, mm6); /* 6 ; x7 x3 x6 x2 */ movq_m2r(*(table+16), mm1); /* 1 ; w22 w20 w18 w16 */ punpckldq_r2r(mm2, mm2); /* x6 x2 x6 x2 [ xt3 xt1 xt3 xt1 ] */ pmaddwd_r2r(mm2, mm4); /* x6*w07+x2*w05 x6*w03+x2*w01 */ punpckhdq_r2r(mm5, mm5); /* x5 x1 x5 x1 [ xt6 xt4 xt6 xt4 ] */ pmaddwd_m2r(*(table+8), mm0); /* x4*w14+x0*w12 x4*w10+x0*w08 */ punpckhdq_r2r(mm6, mm6); /* x7 x3 x7 x3 [ xt7 xt5 xt7 xt5 ] */ movq_m2r(*(table+20), mm7); /* 7 ; w23 w21 w19 w17 */ pmaddwd_r2r(mm5, mm1); /* x5*w22+x1*w20 x5*w18+x1*w16 */ /* * mm3 = a1, a0 (y2,y0) * mm1 = b1, b0 (y3,y1) * mm0 = a3,a2 (y6,y4) * mm5 = b3,b2 (y7,y5) */ paddd_m2r(*round_frw_row, mm3); /* +rounder (y2,y0) */ pmaddwd_r2r(mm6, mm7); /* x7*w23+x3*w21 x7*w19+x3*w17 */ pmaddwd_m2r(*(table+12), mm2); /* x6*w15+x2*w13 x6*w11+x2*w09 */ paddd_r2r(mm4, mm3); /* 4 ; a1=sum(even1) a0=sum(even0) ; now ( y2, y0) */ pmaddwd_m2r(*(table+24), mm5); /* x5*w30+x1*w28 x5*w26+x1*w24 */ pmaddwd_m2r(*(table+28), mm6); /* x7*w31+x3*w29 x7*w27+x3*w25 */ paddd_r2r(mm7, mm1); /* 7 ; b1=sum(odd1) b0=sum(odd0) ; now ( y3, y1) */ paddd_m2r(*round_frw_row, mm0); /* +rounder (y6,y4) */ psrad_i2r(SHIFT_FRW_ROW, mm3); /* (y2, y0) */ paddd_m2r(*round_frw_row, mm1); /* +rounder (y3,y1) */ paddd_r2r(mm2, mm0); /* 2 ; a3=sum(even3) a2=sum(even2) ; now (y6, y4) */ paddd_m2r(*round_frw_row, mm5); /* +rounder (y7,y5) */ psrad_i2r(SHIFT_FRW_ROW, mm1); /* y1=a1+b1 y0=a0+b0 */ paddd_r2r(mm6, mm5); /* 6 ; b3=sum(odd3) b2=sum(odd2) ; now ( y7, y5) */ psrad_i2r(SHIFT_FRW_ROW, mm0); /* y3=a3+b3 y2=a2+b2 */ out += 8; /* increment row-output address by 1 row */ psrad_i2r(SHIFT_FRW_ROW, mm5); /* y4=a3-b3 y5=a2-b2 */ inp += 8; /* increment row-address by 1 row */ packssdw_r2r(mm0, mm3); /* 0 ; y6 y4 y2 y0 */ packssdw_r2r(mm5, mm1); /* 3 ; y7 y5 y3 y1 */ movq_r2r(mm3, mm6); /* mm0 = y6 y4 y2 y0 */ punpcklwd_r2r(mm1, mm3); /* y3 y2 y1 y0 */ punpckhwd_r2r(mm1, mm6); /* y7 y6 y5 y4 */ table += 32; /* increment to next table */ movq_r2m(mm3, *(out-8)); /* 1 ; save y3 y2 y1 y0 */ movq_r2m(mm6, *(out-4)); /* 7 ; save y7 y6 y5 y4 */ } emms();}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?