📄 idct_mmx.c
字号:
/* seventh stage */ MMXADDDIFF( mm2, mm5 ); MMXADDDIFF( mm0, mm4 ); /* eighth stage */ STOREMM(mm5, dst[1*8], Wrshift); STOREMM(mm4, dst[2*8], Wrshift); STOREMM(mm0, dst[5*8], Wrshift); STOREMM(mm2, dst[6*8], Wrshift); } src=temp; dst=block; for( i=0; i<4; src+=16, dst+=2, i++ ) { /* first stage */ // x0, x1 = W0*x0 + W1*x1 + 128, W1*x0 - W0*x1 + 128; // x2, x3 = -W2*x2 + W6*x3, W6*x2 + W2*x3; // x4, x5 = W1*x4 + W7*x5, W7*x4 - W1*x5; // x6, x7 = W5*x6 + W3*x7, W3*x6 - W5*x7; movq_m2r(src[0], mm0); movq_m2r(src[8], mm1); movq_r2r(mm0, mm2); punpckldq_r2r(mm1, mm0); punpckhdq_r2r(mm1, mm2); movq_m2r(src[4], mm4); movq_m2r(src[12], mm5); movq_r2r(mm4, mm6); punpckldq_r2r(mm5, mm4); punpckhdq_r2r(mm5, mm6); // mm5 = low word set // mm7 = high word set pxor_r2r(mm5, mm5); pcmpeqw_r2r(mm5, mm5); movq_r2r(mm5, mm7); psrld_i2r(16, mm5); pxor_r2r(mm5, mm7); movq_r2r(mm4, mm1); // 0,1 / 2,3 / 4,5 / 6,7 / 1=4,5 movq_r2r(mm0, mm3); pand_r2r(mm5, mm0); pslld_i2r(16, mm1); por_r2r(mm1, mm0); // 0,4 / 2,3 / 4,5 / 6,7 / 3=0,1 movq_r2r(mm6, mm1); pand_r2r(mm7, mm6); psrld_i2r(16, mm3); por_r2r(mm3, mm6); // 0,4 / 2,3 / 4,5 / 1,7 / 1=6,7 movq_r2r(mm2, mm3); pslld_i2r(16, mm2); pand_r2r(mm5, mm1); por_r2r(mm1, mm2); // 0,4 / 6,2 / 4,5 / 1,7 / 3=2,3 psrld_i2r(16, mm4); pand_r2r(mm7, mm3); por_r2r(mm3, mm4); // 0,4 / 6,2 / 5,3 / 1,7 MMXMULTADD(mm0, mm1, idct_mmx_col_table); MMXMULTADD(mm2, mm3, idct_mmx_col_table+8); MMXMULTADD(mm4, mm5, idct_mmx_col_table+16); MMXMULTADD(mm6, mm7, idct_mmx_col_table+24); paddd_m2r(idct_mmx_col_round[0], mm0); paddd_m2r(idct_mmx_col_round[0], mm1); /* second stage */ MMXADDDIFF(mm6, mm4); MMXADDDIFF(mm7, mm5); /* third stage */ MMXADDDIFF( mm1, mm3 ); MMXADDDIFF( mm0, mm2 ); MMXADDDIFF( mm6, mm7 ); /* fourth stage */ MMXADDDIFF( mm3, mm4 ); MMXADDDIFF( mm1, mm5 ); /* fifth stage */ STOREMM(mm4, dst[0*8], Wcshift); STOREMM(mm5, dst[3*8], Wcshift); STOREMM(mm1, dst[4*8], Wcshift); STOREMM(mm3, dst[7*8], Wcshift); /* sixth stage */ // x6 = (181*x6+128)>>8; // x7 = (181*x7+128)>>8; // actually, this computes, roughly: x6 -= (x6>>8)*75 movq_r2r(mm6, mm4); movq_r2r(mm7, mm5); psrad_i2r(2, mm6); psrad_i2r(2, mm7); psubd_r2r(mm6, mm4); psubd_r2r(mm7, mm5); psrad_i2r(3, mm6); psrad_i2r(3, mm7); psubd_r2r(mm6, mm4); psubd_r2r(mm7, mm5); psrad_i2r(2, mm6); psrad_i2r(2, mm7); psubd_r2r(mm6, mm4); psubd_r2r(mm7, mm5); psrad_i2r(1, mm6); psrad_i2r(1, mm7); psubd_r2r(mm6, mm4); psubd_r2r(mm7, mm5); /* seventh stage */ MMXADDDIFF( mm2, mm5 ); MMXADDDIFF( mm0, mm4 ); /* eighth stage */ STOREMM(mm5, dst[1*8], Wcshift); STOREMM(mm4, dst[2*8], Wcshift); STOREMM(mm0, dst[5*8], Wcshift); STOREMM(mm2, dst[6*8], Wcshift); } emms();}void idct_sse(int16_t *block){ float temp[64+3], *dst, *altemp; int i; int16_t *src; altemp=ALIGN_PTR(temp,16); src=block; dst=altemp; for( i=0; i<2; src+=32, dst+=4, i++) {#define MM2XMMl(mm,x0,x1) \ movq_r2r ( mm, mm6); \ movq_r2r ( mm, mm7); \ psraw_i2r ( 16, mm); \ punpcklwd_r2r( mm, mm6); \ punpckhwd_r2r( mm, mm7); \ cvtpi2ps_r2r ( mm6, x0); \ cvtpi2ps_r2r ( mm7, x1);#define MM2XMM(ml,mh,x0,x1) \ MM2XMMl(mh,x0,x1); \ movlhps_r2r(x0, x0); \ movlhps_r2r(x1, x1); \ MM2XMMl(ml,x0,x1);#define LOADROTATEif(src, x0, x1, x2, x3) \ \ movq_m2r((src)[16], mm2); /* 0c 1c 2c 3c */ \ movq_m2r((src)[24], mm3); /* 0d 1d 2d 3d */ \ movq_m2r((src)[ 0], mm0); /* 0a 1a 2a 3a */ \ movq_m2r((src)[ 8], mm1); /* 0b 1b 2b 3b */ \ \ /* mm5 = 0c 0d 1c 1d */ \ /* mm2 = 2c 2d 3c 3d */ \ movq_r2r(mm2, mm5); \ punpcklwd_r2r(mm3, mm5); \ punpckhwd_r2r(mm3, mm2); \ \ /* mm4 = 0a 0b 1a 1b */ \ /* mm0 = 2a 2b 3a 3b */ \ movq_r2r(mm0, mm4); \ punpcklwd_r2r(mm1, mm4); \ punpckhwd_r2r(mm1, mm0); \ \ MM2XMM(mm4, mm5, x0, x1); \ MM2XMM(mm0, mm2, x2, x3); LOADROTATEif(src, xmm0, xmm6, xmm3, xmm5); LOADROTATEif(src+4, xmm1, xmm4, xmm2, xmm7); // first stage SSEADDDIFF( xmm0, xmm1 ); SSEMULTADD( xmm2, xmm3, idct_sse_table+16 ); SSEMULTADD( xmm4, xmm5, idct_sse_table+32 ); SSEMULTADD( xmm6, xmm7, idct_sse_table+48 ); // third stage SSEADDDIFF( xmm1, xmm3 ); // second stage SSEADDDIFF( xmm6, xmm4 ); SSEADDDIFF( xmm7, xmm5 ); // fourth stage SSEADDDIFF( xmm3, xmm4 ); SSEADDDIFF( xmm1, xmm5 ); movaps_r2m( xmm3, dst[7*8]); movaps_r2m( xmm4, dst[0*8]); movaps_r2m( xmm1, dst[4*8]); movaps_r2m( xmm5, dst[3*8]); SSEADDDIFF_t( xmm6, xmm7, xmm1 ); SSEADDDIFF_t( xmm0, xmm2, xmm1 ); // x7 = MUL_BY_ROOT_2_OVER_2(x7); // x6 = MUL_BY_ROOT_2_OVER_2(x6); movaps_m2r( idct_sse_root2_over2[0], xmm1 ); mulps_r2r( xmm1, xmm7 ); mulps_r2r( xmm1, xmm6 ); SSEADDDIFF_t(xmm2, xmm7, xmm1); SSEADDDIFF_t(xmm0, xmm6, xmm1); movaps_r2m( xmm2, dst[6*8] ); movaps_r2m( xmm7, dst[1*8] ); movaps_r2m( xmm0, dst[5*8] ); movaps_r2m( xmm6, dst[2*8] ); } src=block; dst=altemp; for( i=0; i<2; src+=4, dst+=32, i++) {#define LOADROTATEff(src, t, x0, x1, x2, x3) \ movaps_m2r((src)[ 0], x0); /* 0a 1a 2a 3a */ \ movaps_m2r((src)[ 8], t); /* 0b 1b 2b 3b */ \ movaps_m2r((src)[16], x3); /* 0c 1c 2c 3c */ \ movaps_m2r((src)[24], x1); /* 0d 1d 2d 3d */ \ \ /* mm0 = 0a 0b 1a 1b */ \ /* mm2 = 2a 2b 3a 3b */ \ movaps_r2r (x0, x2); \ unpcklps_r2r(t, x0); \ unpckhps_r2r(t, x2); \ \ /* mm3 = 0c 0d 1c 1d */ \ /* mmt = 2c 2d 3c 3d */ \ movaps_r2r (x3, t); \ unpcklps_r2r(x1, x3); \ unpckhps_r2r(x1, t); \ \ /* mm0 = 0a 0b 0c 0d */ \ /* mm1 = 1a 1b 1c 1d */ \ movaps_r2r (x3, x1); \ movhlps_r2r(x0, x1); \ movlhps_r2r(x3, x0); \ \ /* mm2 = 2a 2b 2c 2d */ \ /* mm3 = 3a 3b 3c 3d */ \ movaps_r2r ( t, x3); \ movhlps_r2r(x2, x3); \ movlhps_r2r( t, x2);#define STOREXMM(x0, dst) \ mulps_m2r(*idct_sse_eighth, x0); \ cvtps2pi_r2r(x0, mm0); \ movhlps_r2r (x0, x0); \ cvtps2pi_r2r(x0, mm1); \ packssdw_r2r(mm1, mm0); \ movq_r2m (mm0, dst); LOADROTATEff(dst, xmm1, xmm0, xmm6, xmm3, xmm5); movaps_r2m(xmm0, dst[0]); LOADROTATEff(dst+4, xmm0, xmm1, xmm4, xmm2, xmm7); movaps_m2r(dst[0], xmm0); // first stage SSEADDDIFF( xmm0, xmm1 ); SSEMULTADD( xmm2, xmm3, idct_sse_table+16 ); SSEMULTADD( xmm4, xmm5, idct_sse_table+32 ); SSEMULTADD( xmm6, xmm7, idct_sse_table+48 ); // third stage SSEADDDIFF( xmm1, xmm3 ); // second stage SSEADDDIFF( xmm6, xmm4 ); SSEADDDIFF( xmm7, xmm5 ); // fourth stage SSEADDDIFF( xmm3, xmm4 ); SSEADDDIFF( xmm1, xmm5 ); STOREXMM( xmm3, src[7*8]); STOREXMM( xmm4, src[0*8] ); STOREXMM( xmm1, src[4*8]); STOREXMM( xmm5, src[3*8]); SSEADDDIFF_t( xmm6, xmm7, xmm1 ); SSEADDDIFF_t( xmm0, xmm2, xmm1 ); // x7 = MUL_BY_ROOT_2_OVER_2(x7); // x6 = MUL_BY_ROOT_2_OVER_2(x6); movaps_m2r( idct_sse_root2_over2[0], xmm1 ); mulps_r2r( xmm1, xmm7 ); mulps_r2r( xmm1, xmm6 ); SSEADDDIFF_t(xmm2, xmm7, xmm1); SSEADDDIFF_t(xmm0, xmm6, xmm1); STOREXMM( xmm2, src[6*8] ); STOREXMM( xmm7, src[1*8] ); STOREXMM( xmm0, src[5*8] ); STOREXMM( xmm6, src[2*8] ); } emms();}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -