⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_mmx.c

📁 Motion JPEG编解码器源代码
💻 C
📖 第 1 页 / 共 2 页
字号:
            /* seventh stage */        MMXADDDIFF( mm2, mm5 );        MMXADDDIFF( mm0, mm4 );                /* eighth stage */        STOREMM(mm5, dst[1*8], Wrshift);        STOREMM(mm4, dst[2*8], Wrshift);        STOREMM(mm0, dst[5*8], Wrshift);        STOREMM(mm2, dst[6*8], Wrshift);    }    src=temp;    dst=block;    for( i=0; i<4; src+=16, dst+=2, i++ ) {        /* first stage */        // x0, x1 =  W0*x0 + W1*x1 + 128, W1*x0 - W0*x1 + 128;        // x2, x3 = -W2*x2 + W6*x3, W6*x2 + W2*x3;        // x4, x5 =  W1*x4 + W7*x5, W7*x4 - W1*x5;            // x6, x7 =  W5*x6 + W3*x7, W3*x6 - W5*x7;            movq_m2r(src[0], mm0);        movq_m2r(src[8], mm1);        movq_r2r(mm0, mm2);        punpckldq_r2r(mm1, mm0);        punpckhdq_r2r(mm1, mm2);        movq_m2r(src[4], mm4);        movq_m2r(src[12], mm5);        movq_r2r(mm4, mm6);        punpckldq_r2r(mm5, mm4);        punpckhdq_r2r(mm5, mm6);        // mm5 = low word set        // mm7 = high word set        pxor_r2r(mm5, mm5);        pcmpeqw_r2r(mm5, mm5);        movq_r2r(mm5, mm7);        psrld_i2r(16, mm5);        pxor_r2r(mm5, mm7);        movq_r2r(mm4, mm1);        // 0,1 / 2,3 / 4,5 / 6,7 / 1=4,5        movq_r2r(mm0, mm3);        pand_r2r(mm5, mm0);        pslld_i2r(16, mm1);        por_r2r(mm1, mm0);        // 0,4 / 2,3 / 4,5 / 6,7 / 3=0,1        movq_r2r(mm6, mm1);        pand_r2r(mm7, mm6);        psrld_i2r(16, mm3);        por_r2r(mm3, mm6);        // 0,4 / 2,3 / 4,5 / 1,7 / 1=6,7        movq_r2r(mm2, mm3);        pslld_i2r(16, mm2);        pand_r2r(mm5, mm1);        por_r2r(mm1, mm2);        // 0,4 / 6,2 / 4,5 / 1,7 / 3=2,3        psrld_i2r(16, mm4);        pand_r2r(mm7, mm3);        por_r2r(mm3, mm4);        // 0,4 / 6,2 / 5,3 / 1,7                   MMXMULTADD(mm0, mm1, idct_mmx_col_table);        MMXMULTADD(mm2, mm3, idct_mmx_col_table+8);        MMXMULTADD(mm4, mm5, idct_mmx_col_table+16);        MMXMULTADD(mm6, mm7, idct_mmx_col_table+24);        paddd_m2r(idct_mmx_col_round[0], mm0);        paddd_m2r(idct_mmx_col_round[0], mm1);        /* second stage */        MMXADDDIFF(mm6, mm4);            MMXADDDIFF(mm7, mm5);                /* third stage */                MMXADDDIFF( mm1, mm3 );        MMXADDDIFF( mm0, mm2 );        MMXADDDIFF( mm6, mm7 );            /* fourth stage */        MMXADDDIFF( mm3, mm4 );        MMXADDDIFF( mm1, mm5 );        /* fifth stage */        STOREMM(mm4, dst[0*8], Wcshift);        STOREMM(mm5, dst[3*8], Wcshift);        STOREMM(mm1, dst[4*8], Wcshift);        STOREMM(mm3, dst[7*8], Wcshift);        /* sixth stage */        // x6 = (181*x6+128)>>8;        // x7 = (181*x7+128)>>8;              // actually, this computes, roughly: x6 -= (x6>>8)*75        movq_r2r(mm6, mm4);        movq_r2r(mm7, mm5);        psrad_i2r(2, mm6);        psrad_i2r(2, mm7);        psubd_r2r(mm6, mm4);        psubd_r2r(mm7, mm5);        psrad_i2r(3, mm6);        psrad_i2r(3, mm7);        psubd_r2r(mm6, mm4);        psubd_r2r(mm7, mm5);        psrad_i2r(2, mm6);        psrad_i2r(2, mm7);        psubd_r2r(mm6, mm4);        psubd_r2r(mm7, mm5);        psrad_i2r(1, mm6);        psrad_i2r(1, mm7);        psubd_r2r(mm6, mm4);        psubd_r2r(mm7, mm5);            /* seventh stage */        MMXADDDIFF( mm2, mm5 );        MMXADDDIFF( mm0, mm4 );        /* eighth stage */        STOREMM(mm5, dst[1*8], Wcshift);        STOREMM(mm4, dst[2*8], Wcshift);        STOREMM(mm0, dst[5*8], Wcshift);        STOREMM(mm2, dst[6*8], Wcshift);    }    emms();}void idct_sse(int16_t *block){    float temp[64+3], *dst, *altemp;    int i;    int16_t *src;    altemp=ALIGN_PTR(temp,16);    src=block;    dst=altemp;    for( i=0; i<2; src+=32, dst+=4, i++) {#define MM2XMMl(mm,x0,x1)          \        movq_r2r     ( mm,  mm6);  \        movq_r2r     ( mm,  mm7);  \        psraw_i2r    ( 16,  mm);   \        punpcklwd_r2r( mm,  mm6);  \        punpckhwd_r2r( mm,  mm7);  \        cvtpi2ps_r2r ( mm6, x0);   \        cvtpi2ps_r2r ( mm7, x1);#define MM2XMM(ml,mh,x0,x1)  \        MM2XMMl(mh,x0,x1);   \        movlhps_r2r(x0, x0); \        movlhps_r2r(x1, x1); \        MM2XMMl(ml,x0,x1);#define LOADROTATEif(src, x0, x1, x2, x3)           \                                                    \        movq_m2r((src)[16], mm2); /* 0c 1c 2c 3c */ \        movq_m2r((src)[24], mm3); /* 0d 1d 2d 3d */ \        movq_m2r((src)[ 0], mm0); /* 0a 1a 2a 3a */ \        movq_m2r((src)[ 8], mm1); /* 0b 1b 2b 3b */ \                                                    \        /* mm5 = 0c 0d 1c 1d */                     \        /* mm2 = 2c 2d 3c 3d */                     \        movq_r2r(mm2, mm5);                         \        punpcklwd_r2r(mm3, mm5);                    \        punpckhwd_r2r(mm3, mm2);                    \                                                    \        /* mm4 = 0a 0b 1a 1b */                     \        /* mm0 = 2a 2b 3a 3b */                     \        movq_r2r(mm0, mm4);                         \        punpcklwd_r2r(mm1, mm4);                    \        punpckhwd_r2r(mm1, mm0);                    \                                                    \        MM2XMM(mm4, mm5, x0, x1);                   \        MM2XMM(mm0, mm2, x2, x3);        LOADROTATEif(src,   xmm0, xmm6, xmm3, xmm5);        LOADROTATEif(src+4, xmm1, xmm4, xmm2, xmm7);        // first stage        SSEADDDIFF( xmm0, xmm1 );        SSEMULTADD( xmm2, xmm3, idct_sse_table+16 );        SSEMULTADD( xmm4, xmm5, idct_sse_table+32 );        SSEMULTADD( xmm6, xmm7, idct_sse_table+48 );                // third stage        SSEADDDIFF( xmm1, xmm3 );        // second stage        SSEADDDIFF( xmm6, xmm4 );        SSEADDDIFF( xmm7, xmm5 );          // fourth stage        SSEADDDIFF( xmm3, xmm4 );        SSEADDDIFF( xmm1, xmm5 );        movaps_r2m( xmm3, dst[7*8]);        movaps_r2m( xmm4, dst[0*8]);        movaps_r2m( xmm1, dst[4*8]);        movaps_r2m( xmm5, dst[3*8]);        SSEADDDIFF_t( xmm6, xmm7, xmm1 );        SSEADDDIFF_t( xmm0, xmm2, xmm1 );        // x7 = MUL_BY_ROOT_2_OVER_2(x7);        // x6 = MUL_BY_ROOT_2_OVER_2(x6);        movaps_m2r( idct_sse_root2_over2[0], xmm1 );        mulps_r2r( xmm1, xmm7 );        mulps_r2r( xmm1, xmm6 );          SSEADDDIFF_t(xmm2, xmm7, xmm1);        SSEADDDIFF_t(xmm0, xmm6, xmm1);        movaps_r2m( xmm2, dst[6*8] );        movaps_r2m( xmm7, dst[1*8] );        movaps_r2m( xmm0, dst[5*8] );        movaps_r2m( xmm6, dst[2*8] );            }    src=block;    dst=altemp;    for( i=0; i<2; src+=4, dst+=32, i++) {#define LOADROTATEff(src, t, x0, x1, x2, x3)           \        movaps_m2r((src)[ 0], x0); /* 0a 1a 2a 3a */   \        movaps_m2r((src)[ 8], t);  /* 0b 1b 2b 3b */   \        movaps_m2r((src)[16], x3); /* 0c 1c 2c 3c */   \        movaps_m2r((src)[24], x1); /* 0d 1d 2d 3d */   \                                                       \        /* mm0 = 0a 0b 1a 1b */                        \        /* mm2 = 2a 2b 3a 3b */                        \        movaps_r2r  (x0, x2);                          \        unpcklps_r2r(t,  x0);                          \        unpckhps_r2r(t,  x2);                          \                                                       \        /* mm3 = 0c 0d 1c 1d */                        \        /* mmt = 2c 2d 3c 3d */                        \        movaps_r2r  (x3, t);                           \        unpcklps_r2r(x1, x3);                          \        unpckhps_r2r(x1, t);                           \                                                       \        /* mm0 = 0a 0b 0c 0d */                        \        /* mm1 = 1a 1b 1c 1d */                        \        movaps_r2r (x3, x1);                           \        movhlps_r2r(x0, x1);                           \        movlhps_r2r(x3, x0);                           \                                                       \        /* mm2 = 2a 2b 2c 2d */                        \        /* mm3 = 3a 3b 3c 3d */                        \        movaps_r2r ( t, x3);                           \        movhlps_r2r(x2, x3);                           \        movlhps_r2r( t, x2);#define STOREXMM(x0, dst)                \        mulps_m2r(*idct_sse_eighth, x0); \        cvtps2pi_r2r(x0,  mm0);          \        movhlps_r2r (x0,  x0);           \        cvtps2pi_r2r(x0,  mm1);          \        packssdw_r2r(mm1, mm0);          \        movq_r2m    (mm0, dst);        LOADROTATEff(dst,   xmm1, xmm0, xmm6, xmm3, xmm5);        movaps_r2m(xmm0, dst[0]);        LOADROTATEff(dst+4, xmm0, xmm1, xmm4, xmm2, xmm7);        movaps_m2r(dst[0], xmm0);        // first stage                SSEADDDIFF( xmm0, xmm1 );        SSEMULTADD( xmm2, xmm3, idct_sse_table+16 );        SSEMULTADD( xmm4, xmm5, idct_sse_table+32 );        SSEMULTADD( xmm6, xmm7, idct_sse_table+48 );          // third stage        SSEADDDIFF( xmm1, xmm3 );        // second stage        SSEADDDIFF( xmm6, xmm4 );        SSEADDDIFF( xmm7, xmm5 );          // fourth stage        SSEADDDIFF( xmm3, xmm4 );        SSEADDDIFF( xmm1, xmm5 );        STOREXMM( xmm3, src[7*8]);        STOREXMM( xmm4, src[0*8] );        STOREXMM( xmm1, src[4*8]);        STOREXMM( xmm5, src[3*8]);        SSEADDDIFF_t( xmm6, xmm7, xmm1 );        SSEADDDIFF_t( xmm0, xmm2, xmm1 );        // x7 = MUL_BY_ROOT_2_OVER_2(x7);        // x6 = MUL_BY_ROOT_2_OVER_2(x6);        movaps_m2r( idct_sse_root2_over2[0], xmm1 );        mulps_r2r( xmm1, xmm7 );        mulps_r2r( xmm1, xmm6 );          SSEADDDIFF_t(xmm2, xmm7, xmm1);        SSEADDDIFF_t(xmm0, xmm6, xmm1);        STOREXMM( xmm2, src[6*8] );                STOREXMM( xmm7, src[1*8] );        STOREXMM( xmm0, src[5*8] );        STOREXMM( xmm6, src[2*8] );    }    emms();}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -