fdct_x86.c

来自「Motion JPEG编解码器源代码」· C语言 代码 · 共 471 行

C
471
字号
#include <config.h>#include <math.h>#include "mjpeg_types.h"#include "mmx.h"#define ALIGN_PTR(x,a) ((void *)( (((size_t)(x))+(a)-1)&(-(a))))#define SHUFFLEMAP(A,B,C,D) ((A)*1+(B)*4+(C)*16+(D)*64)#define NC_COS6      0.382683432365089771728459984030399//cos(6*pi/16)#define NC_R_SQRT2   0.707106781186547524400844362104849// 1/sqrt(2)#define NC_COS0SQRT2 1.0                                //cos(0*pi/16)*sqrt(2)#define NC_COS1SQRT2 1.38703984532214746182161919156644 //cos(1*pi/16)*sqrt(2)#define NC_COS2SQRT2 1.30656296487637652785664317342719 //cos(2*pi/16)*sqrt(2)#define NC_COS3SQRT2 1.17587560241935871697446710461126 //cos(3*pi/16)*sqrt(2)#define NC_COS4SQRT2 1.0                                //cos(4*pi/16)*sqrt(2)#define NC_COS5SQRT2 0.785694958387102181277897367657217//cos(5*pi/16)*sqrt(2)#define NC_COS6SQRT2 0.541196100146196984399723205366389//cos(6*pi/16)*sqrt(2)#define NC_COS7SQRT2 0.275899379282943012335957563669373//cos(7*pi/16)*sqrt(2)void init_fdct_sse( void );void fdct_sse(int16_t *block);static float aanscales[64];#define SSECONST(n,x) static float n[4] __attribute__ ((aligned (16))) = { x, x, x, x }SSECONST(fdct_sse_r_sqrt2,  NC_R_SQRT2);SSECONST(fdct_sse_cos6,     NC_COS6);SSECONST(fdct_sse_cos6sqrt2,NC_COS6SQRT2);SSECONST(fdct_sse_cos2sqrt2,NC_COS2SQRT2);void init_fdct_sse( void ){    int i, j;    static const double aansf[8] = {        1.0,           // sqrt(2) factor left out here...        NC_COS1SQRT2,        NC_COS2SQRT2,        NC_COS3SQRT2,        NC_COS4SQRT2,  // cos(4*pi/16) * sqrt(2) = 1.0 exactly        NC_COS5SQRT2,        NC_COS6SQRT2,        NC_COS7SQRT2    };    for (i = 0; i < 8; i++)        for (j = 0; j < 8; j++)            aanscales[(i << 3) + j] = 1.0 / (aansf[i] * aansf[j] * 8.0);}/* * Perform a floating point forward DCT on one block of samples. */// computes A'=A-B//          B'=A+B#define ADDDIFF(A,B) { double x=A; A-=B; B+=x; }#define MMXADDDIFF_Wt(A,B,C) \        movq_r2r (A,C);      \        psubw_r2r(B,A);      \        paddw_r2r(C,B);#define MMXADDDIFF_Dt(A,B,C) \        movq_r2r (A,C);      \        psubd_r2r(B,A);      \        paddd_r2r(C,B);#define SSEADDDIFF_t(A,B,C) \        movaps_r2r(A,C);    \        subps_r2r(B,A);     \        addps_r2r(C,B);#define SSEADDDIFF(x,y)  \        subps_r2r(y, x); \        addps_r2r(y, y); \        addps_r2r(x, y);#define MMXEXTEND_WD(S,DL,DH) \        movq_r2r(S, DL);      \        movq_r2r(S, DH);      \        psraw_i2r(16, S);     \        punpcklwd_r2r(S, DL); \        punpckhwd_r2r(S, DH);#define LOADROTATEff(src, t, x0, x1, x2, x3)           \        movaps_m2r((src)[ 0], x0); /* 0a 1a 2a 3a */   \        movaps_m2r((src)[ 8], t);  /* 0b 1b 2b 3b */   \        movaps_m2r((src)[16], x3); /* 0c 1c 2c 3c */   \        movaps_m2r((src)[24], x1); /* 0d 1d 2d 3d */   \                                                       \        /* mm0 = 0a 0b 1a 1b */                        \        /* mm2 = 2a 2b 3a 3b */                        \        movaps_r2r  (x0, x2);                          \        unpcklps_r2r(t,  x0);                          \        unpckhps_r2r(t,  x2);                          \                                                       \        /* mm3 = 0c 0d 1c 1d */                        \        /* mmt = 2c 2d 3c 3d */                        \        movaps_r2r  (x3, t);                           \        unpcklps_r2r(x1, x3);                          \        unpckhps_r2r(x1, t);                           \                                                       \        /* mm0 = 0a 0b 0c 0d */                        \        /* mm1 = 1a 1b 1c 1d */                        \        movaps_r2r (x3, x1);                           \        movhlps_r2r(x0, x1);                           \        movlhps_r2r(x3, x0);                           \                                                       \        /* mm2 = 2a 2b 2c 2d */                        \        /* mm3 = 3a 3b 3c 3d */                        \        movaps_r2r ( t, x3);                           \        movhlps_r2r(x2, x3);                           \        movlhps_r2r( t, x2);#define STOREXMM(x0, offs)               \        mulps_m2r(aanptr[offs], x0);     \        cvtps2pi_r2r(x0,  mm0);          \        movhlps_r2r (x0,  x0);           \        cvtps2pi_r2r(x0,  mm1);          \        packssdw_r2r(mm1, mm0);          \        movq_r2m    (mm0, blkptr[offs]);#if 0void fdct_basis(int16_t *block){    double tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;    double tmp10, tmp11, tmp12, tmp13;    double z1, z2, z3, z4, z5, z11, z13;    double *dataptr;    double data[64];    int16_t *blkptr;    int i;    // each iteration basically performs the following code:    blkptr = block;    dataptr = data;    for (i = 0; i < 8; i++)    {        tmp4 = blkptr[0] + blkptr[7];        tmp0 = blkptr[0] - blkptr[7];        tmp5 = blkptr[1] + blkptr[6];        tmp1 = blkptr[1] - blkptr[6];        tmp6 = blkptr[2] + blkptr[5];        tmp2 = blkptr[2] - blkptr[5];        tmp7 = blkptr[3] + blkptr[4];        tmp3 = blkptr[3] - blkptr[4];        /* Even part */        ADDDIFF(tmp4,tmp7);        ADDDIFF(tmp5,tmp6);        ADDDIFF(tmp7,tmp6);        dataptr[0] = tmp6; /* phase 3 */        dataptr[4] = tmp7;        tmp5+=tmp4;        tmp5 = tmp5 * ((double) NC_R_SQRT2); /* c4 */        ADDDIFF(tmp4,tmp5);        dataptr[2] = tmp5;	/* phase 5 */        dataptr[6] = tmp4;        /* Odd part */        tmp3 = tmp3 + tmp2;	/* phase 2 */        tmp2 = tmp2 + tmp1;        tmp1 = tmp1 + tmp0;        /* The rotator is modified from fig 4-8 to avoid extra negations. */        z5 = (tmp3 - tmp1) * ((double) NC_COS6); /* c6 */        tmp3 = ((double) NC_COS6SQRT2) * tmp3 + z5; /* c2-c6 */        tmp1 = ((double) NC_COS2SQRT2) * tmp1 + z5; /* c2+c6 */        tmp2 = tmp2 * ((double) NC_R_SQRT2); /* c4 */        ADDDIFF(tmp0,tmp2);        ADDDIFF(tmp0,tmp3);        ADDDIFF(tmp2,tmp1);        dataptr[5] = tmp3;	/* phase 6 */        dataptr[3] = tmp0;        dataptr[1] = tmp1;        dataptr[7] = tmp2;        dataptr += 8;		/* advance pointer to next row */        blkptr += 8;    }}#endifvoid fdct_sse(int16_t *block){    float *dataptr,*aanptr;    float data[67];    int16_t *blkptr;    int i;    /* Pass 1: process rows. */    blkptr = block;    dataptr = ALIGN_PTR(data,16);    for (i = 0; i < 2; i++)    {        // mm0=0a,1a,2a,3a        // mm1=4a,5a,6a,7a        // mm2=0b,1b,2b,3b        // mm3=4b,5b,6b,7b        movq_m2r(blkptr[16], mm0);        movq_m2r(blkptr[20], mm1);        movq_m2r(blkptr[24], mm2);        movq_m2r(blkptr[28], mm3);        pshufw_r2ri(mm1, mm1, SHUFFLEMAP(3,2,1,0));        pshufw_r2ri(mm3, mm3, SHUFFLEMAP(3,2,1,0));        MMXADDDIFF_Wt(mm0, mm1, mm4);        MMXADDDIFF_Wt(mm2, mm3, mm5);        // mm1=4a,4b,5a,5b        // mm4=7a,7b,6a,6b        movq_r2r(mm1, mm4);        punpcklwd_r2r(mm3, mm1);        punpckhwd_r2r(mm3, mm4);        pshufw_r2ri(mm4, mm4, SHUFFLEMAP(2,3,0,1));        MMXADDDIFF_Wt(mm1, mm4, mm3);        // mm6 = 6a,6b        // mm7 = 7a,7b        MMXEXTEND_WD(mm4, mm7, mm6);        MMXADDDIFF_Dt(mm7, mm6, mm4);        cvtpi2ps_r2r( mm6, xmm2 );        cvtpi2ps_r2r( mm7, xmm3 );        movlps_r2m( xmm2, dataptr[0*8+2] );        movlps_r2m( xmm3, dataptr[4*8+2] );        // mm6 = 4a,4b        // mm7 = 5a,5b        MMXEXTEND_WD(mm1, mm6, mm7);        paddd_r2r( mm6, mm7 );        cvtpi2ps_r2r( mm6, xmm0 );        cvtpi2ps_r2r( mm7, xmm1 );        // mm0 = 0a,0b,1a,1b        // mm1 = 2a,2b,3a,3b        movq_r2r(mm0, mm1);        punpcklwd_r2r(mm2, mm0);        punpckhwd_r2r(mm2, mm1);        // mm4 = 0a,0b        // mm5 = 1a,1b        // mm6 = 2a,2b        // mm7 = 3a,3b        MMXEXTEND_WD(mm0, mm4, mm5);        MMXEXTEND_WD(mm1, mm6, mm7);        cvtpi2ps_r2r( mm4, xmm4 );        paddd_r2r( mm6, mm7 );        cvtpi2ps_r2r( mm7, xmm7 );        paddd_r2r( mm5, mm6 );        cvtpi2ps_r2r( mm6, xmm6 );        paddd_r2r( mm4, mm5 );        cvtpi2ps_r2r( mm5, xmm5 );        movlhps_r2r( xmm0, xmm0 );        movlhps_r2r( xmm1, xmm1 );        movlhps_r2r( xmm4, xmm4 );        movlhps_r2r( xmm5, xmm5 );        movlhps_r2r( xmm6, xmm6 );        movlhps_r2r( xmm7, xmm7 );        // mm0=0a,1a,2a,3a        // mm1=4a,5a,6a,7a        // mm2=0b,1b,2b,3b        // mm3=4b,5b,6b,7b        movq_m2r(blkptr[ 0], mm0);        movq_m2r(blkptr[ 4], mm1);        movq_m2r(blkptr[ 8], mm2);        movq_m2r(blkptr[12], mm3);        pshufw_r2ri(mm1, mm1, SHUFFLEMAP(3,2,1,0));        pshufw_r2ri(mm3, mm3, SHUFFLEMAP(3,2,1,0));        MMXADDDIFF_Wt(mm0, mm1, mm4);        MMXADDDIFF_Wt(mm2, mm3, mm5);        // mm1=4a,4b,5a,5b        // mm4=7a,7b,6a,6b        movq_r2r(mm1, mm4);        punpcklwd_r2r(mm3, mm1);        punpckhwd_r2r(mm3, mm4);        pshufw_r2ri(mm4, mm4, SHUFFLEMAP(2,3,0,1));        MMXADDDIFF_Wt(mm1, mm4, mm3);        // mm6 = 6a,6b        // mm7 = 7a,7b        MMXEXTEND_WD(mm4, mm7, mm6);        MMXADDDIFF_Dt(mm7, mm6, mm4);        cvtpi2ps_r2r( mm6, xmm2 );        cvtpi2ps_r2r( mm7, xmm3 );        movlps_r2m( xmm2, dataptr[0*8+0] );        movlps_r2m( xmm3, dataptr[4*8+0] );        // mm6 = 4a,4b        // mm7 = 5a,5b        MMXEXTEND_WD(mm1, mm6, mm7);        paddd_r2r( mm6, mm7 );        cvtpi2ps_r2r( mm6, xmm0 );        cvtpi2ps_r2r( mm7, xmm1 );        // mm0 = 0a,0b,1a,1b        // mm1 = 2a,2b,3a,3b        movq_r2r(mm0, mm1);        punpcklwd_r2r(mm2, mm0);        punpckhwd_r2r(mm2, mm1);        // mm4 = 0a,0b        // mm5 = 1a,1b        // mm6 = 2a,2b        // mm7 = 3a,3b        MMXEXTEND_WD(mm0, mm4, mm5);        MMXEXTEND_WD(mm1, mm6, mm7);        cvtpi2ps_r2r( mm4, xmm4 );        paddd_r2r( mm6, mm7 );        cvtpi2ps_r2r( mm7, xmm7 );        paddd_r2r( mm5, mm6 );        cvtpi2ps_r2r( mm6, xmm6 );        paddd_r2r( mm4, mm5 );        cvtpi2ps_r2r( mm5, xmm5 );        // z1 = (tmp11 + tmp10) * ((double) NC_R_SQRT2); /* c4 */        // dataptr[2] = tmp10 + z1;	/* phase 5 */        // dataptr[6] = tmp10 - z1;        mulps_m2r(fdct_sse_r_sqrt2[0],xmm1);        SSEADDDIFF_t(xmm0, xmm1, xmm2);        movaps_r2m( xmm0, dataptr[6*8] );        movaps_r2m( xmm1, dataptr[2*8] );                /* The rotator is modified from fig 4-8 to avoid extra negations. */        // z5 = (tmp7 - tmp5) * ((double) NC_COS6); /* c6 */        movaps_r2r( xmm7, xmm2 );        subps_r2r( xmm5, xmm2 );        mulps_m2r(fdct_sse_cos6[0], xmm2 );        // tmp7 = ((double) NC_COS6SQRT2) * tmp7 + z5; /* c2-c6 */        // tmp5 = ((double) NC_COS2SQRT2) * tmp5 + z5; /* c2+c6 */        // tmp6 = tmp6 * ((double) NC_R_SQRT2); /* c4 */        mulps_m2r(fdct_sse_cos6sqrt2[0], xmm7);        mulps_m2r(fdct_sse_cos2sqrt2[0], xmm5);        mulps_m2r(fdct_sse_r_sqrt2[0], xmm6 );        addps_r2r(xmm2, xmm7);        addps_r2r(xmm2, xmm5);        // z16 = tmp4 + tmp6;		/* phase 5 */        // z14 = tmp4 - tmp6;        SSEADDDIFF_t(xmm4, xmm6, xmm2);        SSEADDDIFF_t(xmm4, xmm7, xmm2);        SSEADDDIFF_t(xmm6, xmm5, xmm2);                // dataptr[5] = z14 + tmp7;	/* phase 6 */        // dataptr[3] = z14 - tmp7;        // dataptr[1] = z16 + tmp5;        // dataptr[7] = z16 - tmp5;        movaps_r2m( xmm7, dataptr[5*8] );        movaps_r2m( xmm4, dataptr[3*8] );        movaps_r2m( xmm5, dataptr[1*8] );        movaps_r2m( xmm6, dataptr[7*8] );        dataptr += 4;		/* advance pointer to next row */        blkptr += 8*4;    }    /* Pass 2: process columns. */    dataptr = ALIGN_PTR(data,16);    aanptr  = aanscales;    blkptr = block;    for (i = 0; i < 2; i++)    {        LOADROTATEff(dataptr,   xmm4, xmm0, xmm1, xmm2, xmm3);        movaps_r2m(xmm0, dataptr[0]);        LOADROTATEff(dataptr+4, xmm0, xmm7, xmm6, xmm5, xmm4);        movaps_m2r(dataptr[0], xmm0);        SSEADDDIFF(xmm0,xmm4);        SSEADDDIFF(xmm1,xmm5);        SSEADDDIFF(xmm2,xmm6);        SSEADDDIFF(xmm3,xmm7);        /* Even part */        SSEADDDIFF(xmm4, xmm7);        SSEADDDIFF(xmm5, xmm6);        SSEADDDIFF(xmm7, xmm6);        STOREXMM(xmm6,0);        STOREXMM(xmm7,32);        addps_r2r(xmm4, xmm5);        mulps_m2r(fdct_sse_r_sqrt2[0],xmm5);                SSEADDDIFF_t(xmm4, xmm5, xmm6);        STOREXMM(xmm5,16);        STOREXMM(xmm4,48);        /* Odd part */        addps_r2r(xmm2, xmm3);        addps_r2r(xmm1, xmm2);        addps_r2r(xmm0, xmm1);        /* The rotator is modified from fig 4-8 to avoid extra negations. */        movaps_r2r(xmm3, xmm4);        subps_r2r(xmm1, xmm4);        mulps_m2r(fdct_sse_cos6[0], xmm4);        mulps_m2r(fdct_sse_cos6sqrt2[0], xmm3);        mulps_m2r(fdct_sse_cos2sqrt2[0], xmm1);        mulps_m2r(fdct_sse_r_sqrt2[0], xmm2);        addps_r2r(xmm4, xmm3);        addps_r2r(xmm4, xmm1);        SSEADDDIFF_t(xmm0, xmm2, xmm5);        SSEADDDIFF_t(xmm0, xmm3, xmm6);        STOREXMM(xmm0,24);        STOREXMM(xmm3,40);        SSEADDDIFF_t(xmm2, xmm1, xmm7);        STOREXMM(xmm2,56);        STOREXMM(xmm1, 8);        dataptr+=4*8;			/* advance pointer to next column */        aanptr+=4;        blkptr+=4;    }    emms();}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?