⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 vp3dsp_sse2.cpp

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 CPP
📖 第 1 页 / 共 3 页
字号:
    paddsw(xmm0, xmm0);     /* xmm0 = C. + C. */

    paddsw(xmm7, xmm0);     /* xmm0 = G. + C. */

    movdqu(xmm6, I(6));    /* Write out op6 */

    movdqu(xmm5, I(5));    /* Write out op5 */
    movdqu(xmm7, I(7));    /* Write out op7 */

    movdqu(xmm0, I(0));    /* Write out op0 */


    //transpose
    movdqu(I(4), xmm4);    /* xmm4=e7e6e5e4e3e2e1e0 */
    movdqu(I(5), xmm0);    /* xmm4=f7f6f5f4f3f2f1f0 */

    movdqu(xmm4, xmm5);     /* make a copy */
    punpcklwd(xmm0, xmm4);  /* xmm4=f3e3f2e2f1e1f0e0 */

    punpckhwd(xmm0, xmm5);  /* xmm5=f7e7f6e6f5e5f4e4 */
    movdqu(I(6), xmm6);    /* xmm6=g7g6g5g4g3g2g1g0 */

    movdqu(I(7), xmm0);    /* xmm0=h7h6h5h4h3h2h1h0 */
    movdqu(xmm6, xmm7);     /* make a copy */

    punpcklwd(xmm0, xmm6);  /* xmm6=h3g3h3g2h1g1h0g0 */
    punpckhwd(xmm0, xmm7);  /* xmm7=h7g7h6g6h5g5h4g4 */

    movdqu(xmm4, xmm3);     /* make a copy */
    punpckldq(xmm6, xmm4);  /* xmm4=h1g1f1e1h0g0f0e0 */

    punpckhdq(xmm6, xmm3);  /* xmm3=h3g3g3e3h2g2f2e2 */
    movdqu(xmm3, I(6));    /* save h3g3g3e3h2g2f2e2 */
    /* Free xmm6 */
    movdqu(xmm5, xmm6);     /* make a copy */
    punpckldq(xmm7, xmm5);  /* xmm5=h5g5f5e5h4g4f4e4 */

    punpckhdq(xmm7, xmm6);  /* xmm6=h7g7f7e7h6g6f6e6 */
    movdqu(I(0), xmm0);    /* xmm0=a7a6a5a4a3a2a1a0 */
    /* Free xmm7 */
    movdqu(I(1), xmm1);    /* xmm1=b7b6b5b4b3b2b1b0 */
    movdqu(xmm0, xmm7);     /* make a copy */

    punpcklwd(xmm1, xmm0);  /* xmm0=b3a3b2a2b1a1b0a0 */
    punpckhwd(xmm1, xmm7);  /* xmm7=b7a7b6a6b5a5b4a4 */
    /* Free xmm1 */
    movdqu(I(2), xmm2);    /* xmm2=c7c6c5c4c3c2c1c0 */
    movdqu(I(3), xmm3);    /* xmm3=d7d6d5d4d3d2d1d0 */

    movdqu(xmm2, xmm1);     /* make a copy */
    punpcklwd(xmm3, xmm2);  /* xmm2=d3c3d2c2d1c1d0c0 */

    punpckhwd(xmm3, xmm1);  /* xmm1=d7c7d6c6d5c5d4c4 */
    movdqu(xmm0, xmm3);     /* make a copy        */

    punpckldq(xmm2, xmm0);  /* xmm0=d1c1b1a1d0c0b0a0 */
    punpckhdq(xmm2, xmm3);  /* xmm3=d3c3b3a3d2c2b2a2 */
    /* Free xmm2 */
    movdqu(xmm7, xmm2);     /* make a copy */
    punpckldq(xmm1, xmm2);  /* xmm2=d5c5b5a5d4c4b4a4 */

    punpckhdq(xmm1, xmm7);  /* xmm7=d7c7b7a7d6c6b6a6 */
    movdqu(xmm0, xmm1);     /* make a copy */

    punpcklqdq(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */
    punpckhqdq(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */

    movdqu(xmm0, I(0));    /* save I(0) */
    movdqu(xmm1, I(1));    /* save I(1) */

    movdqu(I(6), xmm0);    /* load h3g3g3e3h2g2f2e2 */
    movdqu(xmm3, xmm1);     /* make a copy */

    punpcklqdq(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */
    punpckhqdq(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */

    movdqu(xmm2, xmm4);     /* make a copy */
    punpcklqdq(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */

    punpckhqdq(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */
    movdqu(xmm1, I(2));    /* save I(2) */

    movdqu(xmm3, I(3));    /* save I(3) */
    movdqu(xmm4, I(4));    /* save I(4) */

    movdqu(xmm2, I(5));    /* save I(5) */
    movdqu(xmm7, xmm5);     /* make a copy */

    punpcklqdq(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */
    punpckhqdq(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */

    movdqu(xmm5, I(6));    /* save I(6) */
    movdqu(xmm7, I(7));    /* save I(7) */

 /* End of Transpose Macro */

    unsigned char *output_data_bytes = (unsigned char *)output;
#define O(i) (output_data_bytes + 16 * i)

//#define SSE2_Column_IDCT() {        \

    movdqu(I(3), xmm2);     /* xmm2 = i3 */
    movdqu(C(3), xmm6);     /* xmm6 = c3 */

    movdqu(xmm2, xmm4);      /* xmm4 = i3 */
    movdqu(I(5), xmm7);     /* xmm7 = i5 */

    pmulhw(xmm6, xmm4);      /* xmm4 = c3 * i3 - i3 */
    movdqu(C(5), xmm1);     /* xmm1 = c5 */

    pmulhw(xmm7, xmm6);      /* xmm6 = c3 * i5 - i5 */
    movdqu(xmm1, xmm5);      /* xmm5 = c5 */

    pmulhw(xmm2, xmm1);      /* xmm1 = c5 * i3 - i3 */
    movdqu(I(1), xmm3);     /* xmm3 = i1 */

    pmulhw(xmm7, xmm5);      /* xmm5 = c5 * i5 - i5 */
    movdqu(C(1), xmm0);     /* xmm0 = c1 */

    /* all registers are in use */

    paddw(xmm2, xmm4);       /* xmm4 = c3 * i3 */
    paddw(xmm7, xmm6);       /* xmm6 = c3 * i5 */

    paddw(xmm1, xmm2);       /* xmm2 = c5 * i3 */
    movdqu(I(7), xmm1);     /* xmm1 = i7 */

    paddw(xmm5, xmm7);       /* xmm7 = c5 * i5 */
    movdqu(xmm0, xmm5);      /* xmm5 = c1 */

    pmulhw(xmm3, xmm0);      /* xmm0 = c1 * i1 - i1 */
    paddsw(xmm7, xmm4);      /* xmm4 = c3 * i3 + c5 * i5 = C */

    pmulhw(xmm1, xmm5);      /* xmm5 = c1 * i7 - i7 */
    movdqu(C(7), xmm7);     /* xmm7 = c7 */

    psubsw(xmm2, xmm6);      /* xmm6 = c3 * i5 - c5 * i3 = D */
    paddw(xmm3, xmm0);       /* xmm0 = c1 * i1 */

    pmulhw(xmm7, xmm3);      /* xmm3 = c7 * i1 */
    movdqu(I(2), xmm2);     /* xmm2 = i2 */

    pmulhw(xmm1, xmm7);      /* xmm7 = c7 * i7 */
    paddw(xmm1, xmm5);       /* xmm5 = c1 * i7 */

    movdqu(xmm2, xmm1);      /* xmm1 = i2 */
    pmulhw(C(2), xmm2);     /* xmm2 = i2 * c2 -i2 */

    psubsw(xmm5, xmm3);      /* xmm3 = c7 * i1 - c1 * i7 = B */
    movdqu(I(6), xmm5);     /* xmm5 = i6 */

    paddsw(xmm7, xmm0);      /* xmm0 = c1 * i1 + c7 * i7 = A */
    movdqu(xmm5, xmm7);      /* xmm7 = i6 */

    psubsw(xmm4, xmm0);      /* xmm0 = A - C */
    pmulhw(C(2), xmm5);     /* xmm5 = c2 * i6 - i6 */

    paddw(xmm1, xmm2);       /* xmm2 = i2 * c2 */
    pmulhw(C(6), xmm1);     /* xmm1 = c6 * i2 */

    paddsw(xmm4, xmm4);      /* xmm4 = C + C */
    paddsw(xmm0, xmm4);      /* xmm4 = A + C = C. */

    psubsw(xmm6, xmm3);      /* xmm3 = B - D */
    paddw(xmm7, xmm5);       /* xmm5 = c2 * i6 */

    paddsw(xmm6, xmm6);      /* xmm6 = D + D */
    pmulhw(C(6), xmm7);     /* xmm7 = c6 * i6 */

    paddsw(xmm3, xmm6);      /* xmm6 = B + D = D. */
    movdqu(xmm4, I(1));     /* Save C. at I(1) */

    psubsw(xmm5, xmm1);      /* xmm1 = c6 * i2 - c2 * i6 = H */
    movdqu(C(4), xmm4);     /* xmm4 = c4 */

    movdqu(xmm3, xmm5);      /* xmm5 = B - D */
    pmulhw(xmm4, xmm3);      /* xmm3 = ( c4 -1 ) * ( B - D ) */

    paddsw(xmm2, xmm7);      /* xmm7 = c2 * i2 + c6 * i6 = G */
    movdqu(xmm6, I(2));     /* Save D. at I(2) */

    movdqu(xmm0, xmm2);      /* xmm2 = A - C */
    movdqu(I(0), xmm6);     /* xmm6 = i0 */

    pmulhw(xmm4, xmm0);      /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */
    paddw(xmm3, xmm5);       /* xmm5 = c4 * ( B - D ) = B. */

    movdqu(I(4), xmm3);     /* xmm3 = i4 */
    psubsw(xmm1, xmm5);      /* xmm5 = B. - H = B.. */

    paddw(xmm0, xmm2);       /* xmm2 = c4 * ( A - C) = A. */
    psubsw(xmm3, xmm6);      /* xmm6 = i0 - i4 */

    movdqu(xmm6, xmm0);      /* xmm0 = i0 - i4 */
    pmulhw(xmm4, xmm6);      /* xmm6 = (c4 - 1) * (i0 - i4) = F */

    paddsw(xmm3, xmm3);      /* xmm3 = i4 + i4 */
    paddsw(xmm1, xmm1);      /* xmm1 = H + H */

    paddsw(xmm0, xmm3);      /* xmm3 = i0 + i4 */
    paddsw(xmm5, xmm1);      /* xmm1 = B. + H = H. */

    pmulhw(xmm3, xmm4);      /* xmm4 = ( c4 - 1 ) * ( i0 + i4 )  */
    paddw(xmm0, xmm6);       /* xmm6 = c4 * ( i0 - i4 ) */

    psubsw(xmm2, xmm6);      /* xmm6 = F - A. = F. */
    paddsw(xmm2, xmm2);      /* xmm2 = A. + A. */

    movdqu(I(1), xmm0);     /* Load        C. from I(1) */
    paddsw(xmm6, xmm2);      /* xmm2 = F + A. = A.. */

    paddw(xmm3, xmm4);       /* xmm4 = c4 * ( i0 + i4 ) = 3 */
    psubsw(xmm1, xmm2);      /* xmm2 = A.. - H. = R2 */

    paddsw(Eight, xmm2);    /* Adjust R2 and R1 before shifting */
    paddsw(xmm1, xmm1);      /* xmm1 = H. + H. */

    paddsw(xmm2, xmm1);      /* xmm1 = A.. + H. = R1 */
    psraw(4, xmm2);          /* xmm2 = op2 */

    psubsw(xmm7, xmm4);      /* xmm4 = E - G = E. */
    psraw(4, xmm1);          /* xmm1 = op1 */

    movdqu(I(2), xmm3);     /* Load D. from I(2) */
    paddsw(xmm7, xmm7);      /* xmm7 = G + G */

    movdqu(xmm2, O(2));     /* Write out op2 */
    paddsw(xmm4, xmm7);      /* xmm7 = E + G = G. */

    movdqu(xmm1, O(1));     /* Write out op1 */
    psubsw(xmm3, xmm4);      /* xmm4 = E. - D. = R4 */

    paddsw(Eight, xmm4);    /* Adjust R4 and R3 before shifting */
    paddsw(xmm3, xmm3);      /* xmm3 = D. + D. */

    paddsw(xmm4, xmm3);      /* xmm3 = E. + D. = R3 */
    psraw(4, xmm4);          /* xmm4 = op4 */

    psubsw(xmm5, xmm6);      /* xmm6 = F. - B..= R6 */
    psraw(4, xmm3);          /* xmm3 = op3 */

    paddsw(Eight, xmm6);    /* Adjust R6 and R5 before shifting */
    paddsw(xmm5, xmm5);      /* xmm5 = B.. + B.. */

    paddsw(xmm6, xmm5);      /* xmm5 = F. + B.. = R5 */
    psraw(4, xmm6);          /* xmm6 = op6 */

    movdqu(xmm4, O(4));     /* Write out op4 */
    psraw(4, xmm5);          /* xmm5 = op5 */

    movdqu(xmm3, O(3));     /* Write out op3 */
    psubsw(xmm0, xmm7);      /* xmm7 = G. - C. = R7 */

    paddsw(Eight, xmm7);    /* Adjust R7 and R0 before shifting */
    paddsw(xmm0, xmm0);      /* xmm0 = C. + C. */

    paddsw(xmm7, xmm0);      /* xmm0 = G. + C. */
    psraw(4, xmm7);          /* xmm7 = op7 */

    movdqu(xmm6, O(6));     /* Write out op6 */
    psraw(4, xmm0);          /* xmm0 = op0 */

    movdqu(xmm5, O(5));     /* Write out op5 */
    movdqu(xmm7, O(7));     /* Write out op7 */

    movdqu(xmm0, O(0));     /* Write out op0 */

 /* End of SSE2_Column_IDCT macro */
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -