📄 vp3dsp_sse2.cpp
字号:
paddsw(xmm0, xmm0); /* xmm0 = C. + C. */
paddsw(xmm7, xmm0); /* xmm0 = G. + C. */
movdqu(xmm6, I(6)); /* Write out op6 */
movdqu(xmm5, I(5)); /* Write out op5 */
movdqu(xmm7, I(7)); /* Write out op7 */
movdqu(xmm0, I(0)); /* Write out op0 */
//transpose
movdqu(I(4), xmm4); /* xmm4=e7e6e5e4e3e2e1e0 */
movdqu(I(5), xmm0); /* xmm4=f7f6f5f4f3f2f1f0 */
movdqu(xmm4, xmm5); /* make a copy */
punpcklwd(xmm0, xmm4); /* xmm4=f3e3f2e2f1e1f0e0 */
punpckhwd(xmm0, xmm5); /* xmm5=f7e7f6e6f5e5f4e4 */
movdqu(I(6), xmm6); /* xmm6=g7g6g5g4g3g2g1g0 */
movdqu(I(7), xmm0); /* xmm0=h7h6h5h4h3h2h1h0 */
movdqu(xmm6, xmm7); /* make a copy */
punpcklwd(xmm0, xmm6); /* xmm6=h3g3h3g2h1g1h0g0 */
punpckhwd(xmm0, xmm7); /* xmm7=h7g7h6g6h5g5h4g4 */
movdqu(xmm4, xmm3); /* make a copy */
punpckldq(xmm6, xmm4); /* xmm4=h1g1f1e1h0g0f0e0 */
punpckhdq(xmm6, xmm3); /* xmm3=h3g3g3e3h2g2f2e2 */
movdqu(xmm3, I(6)); /* save h3g3g3e3h2g2f2e2 */
/* Free xmm6 */
movdqu(xmm5, xmm6); /* make a copy */
punpckldq(xmm7, xmm5); /* xmm5=h5g5f5e5h4g4f4e4 */
punpckhdq(xmm7, xmm6); /* xmm6=h7g7f7e7h6g6f6e6 */
movdqu(I(0), xmm0); /* xmm0=a7a6a5a4a3a2a1a0 */
/* Free xmm7 */
movdqu(I(1), xmm1); /* xmm1=b7b6b5b4b3b2b1b0 */
movdqu(xmm0, xmm7); /* make a copy */
punpcklwd(xmm1, xmm0); /* xmm0=b3a3b2a2b1a1b0a0 */
punpckhwd(xmm1, xmm7); /* xmm7=b7a7b6a6b5a5b4a4 */
/* Free xmm1 */
movdqu(I(2), xmm2); /* xmm2=c7c6c5c4c3c2c1c0 */
movdqu(I(3), xmm3); /* xmm3=d7d6d5d4d3d2d1d0 */
movdqu(xmm2, xmm1); /* make a copy */
punpcklwd(xmm3, xmm2); /* xmm2=d3c3d2c2d1c1d0c0 */
punpckhwd(xmm3, xmm1); /* xmm1=d7c7d6c6d5c5d4c4 */
movdqu(xmm0, xmm3); /* make a copy */
punpckldq(xmm2, xmm0); /* xmm0=d1c1b1a1d0c0b0a0 */
punpckhdq(xmm2, xmm3); /* xmm3=d3c3b3a3d2c2b2a2 */
/* Free xmm2 */
movdqu(xmm7, xmm2); /* make a copy */
punpckldq(xmm1, xmm2); /* xmm2=d5c5b5a5d4c4b4a4 */
punpckhdq(xmm1, xmm7); /* xmm7=d7c7b7a7d6c6b6a6 */
movdqu(xmm0, xmm1); /* make a copy */
punpcklqdq(xmm4, xmm0); /* xmm0=h0g0f0e0d0c0b0a0 */
punpckhqdq(xmm4, xmm1); /* xmm1=h1g1g1e1d1c1b1a1 */
movdqu(xmm0, I(0)); /* save I(0) */
movdqu(xmm1, I(1)); /* save I(1) */
movdqu(I(6), xmm0); /* load h3g3g3e3h2g2f2e2 */
movdqu(xmm3, xmm1); /* make a copy */
punpcklqdq(xmm0, xmm1); /* xmm1=h2g2f2e2d2c2b2a2 */
punpckhqdq(xmm0, xmm3); /* xmm3=h3g3f3e3d3c3b3a3 */
movdqu(xmm2, xmm4); /* make a copy */
punpcklqdq(xmm5, xmm4); /* xmm4=h4g4f4e4d4c4b4a4 */
punpckhqdq(xmm5, xmm2); /* xmm2=h5g5f5e5d5c5b5a5 */
movdqu(xmm1, I(2)); /* save I(2) */
movdqu(xmm3, I(3)); /* save I(3) */
movdqu(xmm4, I(4)); /* save I(4) */
movdqu(xmm2, I(5)); /* save I(5) */
movdqu(xmm7, xmm5); /* make a copy */
punpcklqdq(xmm6, xmm5); /* xmm5=h6g6f6e6d6c6b6a6 */
punpckhqdq(xmm6, xmm7); /* xmm7=h7g7f7e7d7c7b7a7 */
movdqu(xmm5, I(6)); /* save I(6) */
movdqu(xmm7, I(7)); /* save I(7) */
/* End of Transpose Macro */
unsigned char *output_data_bytes = (unsigned char *)output;
#define O(i) (output_data_bytes + 16 * i)
//#define SSE2_Column_IDCT() { \
movdqu(I(3), xmm2); /* xmm2 = i3 */
movdqu(C(3), xmm6); /* xmm6 = c3 */
movdqu(xmm2, xmm4); /* xmm4 = i3 */
movdqu(I(5), xmm7); /* xmm7 = i5 */
pmulhw(xmm6, xmm4); /* xmm4 = c3 * i3 - i3 */
movdqu(C(5), xmm1); /* xmm1 = c5 */
pmulhw(xmm7, xmm6); /* xmm6 = c3 * i5 - i5 */
movdqu(xmm1, xmm5); /* xmm5 = c5 */
pmulhw(xmm2, xmm1); /* xmm1 = c5 * i3 - i3 */
movdqu(I(1), xmm3); /* xmm3 = i1 */
pmulhw(xmm7, xmm5); /* xmm5 = c5 * i5 - i5 */
movdqu(C(1), xmm0); /* xmm0 = c1 */
/* all registers are in use */
paddw(xmm2, xmm4); /* xmm4 = c3 * i3 */
paddw(xmm7, xmm6); /* xmm6 = c3 * i5 */
paddw(xmm1, xmm2); /* xmm2 = c5 * i3 */
movdqu(I(7), xmm1); /* xmm1 = i7 */
paddw(xmm5, xmm7); /* xmm7 = c5 * i5 */
movdqu(xmm0, xmm5); /* xmm5 = c1 */
pmulhw(xmm3, xmm0); /* xmm0 = c1 * i1 - i1 */
paddsw(xmm7, xmm4); /* xmm4 = c3 * i3 + c5 * i5 = C */
pmulhw(xmm1, xmm5); /* xmm5 = c1 * i7 - i7 */
movdqu(C(7), xmm7); /* xmm7 = c7 */
psubsw(xmm2, xmm6); /* xmm6 = c3 * i5 - c5 * i3 = D */
paddw(xmm3, xmm0); /* xmm0 = c1 * i1 */
pmulhw(xmm7, xmm3); /* xmm3 = c7 * i1 */
movdqu(I(2), xmm2); /* xmm2 = i2 */
pmulhw(xmm1, xmm7); /* xmm7 = c7 * i7 */
paddw(xmm1, xmm5); /* xmm5 = c1 * i7 */
movdqu(xmm2, xmm1); /* xmm1 = i2 */
pmulhw(C(2), xmm2); /* xmm2 = i2 * c2 -i2 */
psubsw(xmm5, xmm3); /* xmm3 = c7 * i1 - c1 * i7 = B */
movdqu(I(6), xmm5); /* xmm5 = i6 */
paddsw(xmm7, xmm0); /* xmm0 = c1 * i1 + c7 * i7 = A */
movdqu(xmm5, xmm7); /* xmm7 = i6 */
psubsw(xmm4, xmm0); /* xmm0 = A - C */
pmulhw(C(2), xmm5); /* xmm5 = c2 * i6 - i6 */
paddw(xmm1, xmm2); /* xmm2 = i2 * c2 */
pmulhw(C(6), xmm1); /* xmm1 = c6 * i2 */
paddsw(xmm4, xmm4); /* xmm4 = C + C */
paddsw(xmm0, xmm4); /* xmm4 = A + C = C. */
psubsw(xmm6, xmm3); /* xmm3 = B - D */
paddw(xmm7, xmm5); /* xmm5 = c2 * i6 */
paddsw(xmm6, xmm6); /* xmm6 = D + D */
pmulhw(C(6), xmm7); /* xmm7 = c6 * i6 */
paddsw(xmm3, xmm6); /* xmm6 = B + D = D. */
movdqu(xmm4, I(1)); /* Save C. at I(1) */
psubsw(xmm5, xmm1); /* xmm1 = c6 * i2 - c2 * i6 = H */
movdqu(C(4), xmm4); /* xmm4 = c4 */
movdqu(xmm3, xmm5); /* xmm5 = B - D */
pmulhw(xmm4, xmm3); /* xmm3 = ( c4 -1 ) * ( B - D ) */
paddsw(xmm2, xmm7); /* xmm7 = c2 * i2 + c6 * i6 = G */
movdqu(xmm6, I(2)); /* Save D. at I(2) */
movdqu(xmm0, xmm2); /* xmm2 = A - C */
movdqu(I(0), xmm6); /* xmm6 = i0 */
pmulhw(xmm4, xmm0); /* xmm0 = ( c4 - 1 ) * ( A - C ) = A. */
paddw(xmm3, xmm5); /* xmm5 = c4 * ( B - D ) = B. */
movdqu(I(4), xmm3); /* xmm3 = i4 */
psubsw(xmm1, xmm5); /* xmm5 = B. - H = B.. */
paddw(xmm0, xmm2); /* xmm2 = c4 * ( A - C) = A. */
psubsw(xmm3, xmm6); /* xmm6 = i0 - i4 */
movdqu(xmm6, xmm0); /* xmm0 = i0 - i4 */
pmulhw(xmm4, xmm6); /* xmm6 = (c4 - 1) * (i0 - i4) = F */
paddsw(xmm3, xmm3); /* xmm3 = i4 + i4 */
paddsw(xmm1, xmm1); /* xmm1 = H + H */
paddsw(xmm0, xmm3); /* xmm3 = i0 + i4 */
paddsw(xmm5, xmm1); /* xmm1 = B. + H = H. */
pmulhw(xmm3, xmm4); /* xmm4 = ( c4 - 1 ) * ( i0 + i4 ) */
paddw(xmm0, xmm6); /* xmm6 = c4 * ( i0 - i4 ) */
psubsw(xmm2, xmm6); /* xmm6 = F - A. = F. */
paddsw(xmm2, xmm2); /* xmm2 = A. + A. */
movdqu(I(1), xmm0); /* Load C. from I(1) */
paddsw(xmm6, xmm2); /* xmm2 = F + A. = A.. */
paddw(xmm3, xmm4); /* xmm4 = c4 * ( i0 + i4 ) = 3 */
psubsw(xmm1, xmm2); /* xmm2 = A.. - H. = R2 */
paddsw(Eight, xmm2); /* Adjust R2 and R1 before shifting */
paddsw(xmm1, xmm1); /* xmm1 = H. + H. */
paddsw(xmm2, xmm1); /* xmm1 = A.. + H. = R1 */
psraw(4, xmm2); /* xmm2 = op2 */
psubsw(xmm7, xmm4); /* xmm4 = E - G = E. */
psraw(4, xmm1); /* xmm1 = op1 */
movdqu(I(2), xmm3); /* Load D. from I(2) */
paddsw(xmm7, xmm7); /* xmm7 = G + G */
movdqu(xmm2, O(2)); /* Write out op2 */
paddsw(xmm4, xmm7); /* xmm7 = E + G = G. */
movdqu(xmm1, O(1)); /* Write out op1 */
psubsw(xmm3, xmm4); /* xmm4 = E. - D. = R4 */
paddsw(Eight, xmm4); /* Adjust R4 and R3 before shifting */
paddsw(xmm3, xmm3); /* xmm3 = D. + D. */
paddsw(xmm4, xmm3); /* xmm3 = E. + D. = R3 */
psraw(4, xmm4); /* xmm4 = op4 */
psubsw(xmm5, xmm6); /* xmm6 = F. - B..= R6 */
psraw(4, xmm3); /* xmm3 = op3 */
paddsw(Eight, xmm6); /* Adjust R6 and R5 before shifting */
paddsw(xmm5, xmm5); /* xmm5 = B.. + B.. */
paddsw(xmm6, xmm5); /* xmm5 = F. + B.. = R5 */
psraw(4, xmm6); /* xmm6 = op6 */
movdqu(xmm4, O(4)); /* Write out op4 */
psraw(4, xmm5); /* xmm5 = op5 */
movdqu(xmm3, O(3)); /* Write out op3 */
psubsw(xmm0, xmm7); /* xmm7 = G. - C. = R7 */
paddsw(Eight, xmm7); /* Adjust R7 and R0 before shifting */
paddsw(xmm0, xmm0); /* xmm0 = C. + C. */
paddsw(xmm7, xmm0); /* xmm0 = G. + C. */
psraw(4, xmm7); /* xmm7 = op7 */
movdqu(xmm6, O(6)); /* Write out op6 */
psraw(4, xmm0); /* xmm0 = op0 */
movdqu(xmm5, O(5)); /* Write out op5 */
movdqu(xmm7, O(7)); /* Write out op7 */
movdqu(xmm0, O(0)); /* Write out op0 */
/* End of SSE2_Column_IDCT macro */
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -