📄 mmxidct.cpp
字号:
movq (r3, r5 );// r3 = 33 32 31 30
psllq (r3, 48 );// r3 = 30 __ __ __
pand (r1, r0 );// r1 = __ 21 __ __
movq (edx+32, r7 );// write R2 = r7
por (r6, r3 );// r6 = 30 __ __ 06
movq (r7, M(1) );// r7 = __ __ FF __
por (r6, r1 );// r6 = 30 21 __ 06
movq (r1, eax+56);
pand (r7, r4 );// r7 = __ __ 15 __
pmullw (r1, ebx+56 );// r1 = 37 36 35 34
por (r7, r6 );// r7 = 30 21 15 06 = R3
pand (r0, M(1) );// r0 = __ __ 20 __
psrlq (r4, 32 );// r4 = __ __ 17 16
movq (edx+48, r7 );// write R3 = r7
movq (r6, r4 );// r6 = __ __ 17 16
movq (r7, M(3) );// r7 = FF __ __ __
pand (r4, r2 );// r4 = __ __ __ 16
movq (r3, M(1) );// r3 = __ __ FF __
pand (r7, r1 );// r7 = 37 __ __ __
pand (r3, r5 );// r3 = __ __ 31 __
por (r0, r4 );// r0 = __ __ 20 16
psllq (r3, 16 );// r3 = __ 31 __ __
por (r7, r0 );// r7 = 37 __ 20 16
movq (r4, M(2) );// r4 = __ FF __ __
por (r7, r3 );// r7 = 37 31 20 16 = R4
movq (r0, eax+80);
movq (r3, r4 );// r3 = __ __ FF __
pmullw (r0, ebx+80 );// r0 = 53 52 51 50
pand (r4, r5 );// r4 = __ 32 __ __
movq (edx+8, r7 );// write R4 = r7
por (r6, r4 );// r6 = __ 32 17 16
movq (r4, r3 );// r4 = __ FF __ __
psrlq (r6, 16 );// r6 = __ __ 32 17
movq (r7, r0 );// r7 = 53 52 51 50
pand (r4, r1 );// r4 = __ 36 __ __
psllq (r7, 48 );// r7 = 50 __ __ __
por (r6, r4 );// r6 = __ 36 32 17
movq (r4, eax+88);
por (r7, r6 );// r7 = 50 36 32 17 = R5
pmullw (r4, ebx+88 );// r4 = 57 56 55 54
psrlq (r3, 16 );// r3 = __ __ FF __
movq (edx+24, r7 );// write R5 = r7
pand (r3, r1 );// r3 = __ __ 35 __
psrlq (r5, 48 );// r5 = __ __ __ 33
pand (r1, r2 );// r1 = __ __ __ 34
movq (r6, eax+104);
por (r5, r3 );// r5 = __ __ 35 33
pmullw (r6, ebx+104 );// r6 = 67 66 65 64
psrlq (r0, 16 );// r0 = __ 53 52 51
movq (r7, r4 );// r7 = 57 56 55 54
movq (r3, r2 );// r3 = __ __ __ FF
psllq (r7, 48 );// r7 = 54 __ __ __
pand (r3, r0 );// r3 = __ __ __ 51
pxor (r0, r3 );// r0 = __ 53 52 __
psllq (r3, 32 );// r3 = __ 51 __ __
por ( r7, r5 );// r7 = 54 __ 35 33
movq (r5, r6 );// r5 = 67 66 65 64
pand (r6, M(1) );// r6 = __ __ 65 __
por (r7, r3 );// r7 = 54 51 35 33 = R6
psllq (r6, 32 );// r6 = 65 __ __ __
por (r0, r1 );// r0 = __ 53 52 34
movq (edx+40, r7 );// write R6 = r7
por (r0, r6 );// r0 = 65 53 52 34 = R7
movq (r7, eax+120);
movq (r6, r5 );// r6 = 67 66 65 64
pmullw (r7, ebx+120 );// r7 = 77 76 75 74
psrlq (r5, 32 );// r5 = __ __ 67 66
pand (r6, r2 );// r6 = __ __ __ 64
movq (r1, r5 );// r1 = __ __ 67 66
movq (edx+56, r0 );// write R7 = r0
pand (r1, r2 );// r1 = __ __ __ 66
movq (r0, eax+112);
movq (r3, r7 );// r3 = 77 76 75 74
pmullw (r0, ebx+112 );// r0 = 73 72 71 70
psllq (r3, 16 );// r3 = 76 75 74 __
pand (r7, M(3) );// r7 = 77 __ __ __
pxor (r5, r1 );// r5 = __ __ 67 __
por ( r6, r5 );// r6 = __ __ 67 64
movq (r5, r3 );// r5 = 76 75 74 __
pand (r5, M(3) );// r5 = 76 __ __ __
por (r7, r1 );// r7 = 77 __ __ 66
movq (r1, eax+96);
pxor (r3, r5 );// r3 = __ 75 74 __
pmullw (r1, ebx+96 );// r1 = 63 62 61 60
por (r7, r3 );// r7 = 77 75 74 66 = R15
por ( r6, r5 );// r6 = 76 __ 67 64
movq (r5, r0 );// r5 = 73 72 71 70
movq (edx+120, r7 );// store R15 = r7
psrlq (r5, 16 );// r5 = __ 73 72 71
pand (r5, M(2) );// r5 = __ 73 __ __
movq (r7, r0 );// r7 = 73 72 71 70
por ( r6, r5 );// r6 = 76 73 67 64 = R14
pand (r0, r2 );// r0 = __ __ __ 70
pxor (r7, r0 );// r7 = 73 72 71 __
psllq (r0, 32 );// r0 = __ 70 __ __
movq (edx+104, r6 );// write R14 = r6
psrlq (r4, 16 );// r4 = __ 57 56 55
movq (r5, eax+72);
psllq (r7, 16 );// r7 = 72 71 __ __
pmullw (r5, ebx+72 );// r5 = 47 46 45 44
movq (r6, r7 );// r6 = 72 71 __ __
movq (r3, M(2) );// r3 = __ FF __ __
psllq (r6, 16 );// r6 = 71 __ __ __
pand (r7, M(3) );// r7 = 72 __ __ __
pand (r3, r1 );// r3 = __ 62 __ __
por ( r7, r0 );// r7 = 72 70 __ __
movq (r0, r1 );// r0 = 63 62 61 60
pand (r1, M(3) );// r1 = 63 __ __ __
por (r6, r3 );// r6 = 71 62 __ __
movq (r3, r4 );// r3 = __ 57 56 55
psrlq (r1, 32 );// r1 = __ __ 63 __
pand (r3, r2 );// r3 = __ __ __ 55
por (r7, r1 );// r7 = 72 70 63 __
por ( r7, r3 );// r7 = 72 70 63 55 = R13
movq (r3, r4 );// r3 = __ 57 56 55
pand (r3, M(1) );// r3 = __ __ 56 __
movq (r1, r5 );// r1 = 47 46 45 44
movq (edx+88, r7 );// write R13 = r7
psrlq (r5, 48 );// r5 = __ __ __ 47
movq (r7, eax+64);
por (r6, r3 );// r6 = 71 62 56 __
pmullw (r7, ebx+64 );// r7 = 43 42 41 40
por (r6, r5 );// r6 = 71 62 56 47 = R12
pand (r4, M(2) );// r4 = __ 57 __ __
psllq (r0, 32 );// r0 = 61 60 __ __
movq (edx+72, r6 );// write R12 = r6
movq (r6, r0 );// r6 = 61 60 __ __
pand (r0, M(3) );// r0 = 61 __ __ __
psllq (r6, 16 );// r6 = 60 __ __ __
movq (r5, eax+40);
movq (r3, r1 );// r3 = 47 46 45 44
pmullw (r5, ebx+40 );// r5 = 27 26 25 24
psrlq (r1, 16 );// r1 = __ 47 46 45
pand (r1, M(1) );// r1 = __ __ 46 __
por (r0, r4 );// r0 = 61 57 __ __
pand (r2, r7 );// r2 = __ __ __ 40
por (r0, r1 );// r0 = 61 57 46 __
por ( r0, r2 );// r0 = 61 57 46 40 = R11
psllq (r3, 16 );// r3 = 46 45 44 __
movq (r4, r3 );// r4 = 46 45 44 __
movq (r2, r5 );// r2 = 27 26 25 24
movq (edx+112, r0 );// write R11 = r0
psrlq (r2, 48 );// r2 = __ __ __ 27
pand (r4, M(2) );// r4 = __ 45 __ __
por (r6, r2 );// r6 = 60 __ __ 27
movq (r2, M(1) );// r2 = __ __ FF __
por (r6, r4 );// r6 = 60 45 __ 27
pand (r2, r7 );// r2 = __ __ 41 __
psllq (r3, 32 );// r3 = 44 __ __ __
por ( r3, edx+80 );// r3 = 44 __ __ 23
por (r6, r2 );// r6 = 60 45 41 27 = R10
movq (r2, M(3) );// r2 = FF __ __ __
psllq (r5, 16 );// r5 = 26 25 24 __
movq (edx+96, r6 );// store R10 = r6
pand (r2, r5 );// r2 = 26 __ __ __
movq (r6, M(2) );// r6 = __ FF __ __
pxor (r5, r2 );// r5 = __ 25 24 __
pand (r6, r7 );// r6 = __ 42 __ __
psrlq (r2, 32 );// r2 = __ __ 26 __
pand (r7, M(3) );// r7 = 43 __ __ __
por (r3, r2 );// r3 = 44 __ 26 23
por ( r7, edx+64 );// r7 = 43 __ __ 12
por (r6, r3 );// r6 = 44 42 26 23 = R9
por ( r7, r5 );// r7 = 43 25 24 12 = R8
//
movq (edx+80, r6 );// store R9 = r6
//
movq (edx+64, r7 );// store R8 = r7
// 123c ( / 64 coeffs < 2c / coeff)
# undef M
// Done w/dequant + descramble + partial transpose// now do the idct itself.
//# define I( K) [edx + ( K * 16)]
//# define J( K) [edx + ( (K - 4) * 16) + 8]
RowIDCT(r0,r1,r2,r3,r4,r5,r6,r7,I10_1(edx),J10_1(edx),C10(ecx)); // 46 c
Transpose(r0,r1,r2,r3,r4,r5,r6,r7,I10_1(edx),J10_1(edx)); // 19 c
//# undef I
//# undef J
//# define I( K) [edx + ( K * 16) + 64]
//# define J( K) [edx + ( (K - 4) * 16) + 72]
RowIDCT(r0,r1,r2,r3,r4,r5,r6,r7,I_2(edx),J_2(edx),C10(ecx)); // 46 c
Transpose(r0,r1,r2,r3,r4,r5,r6,r7,I_2(edx),J_2(edx)); // 19 c
//# undef I
//# undef J
//# define I( K) [edx + (K * 16)]
//# define J( K) I( K)
ColumnIDCT(r0,r1,r2,r3,r4,r5,r6,r7,I10_2(edx),I10_2(edx),C10(ecx),ecx + EightOffset); // 57 c
//# undef I
//# undef J
//# define I( K) [edx + (K * 16) + 8]
//# define J( K) I( K)
ColumnIDCT(r0,r1,r2,r3,r4,r5,r6,r7,I10_3(edx),I10_3(edx),C10(ecx),ecx + EightOffset); // 57 c
//# undef I
//# undef J
// 368 cycles ( / 64 coeff < 6 c / coeff)
}
/**************************************************************************************
*
* Routine: MMX_idct10
*
* Description: Perform IDCT on a 8x8 block with at most 10 nonzero coefficients
*
* Input: Pointer to input and output buffer
*
* Output: None
*
* Return: None
*
* Special Note: The input coefficients are in transposed ZigZag order
*
* Error: None
*
***************************************************************************************
*/
/* --------------------------------------------------------------- */
// This macro does four 4-sample one-dimensional idcts in parallel. Inputs
// 4 thru 7 are assumed to be zero.
template<class Ti,class Tc> static __forceinline void BeginIDCT_10(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tc C)
{
movq (r2, I(3) );
movq (r6, C(3) );
movq (r4, r2 );
movq (r1, C(5) );
pmulhw (r4, r6 );/* r4 = c3*i3 - i3 */
movq (r3, I(1) );
pmulhw (r1, r2 );/* r1 = c5*i3 - i3 */
movq (r0, C(1) );/* (all registers are in use) */
paddw (r4, r2 );/* r4 = C = c3*i3 */
pxor (r6,r6 );/* used to get -(c5*i3) */
paddw (r2, r1 );/* r2 = c5*i3 */
movq (r5, I(2) );
pmulhw (r0, r3 );/* r0 = c1*i1 - i1 */
movq (r1, r5 );
paddw (r0, r3 );/* r0 = A = c1*i1 */
pmulhw (r3, C(7) );/* r3 = B = c7*i1 */
psubsw (r6, r2 );/* r6 = D = -c5*i3 */
pmulhw (r5, C(2) );/* r1 = c2*i2 - i2 */
psubsw (r0, r4 );/* r0 = A - C */
movq (r7,I(2) );
paddsw (r4, r4 );/* r4 = C + C */
paddw (r7, r5 );/* r7 = G = c2*i2 */
paddsw (r4, r0 );/* r4 = C. = A + C */
pmulhw (r1, C(6) );/* r1 = H = c6*i2 */
psubsw (r3, r6 );/* r3 = B - D */
movq (I(1), r4 );/* save C. at I(1) */
paddsw (r6, r6 );/* r6 = D + D */
movq (r4, C(4) );
paddsw (r6, r3 );/* r6 = D. = B + D */
movq (r5, r3 );/* r5 = B - D */
pmulhw (r3, r4 );/* r3 = (c4 - 1) * (B - D) */
movq (I(2), r6 );/* save D. at I(2) */
movq (r2, r0 );/* r2 = A - C */
movq (r6, I(0) );
pmulhw (r0, r4 );/* r0 = (c4 - 1) * (A - C) */
paddw (r5, r3 );/* r5 = B. = c4 * (B - D) */
paddw (r2, r0 );/* r0 = A. = c4 * (A - C) */
psubsw (r5, r1 );/* r5 = B.. = B. - H */
pmulhw (r6, r4 );/* r6 = c4*i0 - i0 */
paddw (r6, I(0) );/* r6 = E = c4*i0 */
paddsw (r1, r1 );/* r1 = H + H */
movq (r4, r6 );/* r4 = E */
paddsw (r1, r5 );/* r1 = H. = B + H */
psubsw (r6, r2 );/* r6 = F. = E - A. */
paddsw (r2, r2 );/* r2 = A. + A. */
movq (r0, I(1) );/* r0 = C. */
paddsw (r2, r6 );/* r2 = A.. = E + A. */
psubsw (r2, r1 );/* r2 = R2 = A.. - H. */
}
// end BeginIDCT_10 macro (25 cycles).
template<class Ti,class Tc> static __forceinline void RowIDCT_10(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tc C)
{
BeginIDCT_10 (r0,r1,r2,r3,r4,r5,r6,r7,I,C);
movq (r3, I(2) );/* r3 = D. */
psubsw (r4, r7 );/* r4 = E. = E - G */
paddsw (r1, r1 );/* r1 = H. + H. */
paddsw (r7, r7 );/* r7 = G + G */
paddsw (r1, r2 );/* r1 = R1 = A.. + H. */
paddsw (r7, r4 );/* r7 = G. = E + G */
psubsw (r4, r3 );/* r4 = R4 = E. - D. */
paddsw (r3, r3);
psubsw (r6, r5 );/* r6 = R6 = F. - B.. */
paddsw (r5, r5 );
paddsw (r3, r4 );/* r3 = R3 = E. + D. */
paddsw (r5, r6 );/* r5 = R5 = F. + B.. */
psubsw (r7, r0 );/* r7 = R7 = G. - C. */
paddsw (r0, r0 );
movq (I(1), r1 );/* save R1 */
paddsw (r0, r7 );/* r0 = R0 = G. + C. */
}
// end RowIDCT macro (8 + 38 = 46 cycles)
// Column IDCT normalizes and stores final results.
template<class Ti,class Tj,class Tc> static __forceinline void ColumnIDCT_10(__m64 &r0,__m64 &r1,__m64 &r2,__m64 &r3,__m64 &r4,__m64 &r5,__m64 &r6,__m64 &r7,Ti I,Tj J,Tc C,unsigned char *Eight)
{
BeginIDCT_10 (r0,r1,r2,r3,r4,r5,r6,r7,I,C);
paddsw (r2, Eight );/* adjust R2 (and R1) for shift */
paddsw (r1, r1 );/* r1 = H. + H. */
paddsw (r1, r2 );/* r1 = R1 = A.. + H. */
psraw (r2, 4 );/* r2 = NR2 */
psubsw (r4, r7 );/* r4 = E. = E - G */
psraw (r1, 4 );/* r1 = NR1 */
movq (r3, I(2) );/* r3 = D. */
paddsw (r7, r7 );/* r7 = G + G */
movq (I(2), r2 );/* store NR2 at I2 */
paddsw (r7, r4 );/* r7 = G. = E + G */
movq (I(1), r1 );/* store NR1 at I1 */
psubsw (r4, r3 );/* r4 = R4 = E. - D. */
paddsw (r4, Eight );/* adjust R4 (and R3) for shift */
paddsw (r3, r3 );/* r3 = D. + D. */
paddsw (r3, r4 );/* r3 = R3 = E. + D. */
psraw (r4, 4 );/* r4 = NR4 */
psubsw (r6, r5 );/* r6 = R6 = F. - B.. */
psraw (r3, 4 );/* r3 = NR3 */
paddsw (r6, Eight );/* adjust R6 (and R5) for shift */
paddsw (r5, r5 );/* r5 = B.. + B.. */
paddsw (r5, r6 );/* r5 = R5 = F. + B.. */
psraw (r6, 4 );/* r6 = NR6 */
movq (J(4), r4 );/* store NR4 at J4 */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -