📄 fdct3dn.cpp
字号:
psubd t7t6, in7in6; // [tmp7,tmp6] <= [in0,in1] - [in7,in6]
psubd t4t5, in4in5; // [tmp4,tmp5] <= [in3,in2] - [in4,in5]
// PI2FD( t7t6, t7t6); // convert t7t6(int,int) -> t7t6(float,float)
EMIT 0x0f
EMIT 0x0f
EMIT 0xe4 //((0xc4 & 0x3f) << 3) | 0xc4
EMIT 0x0d
paddd t3t2, in4in5; // [tmp3,tmp2] <= [in3,in2] + [in4,in5]
// PI2FD( t4t5, t4t5); // convert t4t5(int,int) -> t4t5(float,float)
EMIT 0x0f
EMIT 0x0f
EMIT 0xed //((0xc5 & 0x3f) << 3) | 0xc5
EMIT 0x0d
paddd t0t1, in7in6; // [tmp0,tmp1] <= [in0,in1] + [in7,in6]
movq [tmp7tmp6], t7t6;
// PI2FD( t0t1, t0t1); // convert t0t1(int,int) -> t0t1(float,float)
EMIT 0x0f
EMIT 0x0f
EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
EMIT 0x0d
movq [tmp4tmp5], t4t5;
// PI2FD( t3t2, t3t2); // convert t3t2(int,int) -> t3t2(float,float)
EMIT 0x0f
EMIT 0x0f
EMIT 0xc0 //((0xc0 & 0x3f) << 3) | 0xc0
EMIT 0x0d
// movq [tmp0tmp1], t0t1;
// movq [tmp3tmp2], t3t2;
/* Even part */
// *tmp10 = (*tmp0) + (*tmp3); /* phase 2 */
// *tmp11 = (*tmp1) + (*tmp2);
// *tmp13 = (*tmp0) - (*tmp3);
// *tmp12 = (*tmp1) - (*tmp2);
// 2a) [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
// 2b) [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]
#define t10t11 mm6
#define t13t12 t0t1
movq t10t11, t0t1; // copy t0t1, prepare t10t11 calculation
// PFADD( t10t11, t3t2 ); // [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
EMIT 0x0f
EMIT 0x0f
EMIT 0xf0 // ((0xc6 & 0x3f) << 3) | 0xc0
EMIT 0x9e
// PFSUB( t13t12, t3t2 ); // [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc8 // ((0xc1 & 0x3f) << 3) | 0xc0
EMIT 0x9a
#define t10mt11 mm2 // mm0, mm4, mm5 not ok
#define t13t13 mm7 // mm1 not ok
// st1_20 <= [tmp10,tmp11]
// st1_21 <= [tmp10,-tmp11] // negated tmp11
// st1_22 <= [tmp13,tmp12]
// st1_23 <= [tmp13,tmp13] // tmp13 duplicated
// movq [tmp10tmp11], t10t11;
movq t10mt11, t10t11;
// movq [tmp13tmp12], t13t12;
pxor t10mt11, [mmMask00001000]; // t10mt11 = [tmp10, -tmp11]
movq t13t13, t13t12;
// 3a) dataptr[4] = (*tmp10) - (*tmp11);
// 3a) dataptr[0] = (*tmp10) + (*tmp11); /* phase 3 */
punpckhdq t13t13, t13t13; // t13t13 = [tmp13, tmp13]
EMIT 0x0f
EMIT 0x0f
EMIT 0xf2 //((0xc6 & 0x3f) << 3) | 0xc2
EMIT 0xae
// PFACC( t10t11, t10mt11 ); // produce t10t11 = [dataptr[4], dataptr[0] ]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
EMIT 0xae
// PFACC( t13t12, t13t12); // t13t12 = [z1a,z1a] <= [ tmp12+tmp13,tmp12+tmp13 ]
#define z1mz1 t13t12 // [z1, -z1]
#define out4out0 t10t11
#define Z1B_CONST _ebx + 0 // [0.07071, -0.7071]
// movd dword ptr [ edx + 4*0 ], out4out0; // dataptr[0] <= final result
movd dword ptr [ edx + 32*0 ], out4out0; // dataptr[0] <= final result
psrlq out4out0, 32; // [ __, out4 ]
// z1 = ((*tmp12) + (*tmp13)) * ((float ) NC_R_SQRT2); /* c4 */
// 4b) [z1,-z1] <= [z1a,z1a] * [ 0.7071,-0.7071 ];
EMIT 0x0f
EMIT 0x0f
EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 + 0 | 0x40)
EMIT 0
EMIT 0xb4
// PFMULM( z1mz1, Z1B_CONST, 0 ); // z1mz1<= [z1, -z1]
// movd dword ptr [ edx + 4*4 ], out4out0; // dataptr[4] <= final result
movd dword ptr [ edx + 32*4 ], out4out0; // dataptr[4] <= final result
// [dataptr2,dataptr6] <= [tmp13,tmp13] + [z1,-z1]; // pfadd
EMIT 0x0f
EMIT 0x0f
EMIT 0xf9 //((0xc7 & 0x3f) << 3) | 0xc1
EMIT 0x9e
// PFADD(t13t13, z1mz1 ); // produce t13t13 = [dataptr2,dataptr6]
// 5a) dataptr[2] = (*tmp13) + z1; /* phase 5 */
// 5a) dataptr[6] = (*tmp13) - z1;
#define out2out6 t13t13
// movd dword ptr [edx + 4*6], out2out6; // dataptr[6] <= final result
movd dword ptr [edx + 32*6], out2out6; // dataptr[6] <= final result
psrlq out2out6, 32; // [ __, out2 ]
// movd dword ptr [edx + 4*2], out2out6; // dataptr[2] <= final result
movd dword ptr [edx + 32*2], out2out6; // dataptr[2] <= final result
#define t14t16 mm0
#define t7t6_2 mm2 // temp copy of t7t6
#define t15t15 mm1
// *tmp14 = (*tmp4) + (*tmp5); /* phase 2 */
// *tmp15 = (*tmp5) + (*tmp6);
// *tmp16 = (*tmp6) + (*tmp7);
// 6a) [tmp14,tmp16] <= [ (tmp4+tmp5), (tmp7+tmp6) ]; //pfacc
// 6b) [ tmp15] <= [tmp7,tmp6] + [tmp4,tmp5];
movq t14t16, [tmp7tmp6]; // prepare [tmp14,tmp16] generation
movq t15t15, [tmp4tmp5];
movq t7t6_2, t14t16; // make copy of [tmp7,tmp6]
// PFACC(t14t16, t15t15); // t14t16 <= [ (tmp4+tmp5), (tmp7+tmp6) ]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc1 //((0xc0 & 0x3f) << 3) | 0xc1
EMIT 0xae
// PFADD(t15t15, t7t6_2); // t15t15 <= [(x),tmp5+tmp6]
EMIT 0x0f
EMIT 0x0f
EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
EMIT 0x9e
#define t14mt16 mm2
#define t15t14 mm3
movq t15t14, t14t16; // prepare to generate [tmp15,tmp14]
// movd [tmp17tmp16], t14t16; // produce [tmp17,tmp16] <= [...,t16]
punpckldq t15t15,t15t15; // t15t15 <= [tmp5+tmp6,tmp5+tmp6]
movq t14mt16, t14t16; // prepare to generate [tmp14, -tmp16]
punpckhdq t15t14, t15t15; // mm3 <= [t15,t14]
// movq [tmp15tmp14], t15t14;
/* The rotator is modified from fig 4-8 to avoid extra negations. */
// (*z5) = ((*tmp14) - (*tmp16)) * ((float ) 0.382683433); /* c6 */
// (*z2) = ((float ) 0.541196100) * (*tmp14) + (*z5); /* c2-c6 */
// (*z4) = ((float ) 1.306562965) * (*tmp16) + (*z5); /* c2+c6 */
// (*z3) = (*tmp15) * ((float ) NC_R_SQRT2); /* c4 */
// form [z5a,z3a] <= [tmp14-tmp16, tmp15]
// 7a) [z2a,z4a] <= [tmp14,tmp16] * [0.5411,1.3066]
// 7b) [z5 ,z3 ] <= [tmp14-tmp16,tmp15] * [0.3827,0.7071]
#define ___t15 t15t15
pxor t14mt16, [mmMask00001000]; // t14mt16 = [tmp14,-tmp16]
psllq ___t15, 32; // create ___t15 = [ t15, 000]
//#define Z2AZ4A_CONST _ebx + 8 // [0.5411, 1.3066]
#define Z2AZ4A_CONST _ebx // [0.5411, 1.3066]
#define z5az3a ___t15 // [z5a,z3a] <= [t14-t16,t15] <= t14mt16
#define z2az4a t14t16 // [z2a,z4a] = [t14t16]*[0.5411,1.30666]
// PFMULM( z2az4a, Z2AZ4A_CONST, 8 ); // form [z2a,z4a]
// PFACC( z5az3a, t14mt16 ); // [z5a,z3a] = [tmp14-tmp16, tmp15]
EMIT 0x0f
EMIT 0x0f
EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 8
EMIT 0xb4
EMIT 0x0f
EMIT 0x0f
EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
EMIT 0xae
#define Z5AZ3A_CONST _ebx // [0.3827, 0.7071]
//#define Z5AZ3A_CONST _ebx + 16 // [0.3827, 0.7071]
// PFMULM( z5az3a, Z5AZ3A_CONST, 16 ); // z5az3a = [z5,z3]
EMIT 0x0f
EMIT 0x0f
EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
EMIT 16
EMIT 0xb4
#define z5z3 z5az3a
#define z5z5 mm7
#define z3z3 mm6
movq z5z5, z5z3; // start to form z5z5
movq z3z3, z5z3; // start to form z3z3
punpckhdq z5z5,z5z5; // z5z5 = [z5,z5]
punpckldq z3z3,z3z3; // z3z3 = [z3,z3]
// movq [z5z4], z5z5; // produce z5
#define z3mz3 z3z3
#define z2z4 z2az4a
pxor z3mz3, [mmMask00001000]; // z3mz3 = [z3,-z3]
// 7c) [z2,z4] <= [z2a,z4a] + [z5,z5];
// PFADD( z2z4, z5z5 ); // z2az4a <= [z2,z4]
EMIT 0x0f
EMIT 0x0f
EMIT 0xc7 //((0xc0 & 0x3f) << 3) | 0xc7
EMIT 0x9e
#define t7t7 mm5
movq t7t7, [tmp7tmp6]; // begin forming [tmp7,tmp7]
punpckhdq t7t7,t7t7; // t7t7 = [tmp7,tmp7]
// z11 = (*tmp7) + (*z3); /* phase 5 */
// z13 = (*tmp7) - (*z3);
#define z13z11 t7t7
#define out5out1 mm4
#define out3out7 z13z11
// 7d) [z13,z11] <= [tmp7,tmp7] - [z3,-z3]
// PFSUB( z13z11, z3mz3); // z13z11 = [z13,z11]
EMIT 0x0f
EMIT 0x0f
EMIT 0xee //((0xc5 & 0x3f) << 3) | 0xc6
EMIT 0x9a
movq out5out1, z2z4; // begin forming out5out1
// dataptr[5] = z13 + (*z2); /* phase 6 */
// dataptr[3] = z13 - (*z2);
// dataptr[1] = z11 + (*z4);
// dataptr[7] = z11 - (*z4);
// 8a) [dataptr5,dataptr1] <= [z13,z11] + [z2,z4]
// 8b) [dataptr3,dataptr7] <= [z13,z11] - [z2,z4]
// PFADD( out5out1, z13z11); // produce [out5,out1]
EMIT 0x0f
EMIT 0x0f
EMIT 0xe5 //((0xc4 & 0x3f) << 3) | 0xc5
EMIT 0x9e
// PFSUB( out3out7, z2z4 ); // produce [out3,out7]
EMIT 0x0f
EMIT 0x0f
EMIT 0xe8 //((0xc5 & 0x3f) << 3) | 0xc0
EMIT 0x9a
// movd [edx + 4*1], out5out1; // store out[1]
movd [edx + 32*1], out5out1; // store out[1]
psrlq out5out1, 32; // out5out1 <= [ 000, out5 ]
// movd [edx + 4*7], out3out7; // store out[7]
movd [edx + 32*7], out3out7; // store out[7]
psrlq out3out7, 32; // out3out7 <= [ 000, out3 ]
// movd [edx + 4*5], out5out1; // store out[5]
movd [edx + 32*5], out5out1; // store out[5]
sub edi, 0x01; // i = i - 1
// movd [edx + 4*3], out3out7; // store out[3]
movd [edx + 32*3], out3out7; // store out[3]
// add edx, 32; // 32=4*8, outptr += 8 (floats)
add edx, 4; // 4=4*1, outptr += 1 (floats)
cmp edi, 0x00; // end for ( i=8; i >= 0; i=i-1)
jg tdn_dct_row1; // branch until (edi == 0)
// dataptr += 8; /* advance pointer to next row */
// blkptr += 8;
// } // end for (i=0; i<8; i++);
// ----------------------- end of dct_row processing
// undefine the aliases used during dct_row processing, just in case
// we want to change the register allocation for the dct_col processing.
#undef in7in6
#undef in4in5
#undef in0in1
#undef in3in2
#undef in0in1_2
#undef in3in2_2
#undef t0t1
#undef t3t2
#undef t4t5
#undef t7t6
#undef t10t11
#undef t13t12
#undef t10mt11
#undef t13t13
#undef z1mz1
#undef out4out0
#undef Z1B_CONST
#undef out2out6
#undef t14t16
#undef t7t6_2
#undef t15t15
#undef t14mt16
#undef t15t14
#undef ___t15
#undef Z2AZ4A_CONST
#undef z5az3a
#undef z2az4a
#undef Z5AZ3A_CONST
#undef z5z3
#undef z5z5
#undef z3z3
#undef z3mz3
#undef z2z4
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -