⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fdct3dn.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 4 页
字号:


     psubd t7t6, in7in6; // [tmp7,tmp6] <= [in0,in1] - [in7,in6] 
      psubd t4t5, in4in5; // [tmp4,tmp5] <= [in3,in2] - [in4,in5]

//     PI2FD( t7t6, t7t6);    // convert t7t6(int,int) -> t7t6(float,float)
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xe4 //((0xc4 & 0x3f) << 3) | 0xc4
     EMIT 0x0d

      paddd t3t2, in4in5;  // [tmp3,tmp2] <= [in3,in2] + [in4,in5]

//     PI2FD( t4t5, t4t5);   // convert t4t5(int,int) -> t4t5(float,float)
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xed //((0xc5 & 0x3f) << 3) | 0xc5
     EMIT 0x0d

      paddd t0t1, in7in6;  // [tmp0,tmp1] <= [in0,in1] + [in7,in6]

     movq [tmp7tmp6], t7t6;
//      PI2FD( t0t1, t0t1);    // convert t0t1(int,int) -> t0t1(float,float)
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
      EMIT 0x0d

     movq [tmp4tmp5], t4t5;
//      PI2FD( t3t2, t3t2);   // convert t3t2(int,int) -> t3t2(float,float)
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc0 //((0xc0 & 0x3f) << 3) | 0xc0
     EMIT 0x0d


//    movq [tmp0tmp1], t0t1;


//    movq [tmp3tmp2], t3t2;



    /* Even part */

//    *tmp10 = (*tmp0) + (*tmp3);    /* phase 2 */
//    *tmp11 = (*tmp1) + (*tmp2);
//    *tmp13 = (*tmp0) - (*tmp3);
//    *tmp12 = (*tmp1) - (*tmp2);

// 2a) [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
// 2b) [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]

#define t10t11 mm6
#define t13t12 t0t1

    movq t10t11, t0t1;       // copy t0t1, prepare t10t11 calculation

//    PFADD( t10t11, t3t2 );       //  [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xf0 // ((0xc6 & 0x3f) << 3) | 0xc0
    EMIT 0x9e

//    PFSUB( t13t12, t3t2 );      //  [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc8 // ((0xc1 & 0x3f) << 3) | 0xc0
    EMIT 0x9a

#define t10mt11 mm2 // mm0, mm4, mm5 not ok
#define t13t13  mm7 // mm1 not ok

//  st1_20 <= [tmp10,tmp11]
//  st1_21 <= [tmp10,-tmp11]   // negated tmp11
//  st1_22 <= [tmp13,tmp12] 
//  st1_23 <= [tmp13,tmp13] // tmp13 duplicated

//    movq [tmp10tmp11], t10t11;
     movq t10mt11, t10t11;     

//    movq [tmp13tmp12], t13t12;
    pxor t10mt11, [mmMask00001000]; // t10mt11 = [tmp10, -tmp11]
     movq t13t13, t13t12;

// 3a)  dataptr[4] = (*tmp10) - (*tmp11);
// 3a)  dataptr[0] = (*tmp10) + (*tmp11); /* phase 3 */

    punpckhdq t13t13, t13t13;      // t13t13 = [tmp13, tmp13]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xf2 //((0xc6 & 0x3f) << 3) | 0xc2
     EMIT 0xae

//     PFACC( t10t11, t10mt11 ); // produce t10t11 = [dataptr[4], dataptr[0] ]

    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
    EMIT 0xae
     
//    PFACC( t13t12, t13t12); // t13t12 = [z1a,z1a] <= [ tmp12+tmp13,tmp12+tmp13 ] 


#define z1mz1   t13t12      // [z1, -z1]

#define out4out0 t10t11
#define Z1B_CONST _ebx + 0 // [0.07071, -0.7071]

//    movd dword ptr [ edx + 4*0 ], out4out0; // dataptr[0] <= final result
      movd dword ptr [ edx + 32*0 ], out4out0; // dataptr[0] <= final result
     psrlq out4out0, 32;    // [ __, out4 ]

//    z1 = ((*tmp12) + (*tmp13)) * ((float ) NC_R_SQRT2); /* c4 */
// 4b)  [z1,-z1] <=  [z1a,z1a] * [ 0.7071,-0.7071 ];
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 + 0 | 0x40)
    EMIT 0
    EMIT 0xb4
     
//    PFMULM( z1mz1, Z1B_CONST, 0 ); // z1mz1<= [z1, -z1]

//    movd dword ptr [ edx + 4*4 ], out4out0; // dataptr[4] <= final result
    movd dword ptr [ edx + 32*4 ], out4out0; // dataptr[4] <= final result

//  [dataptr2,dataptr6] <=   [tmp13,tmp13] + [z1,-z1]; // pfadd
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xf9 //((0xc7 & 0x3f) << 3) | 0xc1
    EMIT 0x9e
      
//    PFADD(t13t13, z1mz1 );  // produce t13t13 = [dataptr2,dataptr6]

// 5a)  dataptr[2] = (*tmp13) + z1;    /* phase 5 */
// 5a)  dataptr[6] = (*tmp13) - z1;

#define out2out6 t13t13
//    movd dword ptr [edx + 4*6], out2out6; // dataptr[6] <= final result
    movd dword ptr [edx + 32*6], out2out6; // dataptr[6] <= final result
     psrlq out2out6, 32;    // [ __, out2 ]

//    movd dword ptr [edx + 4*2], out2out6; // dataptr[2] <= final result
    movd dword ptr [edx + 32*2], out2out6; // dataptr[2] <= final result

#define t14t16   mm0
#define t7t6_2 mm2  // temp copy of t7t6
#define t15t15   mm1

//    *tmp14 = (*tmp4) + (*tmp5);    /* phase 2 */
//    *tmp15 = (*tmp5) + (*tmp6);
//    *tmp16 = (*tmp6) + (*tmp7);

// 6a) [tmp14,tmp16] <= [ (tmp4+tmp5), (tmp7+tmp6) ]; //pfacc
// 6b) [      tmp15] <= [tmp7,tmp6] + [tmp4,tmp5]; 
    movq t14t16, [tmp7tmp6];  // prepare [tmp14,tmp16] generation

    movq t15t15, [tmp4tmp5];
     movq t7t6_2, t14t16;     // make copy of [tmp7,tmp6]

//    PFACC(t14t16, t15t15); // t14t16 <= [ (tmp4+tmp5), (tmp7+tmp6) ]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc1 //((0xc0 & 0x3f) << 3) | 0xc1
     EMIT 0xae 
   
//    PFADD(t15t15, t7t6_2); // t15t15 <= [(x),tmp5+tmp6]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
    EMIT 0x9e 

#define t14mt16 mm2
#define t15t14 mm3
    movq t15t14, t14t16;   // prepare to generate [tmp15,tmp14]
//     movd [tmp17tmp16], t14t16; // produce [tmp17,tmp16] <= [...,t16]

    punpckldq t15t15,t15t15; // t15t15 <= [tmp5+tmp6,tmp5+tmp6]
     movq t14mt16, t14t16;   // prepare to generate [tmp14, -tmp16]

    punpckhdq t15t14, t15t15; // mm3 <= [t15,t14]

//    movq [tmp15tmp14], t15t14; 

    /* The rotator is modified from fig 4-8 to avoid extra negations. */
//    (*z5) = ((*tmp14) - (*tmp16)) * ((float ) 0.382683433); /* c6 */
//    (*z2) = ((float ) 0.541196100) * (*tmp14) + (*z5); /* c2-c6 */
//    (*z4) = ((float ) 1.306562965) * (*tmp16) + (*z5); /* c2+c6 */
//    (*z3) = (*tmp15) * ((float ) NC_R_SQRT2); /* c4 */


//     form [z5a,z3a] <= [tmp14-tmp16, tmp15]
// 7a) [z2a,z4a] <= [tmp14,tmp16] * [0.5411,1.3066]
// 7b) [z5 ,z3 ] <= [tmp14-tmp16,tmp15] * [0.3827,0.7071]

#define ___t15  t15t15
    pxor t14mt16, [mmMask00001000]; // t14mt16 = [tmp14,-tmp16]
     psllq ___t15, 32;             // create ___t15 = [ t15, 000]

//#define Z2AZ4A_CONST _ebx + 8 // [0.5411, 1.3066]
#define Z2AZ4A_CONST _ebx // [0.5411, 1.3066]
#define z5az3a ___t15      // [z5a,z3a] <= [t14-t16,t15] <= t14mt16
#define z2az4a t14t16   // [z2a,z4a] = [t14t16]*[0.5411,1.30666]

//    PFMULM( z2az4a, Z2AZ4A_CONST, 8 ); // form [z2a,z4a]
//     PFACC( z5az3a, t14mt16 );  // [z5a,z3a] = [tmp14-tmp16, tmp15]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
    EMIT 8
    EMIT 0xb4

    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
    EMIT 0xae  

#define Z5AZ3A_CONST _ebx  // [0.3827, 0.7071]
//#define Z5AZ3A_CONST _ebx + 16 // [0.3827, 0.7071]

//    PFMULM( z5az3a, Z5AZ3A_CONST, 16 ); // z5az3a = [z5,z3]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
    EMIT 16
    EMIT 0xb4

#define z5z3 z5az3a
#define z5z5 mm7
#define z3z3 mm6
    movq z5z5, z5z3;        // start to form z5z5
     movq z3z3, z5z3;        // start to form z3z3

    punpckhdq z5z5,z5z5;    // z5z5 = [z5,z5]
     punpckldq z3z3,z3z3;    // z3z3 = [z3,z3]

//    movq [z5z4], z5z5;      // produce z5

#define z3mz3 z3z3
#define z2z4 z2az4a
    pxor z3mz3, [mmMask00001000];  // z3mz3 = [z3,-z3]
// 7c) [z2,z4]  <= [z2a,z4a] + [z5,z5];
//    PFADD( z2z4, z5z5 );        // z2az4a <= [z2,z4]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc7 //((0xc0 & 0x3f) << 3) | 0xc7
    EMIT 0x9e

#define t7t7 mm5
    movq t7t7, [tmp7tmp6];    // begin forming [tmp7,tmp7]

    punpckhdq t7t7,t7t7;    // t7t7 = [tmp7,tmp7]

//    z11 = (*tmp7) + (*z3);        /* phase 5 */
//    z13 = (*tmp7) - (*z3);

#define z13z11 t7t7
#define out5out1 mm4
#define out3out7 z13z11
// 7d) [z13,z11] <= [tmp7,tmp7] - [z3,-z3]
//    PFSUB( z13z11, z3mz3);   // z13z11 = [z13,z11]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xee //((0xc5 & 0x3f) << 3) | 0xc6
    EMIT 0x9a
       
     movq out5out1, z2z4;    // begin forming out5out1

//    dataptr[5] = z13 + (*z2);  /* phase 6 */
//    dataptr[3] = z13 - (*z2);
//    dataptr[1] = z11 + (*z4);
//   dataptr[7] = z11 - (*z4);

// 8a) [dataptr5,dataptr1] <= [z13,z11] + [z2,z4]
// 8b) [dataptr3,dataptr7] <= [z13,z11] - [z2,z4]

//    PFADD( out5out1, z13z11); // produce [out5,out1]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xe5 //((0xc4 & 0x3f) << 3) | 0xc5
    EMIT 0x9e

//    PFSUB( out3out7, z2z4 );  // produce [out3,out7]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xe8 //((0xc5 & 0x3f) << 3) | 0xc0
    EMIT 0x9a  

//      movd [edx + 4*1], out5out1; // store out[1]
      movd [edx + 32*1], out5out1; // store out[1]
  
     psrlq out5out1, 32;      // out5out1 <= [ 000, out5 ]

//    movd [edx + 4*7], out3out7; // store out[7]
    movd [edx + 32*7], out3out7; // store out[7]
     psrlq out3out7, 32;      // out3out7 <= [ 000, out3 ]

//    movd [edx + 4*5], out5out1; // store out[5]
    movd [edx + 32*5], out5out1; // store out[5]
     sub edi, 0x01;          // i = i - 1

//    movd [edx + 4*3], out3out7; // store out[3]
    movd [edx + 32*3], out3out7; // store out[3]
//     add edx, 32;            // 32=4*8, outptr += 8 (floats)
     add edx, 4;            // 4=4*1, outptr += 1 (floats)

    cmp edi, 0x00;          // end for ( i=8; i >= 0; i=i-1)
     jg tdn_dct_row1;  // branch until (edi == 0)

//    dataptr += 8;       /* advance pointer to next row */
//    blkptr += 8;
//  } // end for (i=0; i<8; i++);

// ----------------------- end of dct_row processing

// undefine the aliases used during dct_row processing, just in case
// we want to change the register allocation for the dct_col processing.

#undef in7in6
#undef in4in5
#undef in0in1
#undef in3in2
#undef in0in1_2
#undef in3in2_2
#undef t0t1
#undef t3t2
#undef t4t5
#undef t7t6
#undef t10t11
#undef t13t12
#undef t10mt11
#undef t13t13
#undef z1mz1
#undef out4out0
#undef Z1B_CONST
#undef out2out6
#undef t14t16
#undef t7t6_2
#undef t15t15
#undef t14mt16
#undef t15t14
#undef ___t15
#undef Z2AZ4A_CONST
#undef z5az3a
#undef z2az4a
#undef Z5AZ3A_CONST
#undef z5z3
#undef z5z5
#undef z3z3
#undef z3mz3
#undef z2z4

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -