fdct3dn.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,403 行 · 第 1/4 页
CPP
1,403 行
#undef t7t7
#undef z13z11
#undef out5out1
#undef out3out7

  /*************************************************************
   *
   * Pass 2: DCT process columns
   *
   *************************************************************/

     // The 2nd-pass uses the same base-code as the 1st-pass, the two
     // code loops differ only in the handling of input/output data.
     //
     // 1) Since the 1st-pass produces float-data, the 2nd-pass's
     //    input must accept floating point data.
     // 2) Since the 2nd-pass processes input row-by-row, and the source
     //    is already located in the temp array, the output is also
     //    stored row-by-row.  (We don't want to allocate an additional
     //    float[64] array.  This wastes cache RAM.)
     //    Therefore, the final post-processing (descaling) must
     //    additionally transpose the final output.

  //    s
  // 
//    for (i = 0; i < 8; i++)
//    {

//   3_2_1_0 -> x_3_2_1
     // mm0, mm1 = 3_2_1_0,  mm2, mm3 = x_3_2_1
     // mm4 = 7_6_5_4

     mov eax, dword ptr [dataptr]; 
      pxor mm7, mm7;              // mm7 <= 0x0000_0000_0000_0000

     mov edx, dword ptr [dataptr2];// edx <= &dataptr[64]
      mov edi, 0x08;               // edi = 'i' // for ( i = 8; i > 0; i=i+1 )

     lea ebx, dword ptr [CONSTANTS];// ebx <= &CONSTANTS[0]
     // tdn_dct_col1 computes the fDCT for 1 input-row.
     // tdn_dct_col1 stored the FDCT result back into the same row of the
     //    temp array.

tdn_dct_col1: // 3d_now_dct_col1 loop-point

// 1a) [in7,in6] = [blkptr7, blkptr6]; // dword, dword
// 1b) [in4,in5] = [blkptr4, blkptr5]; //dword, dword
// 1c) [in0,in1] = [blkptr0, blkptr1]; // dword,dword
// 1d) [in3,in2] = [blkptr3, blkptr2]; //dword, dword

#define in7in6 mm2
#define in4in5 mm3
#define in0in1 mm1
#define in3in2 mm0
#define in0in1_2 mm4
#define in3in2_2 mm5

     movd in0in1, dword ptr [eax+4*1];  // in0in1 <= [ 000, in1 ]

     punpckldq in0in1, qword ptr [eax+4*0];  // in0in1 <= [ in0, in1 ]

     movd in4in5, dword ptr [eax+4*5];  // in4in5 <= [ 000, in5 ]
      movq in0in1_2, in0in1;            // copy in0in1 to in0in1_2

     punpckldq in4in5, qword ptr [eax+4*4];  // in4in5 <= [ in4, in5 ]

     movq in3in2, qword ptr [eax+4*2];  // get [in3,in2]

     movq in7in6, qword ptr [eax+4*6];  // get [in7,in6]
      movq in3in2_2, in3in2;            // copy in3in2 to in3in2_2

/*
    *tmp0 = dataptr[0] + dataptr[7];
    *tmp7 = dataptr[0] - dataptr[7];
    *tmp1 = dataptr[1] + dataptr[6];
    *tmp6 = dataptr[1] - dataptr[6];
    *tmp2 = dataptr[2] + dataptr[5];
    *tmp5 = dataptr[2] - dataptr[5];
    *tmp3 = dataptr[3] + dataptr[4];
    *tmp4 = dataptr[3] - dataptr[4];
*/

// tmp0= inptr[0]                         +                         inptr[7];
// tmp1=         inptr[1]                 +                 inptr[6];

// tmp7= inptr[0]                         -                         inptr[7];
// tmp6=         inptr[1]                 -                 inptr[6];

// tmp3=                         inptr[3] + inptr[4];
// tmp2=                 inptr[2]         +         inptr[5];

// tmp4=                         inptr[3] - inptr[4];
// tmp5=                 inptr[2]         -         inptr[5];


#define t0t1 in0in1
#define t3t2 in3in2
#define t4t5 in3in2_2
#define t7t6 in0in1_2
//  st1_0 <= [tmp0,tmp1]  (float, float)
//  st1_1 <= [tmp7,tmp6]  (float, float)
//  st1_2 <= [tmp3,tmp2]  (float, float)
//  st1_3 <= [tmp4,tmp5]  (float, float)


     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xe2 //((0xc4 & 0x3f) << 3) | 0xc2
     EMIT 0x9a

//     PFSUB(t7t6, in7in6); // [tmp7,tmp6] <= [in0,in1] - [in7,in6]
      add eax, 4*8;       // increment inptr+=8 (floats)

     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xeb //((0xc5 & 0x3f) << 3) | 0xc3
     EMIT 0x9a

//     PFSUB(t4t5, in4in5);// [tmp4,tmp5] <= [in3,in2] - [in4,in5]

     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc3 //((0xc0 & 0x3f) << 3) | 0xc3
     EMIT 0x9e

//     PFADD(t3t2, in4in5);  // [tmp3,tmp2] <= [in3,in2] + [in4,in5]
     movq [tmp7tmp6], t7t6;

     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
     EMIT 0x9e
     
//     PFADD(t0t1, in7in6); // [tmp0,tmp1] <= [in0,in1] + [in7,in6]
     movq [tmp4tmp5], t4t5;

//     movq [tmp3tmp2], t3t2;

//     movq [tmp0tmp1], t0t1;

    // Even part 

//    *tmp10 = (*tmp0) + (*tmp3);    // phase 2 
//    *tmp11 = (*tmp1) + (*tmp2);
//    *tmp13 = (*tmp0) - (*tmp3);
//    *tmp12 = (*tmp1) - (*tmp2);

// 2a) [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
// 2b) [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]

#define t10t11 mm6
#define t13t12 t0t1

    movq t10t11, t0t1;       // copy t0t1, prepare t10t11 calculation

//    PFADD( t10t11, t3t2 );      //  [tmp10,tmp11] <= [tmp0,tmp1] + [tmp3,tmp2]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xf0 //((0xc6 & 0x3f) << 3) | 0xc0
    EMIT 0x9e      

//    PFSUB( t13t12, t3t2 );      //  [tmp13,tmp12] <= [tmp0,tmp1] - [tmp3,tmp2]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc8 //((0xc1 & 0x3f) << 3) | 0xc0
    EMIT 0x9a      

#define t10mt11 mm2 // mm0, mm4, mm5 not ok
#define t13t13  mm7 // mm1 not ok

//  st1_20 <= [tmp10,tmp11]
//  st1_21 <= [tmp10,-tmp11]   // negated tmp11
//  st1_22 <= [tmp13,tmp12] 
//  st1_23 <= [tmp13,tmp13] // tmp13 duplicated

    movq [tmp10tmp11], t10t11;
     movq t10mt11, t10t11;     

    movq [tmp13tmp12], t13t12;
    pxor t10mt11, [mmMask00001000]; // t10mt11 = [tmp10, -tmp11]
     movq t13t13, t13t12;

// 3a)   dataptr2[4] = (*tmp10) - (*tmp11);
// 3a)   dataptr2[0] = (*tmp10) + (*tmp11); // phase 3

    punpckhdq t13t13, t13t13;      // t13t13 = [tmp13, tmp13]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xf2 //((0xc6 & 0x3f) << 3) | 0xc2
     EMIT 0xae

//     PFACC( t10t11, t10mt11 ); // produce t10t11 = [dataptr[4], dataptr[0] ]

    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
    EMIT 0xae

//    PFACC( t13t12, t13t12); // t13t12 = [z1a,z1a] <= [tmp12+tmp13,tmp12+tmp13]


#define z1mz1   t13t12      // [z1, -z1]

#define out4out0 t10t11
#define Z1B_CONST _ebx + 0 // [0.07071, -0.7071]

    movd dword ptr [ edx + 4*0 ], out4out0; // dataptr[0] <= final result
//      movd dword ptr [ edx + 32*0 ], out4out0; // dataptr[0] <= final result
     psrlq out4out0, 32;    // [ __, out4 ]

//    z1 = ((*tmp12) + (*tmp13)) * ((float ) NC_R_SQRT2); // c4 
// 4b)   [z1,-z1] <=  [z1a,z1a] * [ 0.7071,-0.7071 ];
//    PFMULM( z1mz1, Z1B_CONST, 0 ); // z1mz1<= [z1, -z1]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 + 0 | 0x40)
    EMIT 0
    EMIT 0xb4

    movd dword ptr [ edx + 4*4 ], out4out0; // dataptr[4] <= final result
//    movd dword ptr [ edx + 32*4 ], out4out0; // dataptr[4] <= final result

//  [dataptr2,dataptr6] <=   [tmp13,tmp13] + [z1,-z1]; // pfadd
//    PFADD(t13t13, z1mz1 );  // produce t13t13 = [dataptr2,dataptr6]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xf9 //((0xc7 & 0x3f) << 3) | 0xc1
    EMIT 0x9e  

// 5a)  dataptr[2] = (*tmp13) + z1;    // phase 5 
// 5a)  dataptr[6] = (*tmp13) - z1;

#define out2out6 t13t13
    movd dword ptr [edx + 4*6], out2out6; // dataptr[6] <= final result
//    movd dword ptr [edx + 32*6], out2out6; // dataptr[6] <= final result
     psrlq out2out6, 32;    // [ __, out2 ]

    movd dword ptr [edx + 4*2], out2out6; // dataptr[2] <= final result
//    movd dword ptr [edx + 32*2], out2out6; // dataptr[2] <= final result

#define t14t16   mm0
#define t7t6_2 mm2  // temp copy of t7t6
#define t15t15   mm1

//    *tmp14 = (*tmp4) + (*tmp5);    // phase 2 
//    *tmp15 = (*tmp5) + (*tmp6);
//    *tmp16 = (*tmp6) + (*tmp7);

// 6a) [tmp14,tmp16] <= [ (tmp4+tmp5), (tmp7+tmp6) ]; //pfacc
// 6b) [      tmp15] <= [tmp7,tmp6] + [tmp4,tmp5]; 
    movq t14t16, [tmp7tmp6];  // prepare [tmp14,tmp16] generation

    movq t15t15, [tmp4tmp5];
     movq t7t6_2, t14t16;     // make copy of [tmp7,tmp6]

    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc1 //((0xc0 & 0x3f) << 3) | 0xc1
    EMIT 0xae

//    PFACC(t14t16, t15t15); // t14t16 <= [ (tmp4+tmp5), (tmp7+tmp6) ]

    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
    EMIT 0x9e

//    PFADD(t15t15, t7t6_2); // t15t15 <= [(x),tmp5+tmp6]

#define t14mt16 mm2
#define t15t14 mm3
    movq t15t14, t14t16;   // prepare to generate [tmp15,tmp14]
//     movd [tmp17tmp16], t14t16; // produce [tmp17,tmp16] <= [...,t16]

    punpckldq t15t15,t15t15; // t15t15 <= [tmp5+tmp6,tmp5+tmp6]
     movq t14mt16, t14t16;   // prepare to generate [tmp14, -tmp16]

    punpckhdq t15t14, t15t15; // mm3 <= [t15,t14]

//    movq [tmp15tmp14], t15t14; 

    // The rotator is modified from fig 4-8 to avoid extra negations. 
//    (*z5) = ((*tmp14) - (*tmp16)) * ((float ) 0.382683433); // c6 
//    (*z2) = ((float ) 0.541196100) * (*tmp14) + (*z5); // c2-c6 
//    (*z4) = ((float ) 1.306562965) * (*tmp16) + (*z5); // c2+c6
//    (*z3) = (*tmp15) * ((float ) NC_R_SQRT2);          // c4


//     form [z5a,z3a] <= [tmp14-tmp16, tmp15]
// 7a) [z2a,z4a] <= [tmp14,tmp16] * [0.5411,1.3066]
// 7b) [z5 ,z3 ] <= [tmp14-tmp16,tmp15] * [0.3827,0.7071]

#define ___t15  t15t15
    pxor t14mt16, [mmMask00001000]; // t14mt16 = [tmp14,-tmp16]
     psllq ___t15, 32;             // create ___t15 = [ t15, 000]

//#define Z2AZ4A_CONST _ebx + 8 // [0.5411, 1.3066]
#define Z2AZ4A_CONST _ebx // [0.5411, 1.3066]
#define z5az3a ___t15      // [z5a,z3a] <= [t14-t16,t15] <= t14mt16
#define z2az4a t14t16   // [z2a,z4a] = [t14t16]*[0.5411,1.30666]

//    PFMULM( z2az4a, Z2AZ4A_CONST, 8 ); // form [z2a,z4a]
//     PFACC( z5az3a, t14mt16 );  // [z5a,z3a] = [tmp14-tmp16, tmp15]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
    EMIT 8
    EMIT 0xb4

    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xca //((0xc1 & 0x3f) << 3) | 0xc2
    EMIT 0xae  

#define Z5AZ3A_CONST _ebx  // [0.3827, 0.7071]
//#define Z5AZ3A_CONST _ebx + 16 // [0.3827, 0.7071]

//    PFMULM( z5az3a, Z5AZ3A_CONST, 16 ); // z5az3a = [z5,z3]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
    EMIT 16
    EMIT 0xb4 

#define z5z3 z5az3a
#define z5z5 mm7
#define z3z3 mm6
    movq z5z5, z5z3;        // start to form z5z5
     movq z3z3, z5z3;        // start to form z3z3

    punpckhdq z5z5,z5z5;    // z5z5 = [z5,z5]
     punpckldq z3z3,z3z3;    // z3z3 = [z3,z3]

//    movq [z5z4], z5z5;      // produce z5

#define z3mz3 z3z3
#define z2z4 z2az4a
    pxor z3mz3, [mmMask00001000];  // z3mz3 = [z3,-z3]
// 7c) [z2,z4]  <= [z2a,z4a] + [z5,z5];
//    PFADD( z2z4, z5z5 );        // z2az4a <= [z2,z4]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xc7 //((0xc0 & 0x3f) << 3) | 0xc7
    EMIT 0x9e        

#define t7t7 mm5
    movq t7t7, [tmp7tmp6];    // begin forming [tmp7,tmp7]

    punpckhdq t7t7,t7t7;    // t7t7 = [tmp7,tmp7]

//    z11 = (*tmp7) + (*z3);        // phase 5
fdct3dn.cpp - 源码说明

本页面展示了「这是一组DCT和iDCT的代码」中的 fdct3dn.cpp 源码文件，采用 C++ 编程语言编写，共 1,403 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与iDCT相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?