fdct3dn.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 1,403 行 · 第 1/4 页
CPP
1,403 行
//    z13 = (*tmp7) - (*z3);

#define z13z11 t7t7
#define out5out1 mm4
#define out3out7 z13z11
// 7d) [z13,z11] <= [tmp7,tmp7] - [z3,-z3]
//    PFSUB( z13z11, z3mz3);   // z13z11 = [z13,z11]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xee //((0xc5 & 0x3f) << 3) | 0xc6
    EMIT 0x9a
       
     movq out5out1, z2z4;    // begin forming out5out1

//    dataptr[5] = z13 + (*z2);  // phase 6 
//    dataptr[3] = z13 - (*z2);
//    dataptr[1] = z11 + (*z4);
//   dataptr[7] = z11 - (*z4);

// 8a) [dataptr5,dataptr1] <= [z13,z11] + [z2,z4]
// 8b) [dataptr3,dataptr7] <= [z13,z11] - [z2,z4]

//    PFADD( out5out1, z13z11); // produce [out5,out1]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xe5 //((0xc4 & 0x3f) << 3) | 0xc5
    EMIT 0x9e 

//    PFSUB( out3out7, z2z4 );  // produce [out3,out7]
    EMIT 0x0f
    EMIT 0x0f
    EMIT 0xe8 //((0xc5 & 0x3f) << 3) | 0xc0
    EMIT 0x9a  

      movd [edx + 4*1], out5out1; // store out[1]
//      movd [edx + 32*1], out5out1; // store out[1]
  
     psrlq out5out1, 32;      // out5out1 <= [ 000, out5 ]

    movd [edx + 4*7], out3out7; // store out[7]
//    movd [edx + 32*7], out3out7; // store out[7]
     psrlq out3out7, 32;      // out3out7 <= [ 000, out3 ]

    movd [edx + 4*5], out5out1; // store out[5]
//    movd [edx + 32*5], out5out1; // store out[5]
     sub edi, 0x01;          // i = i - 1

    movd [edx + 4*3], out3out7; // store out[3]
//    movd [edx + 32*3], out3out7; // store out[3]
     add edx, 32;            // 32=4*8, outptr += 8 (floats)
//     add edx, 4;            // 4=4*1, outptr += 1 (floats)

    cmp edi, 0x00;          // end for ( i=8; i >= 0; i=i-1)
     jg tdn_dct_col1;  // branch until (edi == 0)

//    FEMMS;
//    };
//    dataptr2 += 8;       // advance pointer to next row 
//    blkptr += 8;
//  } // end for (i=0; i<8; i++); 

// --------------------------- end of dct_column processing
// output matrix is transposed with respect to input.  Before this result
// can be used, the matrix must be transposed.


  // descale and transpose the output 
//  dataptr2 = &data[0];
  
/*
  // To enable precise rounding uncomment this C-code, and the #defines
  // near the beginning of this file (important!)  
  // of course, this code will run much slower!
  for (i = 0; i < 8; i++)
    for ( j = 0; j < 8; j++)
      // scale, round, and transpose output matrix
      block[i+(j<<3)] = (short int) floor(dataptr2[j+(i<<3)] * local_aanscales[i+(j<<3)] + 0.5);
*/

    // The following loop transposes and descales the 3dn_dct result.
    // There is a slight systematic error in the descaling algorithm.
    // The x86/x87 float->int convert uses a truncation policy :
    // The floating-point value's least significant digit (LSD) is 
    // rounded to integer of lesser MAGNITUDE.  For positive numbers, this
    // policy is the same as standard-truncation (+5.x -> +5.0)
    // But problems arise with negative numbers : -5.x(float) -> -5(int), 
    // For negative numbers, the policy is *not* truncation, but rather
    // unconditional-roundup.  
    //
    // Since negative#'s and postive#'s are rounded differently, the naive 
    // "add +0.5" compensation method fails.  This rounding issue ruins an 
    // otherwise "correct" f_DCT algorithm.
    //
    // To correct the rounding problem, negative numbers are 'precompensated'
    // with '-0.5'.  Postive numbers continue to 'precompensated' with '+0.5.'
    // This policy produces accurate rounding, but is data-dependent : 
    // The CPU must examine each input float-value in order to select the
    // proper rounding compensation-value (-0.5 or +0.5.)
    //
    // In practice, data-decision policies are to be avoided. 
    //
    // An alternative "fast-round"-algorithm offers faster performance at the
    // expense of accuracy. The float-value is first multiplied by a 
    // factor:"2^N", where N is an integer.  Then, the scaled-float is 
    // converted to integer using the standard CPU instruction (yes, the one
    // with the troublesome truncation-policy.)  Now, the scaled-integer is
    // summed with a scaled-precompensation value ("+2^(N-1)".)  During the
    // last step, the compenasted-integer is descaled by right shifting 
    // N-bits, yielding the final result.
    //
    // The forward(float)-scaling and back(int)-decsaling effectively
    // "pushes" the rounding-inaccuracy far to the right of the decimal-point,
    // where its impact is less.  Using this method, negative#s still exhibit
    // a slight-shift (the shift is exactly equal to 2^(1-N)), but with a
    // suitably large N, the shift is negligble.

    // For N=15, the shift is 1 part per 32768, or 1/32768 (~0.0000305)

    // As a final note, the -0.5/+0.5 (data-dependent) rounding policy can
    // be implemented using the partition-compare MMX instructions pcmpXXX.
    // Statistical tests for both precise and imprecise rounding policies 
    // were compared.  For N=15, the tests revealed no loss in precision
    // for the "fast-round" approach.


// row  col->  (source)        col->   (destination)
//  #    0 1 2 3 4 5 6 7            0 1 2 3   4 5 6 7
//  -    ---------------            -----------------
//  0    A I ...                    A B C D   E F G H 
//  1    B J ...                    I J K L   M N O P
//  2    C K ...(source)    ------>   (destination)
//  3    D L ...
//
//  4    E M ...
//  5    F N ...
//  6    G O ...
//  7    H P ...

#define mmRoundup mm7

     mov eax, dword ptr [dataptr2];// eax = upper half of source matrix
      mov edi, 0x04;               // edi = 'i' // for ( i = 4; i > 0; i=i+1 )

     lea ebx, dword ptr [local_aanscales];//ebx <= local_aanscales[] (table of constants)
      mov ecx, eax;                 // copy eax -> ecx

     movq mmRoundup, [mmMaskRnd]; // "round up" (+0.5,+0.5) mask
      add ecx, 128;                // ecx = lower half of source matrix

     mov edx, dword ptr [block];   // edx <= block[] (output)
      sub ebx, 64;

tdn_dct_postproc: // 3d_now_dct post-processing jump-point
     // Each loop iteration converts a 2x8 (colxrow) segment.  The loop
     // executes 4 times to produce the final 8x8 output matrix.

     movq mm0, qword ptr [eax+32*0];  // mm0 <= (1,0 : 0) [A,I]
      add ebx, 64;

     movq mm2, qword ptr [eax+32*1];  // mm2 <= (1,0 : 1) [B,J]
      movq mm1, mm0; // mm1 = copy of (1,0 : 0) [A,I]
  
     movq mm3, qword ptr [eax+32*2]; // mm3 <= (1,0 : 2) [C,K]
      punpckldq mm0,mm2; // mm0 = ( 0 : 1,0 ) [B,A]

     movq mm4, qword ptr [eax+32*3]; // mm4 <= (1,0 : 3) [D,L]
      punpckhdq mm1, mm2; // mm1 = ( 1 : 1,0 ) [J,I]

     movq mm2, mm3;  // mm2 = copy of (1,0 : 2) [C,K]
//      PFMULM( mm0, _ebx, 0 );  // mm0 <= scale ( 0 : 1,0 ) [B,A]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
      EMIT 0
      EMIT 0xb4

     punpckldq mm2, mm4; // mm2 = ( 0 : 3,2 ) [D,C]
//      PFMULM( mm1, _ebx, 0+32 ); // mm1 <= scale ( 1 : 1,0 ) [J,K]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
      EMIT 0+32
      EMIT 0xb4

     punpckhdq mm3, mm4; // mm3 = ( 1 : 3,2 ) [L,K]
//      PFMULM( mm2, _ebx, 8 ); // mm2 <= scale ( 0 : 3,2 ) [D,C]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0x53 //(((0xc2 & 0x3f) << 3) | 0x03 | 0x40)
      EMIT 8
      EMIT 0xb4 

//     PFMULM( mm3, _ebx, 8+32 ); // mm1 <= scale ( 1: 3,2 ) [L,K]
//      PF2ID( mm0, mm0 ); // mm0 <= float -> integer [B,A]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0x5b //(((0xc3 & 0x3f) << 3) | 0x03 | 0x40)
     EMIT 8+32
     EMIT 0xb4

     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc0 //((0xc0 & 0x3f) << 3) | 0xc0
     EMIT 0x1d 

//     PF2ID( mm1, mm1 ); // mm1 <= float -> integer [J,I]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
     EMIT 0x1d

      add eax, 8;

//     PF2ID( mm2, mm2 ); // mm2 <= float -> integer [D,C]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xd2 //((0xc2 & 0x3f) << 3) | 0xc2
     EMIT 0x1d

      paddd  mm0, mmRoundup; // roundup compensation [B,A]

     paddd  mm1, mmRoundup; // roundup compensation [J,I]
//      PF2ID( mm3, mm3 ); // mm3 <= float -> integer [L,K]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xdb //((0xc3 & 0x3f) << 3) | 0xc3
     EMIT 0x1d

     paddd  mm2, mmRoundup; // roundup compensation [D,C]
      psrad mm0, PSCF_SHIFT; // DESCALE [B,A]

     paddd  mm3, mmRoundup; // roundup compensation [L,K]
      psrad mm2, PSCF_SHIFT; // DESCALE [D,C]

     packssdw mm0, mm2;     // form [D,C,B,A]
      psrad mm1, PSCF_SHIFT; // DESCALE [J,I]

     movq qword ptr [edx], mm0; // store [D,C,B,A]
      psrad mm3, PSCF_SHIFT; // DESCALE [L,K]

     packssdw mm1, mm3;    // form [L,K,J,I]
//      sub edi, 1;      // i = i - 1
//  row->0 1
//  col  ---
//  4    E M ...
//  5    F N ...
//  6    G O ...
//  7    H P ...

      movq mm0, qword ptr [ecx+32*0];  // mm0 <= (1,0 : 0) [E,M]

     movq qword ptr [edx+16], mm1; // store [L,K,J,I]

     movq mm2, qword ptr [ecx+32*1];  // mm2 <= (1,0 : 1) [F,N]
      movq mm1, mm0; // mm1 = copy of (1,0 : 0) [E,M]
  
     movq mm3, qword ptr [ecx+32*2]; // mm3 <= (1,0 : 2) [G,O]
      punpckldq mm0,mm2; // mm0 = ( 0 : 1,0 ) [F,E]

     movq mm4, qword ptr [ecx+32*3]; // mm4 <= (1,0 : 3) [H,P]
      punpckhdq mm1, mm2; // mm1 = ( 1 : 1,0 ) [N,M]

     movq mm2, mm3;  // mm2 = copy of (1,0 : 2) [G,O]
//      PFMULM( mm0, _ebx, 16);  // mm0 <= scale ( 0 : 1,0 ) [F,E]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0x43 //(((0xc0 & 0x3f) << 3) | 0x03 | 0x40)
      EMIT 16
      EMIT 0xb4  

     punpckldq mm2, mm4; // mm2 = ( 0 : 3,2 ) [H,G]
//      PFMULM( mm1, _ebx, 16+32 ); // mm1 <= scale ( 1 : 1,0 ) [J,K]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0x4b //(((0xc1 & 0x3f) << 3) | 0x03 | 0x40)
      EMIT 16+32
      EMIT 0xb4 

     punpckhdq mm3, mm4; // mm3 = ( 1 : 3,2 ) [P,O]
//      PFMULM( mm2, _ebx, 16+8); // mm2 <= scale ( 0 : 3,2 ) [H,G]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0x53 //(((0xc2 & 0x3f) << 3) | 0x03 | 0x40)
      EMIT 16+8
      EMIT 0xb4 

//     PFMULM( mm3, _ebx, 16+8+32 ); // mm1 <= scale ( 1: 3,2 ) [L,K]
//      PF2ID( mm0, mm0 ); // mm0 <= float -> integer [F,E]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0x5b //(((0xc3 & 0x3f) << 3) | 0x03 | 0x40)
     EMIT 16+8+32
     EMIT 0xb4

     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc0 //((0xc0 & 0x3f) << 3) | 0xc0
     EMIT 0x1d

//     PF2ID( mm1, mm1 ); // mm1 <= float -> integer [N,M]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xc9 //((0xc1 & 0x3f) << 3) | 0xc1
     EMIT 0x1d

      add ecx, 8;

//     PF2ID( mm2, mm2 ); // mm2 <= float -> integer [H,G]
     EMIT 0x0f
     EMIT 0x0f
     EMIT 0xd2 //((0xc2 & 0x3f) << 3) | 0xc2
     EMIT 0x1d

      paddd  mm0, mmRoundup; // roundup compensation [F,E]

     paddd  mm1, mmRoundup; // roundup compensation [N,M]
//      PF2ID( mm3, mm3 ); // mm3 <= float -> integer [P,O]
      EMIT 0x0f
      EMIT 0x0f
      EMIT 0xdb //((0xc3 & 0x3f) << 3) | 0xc3
      EMIT 0x1d 

     paddd  mm2, mmRoundup; // roundup compensation [H,G]
      psrad mm0, PSCF_SHIFT; // DESCALE [F,E]

     paddd  mm3, mmRoundup; // roundup compensation [P,O]
      psrad mm2, PSCF_SHIFT; // DESCALE [H,G]

     packssdw mm0, mm2;     // form [H,G,F,E]
      psrad mm1, PSCF_SHIFT; // DESCALE [N,M]

     movq qword ptr [edx+8], mm0; // store [H,G,F,E]
      psrad mm3, PSCF_SHIFT; // DESCALE [P,O]

     packssdw mm1, mm3;    // form [P,O,N,M]
      sub edi, 1;      // i = i - 1

     movq qword ptr [edx+8+16], mm1; // store [P,O,N,M]
      add edx, 32;

     cmp edi, 0x00;   // (i==0) ?

     jg tdn_dct_postproc;  // end for ( i =8; i > 0; i = i - 1)
//     FEMMS;
     EMIT 0x0f
     EMIT 0x0e  // this is the EMMS instruction the compiler complains about
   };

}
fdct3dn.cpp - 源码说明

本页面展示了「这是一组DCT和iDCT的代码」中的 fdct3dn.cpp 源码文件，采用 C++ 编程语言编写，共 1,403 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与iDCT相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?