idct_ap922hybr.cpp

来自「这是一组DCT和iDCT的代码」· C++ 代码 · 共 986 行 · 第 1/3 页
CPP
986 行
    cmp edi, 8;
     psraw mm7, DESCALE_SHIFT2;    // (6e) clip y[6,7] to {-256,+255}

    movd [ OUTC + 5*16], mm5;    // (6f) store y[5]
     punpckhdq mm5, mm5;            ;// (6f) mm3 = [y0c1 y0c0 y0c1 y0c0] <int16>

    movd [ OUTC + 7*16], mm7;    // (6f) store y[7]
     punpckhdq mm7, mm7;            ;// (6f) mm3 = [y6c1 y6c0 y6c1 y6c0] <int16>

    movd [ OUTC + 0*16], mm5;    // (6f) store y[0]

    movd [ OUTC + 6*16], mm7;    // (6f) store y[6]

    jl acc_idct_colloop1;
  // end for ( x=0; x < 8; x=x+2 )

   FEMMS;
  }
}
*/

////////////////////////////////////////////////////////////////////////
//
//
// AP922hybr_x87(), conventional x87 implementation of hybrid AP922
//
//
////////////////////////////////////////////////////////////////////////

void
idct_ap922hybr_x87( short *data )
{
//  static tdnfloat fTempArray[64]; // intermediate (row-iDCT output) matrix
  static int ar[8], br[8];
  static tdnfloat b[8], e[8], dp[8]; // intermediate results
  static tdnfloat tmp[2];  // temp calculation, scratch pad

  static short xi[8];  // temp 16-bit ints for initial P_T operator

//  static short const _one_corr=0x0001; 
  short *ti;    // pointer to IDCT row coefficient table
 
  short *x;     // pointer to input
  short *y;     // pointer to final output
  tdnfloat *yr, *xc; // pointer to output row

  int i;

  ////////////////////////////////////////////////////////////////////////
  //
  // AP922 iDCT row transform (int32 version)
  //   Core transform code is very similar to the original AP922mmx iDCT_row
  //   except full 32-bit output is preserved (instead of shift/rnd to 16-bit)
  //    
  //
  // Row IDCT operator :A_T*M_T*P_T
  // Let Y=[output column data, 8 elements] 32-bit IEEE-754 float
  //     X=[input column data, 8 elements] 16-bit short integer
  //
  //     Y= [ A_T*M_T*P_T ] * X
  //
  //   (Y and X are both column vectors)


  for ( i = 0; i < 8; ++i ) // row iDCT
  {
    ti = (short*)&tab_i_01234567hybr[ i * 32 ];
    x = &data[ 8*i ];
    yr = ((tdnfloat *) fTempArray) + 8*i; // intermediate output, select row#i

    //
    // 1) Apply M_T * P_T operator 
    //    

    // The operation "output(32-bit) = x(16-bit) * y(16-bit)" produces a
    // 32-bit result.  Later, the 32-bit result is converted to float,
    // which means the prior integer-mult effectively scales up the
    // result by 65536 (left shift 16.)

    // ar[3..0], br[3..0] = 32-bit integers
    //
    // The AP922hybrid iDCT_row computes ar[3..0] & br[3..0] without
    // any post-shift/round.  As a result, the pmaddwd/paddd sequence
    // has the side-effect of upscaling by a factor of 2 (left shift by 1.)
    //
    // This effect is reflected in the constant PSCALE_SHIFT, which has 
    // been adjusted upward from 16 to 17.

    ar[0] = x[0] * ti[0]  + x[4] * ti[1]  + x[2] * ti[4]  + x[6] * ti[5];
    ar[1] = x[0] * ti[2]  + x[4] * ti[3]  + x[2] * ti[6]  + x[6] * ti[7];
    ar[2] = x[0] * ti[8]  + x[4] * ti[9]  + x[2] * ti[12] + x[6] * ti[13];
    ar[3] = x[0] * ti[10] + x[4] * ti[11] + x[2] * ti[14] + x[6] * ti[15];

    br[0] = x[1] * ti[16] + x[5] * ti[17] + x[3] * ti[20] + x[7] * ti[21];
    br[1] = x[1] * ti[18] + x[5] * ti[19] + x[3] * ti[22] + x[7] * ti[23];
    br[2] = x[1] * ti[24] + x[5] * ti[25] + x[3] * ti[28] + x[7] * ti[29];
    br[3] = x[1] * ti[26] + x[5] * ti[27] + x[3] * ti[30] + x[7] * ti[31];    


    // 2) Apply A_T operator, store outputs
    //
    //    1  0  0  0   1  0  0  0
    //    0  1  0  0   0  1  0  0
    //    0  0  1  0   0  0  1  0
    //    0  0  0  1   0  0  0  1
    //    0  0  0  1   0  0  0 -1
    //    0  0  1  0   0  0 -1  0
    //    0  1  0  0   0 -1  0  0
    //    1  0  0  0  -1  0  0  0

    //    The output is 32-bit integer.  The 32-bit int is converted to
    //    float and stored in a temporary array, 

    yr[0] = (float)( ar[0] + br[0] ); // store as 32-bit int
    yr[1] = (float)( ar[1] + br[1] ); // store as 32-bit int
    yr[2] = (float)( ar[2] + br[2] ); // store as 32-bit int
    yr[3] = (float)( ar[3] + br[3] ); // store as 32-bit int

    yr[4] = (float)( ar[3] - br[3] ); // store as 32-bit int
    yr[5] = (float)( ar[2] - br[2] ); // store as 32-bit int
    yr[6] = (float)( ar[1] - br[1] ); // store as 32-bit int
    yr[7] = (float)( ar[0] - br[0] ); // store as 32-bit int
  } // end for( i = 0; i < 8; ++i ) // end of row iDCT

  //     AP922hybr iDCT row transform done 
  //
  ////////////////////////////////////////////////////////////////////////


  //////////////////////////////////////////////////////////////////////
  //
  // Column IDCT operator :A_T*(F_T*E_T*B_T*D_T)*P_T
  //  AP922hybrid and AP922float share *identical* column_idct operators
  //
  //  (minor difference in the final descale_shift : 
  //   17bits instead of 16bits.)
  //
  // Let Y=[output column data, 8 elements], 16-bit short integer
  //     X=[input column data, 8 elements], 32-bit IEEE-754 float
  //
  //     Y= [ A_T*(F_T*E_T*B_T*D_T)*P_T ] * X
  //
  //   (Y and X are both column vectors)

  for ( i = 0; i < 8; ++i ) // column iDCT
  {
    xc = ((tdnfloat *) fTempArray) + i; // select column #i
    y = &data[ i ];

    // 1) Apply (D_T * P_T) - the cos() coefficients of D_T are implicit
    //    in the idct_row operation.  But we still need to apply the
    //    shuffling operation of D_T.
    //
    //    1  0  0  0   0  0  0  0
    //    0  0  0  0   1  0  0  0
    //    0  0  1  0   0  0  0  0
    //    0  0  0  0   0  0  1  0
    //    0  1  0  0   0  0  0  0
    //    0  0  0  0   0  0  0  1
    //    0  0  0  1   0  0  0  0
    //    0  0  0  0   0  1  0  0

    dp[0] = xc[ 0 *8];
    dp[1] = xc[ 4 *8];
    dp[2] = xc[ 2 *8];
    dp[3] = xc[ 6 *8];

    dp[4] = xc[ 1 *8];
    dp[5] = xc[ 7 *8];
    dp[6] = xc[ 3 *8];
    dp[7] = xc[ 5 *8];
 
    // 2) Apply B_T
    //
    //    1  1  0  0
    //    1 -1  0  0
    //    0  0  1 t2
    //    0  0 t2 -1
    //                1 t1  0  0
    //               t1 -1  0  0
    //                0  0  1 t3
    //                0  0 t3 -1
 
 
    b[0] =   dp[1]            + dp[0];
    b[1] =   dp[0]            - dp[1];

    b[2] = ( dp[3]*tg_2_16f ) + dp[2];
    b[3] = ( dp[2]*tg_2_16f ) - dp[3];

    b[4] = ( dp[5]*tg_1_16f ) + dp[4];
    b[5] = ( dp[4]*tg_1_16f ) - dp[5];

    b[6] = ( dp[7]*tg_3_16f ) + dp[6];
    b[7] = ( dp[6]*tg_3_16f ) - dp[7];
  
    // 3) Apply E_T
    //
    //    1  0  1  0
    //    0  1  0  1
    //    0  1  0 -1
    //    1  0 -1  0
    //                1  0  1  0
    //                1  0 -1  0
    //                0  1  0  1
    //                0  1  0 -1
 
    e[0] = b[0] + b[2];
    e[1] = b[1] + b[3];
    e[2] = b[1] - b[3];
    e[3] = b[0] - b[2];
    e[4] = b[4] + b[6];
    e[5] = b[4] - b[6];
    e[6] = b[5] + b[7];
    e[7] = b[5] - b[7];

    // 4) Apply F_T
    //
    //    1  0  0  0
    //    0  1  0  0
    //    0  0  1  0
    //    0  0  0  1
    //                1  0  0  0
    //                0  1  0  0
    //                0  0  1  0
    //                0  0  0  1

#define _F0 e[0] 
#define _F1 e[1] 
#define _F2 e[2] 
#define _F3 e[3] 
#define _F4 e[4] 
#define _F5 e[5] 
#define _F6 e[6] 
#define _F7 e[7] 

    tmp[0] = (e[5] + e[6]) * cos_4_16f;
    tmp[1] = (e[5] - e[6]) * cos_4_16f;
    _F5 = tmp[0];
    _F6 = tmp[1];

    // 5) Apply A_T
    //
    //    1  0  0  0   1  0  0  0
    //    0  1  0  0   0  1  0  0
    //    0  0  1  0   0  0  1  0
    //    0  0  0  1   0  0  0  1
    //    0  0  0  1   0  0  0 -1
    //    0  0  1  0   0  0 -1  0
    //    0  1  0  0   0 -1  0  0
    //    1  0  0  0  -1  0  0  0
    //
    //    yfloat[0]= F0 + F4
    //    yfloat[1]= F1 + F5
    //    yfloat[2]= F2 + F6
    //    yfloat[3]= F3 + F7
    //           
    //    yfloat[4]= F3 - F7
    //    yfloat[5]= F2 - F6
    //    yfloat[6]= F1 - F5
    //    yfloat[7]= F0 - F4
    //
    //
    // 6) float -> int, shift/round to final output y[]
    //    The final shift&round operation reverses the row-input prescaling.
    //    It also applies the chosen rounding-mode (accurate or fast.)
    //
    //    Note, the C-code below differs *substantially* from the AMD_3DNOW
    //    implementation.  The 3D_Now code applies this basic sequence:
    //
    //       ;// mm0 = [y1 y0] <float32>
    //       ;// mm1 = [y3 y2] <float32>
    //
    //      PF2ID mm0, mm0;  // mm0 <= [y1 y0] <int32>
    //      PF2ID mm1, mm1;  // mm1 <= [y3 y2] <int32>
    //
    //       ;// "0.5" is a 32-bit integer constant scaled up by some bitshift
    //      paddd mm0, [rnd_compensation]; // mm0 <=[y1+"0.5" y0+"0.5"]
    //      paddd mm1, [rnd_compensation]; // mm0 <=[y3+"0.5" y2+"0.5"]
    //
    //      psrad mm0, DESCALE_SHIFT1;     // stage1 shift
    //      psrad mm1, DESCALE_SHIFT1;     // stage1 shift
    //
    //      packssdw mm0, mm1;  // mm0 <= [y3 y2 y1 y0] <int16>
    //
    //      psraw mm0, DESCALE_SHIFT2;  // clip y[] to the range {-256,+255}
    //
    //       ;// DESCALE_SHIFT1 + DESCALE_SHIFT2 = PRESCALE_SHIFT 
    //       ;//      10         +        7       =      16+1
    //
    //      movq [OUTC+...*8], mm0;


    y[0*8] = (short)SHIFT_ROUND_COLF( _F0 + _F4 );
    y[1*8] = (short)SHIFT_ROUND_COLF( _F1 + _F5 );
    y[2*8] = (short)SHIFT_ROUND_COLF( _F2 + _F6 );
    y[3*8] = (short)SHIFT_ROUND_COLF( _F3 + _F7 );

    y[4*8] = (short)SHIFT_ROUND_COLF( _F3 - _F7 );
    y[5*8] = (short)SHIFT_ROUND_COLF( _F2 - _F6 );
    y[6*8] = (short)SHIFT_ROUND_COLF( _F1 - _F5 );
    y[7*8] = (short)SHIFT_ROUND_COLF( _F0 - _F4 );

  } // end for ( i = 0; i < 8; ++i ) // end of SSEfloat column iDCT

  //    AP922hybr iDCT column transform done 
  //
  ////////////////////////////////////////////////////////////////////////

  ////////////////////////////////////////////////////////////////////////
  //    Post transform clipping
  //
  //    In standard-C, the output clip code adds significantly to 
  //    execution time.  

#define IDCT_CLIP_LOW -256  // IDCT output range is 9-bits
#define IDCT_CLIP_HIGH 255  // IDCT output range is 9-bits

  for ( i = 0; i < 64; ++i )
  { // clip output to {-256,+255}
    if ( data[ i ] < IDCT_CLIP_LOW )
      data[ i ] = IDCT_CLIP_LOW;

    if ( data[ i ] > IDCT_CLIP_HIGH )
      data[ i ] = IDCT_CLIP_HIGH;
  }

}
idct_ap922hybr.cpp - 源码说明

本页面展示了「这是一组DCT和iDCT的代码」中的 idct_ap922hybr.cpp 源码文件，采用 C++ 编程语言编写，共 986 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与iDCT相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?