⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fdct3dn.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 4 页
字号:
#include "StdAfx.h"
#include "..\Common.h"

/* fdct3dn_extracomments.c, forward discrete cosine transform, 3D-Now! implementation
 *
 * -----------------------------------
 * All 3D-Now instructions converted to "_EMIT " primitives
 * (thanks to Microsoft Visual C++ preprocesor compile option "/P")
 * -----------------------------------
 *
 * This software is a derivative work of the MPEG Software Simulation Group.
 * Use of this software is restricted by the distribution rights granted
 * by the MPEG Software Simulation Group.
 * 
 *
 * AAN float-DCT (single-precision) with 3D-Now acceleration
 * -----------------------------------------------------------------
 * This file contains a Visual C++ 6 source-code listing for a
 * 3D-Now implementation of the AAN-DCT algorithm.  The AAN algorithm
 * is one of several fast-approaches to perform a discrete-cosine 
 * transform on an 8x8 block of pixel-data.  This code was developed
 * and tested under Visual C++ 6.0 Professional Edition.  You will
 * need the "amd3dx.h" header-file from AMD's SDK to compile this
 * code as-is.
 *
 * The 3D-Now implementation was converted from BBMPEG's floating
 * point AAN-DCT.  The 3D-Now implementation is mathematically identical
 * except for 2 differences :
 * 
 *         (1) 3D-Now implementation is single-precision (32-bit)
 *
 *         (2) 3D-Now implementation introduces a slight (1/32768)
 *             negative offset for the DCT output elements which
 *             are negative.  Please see the embedded code comments for
 *             more information.  The offset occurs during the final
 *             post-scaling/rounding step.  It can be eliminated at the
 *             expense of slower execution.  However, the amount of the
 *             offset is insignificant compared to the overall 
 *             systematic error of the AAN.
 *
 * The code-listing is is compatible with all 3D-Now capable processors 
 * (AMD K6/2 and later, IDT Winchip2, Cyrix M3.)  The code has only been 
 * thoroughly tested on an AMD K6/2-500.
 * 
 * Although the code is reasonably well optimized, not all optimization 
 * possibilities were implemented, as many of them would render
 * the listing virtually unreadable.  
 * Some segments could be shortened by using the DSP-extensions of 
 * the K7-Athlon ("3D-Now enhanced.")  Of particular interest are PFPNACC,
 * PSWAPD, and PSHUFW.
 *
 *
 * Revision history
 * ----------------
 *
 * v1.0  Single-precision floating point AAN-DCT (from BBMPEG)
 *       using the AMD 3D-Now! instruction-set.  Tested on AMD K6/2.
 *
 *
 *  liaor@iname.com v1.0 07/13/2000
 *
 * Other references/links:
 * -----------------------
 *
 *  BBMPEG (Brent Beyeler) : http://members.home.net/beyeler
 *  IEEE-1180 test info    : http://www.mpeg.org
 *  AMD SDK                : http://www.amd.com
 */

/* Original AAN-DCT from <mpeg2codec v1.2> copyright (C) 1996, 
   MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

/* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
 * on each column.  Direct algorithms are also available, but they are
 * much more complex and seem not to be any faster when reduced to code.
 *
 * This implementation is based on Arai, Agui, and Nakajima's algorithm for
 * scaled DCT.  Their original paper (Trans. IEICE E-71(11):1095) is in
 * Japanese, but the algorithm is described in the Pennebaker & Mitchell
 * JPEG textbook (see REFERENCES section in file README).  The following code
 * is based directly on figure 4-8 in P&M.
 * While an 8-point DCT cannot be done in less than 11 multiplies, it is
 * possible to arrange the computation so that many of the multiplies are
 * simple scalings of the final outputs.  These multiplies can then be
 * folded into the multiplications or divisions by the JPEG quantization
 * table entries.  The AA&N method leaves only 5 multiplies and 29 adds
 * to be done in the DCT itself.
 * The primary disadvantage of this method is that with a fixed-point
 * implementation, accuracy is lost due to imprecise representation of the
 * scaled quantization values.  However, that problem does not arise if
 * we use floating point arithmetic.
 */
#ifdef __BORLANDC__

#define EMIT db

#else

#define EMIT _emit

#endif

#include<math.h> // for floor() function
//#include "amd3dx.h" // macros-file from AMD 3D-Now SDK, http://www.amd.com

static float local_aanscales[64];

// PSCF_xxx is for postscaling/rounding operation.
// For 'accurate-rounding' (see C-code), use these alternate definitions
/*
#define PSCF_SCALE 1.0
#define PSCF_SHIFT 0
#define PSCF_MASK  0x0000000000000000 // [+0,+0] dword,dword
*/
#define PSCF_SCALE 32768.0
#define PSCF_SHIFT 15
#define PSCF_MASK  0x0000400000004000 // [+32768,+32768] dword,dword


#define NC_COS6      0.382683432365089771728459984030399//cos(6*pi/16)

#define NC_R_SQRT2   0.707106781186547524400844362104849// 1/sqrt(2)

#define NC_COS1SQRT2 1.38703984532214746182161919156644 //cos(1*pi/16)*sqrt(2)
#define NC_COS2SQRT2 1.30656296487637652785664317342719 //cos(2*pi/16)*sqrt(2)
#define NC_COS3SQRT2 1.17587560241935871697446710461126 //cos(3*pi/16)*sqrt(2)
#define NC_COS5SQRT2 0.785694958387102181277897367657217//cos(5*pi/16)*sqrt(2)
#define NC_COS6SQRT2 0.541196100146196984399723205366389//cos(6*pi/16)*sqrt(2)
#define NC_COS7SQRT2 0.275899379282943012335957563669373//cos(7*pi/16)*sqrt(2)


void init_fdct_3dnow()
{
  int i, j;
  static const double aansf[8] = {
    1.0, 
    NC_COS1SQRT2,
    NC_COS2SQRT2,
    NC_COS3SQRT2,
    1.0,  // cos(4*pi/16) * sqrt(2) = 1.0 exactly
    NC_COS5SQRT2,
    NC_COS6SQRT2,
    NC_COS7SQRT2
  };

  for (i = 0; i < 8; i++)
    for (j = 0; j < 8; j++)
      local_aanscales[(i<<3)+j] = (float)((double)(PSCF_SCALE) / (aansf[i]*aansf[j]*8.0));
//      local_aanscales[(i << 3) + j] = 1.0 / (aansf[i] * aansf[j] * 8.0);
}



void fdct_3dnow(short *block)
{
  static float data[64]; // temporary data array
  static __int64  tmp3tmp2, tmp0tmp1, tmp7tmp6, tmp4tmp5;
  static __int64  tmp10tmp11, tmp13tmp12, tmp15tmp14, tmp17tmp16;
//  static __int64  z2z3, z5z4;
  static const __int64 mmMask00001000 = 0x80000000; 
  static const __int64 mmMaskRnd = PSCF_MASK; // post "round-up" mask

  // multiplication constants for DCT processing
  static const float CONSTANTS[] = { 
      - (float)NC_R_SQRT2, (float)NC_R_SQRT2,     //-0.7071,0.7071 --> [ 0.7071,-0.7071 ]
        (float)NC_COS2SQRT2, (float)NC_COS6SQRT2, // 1.3066,0.5412 --> [ 0.5412, 1.3066 ]
        (float)NC_R_SQRT2, (float)NC_COS6         // 0.7071,0.3827 --> [ 0.3827, 0.7071 ]
  };

/*
  debugging variables 
  float *tmp0 = (float *) &(tmp0tmp1);
  float *tmp1 = (float *) &(tmp0tmp1);
  float *tmp2 =(float *) &(tmp3tmp2);
  float *tmp3 =(float *) &(tmp3tmp2);
  float *tmp4 =(float *) &(tmp4tmp5);
  float *tmp5 =(float *) &(tmp4tmp5);
  float *tmp6 =(float *) &(tmp7tmp6);
  float *tmp7 = (float *) &(tmp7tmp6);
  float *tmp10 = (float *) &(tmp10tmp11);
  float *tmp11 = (float *) &(tmp10tmp11);
  float *tmp12 = (float *) &(tmp13tmp12);
  float *tmp13 = (float *) &(tmp13tmp12);
  float *tmp14 = (float *) &(tmp15tmp14);
  float *tmp15 = (float *) &(tmp15tmp14);
  float *tmp16 = (float *) &(tmp17tmp16);
  float *z2    = (float *) &(z2z3);
  float *z3    = (float *) &(z2z3);
  float *z4    = (float *) &(z5z4);
  float *z5    = (float *) &(z5z4);
  float  z1, z11, z13;
*/
  static float  *dataptr, *dataptr2;
//  short *blkptr;
//  int i,j;

/*
  debugging pointer variables, generate proper address for the following
  pointers
  ++tmp0;
  ++tmp3;
  ++tmp4;
  ++tmp7;

  ++tmp10;
  ++tmp13;
  ++tmp15;

  ++z2;
  ++z5;
*/

  // The initialize routine init_fdct() must be called prior to
  // using this function!

  /*************************************************************
   *
   * Pass 1: process rows, transpose intermediate result
   *
   *************************************************************/
  // 
 
//  blkptr = block;
  dataptr = data;  
  dataptr2 = data;

//  for (i = 0; i < 8; i++)
//  {
    /*
    *tmp0 = blkptr[0] + blkptr[7];
    *tmp7 = blkptr[0] - blkptr[7];
    *tmp1 = blkptr[1] + blkptr[6];
    *tmp6 = blkptr[1] - blkptr[6];
    *tmp2 = blkptr[2] + blkptr[5];
    *tmp5 = blkptr[2] - blkptr[5];
    *tmp3 = blkptr[3] + blkptr[4];
    *tmp4 = blkptr[3] - blkptr[4];
*/
     __asm {
     //////////
     // tdn_dct_row1 computes the fDCT for 1 input-row.
     // tdn_dct_row1 transposes the output, so result is stored as column.
     // sio
     // source data is assumed



     mov eax, dword ptr [block]; 
      pxor mm7, mm7;              // mm7 <= 0x0000_0000_0000_0000

     mov edx, dword ptr [dataptr];// edx <= &dataptr[0]
      mov edi, 0x08;               // edi = 'i' // for ( i = 8; i > 0; i=i+1 )

     lea ebx, dword ptr [CONSTANTS];// ebx <= &CONSTANTS[0]

tdn_dct_row1: // 3d_now_dct_row1 loop-point

     movq mm5, qword ptr [eax];   // mm5 <= blkptr[i3_i2_i1_i0]
      pxor mm0, mm0;              // clear mm0

     movq mm6, qword ptr [eax+8]; // mm6 <= blkptr[i7_i6_i5_i4]
      pxor mm2, mm2;              // clear mm2

     movq mm1, mm5;               // mm1 <= blkptr[i3_i2_i1_i0]
      punpckhwd mm0, mm5;        // mm0 <= produce [i3,00,i2,00] (sword)

     movq mm3, mm6;               // mm3 <= blkptr[i7_i6_i5_i4]
      psrlq mm5, 16;              // mm5 <= blkptr[00_i3_i2_i1]
      
     punpckhwd mm2, mm6;         // mm2 <= produce [i7,00,i6,00] (sword)
      psrlq mm6, 16;              // mm6 <= blkptr[00_i7_i6_i5]

     punpcklwd mm5, mm1;          // mm5 <= [i1,i2,i0,i1] (sword)
      pxor mm1, mm1;              // clear mm1

     psrad mm0, 16;
      punpcklwd mm1, mm5;         // mm1 <= produce [i0,00,i1,00] (sword)

     punpcklwd mm6, mm3;          // mm6 <= [i5,i6,i4,i5] (sword)
      pxor mm3, mm3;              // clear mm3

     psrad mm2, 16;
      punpcklwd mm3, mm6;        // mm3 <= produce [i4,00,i5,00] (sword)

     psrad mm3, 16;

     add eax, 16;                // dataptr += 8 (short ints)
     psrad mm1, 16;

// 1a) [in7,in6] = [blkptr7, blkptr6]; // dword, dword
// 1b) [in4,in5] = [blkptr4, blkptr5]; //dword, dword
// 1c) [in0,in1] = [blkptr0, blkptr1]; // dword,dword
// 1d) [in3,in2] = [blkptr3, blkptr2]; //dword, dword

#define in7in6 mm2
#define in4in5 mm3
#define in0in1 mm1
#define in3in2 mm0
#define in0in1_2 mm4
#define in3in2_2 mm5

    movq in3in2_2, in3in2; // 2nd copy of in3in2
     movq in0in1_2, in0in1;  // 2nd copy of in0in1

// tmp0= inptr[0]                         +                         inptr[7];
// tmp1=         inptr[1]                 +                 inptr[6];

// tmp7= inptr[0]                         -                         inptr[7];
// tmp6=         inptr[1]                 -                 inptr[6];

// tmp3=                         inptr[3] + inptr[4];
// tmp2=                 inptr[2]         +         inptr[5];

// tmp4=                         inptr[3] - inptr[4];
// tmp5=                 inptr[2]         -         inptr[5];



#define t0t1 in0in1
#define t3t2 in3in2
#define t4t5 in3in2_2
#define t7t6 in0in1_2
//  st1_0 <= [tmp0,tmp1]  (float, float)
//  st1_1 <= [tmp7,tmp6]  (float, float)
//  st1_2 <= [tmp3,tmp2]  (float, float)
//  st1_3 <= [tmp4,tmp5]  (float, float)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -