📄 fdct3dn.cpp
字号:
#include "StdAfx.h"
#include "..\Common.h"
/* fdct3dn_extracomments.c, forward discrete cosine transform, 3D-Now! implementation
*
* -----------------------------------
* All 3D-Now instructions converted to "_EMIT " primitives
* (thanks to Microsoft Visual C++ preprocesor compile option "/P")
* -----------------------------------
*
* This software is a derivative work of the MPEG Software Simulation Group.
* Use of this software is restricted by the distribution rights granted
* by the MPEG Software Simulation Group.
*
*
* AAN float-DCT (single-precision) with 3D-Now acceleration
* -----------------------------------------------------------------
* This file contains a Visual C++ 6 source-code listing for a
* 3D-Now implementation of the AAN-DCT algorithm. The AAN algorithm
* is one of several fast-approaches to perform a discrete-cosine
* transform on an 8x8 block of pixel-data. This code was developed
* and tested under Visual C++ 6.0 Professional Edition. You will
* need the "amd3dx.h" header-file from AMD's SDK to compile this
* code as-is.
*
* The 3D-Now implementation was converted from BBMPEG's floating
* point AAN-DCT. The 3D-Now implementation is mathematically identical
* except for 2 differences :
*
* (1) 3D-Now implementation is single-precision (32-bit)
*
* (2) 3D-Now implementation introduces a slight (1/32768)
* negative offset for the DCT output elements which
* are negative. Please see the embedded code comments for
* more information. The offset occurs during the final
* post-scaling/rounding step. It can be eliminated at the
* expense of slower execution. However, the amount of the
* offset is insignificant compared to the overall
* systematic error of the AAN.
*
* The code-listing is is compatible with all 3D-Now capable processors
* (AMD K6/2 and later, IDT Winchip2, Cyrix M3.) The code has only been
* thoroughly tested on an AMD K6/2-500.
*
* Although the code is reasonably well optimized, not all optimization
* possibilities were implemented, as many of them would render
* the listing virtually unreadable.
* Some segments could be shortened by using the DSP-extensions of
* the K7-Athlon ("3D-Now enhanced.") Of particular interest are PFPNACC,
* PSWAPD, and PSHUFW.
*
*
* Revision history
* ----------------
*
* v1.0 Single-precision floating point AAN-DCT (from BBMPEG)
* using the AMD 3D-Now! instruction-set. Tested on AMD K6/2.
*
*
* liaor@iname.com v1.0 07/13/2000
*
* Other references/links:
* -----------------------
*
* BBMPEG (Brent Beyeler) : http://members.home.net/beyeler
* IEEE-1180 test info : http://www.mpeg.org
* AMD SDK : http://www.amd.com
*/
/* Original AAN-DCT from <mpeg2codec v1.2> copyright (C) 1996,
MPEG Software Simulation Group. All Rights Reserved. */
/*
* Disclaimer of Warranty
*
* These software programs are available to the user without any license fee or
* royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
* any and all warranties, whether express, implied, or statuary, including any
* implied warranties or merchantability or of fitness for a particular
* purpose. In no event shall the copyright-holder be liable for any
* incidental, punitive, or consequential damages of any kind whatsoever
* arising from the use of these programs.
*
* This disclaimer of warranty extends to the user of these programs and user's
* customers, employees, agents, transferees, successors, and assigns.
*
* The MPEG Software Simulation Group does not represent or warrant that the
* programs furnished hereunder are free of infringement of any third-party
* patents.
*
* Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
* are subject to royalty fees to patent holders. Many of these patents are
* general enough such that they are unavoidable regardless of implementation
* design.
*
*/
/* A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
* on each column. Direct algorithms are also available, but they are
* much more complex and seem not to be any faster when reduced to code.
*
* This implementation is based on Arai, Agui, and Nakajima's algorithm for
* scaled DCT. Their original paper (Trans. IEICE E-71(11):1095) is in
* Japanese, but the algorithm is described in the Pennebaker & Mitchell
* JPEG textbook (see REFERENCES section in file README). The following code
* is based directly on figure 4-8 in P&M.
* While an 8-point DCT cannot be done in less than 11 multiplies, it is
* possible to arrange the computation so that many of the multiplies are
* simple scalings of the final outputs. These multiplies can then be
* folded into the multiplications or divisions by the JPEG quantization
* table entries. The AA&N method leaves only 5 multiplies and 29 adds
* to be done in the DCT itself.
* The primary disadvantage of this method is that with a fixed-point
* implementation, accuracy is lost due to imprecise representation of the
* scaled quantization values. However, that problem does not arise if
* we use floating point arithmetic.
*/
#ifdef __BORLANDC__
#define EMIT db
#else
#define EMIT _emit
#endif
#include<math.h> // for floor() function
//#include "amd3dx.h" // macros-file from AMD 3D-Now SDK, http://www.amd.com
static float local_aanscales[64];
// PSCF_xxx is for postscaling/rounding operation.
// For 'accurate-rounding' (see C-code), use these alternate definitions
/*
#define PSCF_SCALE 1.0
#define PSCF_SHIFT 0
#define PSCF_MASK 0x0000000000000000 // [+0,+0] dword,dword
*/
#define PSCF_SCALE 32768.0
#define PSCF_SHIFT 15
#define PSCF_MASK 0x0000400000004000 // [+32768,+32768] dword,dword
#define NC_COS6 0.382683432365089771728459984030399//cos(6*pi/16)
#define NC_R_SQRT2 0.707106781186547524400844362104849// 1/sqrt(2)
#define NC_COS1SQRT2 1.38703984532214746182161919156644 //cos(1*pi/16)*sqrt(2)
#define NC_COS2SQRT2 1.30656296487637652785664317342719 //cos(2*pi/16)*sqrt(2)
#define NC_COS3SQRT2 1.17587560241935871697446710461126 //cos(3*pi/16)*sqrt(2)
#define NC_COS5SQRT2 0.785694958387102181277897367657217//cos(5*pi/16)*sqrt(2)
#define NC_COS6SQRT2 0.541196100146196984399723205366389//cos(6*pi/16)*sqrt(2)
#define NC_COS7SQRT2 0.275899379282943012335957563669373//cos(7*pi/16)*sqrt(2)
void init_fdct_3dnow()
{
int i, j;
static const double aansf[8] = {
1.0,
NC_COS1SQRT2,
NC_COS2SQRT2,
NC_COS3SQRT2,
1.0, // cos(4*pi/16) * sqrt(2) = 1.0 exactly
NC_COS5SQRT2,
NC_COS6SQRT2,
NC_COS7SQRT2
};
for (i = 0; i < 8; i++)
for (j = 0; j < 8; j++)
local_aanscales[(i<<3)+j] = (float)((double)(PSCF_SCALE) / (aansf[i]*aansf[j]*8.0));
// local_aanscales[(i << 3) + j] = 1.0 / (aansf[i] * aansf[j] * 8.0);
}
void fdct_3dnow(short *block)
{
static float data[64]; // temporary data array
static __int64 tmp3tmp2, tmp0tmp1, tmp7tmp6, tmp4tmp5;
static __int64 tmp10tmp11, tmp13tmp12, tmp15tmp14, tmp17tmp16;
// static __int64 z2z3, z5z4;
static const __int64 mmMask00001000 = 0x80000000;
static const __int64 mmMaskRnd = PSCF_MASK; // post "round-up" mask
// multiplication constants for DCT processing
static const float CONSTANTS[] = {
- (float)NC_R_SQRT2, (float)NC_R_SQRT2, //-0.7071,0.7071 --> [ 0.7071,-0.7071 ]
(float)NC_COS2SQRT2, (float)NC_COS6SQRT2, // 1.3066,0.5412 --> [ 0.5412, 1.3066 ]
(float)NC_R_SQRT2, (float)NC_COS6 // 0.7071,0.3827 --> [ 0.3827, 0.7071 ]
};
/*
debugging variables
float *tmp0 = (float *) &(tmp0tmp1);
float *tmp1 = (float *) &(tmp0tmp1);
float *tmp2 =(float *) &(tmp3tmp2);
float *tmp3 =(float *) &(tmp3tmp2);
float *tmp4 =(float *) &(tmp4tmp5);
float *tmp5 =(float *) &(tmp4tmp5);
float *tmp6 =(float *) &(tmp7tmp6);
float *tmp7 = (float *) &(tmp7tmp6);
float *tmp10 = (float *) &(tmp10tmp11);
float *tmp11 = (float *) &(tmp10tmp11);
float *tmp12 = (float *) &(tmp13tmp12);
float *tmp13 = (float *) &(tmp13tmp12);
float *tmp14 = (float *) &(tmp15tmp14);
float *tmp15 = (float *) &(tmp15tmp14);
float *tmp16 = (float *) &(tmp17tmp16);
float *z2 = (float *) &(z2z3);
float *z3 = (float *) &(z2z3);
float *z4 = (float *) &(z5z4);
float *z5 = (float *) &(z5z4);
float z1, z11, z13;
*/
static float *dataptr, *dataptr2;
// short *blkptr;
// int i,j;
/*
debugging pointer variables, generate proper address for the following
pointers
++tmp0;
++tmp3;
++tmp4;
++tmp7;
++tmp10;
++tmp13;
++tmp15;
++z2;
++z5;
*/
// The initialize routine init_fdct() must be called prior to
// using this function!
/*************************************************************
*
* Pass 1: process rows, transpose intermediate result
*
*************************************************************/
//
// blkptr = block;
dataptr = data;
dataptr2 = data;
// for (i = 0; i < 8; i++)
// {
/*
*tmp0 = blkptr[0] + blkptr[7];
*tmp7 = blkptr[0] - blkptr[7];
*tmp1 = blkptr[1] + blkptr[6];
*tmp6 = blkptr[1] - blkptr[6];
*tmp2 = blkptr[2] + blkptr[5];
*tmp5 = blkptr[2] - blkptr[5];
*tmp3 = blkptr[3] + blkptr[4];
*tmp4 = blkptr[3] - blkptr[4];
*/
__asm {
//////////
// tdn_dct_row1 computes the fDCT for 1 input-row.
// tdn_dct_row1 transposes the output, so result is stored as column.
// sio
// source data is assumed
mov eax, dword ptr [block];
pxor mm7, mm7; // mm7 <= 0x0000_0000_0000_0000
mov edx, dword ptr [dataptr];// edx <= &dataptr[0]
mov edi, 0x08; // edi = 'i' // for ( i = 8; i > 0; i=i+1 )
lea ebx, dword ptr [CONSTANTS];// ebx <= &CONSTANTS[0]
tdn_dct_row1: // 3d_now_dct_row1 loop-point
movq mm5, qword ptr [eax]; // mm5 <= blkptr[i3_i2_i1_i0]
pxor mm0, mm0; // clear mm0
movq mm6, qword ptr [eax+8]; // mm6 <= blkptr[i7_i6_i5_i4]
pxor mm2, mm2; // clear mm2
movq mm1, mm5; // mm1 <= blkptr[i3_i2_i1_i0]
punpckhwd mm0, mm5; // mm0 <= produce [i3,00,i2,00] (sword)
movq mm3, mm6; // mm3 <= blkptr[i7_i6_i5_i4]
psrlq mm5, 16; // mm5 <= blkptr[00_i3_i2_i1]
punpckhwd mm2, mm6; // mm2 <= produce [i7,00,i6,00] (sword)
psrlq mm6, 16; // mm6 <= blkptr[00_i7_i6_i5]
punpcklwd mm5, mm1; // mm5 <= [i1,i2,i0,i1] (sword)
pxor mm1, mm1; // clear mm1
psrad mm0, 16;
punpcklwd mm1, mm5; // mm1 <= produce [i0,00,i1,00] (sword)
punpcklwd mm6, mm3; // mm6 <= [i5,i6,i4,i5] (sword)
pxor mm3, mm3; // clear mm3
psrad mm2, 16;
punpcklwd mm3, mm6; // mm3 <= produce [i4,00,i5,00] (sword)
psrad mm3, 16;
add eax, 16; // dataptr += 8 (short ints)
psrad mm1, 16;
// 1a) [in7,in6] = [blkptr7, blkptr6]; // dword, dword
// 1b) [in4,in5] = [blkptr4, blkptr5]; //dword, dword
// 1c) [in0,in1] = [blkptr0, blkptr1]; // dword,dword
// 1d) [in3,in2] = [blkptr3, blkptr2]; //dword, dword
#define in7in6 mm2
#define in4in5 mm3
#define in0in1 mm1
#define in3in2 mm0
#define in0in1_2 mm4
#define in3in2_2 mm5
movq in3in2_2, in3in2; // 2nd copy of in3in2
movq in0in1_2, in0in1; // 2nd copy of in0in1
// tmp0= inptr[0] + inptr[7];
// tmp1= inptr[1] + inptr[6];
// tmp7= inptr[0] - inptr[7];
// tmp6= inptr[1] - inptr[6];
// tmp3= inptr[3] + inptr[4];
// tmp2= inptr[2] + inptr[5];
// tmp4= inptr[3] - inptr[4];
// tmp5= inptr[2] - inptr[5];
#define t0t1 in0in1
#define t3t2 in3in2
#define t4t5 in3in2_2
#define t7t6 in0in1_2
// st1_0 <= [tmp0,tmp1] (float, float)
// st1_1 <= [tmp7,tmp6] (float, float)
// st1_2 <= [tmp3,tmp2] (float, float)
// st1_3 <= [tmp4,tmp5] (float, float)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -