📄 idct_ap922tdn.cpp
字号:
#include "StdAfx.h"
#pragma warning(once:4305 4244)
/*
---------------------------------------------------------
idct_ap922tdn.c - AP922float iDCT, 3DNow implementation
---------------------------------------------------------
AP-922float iDCT (with AMD 3D-Now! acceleration)
------------------------------------------------
Intel Application Note AP-922 discusses an MMX implementation of
the MPEG fDCT/iDCT operations. "AP922float" is a floating-point
adaptation of the AP922 algorithm.
The 3DNow implementation requires a 3D-Now capable CPU, such as
the AMD K6-2 or Athlon. The provided assembly code works with Visual C++
6.0 Pro's inline-assembler (with the help of 'amd3dx.h') The
Microsoft Visual C++ Processor Pack Beta isn't needed to compile this
file.
In addition to the 3D-Now assembly listing, this file includes an
"equivalent C-code listing." The "equivalent C-Code" is functionally
identical to ap922float_x87. However the two versions differ in the
ordering of iDCT coefficient table w[]. The 3D-Now ordering is more
'natural' for standard x87 execution, and may run sligtly faster than
the other X87 listing located in ap922x87.c
For reasons documented below, the C-code and 3D-Now assembly code in
this file potentially generate different output:
Numerical Issue #1 : float->int conversion
------------------------------------------
Currently, the 3D-Now instruction set only supports truncation
for float->int conversion. Therefore, accurate_rounding is
disabled for the X87 implementation.
To minimize the truncation error, this version of AP922 utilizes the
same strategy discussed in idct_ap922x87.c Please refer to that file.
(For int->float conversion, AMD's microarchitecture does not have the
same instruction-pairing limitations as Intel's Pentium3. The
standard 3D-Now instruction set lacks "pshufw", so the coefficient
table w[] has been reordered to minimize the amount of input x[]
data-shuffling.)
Numerical Issue #2 : 3D-Now precision vs X87 FPU
------------------------------------------------
On 80x86 CPUs, FPU add/multiply operations are always calculated with
80-bit internal precision. Therefore, all other things being equal, the
X87 DCT will be more accurate than AMD 3D-Now, even when the X87 DCT
stores intermediate results in 32-bit float.
------------------------------------------
LEGAL MUMBO-JUMBO : Disclaimer of Warranty
------------------------------------------
This software is available to the user without any license fee or
royalty on an "as is" basis. The author disclaims any and all warranties,
whether express, implied, or statuary, including any implied warranties or
merchantability or of fitness for a particular purpose. In no event shall
the copyright-holder be liable for any incidental, punitive, or
consequential damages of any kind whatsoever arising from the use of these
programs.
This disclaimer of warranty extends to the user of these programs and user's
customers, employees, agents, transferees, successors, and assigns.
Revision history
----------------
09/05/2000 initial release of 'AP922float' 3D-Now iDCT
Based on AP922, this iDCT uses the original scaled-int (MMX) row
iDCT operator, paired with an all new floating-point column iDCT.
To the best of my knowledge, AP922hybr is fully IEEE 1180/1990
compliant iDCT, including output-range clipping to {-256,+255}
This file contains inline-assembly code developed under
Visual C++ 6.0 Pro.
09/03/2000 liaor@iname.com http://members.tripod.com/~liaor
liaor@iname.com http://members.tripod.com/~liaor
*/
#include <math.h> // standard C-library function "floor()"
#include "amd3dx.h" // inline-assembly macros for AMD 3D-Now
typedef float tdnfloat; // 3D-Now float = IEEE-754 32-bit float
#define USE_FAST_FLOAT2INT 1
#define PSCALE_SHIFT 16 // iDCT row input "prescale", 16-bit left shift
#define DESCALE_SHIFT PSCALE_SHIFT // Total descale shift amount (#bits)
#define DESCALE_SHIFT2 7 // 2nd stage descale shift: '7' produces 9-bit output range
#define DESCALE_SHIFT1 (DESCALE_SHIFT-DESCALE_SHIFT2) // 1st stage descale
#define ROW_PRESCALE( x ) (((int)(x))<<PSCALE_SHIFT)
#define DESCALE_ROUND_COMPENSATION (1<<(DESCALE_SHIFT-1))
#if defined(USE_FAST_FLOAT2INT) && !defined(SHIFT_ROUND_COLF)
// approximate rounding
#define SHIFT_ROUND_COLF( x ) ( ( ( (int)(x) ) + DESCALE_ROUND_COMPENSATION) >> DESCALE_SHIFT)
#else
#define SHIFT_ROUND_COLF( x ) ( ( ((int)floor(x) ) + DESCALE_ROUND_COMPENSATION )>>DESCALE_SHIFT )
#endif //
#define NCOS1_16 (0.980785280403230449126182236134239) // cosine( Pi/16 )
#define NCOS2_16 (0.923879532511286756128183189396788) // cosine( 2Pi/16 )
#define NCOS3_16 (0.831469612302545237078788377617906) // cosine( 3Pi/16 )
#define NCOS4_16 (0.707106781186547524400844362104849) // cosine( 4Pi/16 )
#define NCOS5_16 (0.555570233019602224742830813948533) // cosine( 5Pi/16 )
#define NCOS6_16 (0.382683432365089771728459984030399) // cosine( 6Pi/16 )
#define NCOS7_16 (0.195090322016128267848284868477022) // cosine( 7Pi/16 )
#define TANG1_16 ( NCOS7_16 / NCOS1_16) // tangent( Pi/16)
#define TANG2_16 ( NCOS6_16 / NCOS2_16) // tangent(2Pi/16)
#define TANG3_16 ( NCOS5_16 / NCOS3_16) // tangent(3Pi/16)
// Scaled floating-point coefficient table (ordering for AMD-3DNow ap922float)
// w00 w01 w02 w03
// w04 w05 w06 w07
// w08 w09 w10 w11
// w12 w13 w14 w15 ( all elements in float table are multiplied by 0.5)
// w16 w17 w18 w19
// w20 w21 w22 w23
// w24 w25 w26 w27
// w28 w29 w30 w31
//
// Compared to the integer-based table, the float table is downscaled by a
// multiplication factor of 0.5
// Scaled floating-point coefficient table
// Original iDCT 3DNow-iDCT
// table-ordering -> table-ordering
// ------------------ ------------------
// w00, w02, w04, w06, -> w00, w01, w02, w03,
// w01, w03, w05, w07, -> w04, w05, w06, w07,
// w08, w10, w12, w14, -> w08, w09, w10, w11,
// w09, w11, w13, w15, -> w12, w13, w14, w15,
// w16, w18, w20, w22, -> w16, w17, w18, w19,
// w17, w19, w21, w23, -> w20, w21, w22, w23,
// w24, w26, w28, w30, -> w24, w25, w26, w27,
// w25, w27, w29, w31 -> w28, w29, w30, w31
//
// Compared to the integer-based table, the float table is downscaled by a
// multiplication factor of 0.25*"DESCALE_SHIFT0"
#define IDCT_CONSTANT (0.25)
// all table entries multiplied by (0.5*0.5)=0.25
#define RS0 (NCOS4_16*IDCT_CONSTANT) // iDCT row#0 scalar
#define RS1 (NCOS1_16*IDCT_CONSTANT) // iDCT row#1 scalar
#define RS2 (NCOS2_16*IDCT_CONSTANT) // iDCT row#2 scalar
#define RS3 (NCOS3_16*IDCT_CONSTANT) // iDCT row#3 scalar
#define RS4 (NCOS4_16*IDCT_CONSTANT) // iDCT row#4 scalar
#define RS5 (NCOS3_16*IDCT_CONSTANT) // iDCT row#5 scalar
#define RS6 (NCOS2_16*IDCT_CONSTANT) // iDCT row#6 scalar
#define RS7 (NCOS1_16*IDCT_CONSTANT) // iDCT row#7 scalar
const static tdnfloat tab_i_01234567tdn[] =
{
// Row #0
NCOS4_16*RS0, NCOS2_16*RS0, NCOS4_16*RS0, NCOS6_16*RS0,
NCOS4_16*RS0, NCOS6_16*RS0, -NCOS4_16*RS0, -NCOS2_16*RS0,
NCOS4_16*RS0, -NCOS6_16*RS0, -NCOS4_16*RS0, NCOS2_16*RS0,
NCOS4_16*RS0, -NCOS2_16*RS0, NCOS4_16*RS0, -NCOS6_16*RS0,
NCOS1_16*RS0, NCOS3_16*RS0, NCOS5_16*RS0, NCOS7_16*RS0,
NCOS3_16*RS0, -NCOS7_16*RS0, -NCOS1_16*RS0, -NCOS5_16*RS0,
NCOS5_16*RS0, -NCOS1_16*RS0, NCOS7_16*RS0, NCOS3_16*RS0,
NCOS7_16*RS0, -NCOS5_16*RS0, NCOS3_16*RS0, -NCOS1_16*RS0,
// Row #1
NCOS4_16*RS1, NCOS2_16*RS1, NCOS4_16*RS1, NCOS6_16*RS1,
NCOS4_16*RS1, NCOS6_16*RS1, -NCOS4_16*RS1, -NCOS2_16*RS1,
NCOS4_16*RS1, -NCOS6_16*RS1, -NCOS4_16*RS1, NCOS2_16*RS1,
NCOS4_16*RS1, -NCOS2_16*RS1, NCOS4_16*RS1, -NCOS6_16*RS1,
NCOS1_16*RS1, NCOS3_16*RS1, NCOS5_16*RS1, NCOS7_16*RS1,
NCOS3_16*RS1, -NCOS7_16*RS1, -NCOS1_16*RS1, -NCOS5_16*RS1,
NCOS5_16*RS1, -NCOS1_16*RS1, NCOS7_16*RS1, NCOS3_16*RS1,
NCOS7_16*RS1, -NCOS5_16*RS1, NCOS3_16*RS1, -NCOS1_16*RS1,
// Row #2
NCOS4_16*RS2, NCOS2_16*RS2, NCOS4_16*RS2, NCOS6_16*RS2,
NCOS4_16*RS2, NCOS6_16*RS2, -NCOS4_16*RS2, -NCOS2_16*RS2,
NCOS4_16*RS2, -NCOS6_16*RS2, -NCOS4_16*RS2, NCOS2_16*RS2,
NCOS4_16*RS2, -NCOS2_16*RS2, NCOS4_16*RS2, -NCOS6_16*RS2,
NCOS1_16*RS2, NCOS3_16*RS2, NCOS5_16*RS2, NCOS7_16*RS2,
NCOS3_16*RS2, -NCOS7_16*RS2, -NCOS1_16*RS2, -NCOS5_16*RS2,
NCOS5_16*RS2, -NCOS1_16*RS2, NCOS7_16*RS2, NCOS3_16*RS2,
NCOS7_16*RS2, -NCOS5_16*RS2, NCOS3_16*RS2, -NCOS1_16*RS2,
// Row #3
NCOS4_16*RS3, NCOS2_16*RS3, NCOS4_16*RS3, NCOS6_16*RS3,
NCOS4_16*RS3, NCOS6_16*RS3, -NCOS4_16*RS3, -NCOS2_16*RS3,
NCOS4_16*RS3, -NCOS6_16*RS3, -NCOS4_16*RS3, NCOS2_16*RS3,
NCOS4_16*RS3, -NCOS2_16*RS3, NCOS4_16*RS3, -NCOS6_16*RS3,
NCOS1_16*RS3, NCOS3_16*RS3, NCOS5_16*RS3, NCOS7_16*RS3,
NCOS3_16*RS3, -NCOS7_16*RS3, -NCOS1_16*RS3, -NCOS5_16*RS3,
NCOS5_16*RS3, -NCOS1_16*RS3, NCOS7_16*RS3, NCOS3_16*RS3,
NCOS7_16*RS3, -NCOS5_16*RS3, NCOS3_16*RS3, -NCOS1_16*RS3,
// Row #4
NCOS4_16*RS4, NCOS2_16*RS4, NCOS4_16*RS4, NCOS6_16*RS4,
NCOS4_16*RS4, NCOS6_16*RS4, -NCOS4_16*RS4, -NCOS2_16*RS4,
NCOS4_16*RS4, -NCOS6_16*RS4, -NCOS4_16*RS4, NCOS2_16*RS4,
NCOS4_16*RS4, -NCOS2_16*RS4, NCOS4_16*RS4, -NCOS6_16*RS4,
NCOS1_16*RS4, NCOS3_16*RS4, NCOS5_16*RS4, NCOS7_16*RS4,
NCOS3_16*RS4, -NCOS7_16*RS4, -NCOS1_16*RS4, -NCOS5_16*RS4,
NCOS5_16*RS4, -NCOS1_16*RS4, NCOS7_16*RS4, NCOS3_16*RS4,
NCOS7_16*RS4, -NCOS5_16*RS4, NCOS3_16*RS4, -NCOS1_16*RS4,
// Row #5
NCOS4_16*RS5, NCOS2_16*RS5, NCOS4_16*RS5, NCOS6_16*RS5,
NCOS4_16*RS5, NCOS6_16*RS5, -NCOS4_16*RS5, -NCOS2_16*RS5,
NCOS4_16*RS5, -NCOS6_16*RS5, -NCOS4_16*RS5, NCOS2_16*RS5,
NCOS4_16*RS5, -NCOS2_16*RS5, NCOS4_16*RS5, -NCOS6_16*RS5,
NCOS1_16*RS5, NCOS3_16*RS5, NCOS5_16*RS5, NCOS7_16*RS5,
NCOS3_16*RS5, -NCOS7_16*RS5, -NCOS1_16*RS5, -NCOS5_16*RS5,
NCOS5_16*RS5, -NCOS1_16*RS5, NCOS7_16*RS5, NCOS3_16*RS5,
NCOS7_16*RS5, -NCOS5_16*RS5, NCOS3_16*RS5, -NCOS1_16*RS5,
// Row #6
NCOS4_16*RS6, NCOS2_16*RS6, NCOS4_16*RS6, NCOS6_16*RS6,
NCOS4_16*RS6, NCOS6_16*RS6, -NCOS4_16*RS6, -NCOS2_16*RS6,
NCOS4_16*RS6, -NCOS6_16*RS6, -NCOS4_16*RS6, NCOS2_16*RS6,
NCOS4_16*RS6, -NCOS2_16*RS6, NCOS4_16*RS6, -NCOS6_16*RS6,
NCOS1_16*RS6, NCOS3_16*RS6, NCOS5_16*RS6, NCOS7_16*RS6,
NCOS3_16*RS6, -NCOS7_16*RS6, -NCOS1_16*RS6, -NCOS5_16*RS6,
NCOS5_16*RS6, -NCOS1_16*RS6, NCOS7_16*RS6, NCOS3_16*RS6,
NCOS7_16*RS6, -NCOS5_16*RS6, NCOS3_16*RS6, -NCOS1_16*RS6,
// Row #7
NCOS4_16*RS7, NCOS2_16*RS7, NCOS4_16*RS7, NCOS6_16*RS7,
NCOS4_16*RS7, NCOS6_16*RS7, -NCOS4_16*RS7, -NCOS2_16*RS7,
NCOS4_16*RS7, -NCOS6_16*RS7, -NCOS4_16*RS7, NCOS2_16*RS7,
NCOS4_16*RS7, -NCOS2_16*RS7, NCOS4_16*RS7, -NCOS6_16*RS7,
NCOS1_16*RS7, NCOS3_16*RS7, NCOS5_16*RS7, NCOS7_16*RS7,
NCOS3_16*RS7, -NCOS7_16*RS7, -NCOS1_16*RS7, -NCOS5_16*RS7,
NCOS5_16*RS7, -NCOS1_16*RS7, NCOS7_16*RS7, NCOS3_16*RS7,
NCOS7_16*RS7, -NCOS5_16*RS7, NCOS3_16*RS7, -NCOS1_16*RS7,
// iDCT column coefficients
NCOS4_16, NCOS4_16, // cosine (4Pi/16)
TANG1_16, TANG1_16, // tangent ( Pi/16)
TANG2_16, TANG2_16, // tangent (2Pi/16)
TANG3_16, TANG3_16 // tangent (3Pi/16)
};
// externally allocated array of 64 floats, for temporary storage
extern void *fTempArray;
const static tdnfloat tg_1_16f = TANG1_16; // scalar versions of tab_i_col[]
const static tdnfloat tg_2_16f = TANG2_16;
const static tdnfloat tg_3_16f = TANG3_16;
const static tdnfloat cos_4_16f= NCOS4_16;
// Rounding compensation for final output (in 3DNow column_iDCT operator)
const static int rnd_compensation[2] = { DESCALE_ROUND_COMPENSATION,DESCALE_ROUND_COMPENSATION};
////////////////////////////////////////////////////////////////////////
//
//
// AP922float_3dn(), AMD 3D-Now implementation of AP922float
//
//
////////////////////////////////////////////////////////////////////////
void
idct_ap922float_3dn( short *data )
{
const static __int64 mmAndMask = 0xFFFF0000FFFF0000;
__asm {
#define INPR eax
#define OUTR edx
#define TABLE ecx
mov OUTR, dword ptr [fTempArray];
mov edi, 0x00; //x = 0
mov INPR, dword ptr [data]; ;// row 0
// mov OUT, INP; // algorithm writes data in-place -> row 0
lea TABLE, dword ptr [tab_i_01234567tdn]; // row 0
sub OUTR, 32; // precompensate output address
// for ( x = 0; x < 8; ++x ) // transform one row per iteration
ALIGN 16
acc_idct_rowloop1:
movq mm0, qword ptr [INPR] ; // 0 ; x3 x2 x1 x0
add edi, 0x01; // x=x+1, increment row counter
movq mm3, qword ptr [INPR+8] ; // 1 ; x7 x6 x5 x4
movq mm2, mm0 ; // 2 ; x3 x2 x1 x0
// 1) convert mm0 into [x2 x0] <dword>
pslld mm0, PSCALE_SHIFT // 1a) 0 ; [x2 x0]*PSCALE <dword>
movq mm1, mm3; // 1a) 3 ; x7 x6 x5 x4
PI2FD (mm0, mm0); // 1b) mm0 <=[x2 x0]*PSCALE <float>
pslld mm1, PSCALE_SHIFT; // 1a) 2 ; [x3 x1]*PSCALE <dword>
pand mm2, [mmAndMask]; // 1a) mm2 <=[x3 x1]*PSCALE <float>
PI2FD (mm1, mm1); // 1b) mm1 <=[x6 x4]*PSCALE <float>
pand mm3, [mmAndMask]; // 1a) mm2 <=[x7 x5]*PSCALE <float>
movq mm4, mm0; // 2a) mm4 <=[x2 x0] <float> (a0)
PFMULM( mm4, TABLE, 0); // 2a) mm4 <=[x2*w1 x0*w0] (a0a)
movq mm5, mm1; // 2a) mm5 <=[x6 x4] <float>
PFMULM( mm5, TABLE, 8); // 2a) mm5 <=[x6*w3 x4*w2] (a0b)
PI2FD (mm2, mm2); // 1b) mm2 <=[x3 x1]*PSCALE <float>
PI2FD (mm3, mm3); // 1b) mm3 <=[x7 x5]*PSCALE <float>
movq mm6, mm0; // 2a) mm6 <=[x2 x0] <float> (a1a)
PFMULM( mm6, TABLE, 16); // 2a) mm6 <=[x2*w5 x0*w4] (a1a)
movq mm7, mm1; // 2a) mm7 <=[x6 x4] <float> (a1b)
PFMULM( mm7, TABLE, 24); // 2a) mm7 <=[x6*w7 x4*w6] (a1b)
PFACC( mm4, mm5); // 2b) mm4 <=[a0b a0a] (a0)
// mm5 free
add OUTR, 32; // increment output-pointer (+8 floats)
// mm5 used
movq mm5, mm0; // 2a) mm5 <=[x2 x0] <float> (a2a)
//mm7 free
PFACC( mm6, mm7); // 2b) mm6 <=[a1b a1a] (a1)
PFMULM( mm5, TABLE, 32); // 2a) mm5 <=[x2*w9 x0*w8] (a2a)
//mm7 used
movq mm7, mm1; // 2a) mm7 <=[x6 x4] <float> (a2b)
;//
PFMULM( mm7, TABLE, 40); // 2a) mm7 <=[x6*w11 x4*w10] (a2b)
PFACC( mm4, mm6 ); // 2c) mm4 <= [a1 a0]
// mm6 free, mm4 contains [a1 a0]
PFMULM( mm0, TABLE, 48); // 2a) mm0 <=[x2*w13 x0*w12] (a3a)
// mm6 used
movq mm6, mm2; // 2d) mm6 <= [x3 x1] (b0a)
PFMULM( mm1, TABLE, 56); // 2a) mm1 <=[x6*w15 x4*w14] (a3b)
PFACC( mm5, mm7); // 2b) mm5 <=[a2b a2a] (a2)
//mm7 free
movq [ OUTR], mm4; // store [a1 a0] to INPR-16
// mm4 free
//mm7 used
movq mm7, mm3; // 2d) mm7 <= [x7 x5] (b0b)
PFMULM( mm6, TABLE, 64); // 2d) mm6 <=[x3*w17 x1*w16] (b0a)
PFACC( mm0, mm1); // 2b) mm5 <=[a3b a3a] (a3)
//mm1 free
PFMULM( mm7, TABLE, 72); // 2d) mm7 <=[x7*w19 x5*w18] (b0b)
// mm4 used
movq mm4, mm2; // 2d) mm4 <= [x3 x1] (b1a)
PFMULM( mm4, TABLE, 80); // 2d) mm4 <=[x3*w21 x1*w20] (b1a)
PFACC( mm5, mm0); // 2c) mm5 <=[a3 a2]
//mm0 free, mm5 contains [a3 a2]
//mm1 used
movq mm1, mm3; // 2d) mm1 <= [x7 x5] (b1b)
PFACC( mm6, mm7 ); // 2e) mm6 <=[b0b b0a] (b0)
//mm7 free
PFMULM( mm1, TABLE, 88); // 2d) mm1 <=[x7*w23 x5*w22] (b1b)
movq mm0, mm2; // 2d) mm0 <= [x3 x1] (b2a)
//mm0 used
// movq [INPR - 8 ], mm5; // store [a3 a2] to INPR-8
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -