📄 idct_ap922hybr.cpp
字号:
#include "StdAfx.h"
#pragma warning(once:4305 4244)
/*
---------------------------------------------------------
idct_ap922hybr.c - AP922hybrid iDCT, C-code implementation
---------------------------------------------------------
AP-922 "hybrid" float/int32 iDCT (x87 FPU)
----------------------------------------
Intel Application Note AP-922 discusses an MMX implementation of
the MPEG fDCT/iDCT operations. Numerical analysis of the AP922 iDCT
reveals that the column_idct operator accounts for most of the
output error-magnitude. (Disclaimer, I did not perform any formal
numerical analysis...)
"AP922hybrid" is an adaptation of the original AP922 algorithm.
The 'hybrid' modification combines the original AP922 scaled_integer
row_idct operator with a new floating-point column_idct operator.
In terms of both speed and accuracy, the 3D-Now implementation of
AP922hybrid falls between AP922float(3DNow) and the original AP922mmx.
This file contains two implementations of AP922hybrid :
1) 3D-Now accelerated
2) standard C (X87)
The X87 implementation (in this file) should compile and run on any
i486DX or higher CPU. The standard-C version was coded to emulate the
3D-Now version as much as possible.
The 3DNow implementation requires a 3D-Now capable CPU, such as
the AMD K6-2 or Athlon. The provided 3DNow code was developed under
Visual C++ 6.0 Pro. To use the 3DNow code 'as is', and will need
VC++ 5/6 (and 'amd3dx.h', which is included.) The Microsoft
Visual C++ 6.0 Processor Pack Beta is *not* needed.
Since both AP922float and AP922hybrid use floating point math,
AP922hybrid shares the same set of numerical issues:
Numerical Issue #1 : float->int conversion
------------------------------------------
Currently, the 3D-Now instruction set only supports truncation
for float->int conversion. Therefore, accurate_rounding is
disabled for the X87 implementation.
To minimize the truncation error, this version of AP922 utilizes the
same strategy discussed in idct_ap922x87.c Please refer to that file.
Numerical precision of 3DNow versus X87(FPU)
---------------------------------------
On 80x86 CPUs, FPU add/multiply operations are always
calculated with 80-bit internal precision. Therefore, all other things
being equal, the X87 DCT will be more accurate than AMD 3D-Now, even
when the X87 DCT stores intermediate results in 32-bit float.
The statement 'typedef float tdnfloat;' fixes the x87's computational
accuracy to 32-bit float. AP922hybrid uses a combination of
scaled-integer and floating-point math, with the scaled-integer math
responsible for nearly all output error. Therefore, switching to
tdnfloat to a more accurate type (like "double") does not appreciably
improve accuracy.
------------------------------------------
LEGAL MUMBO-JUMBO : Disclaimer of Warranty
------------------------------------------
This software is available to the user without any license fee or
royalty on an "as is" basis. The author disclaims any and all warranties,
whether express, implied, or statuary, including any implied warranties or
merchantability or of fitness for a particular purpose. In no event shall
the copyright-holder be liable for any incidental, punitive, or consequential
damages of any kind whatsoever arising from the use of these programs.
This disclaimer of warranty extends to the user of these programs and user's
customers, employees, agents, transferees, successors, and assigns.
Revision history
----------------
09/03/2000 initial release of 'AP922hybrid' iDCT
Based on AP922, this iDCT uses the original scaled-int (MMX) row
iDCT operator, paired with an all new floating-point column iDCT.
To the best of my knowledge, AP922hybr is fully IEEE 1180/1990
compliant iDCT, including output-range clipping to {-256,+255}
This file contains inline-assembly code developed under
Visual C++ 6.0 Pro.
09/03/2000 liaor@iname.com http://members.tripod.com/~liaor
liaor@iname.com http://members.tripod.com/~liaor
*/
#include "amd3dx.h" // inline-assembly macros for AMD 3D-Now
typedef float tdnfloat; // 3D-Now float = IEEE-754 32-bit float
// comment out the following line to use accurate (slower) rounding
#define USE_FAST_FLOAT2INT 1
#define DESCALE_SHIFT 17 // Total prescale-shift : 16+1 bit right-shift
#define DESCALE_SHIFT2 7 // 2nd stage descale-shift : 9-bit output range
#define DESCALE_SHIFT1 (DESCALE_SHIFT-DESCALE_SHIFT2) // 1st stage descale
#define DESCALE_ROUND_COMPENSATION (1<<(DESCALE_SHIFT-1))
#ifdef USE_FAST_FLOAT2INT
// approximate rounding
#define SHIFT_ROUND_COLF( x ) ( ( ( (int)(x) ) + DESCALE_ROUND_COMPENSATION) >> DESCALE_SHIFT)
#else
"DELIBERATE_SYNTAX_ERROR______ACCURATE_ROUNDING_IS_NOT_SUPPORTED";
// accurate rounding is NOT supported
#define SHIFT_ROUND_COLF( x ) ( ( ((int)floor( (x) )) + DESCALE_ROUND_COMPENSATION )>>DESCALE_SHIFT )
#endif // USE_FAST_FLOAT2INT
//
// CONCATENATED iDCT ROW COEFF TABLE, rows 0,1,2,3,4,5,6,7 (in order )
// copied directly from Intel Application note AP-922
const static short tab_i_01234567hybr[] = {
//row0, this row is required
16384, 16384, 16384, -16384, //; movq-> w06 w04 w02 w00
21407, 8867, 8867, -21407, //; w07 w05 w03 w01
16384, -16384, 16384, 16384, //; w14 w12 w10 w08
-8867, 21407, -21407, -8867, //; w15 w13 w11 w09
22725, 12873, 19266, -22725, //; w22 w20 w18 w16
19266, 4520, -4520, -12873, //; w23 w21 w19 w17
12873, 4520, 4520, 19266, //; w30 w28 w26 w24
-22725, 19266, -12873, -22725,//; w31 w29 w27 w25
//row1
22725, 22725, 22725, -22725, //; movq-> w06 w04 w02 w00
29692, 12299, 12299, -29692, //; w07 w05 w03 w01
22725, -22725, 22725, 22725, //; w14 w12 w10 w08
-12299, 29692, -29692, -12299,//; w15 w13 w11 w09
31521, 17855, 26722, -31521, //; w22 w20 w18 w16
26722, 6270, -6270, -17855, //; w23 w21 w19 w17
17855, 6270, 6270, 26722, //; w30 w28 w26 w24
-31521, 26722, -17855, -31521,// w31 w29 w27 w25
//row2
21407, 21407, 21407, -21407, // ; movq-> w06 w04 w02 w00
27969, 11585, 11585, -27969, // ;w07 w05 w03 w01
21407, -21407, 21407, 21407, // ;w14 w12 w10 w08
-11585, 27969, -27969, -11585,// ;w15 w13 w11 w09
29692, 16819, 25172, -29692, // ;w22 w20 w18 w16
25172, 5906, -5906, -16819, // ;w23 w21 w19 w17
16819, 5906, 5906, 25172, // ;w30 w28 w26 w24
-29692, 25172, -16819, -29692,// ;w31 w29 w27 w25
//row3
19266, 19266, 19266, -19266, //; movq-> w06 w04 w02 w00
25172, 10426, 10426, -25172, //; w07 w05 w03 w01
19266, -19266, 19266, 19266, //; w14 w12 w10 w08
-10426, 25172, -25172, -10426,//; w15 w13 w11 w09
26722, 15137, 22654, -26722, //; w22 w20 w18 w16
22654, 5315, -5315, -15137, //; w23 w21 w19 w17
15137, 5315, 5315, 22654, //; w30 w28 w26 w24
-26722, 22654, -15137, -26722,//; w31 w29 w27 w25
//row4
16384, 16384, 16384, -16384, //; movq-> w06 w04 w02 w00
21407, 8867, 8867, -21407, // w07 w05 w03 w01
16384, -16384, 16384, 16384, //; w14 w12 w10 w08
-8867, 21407, -21407, -8867, //; w15 w13 w11 w09
22725, 12873, 19266, -22725, //; w22 w20 w18 w16
19266, 4520, -4520, -12873, //; w23 w21 w19 w17
12873, 4520, 4520, 19266, //; w30 w28 w26 w24
-22725, 19266, -12873, -22725,// w31 w29 w27 w25
//row5
19266, 19266, 19266, -19266, //; movq-> w06 w04 w02 w00
25172, 10426, 10426, -25172, //; w07 w05 w03 w01
19266, -19266, 19266, 19266, //; w14 w12 w10 w08
-10426, 25172, -25172, -10426,//; w15 w13 w11 w09
26722, 15137, 22654, -26722, //; w22 w20 w18 w16
22654, 5315, -5315, -15137, //; w23 w21 w19 w17
15137, 5315, 5315, 22654, //; w30 w28 w26 w24
-26722, 22654, -15137, -26722,//; w31 w29 w27 w25
//row6
21407, 21407, 21407, -21407, // ;movq-> w06 w04 w02 w00
27969, 11585, 11585, -27969, // ;w07 w05 w03 w01
21407, -21407, 21407, 21407, // ;w14 w12 w10 w08
-11585, 27969, -27969, -11585,// ;w15 w13 w11 w09
29692, 16819, 25172, -29692, // ;w22 w20 w18 w16
25172, 5906, -5906, -16819, // ;w23 w21 w19 w17
16819, 5906, 5906, 25172, // ;w30 w28 w26 w24
-29692, 25172, -16819, -29692,// ;w31 w29 w27 w25
//row7
22725, 22725, 22725, -22725, //; movq-> w06 w04 w02 w00
29692, 12299, 12299, -29692, //; w07 w05 w03 w01
22725, -22725, 22725, 22725, //; w14 w12 w10 w08
-12299, 29692, -29692, -12299,//; w15 w13 w11 w09
31521, 17855, 26722, -31521, //; w22 w20 w18 w16
26722, 6270, -6270, -17855, //; w23 w21 w19 w17
17855, 6270, 6270, 26722, //; w30 w28 w26 w24
-31521, 26722, -17855, -31521 //; w31 w29 w27 w25
};
#define NCOS1_16 (0.980785280403230449126182236134239) // cosine( Pi/16 )
#define NCOS2_16 (0.923879532511286756128183189396788) // cosine( 2Pi/16 )
#define NCOS3_16 (0.831469612302545237078788377617906) // cosine( 3Pi/16 )
#define NCOS4_16 (0.707106781186547524400844362104849) // cosine( 4Pi/16 )
#define NCOS5_16 (0.555570233019602224742830813948533) // cosine( 5Pi/16 )
#define NCOS6_16 (0.382683432365089771728459984030399) // cosine( 6Pi/16 )
#define NCOS7_16 (0.195090322016128267848284868477022) // cosine( 7Pi/16 )
#define TANG1_16 ( NCOS7_16 / NCOS1_16) // tangent( Pi/16)
#define TANG2_16 ( NCOS6_16 / NCOS2_16) // tangent(2Pi/16)
#define TANG3_16 ( NCOS5_16 / NCOS3_16) // tangent(3Pi/16)
// externally allocated array of 64 floats, for temporary storage
//extern void *fTempArray;
static float fTempArray[64];
const static tdnfloat tab_i_col[]=
{
(float)NCOS4_16, (float)NCOS4_16, // cosine (4Pi/16)
(float)TANG1_16, (float)TANG1_16, // tangent ( Pi/16)
(float)TANG2_16, (float)TANG2_16, // tangent (2Pi/16)
(float)TANG3_16, (float)TANG3_16 // tangent (3Pi/16)
};
const static tdnfloat tg_1_16f = (float)TANG1_16; // scalar versions of tab_i_col[]
const static tdnfloat tg_2_16f = (float)TANG2_16;
const static tdnfloat tg_3_16f = (float)TANG3_16;
const static tdnfloat cos_4_16f= (float)NCOS4_16;
// Rounding compensation for final output (in column_iDCT operator)
const static int rnd_compensation[2] = { DESCALE_ROUND_COMPENSATION,DESCALE_ROUND_COMPENSATION};
/*
////////////////////////////////////////////////////////////////////////
//
//
// AP922hybr_3dn(), AMD 3D-Now implementation of AP922hybrid
//
//
////////////////////////////////////////////////////////////////////////
void idct_ap922hybr_3dn(short *data)
{
__asm {
#define INPR eax
#define OUTR edx
#define TABLE ecx
mov INPR, dword ptr [data]; ;// row 0
mov edi, 0x00; //x = 0
lea TABLE, dword ptr [tab_i_01234567hybr]; // row 0
// mov OUT, INP; // algorithm writes data in-place -> row 0
mov OUTR, dword ptr [fTempArray];
// for ( x = 0; x < 8; ++x ) // transform one row per iteration
ALIGN 16
acc_idct_rowloop1:
movq mm0, qword ptr [INPR] ; // 0 ; x3 x2 x1 x0
movq mm1, qword ptr [INPR+8] ; // 1 ; x7 x6 x5 x4
movq mm2, mm0 ; // 2 ; x3 x2 x1 x0
movq mm3, qword ptr [TABLE] ; // 3 ; w06 w04 w02 w00
punpcklwd mm0, mm1 ; // x5 x1 x4 x0
movq mm5, mm0 ; // 5 ; x5 x1 x4 x0
punpckldq mm0, mm0 ; // x4 x0 x4 x0
movq mm4, qword ptr [TABLE+8] ; // 4 ; w07 w05 w03 w01
punpckhwd mm2, mm1 ; // 1 ; x7 x3 x6 x2
pmaddwd mm3, mm0 ; // x4*w06+x0*w04 x4*w02+x0*w00
movq mm6, mm2 ; // 6 ; x7 x3 x6 x2
movq mm1, qword ptr [TABLE+32] ;// 1 ; w22 w20 w18 w16
punpckldq mm2, mm2 ; // x6 x2 x6 x2
pmaddwd mm4, mm2 ; // x6*w07+x2*w05 x6*w03+x2*w01
punpckhdq mm5, mm5 ; // x5 x1 x5 x1
pmaddwd mm0, qword ptr [TABLE+16] ;// x4*w14+x0*w12 x4*w10+x0*w08
punpckhdq mm6, mm6 ; // x7 x3 x7 x3
movq mm7, qword ptr [TABLE+40] ;// 7 ; w23 w21 w19 w17
pmaddwd mm1, mm5 ; // x5*w22+x1*w20 x5*w18+x1*w16
pmaddwd mm7, mm6 ; // x7*w23+x3*w21 x7*w19+x3*w17
add INPR, 16; // increment INPUT pointer -> row 1
pmaddwd mm2, qword ptr [TABLE+24] ;// x6*w15+x2*w13 x6*w11+x2*w09
paddd mm3, mm4 ; // 4 ; a1=sum(even1) a0=sum(even0)
pmaddwd mm5, qword ptr [TABLE+48] ;// x5*w30+x1*w28 x5*w26+x1*w24
movq mm4, mm3 ; // 4 ; a1 a0
pmaddwd mm6, qword ptr [TABLE+56] ;// x7*w31+x3*w29 x7*w27+x3*w25
paddd mm1, mm7 ; // 7 ; b1=sum(odd1) b0=sum(odd0)
psubd mm3, mm1 ; // a1-b1 a0-b0 [y6 y7] <int32>
paddd mm0, mm2 ; // 2 ; a3=sum(even3) a2=sum(even2)
paddd mm1, mm4 ; // 4 ; a1+b1 a0+b0 [y1 y0] <int32>
punpckldq mm7, mm3; // mm7 <= [y7 XX] <int32>
PI2FD( mm1, mm1); // convert [y1 y0] to <float>
punpckhdq mm3, mm7; // mm3 <=[y7 y6] <int32>
PI2FD( mm3, mm3); // convert [y7 y6] to <float>
paddd mm5, mm6 ; // 6 ; b3=sum(odd3) b2=sum(odd2)
movq mm4, mm0 ; // 4 ; a3 a2
psubd mm0, mm5 ; // 5 ; a3-b3 a2-b2 [y4 y5] <int32>
paddd mm4, mm5 ; // a3+b3 a2+b2 [y3 y2] <int32>
punpckldq mm6, mm0; // mm6 <= [y5 XX] <int32>
PI2FD( mm4, mm4); // convert [y3 y2] to <float>
punpckhdq mm0, mm6; // mm0 <= [y5 y4] <int32>
movq qword ptr [OUTR], mm1 ; // 1 ; save y1 y0
PI2FD( mm0, mm0); // convert [y5 y4] to <float>
movq qword ptr [OUTR+24], mm3 ; // 7 ; save y7 y6
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -