📄 idct_ap528mmx.cpp
字号:
#include "StdAfx.h"
/*
* MPEG2AVI
* --------
* v0.16B34- very minor optimization to reduce overall cycle-count by 1
* v0.16B33- completely reconverted MMX IDCT AAN from Intel's devsheet.
* IEEE1180 test revealed errors in my prior implementation.
* added a DC-element "compensation" factor, to reduce this IDCT's
* apparent non-zero mean-shift.
* v0.16B31- re-ordered the in-place (left-shift<<4)...the left-shift
* is now performed prior to multiplication. This implementation is
* now just as accurate as Intel's original MMX AAN sample-code.
* v0.16B3 - moved tables to global-scope, to enforce QWORD alignment.
* apparntly, Visual C++ doesn't necessarily align stack data.
* And local volatile variables are run-time allocated (stack),
* whereas the global variables are allocated once.
* v0.16B22 - MMX IDCT AAN algorithm , idct_mmx_aan()
* This algorithm requires the input-matrix be transposed. (Alternatively,
* the output can be transposed instead of the input.) Unlike
* the Intel's original code, this code integrates the data
* [left-shift <<4]. All that remains is the transpose.
http://developer.intel.com/drg/mmx/appnotes/
Application Note AP-528
*
// This implementation differs from Intel's AAN-IDCT in one way :
// The transform automatically left-shifts the source-matrix, so the
// calling function should not left-shift the source-matrix.
//
// (However, the AAN-IDCT still transposes the output matrix.
// I have added mmx-code to transpose the output matrix, so the
// user of this function need not worry about this issue.)
//
// This IDCT is fast, but less accurate than the standard IDCT_INT32()
// routine.
//
*/
#include<windows.h>
// If compiling with Visual C++, you should leave these variables at
// global scope. If you declare them locally, VC++ does not necessarily
// align them to QWORD boundaries!
// (declaring them static local variables seems to align them, too.)
const static __int64 x0005000200010001 = 0x0005000200010001;
const static __int64 x0040000000000000 = 0x0040000000000000;
const static __int64 x5a825a825a825a82 = 0x5a825a825a825a82;
const static __int64 x539f539f539f539f = 0x539f539f539f539f;
const static __int64 x4546454645464546 = 0x4546454645464546;
const static __int64 x61f861f861f861f8 = 0x61f861f861f861f8;
const static __int64 mm0adjust = 0x0000000000000040;
static __int64 scratch1 = 0x0000000000000000;
static __int64 scratch3 = 0x0000000000000000;
static __int64 scratch5 = 0x0000000000000000;
static __int64 scratch7 = 0x0000000000000000;
// for debug only
static __int64 x0 = 0x0000000000000000;
#define PRESHIFT 4 // left-shift input coefficient amount
#define ROW_STRIDE 16 // for 8x8 matrix transpose operation
const static WORD preSC[] = {
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270,
21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906,
19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315,
16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520,
12873, 17855, 16819, 15137, 25746, 20228, 13933, 7103,
17734, 24598, 23170, 20853, 17734, 13933, 9597, 4892,
18081, 25080, 23624, 21261, 18081, 14206, 9785, 4988,
};
void
j_rev_dct( short *src_result )
{
__asm
{
//COMMENT ^
//void idct8x8aan (
// int16 *src_result);
//^
//public _idct8x8aan
//_idct8x8aan proc near
//push ebp
//mov ebp, esp
//push esi
//mov esi, DWORD PTR [ebp+8] ; source
mov esi, DWORD PTR [src_result] ; source
lea ecx, DWORD PTR [preSC]
// THe [left shift << 4] is now *BUILT-IN* to the AAN-IDCT routine...
// It took a lot of time to determine which loads needed to be left-shifted,
// and which didn't (because the algorithm modifies the src_result[]
// array in-place!)
// if you wish to change the preshift, amount just change the source line
// with "#define PRESHIFT 4", then recompile.
//////////
// AAN IDCT algorithm begins here!
//
/////////
; column 0: even part
; use V4, V12, V0, V8 to produce V22..V25
movq mm0, qword ptr [esi+8*12] ; maybe the first mul can be done together
; with the dequantization in iHuff module ?
;slot
movq mm1, qword ptr [esi+8*4]
psllw mm0, PRESHIFT; // esi + 96
movq mm3, qword ptr [esi+8*0]
psllw mm1, PRESHIFT; // esi + 32
pmulhw mm0, qword ptr [ecx+8*12] ; V12
psllw mm3, PRESHIFT; // esi + 0
pmulhw mm1, qword ptr [ecx+8*4] ; V4
paddsw mm3, qword ptr [mm0adjust] // v0.16B33 accuracy adjustment
;//slot
pmulhw mm3, qword ptr [ecx+8*0] ; V0
psraw mm0, 1 ; t64=t66
movq mm5, qword ptr [esi+8*8] ; duplicate V4
movq mm2, mm1 ; added 11/1/96
psllw mm5, PRESHIFT; // esi + 64
psubsw mm1, mm0 ; V16
pmulhw mm5, qword ptr [ecx+8*8] ; V8
paddsw mm2, mm0 ; V17
pmulhw mm1, qword ptr x5a825a825a825a82 ; 23170 ->V18
movq mm0, mm2 ; duplicate V17
psraw mm2, 1 ; t75=t82
movq mm4, mm3 ; duplicate V0
psraw mm0, 2 ; t72
paddsw mm3, mm5 ; V19
;moved from the block below
movq mm7, qword ptr [esi+8*10]
psubsw mm4, mm5 ; V20 ;mm5 free
psubsw mm1, mm0 ; V21 ; mm0 free
psraw mm3, 1 ; t74=t81
movq mm6, mm3 ; duplicate t74=t81
psraw mm4, 2 ; t77=t79
paddsw mm3, mm2 ; V22
movq mm5, mm1 ; duplicate V21
paddsw mm1, mm4 ; V23
psllw mm7, PRESHIFT; // esi + 80
movq qword ptr [esi+8*4], mm3 ; V22
psubsw mm4, mm5 ; V24; mm5 free
movq qword ptr [esi+8*12], mm1 ; V23
psubsw mm6, mm2 ; V25; mm2 free
movq qword ptr [esi+8*0], mm4 ; V24
;//slot
; keep mm6 alive all along the next block
;movq qword ptr [esi+8*8], mm6 ; V25
; column 0: odd part
; use V2, V6, V10, V14 to produce V31, V39, V40, V41
;moved above
;movq mm7, qword ptr [esi+8*10]
movq mm0, qword ptr [esi+8*6]
;slot
movq mm5, qword ptr [esi+8*2]
psllw mm0, PRESHIFT; // esi + 48
pmulhw mm7, qword ptr [ecx+8*10] ; V10
psllw mm5, PRESHIFT; // esi + 16
pmulhw mm0, qword ptr [ecx+8*6] ; V6
movq mm4, qword ptr [esi+8*14]
pmulhw mm5, qword ptr [ecx+8*2] ; V2
psllw mm4, PRESHIFT; // esi + 112
movq mm3, mm7 ; duplicate V10
psubsw mm7, mm0 ; V26
paddsw mm3, mm0 ; V29 ; free mm0
movq mm1, mm7 ; duplicate V26
pmulhw mm4, qword ptr [ecx+8*14] ; V14
psraw mm3, 1 ; t91=t94
pmulhw mm7, qword ptr x539f539f539f539f ; V33
psraw mm1, 1 ; t96
movq mm0, mm5 ; duplicate V2
;//slot
psraw mm4, 2 ; t85=t87
paddsw mm5, mm4 ; V27
psubsw mm0, mm4 ; V28 ; free mm4
movq mm2, mm0 ; duplicate V28
psraw mm5, 1 ; t90=t93
pmulhw mm0, qword ptr x4546454645464546 ; V35
psraw mm2, 1 ; t97
movq mm4, mm5 ; duplicate t90=t93
psubsw mm1, mm2 ; V32 ; free mm2
pmulhw mm1, qword ptr x61f861f861f861f8 ; V36
psllw mm7, 1 ; t107
paddsw mm5, mm3 ; V31
psubsw mm4, mm3 ; V30 ; free mm3
pmulhw mm4, qword ptr x5a825a825a825a82 ; V34
nop ;slot
psubsw mm0, mm1 ; V38
psubsw mm1, mm7 ; V37 ; free mm7
psllw mm1, 1 ; t114
;move from the next block
movq mm3, mm6 ; duplicate V25
;move from the next block
movq mm7, qword ptr [esi+8*4] ; V22
psllw mm0, 1 ; t110
psubsw mm0, mm5 ; V39 (mm5 still needed for next block)
psllw mm4, 2 ; t112
;move from the next block
movq mm2, qword ptr [esi+8*12] ; V23
psubsw mm4, mm0 ; V40
paddsw mm1, mm4 ; V41; free mm0
;move from the next block
psllw mm2, 1 ; t117=t125
; column 0: output butterfly
;move above
;movq mm3, mm6 ; duplicate V25
;movq mm7, qword ptr [esi+8*4] ; V22
;movq mm2, qword ptr [esi+8*12] ; V23
;psllw mm2, 1 ; t117=t125
psubsw mm6, mm1 ; tm6
paddsw mm3, mm1 ; tm8; free mm1
movq mm1, mm7 ; duplicate V22
paddsw mm7, mm5 ; tm0
movq qword ptr [esi+8*8], mm3 ; tm8; free mm3
psubsw mm1, mm5 ; tm14; free mm5
movq qword ptr [esi+8*6], mm6 ; tm6; free mm6
movq mm3, mm2 ; duplicate t117=t125
movq mm6, qword ptr [esi+8*0] ; V24
paddsw mm2, mm0 ; tm2
movq qword ptr [esi+8*0], mm7 ; tm0; free mm7
psubsw mm3, mm0 ; tm12; free mm0
movq qword ptr [esi+8*14], mm1 ; tm14; free mm1
psllw mm6, 1 ; t119=t123
movq qword ptr [esi+8*2], mm2 ; tm2; free mm2
movq mm0, mm6 ; duplicate t119=t123
movq qword ptr [esi+8*12], mm3 ; tm12; free mm3
paddsw mm6, mm4 ; tm4
;moved from next block
movq mm1, qword ptr [esi+8*5]
psubsw mm0, mm4 ; tm10; free mm4
;moved from next block
movq qword ptr [esi+8*4], mm6 ; tm4; free mm6
psllw mm1, PRESHIFT; // esi+40
pmulhw mm1, qword ptr [ecx+8*5] ; V5
;slot
movq qword ptr [esi+8*10], mm0 ; tm10; free mm0
;slot
; column 1: even part
; use V5, V13, V1, V9 to produce V56..V59
;moved to prev block
;movq mm1, qword ptr [ecx+8*5]
;pmulhw mm1, qword ptr [esi+8*5] ; V5
movq mm7, qword ptr [esi+8*13]
;//slot
psllw mm1, 1 ; t128=t130
psllw mm7, PRESHIFT; // esi + 104
movq mm3, qword ptr [esi+8*1]
movq mm2, mm1 ; duplicate t128=t130
pmulhw mm7, qword ptr [ecx+8*13] ; V13
psllw mm3, PRESHIFT; // esi + 8
pmulhw mm3, qword ptr [ecx+8*1] ; V1
;slot
movq mm5, qword ptr [esi+8*9]
;//slot
psubsw mm1, mm7 ; V50
psllw mm5, PRESHIFT; // esi + 72
pmulhw mm5, qword ptr [ecx+8*9] ; V9
paddsw mm2, mm7 ; V51
pmulhw mm1, qword ptr x5a825a825a825a82 ; 23170 ->V52
movq mm6, mm2 ; duplicate V51
psraw mm2, 1 ; t138=t144
movq mm4, mm3 ; duplicate V1
psraw mm6, 2 ; t136
paddsw mm3, mm5 ; V53
psubsw mm4, mm5 ; V54 ;mm5 free
movq mm7, mm3 ; duplicate V53
;moved from next block
movq mm0, qword ptr [esi+8*11]
psraw mm4, 1 ; t140=t142
psubsw mm1, mm6 ; V55 ; mm6 free
paddsw mm3, mm2 ; V56
movq mm5, mm4 ; duplicate t140=t142
paddsw mm4, mm1 ; V57
movq qword ptr [esi+8*5], mm3 ; V56
psubsw mm5, mm1 ; V58; mm1 free
movq qword ptr [esi+8*13], mm4 ; V57
psubsw mm7, mm2 ; V59; mm2 free
movq qword ptr [esi+8*9], mm5 ; V58
psllw mm0, PRESHIFT; // esi + 88
; keep mm7 alive all along the next block
;movq qword ptr [esi+8*1], mm7 ; V59
;moved above
;movq mm0, qword ptr [esi+8*11]
pmulhw mm0, qword ptr [ecx+8*11] ; V11
;slot
movq mm6, qword ptr [esi+8*7]
;slot
movq mm4, qword ptr [esi+8*15]
psllw mm6, PRESHIFT; // esi + 56
pmulhw mm6, qword ptr [ecx+8*7] ; V7
psllw mm4, PRESHIFT; // esi + 120
movq mm5, qword ptr [esi+8*3]
movq mm3, mm0 ; duplicate V11
pmulhw mm4, qword ptr [ecx+8*15] ; V15
psllw mm5, PRESHIFT; // esi + 24
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -