⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 idct_ap528mmx.cpp

📁 这是一组DCT和iDCT的代码
💻 CPP
📖 第 1 页 / 共 3 页
字号:
#include "StdAfx.h"
/*
 * MPEG2AVI
 * --------
 *  v0.16B34- very minor optimization to reduce overall cycle-count by 1
 *  v0.16B33- completely reconverted MMX IDCT AAN from Intel's devsheet.
 *     IEEE1180 test revealed errors in my prior implementation.
 *     added a DC-element "compensation" factor, to reduce this IDCT's
 *     apparent non-zero mean-shift.
 *  v0.16B31- re-ordered the in-place (left-shift<<4)...the left-shift
 *     is now performed prior to multiplication.  This implementation is
 *     now just as accurate as Intel's original MMX AAN sample-code.
 *  v0.16B3 - moved tables to global-scope, to enforce QWORD alignment.
 *     apparntly, Visual C++ doesn't necessarily align stack data.
 *     And local volatile variables are run-time allocated (stack),
 *     whereas the global variables are allocated once.
 *  v0.16B22 - MMX IDCT AAN algorithm , idct_mmx_aan()
 *     This algorithm requires the input-matrix be transposed. (Alternatively,
 *     the output can be transposed instead of the input.)  Unlike
 *     the Intel's original code, this code integrates the data
 *     [left-shift <<4].  All that remains is the transpose.
    
		http://developer.intel.com/drg/mmx/appnotes/
		Application Note AP-528
 *

// This implementation differs from Intel's AAN-IDCT in one way :
//  The transform automatically left-shifts the source-matrix, so the
//  calling function should not left-shift the source-matrix.
//
//  (However, the AAN-IDCT still transposes the output matrix.
//   I have added mmx-code to transpose the output matrix, so the
//   user of this function need not worry about this issue.)
// 
// This IDCT is fast, but less accurate than the standard IDCT_INT32()
// routine.  
//
 */
#include<windows.h>

// If compiling with Visual C++, you should leave these variables at
// global scope.  If you declare them locally, VC++ does not necessarily
// align them to QWORD boundaries!
// (declaring them static local variables seems to align them, too.)

const static	__int64 x0005000200010001 = 0x0005000200010001;
const static	__int64 x0040000000000000 = 0x0040000000000000;
const static	__int64 x5a825a825a825a82 = 0x5a825a825a825a82;
								 
const static	__int64 x539f539f539f539f = 0x539f539f539f539f;
const static	__int64 x4546454645464546 = 0x4546454645464546;
const static	__int64 x61f861f861f861f8 = 0x61f861f861f861f8;

const static	__int64 mm0adjust = 0x0000000000000040;
static	__int64 scratch1          = 0x0000000000000000;
static	__int64 scratch3          = 0x0000000000000000;
static	__int64 scratch5          = 0x0000000000000000;
static	__int64 scratch7          = 0x0000000000000000;
	// for debug only
static	__int64 x0                = 0x0000000000000000;

#define PRESHIFT 4	// left-shift input coefficient amount

#define ROW_STRIDE 16	// for 8x8 matrix transpose operation

const static WORD preSC[] = {
		16384, 22725, 21407, 19266,  16384, 12873, 8867,  4520,
        22725, 31521, 29692, 26722,  22725, 17855, 12299, 6270,
        21407, 29692, 27969, 25172,  21407, 16819, 11585, 5906,
        19266, 26722, 25172, 22654,  19266, 15137, 10426, 5315,
        16384, 22725, 21407, 19266,  16384, 12873, 8867,  4520,
        12873, 17855, 16819, 15137,  25746, 20228, 13933, 7103,
        17734, 24598, 23170, 20853,  17734, 13933, 9597,  4892,
        18081, 25080, 23624, 21261,  18081, 14206, 9785,  4988,
	};

void 
j_rev_dct( short *src_result )
{
	__asm 
	{

	//COMMENT ^
	//void idct8x8aan (
	//    int16 *src_result);
	//^
	//public  _idct8x8aan
	//_idct8x8aan proc near
	
	//push    ebp
	
	//mov     ebp, esp
	//push    esi

	//mov     esi, DWORD PTR [ebp+8]          ; source
	mov     esi, DWORD PTR [src_result]          ; source

	lea     ecx, DWORD PTR [preSC]


// THe [left shift << 4] is now *BUILT-IN* to the AAN-IDCT routine...
// It took a lot of time to determine which loads needed to be left-shifted,
// and which didn't (because the algorithm modifies the src_result[]
// array in-place!)

// if you wish to change the preshift, amount just change the source line
// with "#define PRESHIFT 4", then recompile.


//////////
//  AAN IDCT algorithm begins here!
//
/////////
; column 0: even part
; use V4, V12, V0, V8 to produce V22..V25
movq mm0, qword ptr [esi+8*12]         ; maybe the first mul can be done together
                                        ; with the dequantization in iHuff module ?
;slot
movq mm1, qword ptr [esi+8*4]
 psllw mm0, PRESHIFT; // esi + 96

movq mm3, qword ptr [esi+8*0]
 psllw mm1, PRESHIFT; // esi + 32

pmulhw mm0, qword ptr [ecx+8*12]       ; V12
 psllw mm3, PRESHIFT; // esi + 0

pmulhw mm1, qword ptr [ecx+8*4]        ; V4

paddsw mm3, qword ptr [mm0adjust]	// v0.16B33 accuracy adjustment
 ;//slot

pmulhw mm3, qword ptr [ecx+8*0]        ; V0
 psraw mm0, 1                            ; t64=t66

movq mm5, qword ptr [esi+8*8]          ; duplicate V4
 movq mm2, mm1                           ; added 11/1/96

psllw mm5, PRESHIFT;	// esi + 64
 psubsw mm1, mm0                         ; V16

pmulhw mm5, qword ptr [ecx+8*8]        ; V8
 paddsw mm2, mm0                          ; V17

pmulhw mm1, qword ptr x5a825a825a825a82 ; 23170 ->V18
 movq mm0, mm2                           ; duplicate V17

psraw mm2, 1                            ; t75=t82
 movq mm4, mm3                           ; duplicate V0

psraw mm0, 2                            ; t72
 paddsw mm3, mm5                         ; V19


;moved from the block below
movq mm7, qword ptr [esi+8*10]
 psubsw mm4, mm5                         ; V20 ;mm5 free

psubsw mm1, mm0                         ; V21 ; mm0 free
 psraw mm3, 1                            ; t74=t81

movq mm6, mm3                           ; duplicate t74=t81
 psraw mm4, 2                            ; t77=t79

paddsw mm3, mm2                         ; V22
 movq mm5, mm1                           ; duplicate V21

paddsw mm1, mm4                         ; V23
 psllw mm7, PRESHIFT; // esi + 80

movq qword ptr [esi+8*4], mm3          ; V22
 psubsw mm4, mm5                         ; V24; mm5 free

movq qword ptr [esi+8*12], mm1         ; V23
 psubsw mm6, mm2                         ; V25; mm2 free

movq qword ptr [esi+8*0], mm4          ; V24
 ;//slot
; keep mm6 alive all along the next block
;movq qword ptr [esi+8*8], mm6         ; V25

; column 0: odd part
; use V2, V6, V10, V14 to produce V31, V39, V40, V41

;moved above
;movq mm7, qword ptr [esi+8*10]

movq mm0, qword ptr [esi+8*6]
;slot

movq mm5, qword ptr [esi+8*2]
 psllw mm0, PRESHIFT; // esi + 48

pmulhw mm7, qword ptr [ecx+8*10]               ; V10
 psllw mm5, PRESHIFT; // esi + 16

pmulhw mm0, qword ptr [ecx+8*6]                ; V6

movq mm4, qword ptr [esi+8*14]

pmulhw mm5, qword ptr [ecx+8*2]                ; V2
 psllw mm4, PRESHIFT; // esi + 112

movq mm3, mm7                                   ; duplicate V10
 psubsw mm7, mm0                                 ; V26

paddsw mm3, mm0                                 ; V29 ; free mm0
 movq mm1, mm7                                   ; duplicate V26

pmulhw mm4, qword ptr [ecx+8*14]               ; V14
 psraw mm3, 1                                    ; t91=t94

pmulhw mm7, qword ptr x539f539f539f539f        ; V33
 psraw mm1, 1                                    ; t96

movq mm0, mm5                                   ; duplicate V2
 ;//slot
psraw mm4, 2                                    ; t85=t87

paddsw mm5, mm4                                 ; V27
psubsw mm0, mm4                                 ; V28 ; free mm4

movq mm2, mm0                                   ; duplicate V28
psraw mm5, 1                                    ; t90=t93

pmulhw mm0, qword ptr x4546454645464546        ; V35
psraw mm2, 1                                    ; t97

movq mm4, mm5                                   ; duplicate t90=t93
psubsw mm1, mm2                                 ; V32 ; free mm2

pmulhw mm1, qword ptr x61f861f861f861f8        ; V36
psllw mm7, 1                                    ; t107

paddsw mm5, mm3                                 ; V31
psubsw mm4, mm3                                 ; V30 ; free mm3

pmulhw mm4, qword ptr x5a825a825a825a82        ; V34
nop ;slot

psubsw mm0, mm1                                 ; V38
psubsw mm1, mm7                                 ; V37 ; free mm7

psllw mm1, 1                                    ; t114
;move from the next block
movq mm3, mm6           ; duplicate V25

;move from the next block
movq mm7, qword ptr [esi+8*4]                  ; V22
psllw mm0, 1                                    ; t110

psubsw mm0, mm5                                 ; V39 (mm5 still needed for next block)
psllw mm4, 2                                    ; t112

;move from the next block
movq mm2, qword ptr [esi+8*12] ; V23
psubsw mm4, mm0                                 ; V40

paddsw mm1, mm4                                 ; V41; free mm0
;move from the next block
psllw mm2, 1                                    ; t117=t125

; column 0: output butterfly
;move above
;movq mm3, mm6          ; duplicate V25
;movq mm7, qword ptr [esi+8*4]                 ; V22
;movq mm2, qword ptr [esi+8*12]                ; V23
;psllw mm2, 1                                   ; t117=t125

psubsw mm6, mm1                                 ; tm6
paddsw mm3, mm1                                 ; tm8; free mm1

movq mm1, mm7                                   ; duplicate V22
paddsw mm7, mm5                                 ; tm0

movq qword ptr [esi+8*8], mm3                  ; tm8; free mm3
psubsw mm1, mm5                                 ; tm14; free mm5

movq qword ptr [esi+8*6], mm6                  ; tm6; free mm6
movq mm3, mm2                                   ; duplicate t117=t125

movq mm6, qword ptr [esi+8*0]                  ; V24
paddsw mm2, mm0                                 ; tm2

movq qword ptr [esi+8*0], mm7                  ; tm0; free mm7
psubsw mm3, mm0                                 ; tm12; free mm0

movq qword ptr [esi+8*14], mm1                 ; tm14; free mm1
psllw mm6, 1                                    ; t119=t123

movq qword ptr [esi+8*2], mm2                  ; tm2; free mm2
movq mm0, mm6                                   ; duplicate t119=t123

movq qword ptr [esi+8*12], mm3                 ; tm12; free mm3
paddsw mm6, mm4                                 ; tm4

;moved from next block
movq mm1, qword ptr [esi+8*5]
psubsw mm0, mm4                                 ; tm10; free mm4

;moved from next block
movq qword ptr [esi+8*4], mm6                  ; tm4; free mm6
 psllw mm1, PRESHIFT; // esi+40

pmulhw mm1, qword ptr [ecx+8*5]                ; V5
;slot

movq qword ptr [esi+8*10], mm0                 ; tm10; free mm0
;slot

; column 1: even part
; use V5, V13, V1, V9 to produce V56..V59
;moved to prev block
;movq mm1, qword ptr [ecx+8*5]
;pmulhw mm1, qword ptr [esi+8*5]               ; V5

movq mm7, qword ptr [esi+8*13]
 ;//slot
psllw mm1, 1                                    ; t128=t130
 psllw mm7, PRESHIFT; // esi + 104
movq mm3, qword ptr [esi+8*1]
 movq mm2, mm1                                   ; duplicate t128=t130
pmulhw mm7, qword ptr [ecx+8*13]               ; V13
 psllw mm3, PRESHIFT; // esi + 8

pmulhw mm3, qword ptr [ecx+8*1]                ; V1
;slot

movq mm5, qword ptr [esi+8*9]
;//slot
psubsw mm1, mm7                                 ; V50
 psllw mm5, PRESHIFT;	// esi + 72

pmulhw mm5, qword ptr [ecx+8*9]                ; V9
paddsw mm2, mm7                                 ; V51

pmulhw mm1, qword ptr x5a825a825a825a82        ; 23170 ->V52
movq mm6, mm2                                   ; duplicate V51

psraw mm2, 1                                    ; t138=t144
movq mm4, mm3                                   ; duplicate V1

psraw mm6, 2                                    ; t136
paddsw mm3, mm5                                 ; V53

psubsw mm4, mm5                                 ; V54 ;mm5 free
movq mm7, mm3                                   ; duplicate V53

;moved from next block
movq mm0, qword ptr [esi+8*11]
psraw mm4, 1                                    ; t140=t142

psubsw mm1, mm6                                 ; V55 ; mm6 free
paddsw mm3, mm2                                 ; V56

movq mm5, mm4                                   ; duplicate t140=t142
paddsw mm4, mm1                                 ; V57

movq qword ptr [esi+8*5], mm3                  ; V56
psubsw mm5, mm1                                 ; V58; mm1 free

movq qword ptr [esi+8*13], mm4                 ; V57
psubsw mm7, mm2                                 ; V59; mm2 free

movq qword ptr [esi+8*9], mm5                  ; V58
 psllw mm0, PRESHIFT; // esi + 88

; keep mm7 alive all along the next block
;movq qword ptr [esi+8*1], mm7                 ; V59

;moved above
;movq mm0, qword ptr [esi+8*11]

pmulhw mm0, qword ptr [ecx+8*11]               ; V11
;slot

movq mm6, qword ptr [esi+8*7]
;slot

movq mm4, qword ptr [esi+8*15]
 psllw mm6, PRESHIFT;	// esi + 56

pmulhw mm6, qword ptr [ecx+8*7]                ; V7
 psllw mm4, PRESHIFT;	// esi + 120

movq mm5, qword ptr [esi+8*3]
 movq mm3, mm0                                   ; duplicate V11

pmulhw mm4, qword ptr [ecx+8*15]               ; V15
 psllw mm5, PRESHIFT;	// esi + 24

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -