dctaan_mmx.asm

来自「Intel AN&N FAST dct MMX and X」· 汇编 代码 · 共 757 行 · 第 1/2 页

ASM
757
字号
; Copyright (C) 1995-99 -  Intel Corporation all rights reserved
; esi - input and output data pointer
; the input data each 16 bit element in the 8x8 matrix is left aligned:
; the output data is tranposed and each 16 bit element in the 8x8 matrix is left aligned:
; e.g. in 11...1110000 format
; israelh. 11/11/97 removed emms. moved to stub
; MMX implementation. Using MMX transpose 

.nolist 

.586

if @version GE 612
.mmx
mmword	TEXTEQU	<QWORD>
else
include iammx.inc			; MMX Emulator Macros
endif

if @version GE 614
.xmm
else
include iaxmm.inc           ; Streaming SIMD Extensions Emulator Macros
endif

NSHIFT  =  15 ; <14>
PRESHIFT = 1  ;	<0>	<2>
WA4_SHIFT  = NSHIFT-1
WA5_SHIFT  = NSHIFT+1

.list

.model flat

_DATA SEGMENT PARA PUBLIC USE32 'DATA'
WA1  DW  2d41h*2,2d41h*2,2d41h*2,2d41h*2
WA2  DW  22a3h*2,22a3h*2,22a3h*2,22a3h*2
WA3  DW  2d41h*2,2d41h*2,2d41h*2,2d41h*2
WA4  DQ  539f539f539f539fh
WA5  DW  30fah*2,30fah*2,30fah*2,30fah*2

scratch1	DQ 0
scratch2	DQ 0
scratch3	DQ 0
scratch4	DQ 0

_DATA ENDS


_TEXT SEGMENT PARA PUBLIC USE32 'CODE'

COMMENT ^
void dct8x8aan_mmx (
    int16 *src_result, int16 *dst_result);
^
public  _dct8x8aan_mmx
_dct8x8aan_mmx proc near

push	ebp
mov	ebp, esp
push	esi			  
push	edi			  

mov	esi, DWORD PTR [ebp+8]	  ; source
mov edi, DWORD PTR [ebp+12]   ; destination   
;slot

                                                                                                               

; column 0
movq mm0, mmword ptr [esi]	   ; v0
psllw mm0,PRESHIFT
movq mm1, mmword ptr [esi+16*7]	   ; v7
movq  mm2,mm0                  ; duplicate v0 
psllw mm1,PRESHIFT
paddw mm0,mm1                  ; v00: v0+v7  
psubw mm2,mm1                  ; v07: v0-v7  

movq mm1, mmword ptr [esi+16]	   ; v1
psllw mm1,PRESHIFT
movq mm3, mmword ptr [esi+16*6]	   ; v6
movq  mm4,mm1                  ; duplicate v1 
psllw mm3,PRESHIFT
paddw mm1,mm3                  ; v01: v1+v6  
psubw mm4,mm3                  ; v06: v1-v6  

movq mm3, mmword ptr [esi+16*2] ; v2
psllw mm3,PRESHIFT
movq mm5, mmword ptr [esi+16*5]	   ; v5
movq  mm6,mm3                  ; duplicate v2 
psllw mm5,PRESHIFT
paddw mm3,mm5                  ; v02: v2+v5  
psubw mm6,mm5                  ; v05: v2-v5  

movq mm5, mmword ptr [esi+16*3]	   ; v3
psllw mm5,PRESHIFT
movq mm7, mmword ptr [esi+16*4]	   ; v4
psllw mm7,PRESHIFT
movq  mmword ptr scratch1,mm7  ; scratch1: v4   ; 
movq  mm7,mm5                  ; duplicate v3 
paddw mm5,scratch1             ; v03: v3+v4  
psubw mm7,scratch1             ; v04: v3-v4  
movq  mmword ptr scratch2,mm5 ; scratch2: v03
movq  mm5,mm0                  ; mm5: v00

paddw mm0,scratch2             ; v10: v00+v03   
psubw mm5,scratch2             ; v13: v00-v03   
movq  mmword ptr scratch3,mm3  ; scratc3: v02
movq  mm3,mm1                  ; duplicate v01

paddw mm1,scratch3             ; v11: v01+v02
psubw mm3,scratch3             ; v12: v01-v02
 
movq  mmword ptr scratch4,mm6  ; scratc4: v05
movq  mm6,mm0                  ; duplicate v10

paddw mm0,mm1          ; v10+v11
psubw mm6,mm1          ; v10-v11
 
movq  mmword ptr [esi],mm0   ; out0: v10+v11 
movq  mmword ptr [esi+16*4],mm6   ; out4: v10-v11 

movq  mm0,mm4                ; mm0: v06
paddw mm4,scratch4           ; v15: v05+v06 
paddw  mm0,mm2 		  		; v16: v07+v06

psllw  mm4,16-NSHIFT         ; v35: compensate the coeefient scale
pmulhw mm4,mmword ptr WA3    ; v35': WA3*v15
;psllw  mm4,16-NSHIFT         ; v35: compensate the coeefient scale

movq   mm6,mm4               ; duplicate v35
paddw  mm4,mm2               ; v45: v07+v35
psubw  mm2,mm6               ; v47: v07-v35
 
paddw  mm3,mm5               ; v22: v12+v13

psllw  mm3,16-NSHIFT         ; v32: compensate the coeefient scale
pmulhw mm3,mmword ptr WA1    ; v32': WA3*v15
;psllw  mm3,16-NSHIFT         ; v32: compensate the coeefient scale
movq   mm6,mm5               ; duplicate v13

paddw  mm5,mm3               ; v13+v32
psubw  mm6,mm3               ; v13-v32

movq  mmword ptr [esi+16*2],mm5   ; out2: v13+v32 
movq  mmword ptr [esi+16*6],mm6   ; out6: v13-v32 

paddw  mm7,scratch4			; v14n: v04+v05
movq   mm5,mm0              ; duplicate v16

psubw  mm0,mm7				    ; va1: v16-v14n
psllw  mm5,16-WA4_SHIFT      ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
psllw  mm7,16-NSHIFT         ; v34: compensate the coeefient scale
pmulhw mm0,mmword ptr WA5		; va0':  va1*WA5
pmulhw mm5,mmword ptr WA4 		; v36'': v16*WA4
pmulhw mm7,mmword ptr WA2		; v34'': v14n*WA2

;psllw  mm5,16-WA4_SHIFT      ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
;psllw  mm7,16-NSHIFT         ; v34: compensate the coeefient scale
;psllw  mm0,16-WA5_SHIFT

psubw  mm5,mm0      ; v36': v36''-va0'
psubw  mm7,mm0      ; v34': v34''-va0'

movq   mm0,mm4       ; duplicate v45
paddw  mm4,mm5       ; v45+v36
psubw  mm0,mm5       ; v45-v36

movq  mmword ptr [esi+16*1],mm4   ; out1: v45+v36 
movq  mmword ptr [esi+16*7],mm0   ; out7: v45-v36 

movq   mm5,mm2       ; duplicate v47
paddw  mm2,mm7       ; v47+v34
psubw  mm5,mm7       ; v47-v34

movq  mmword ptr [esi+16*5],mm2   ; out5: v47+v34 
movq  mmword ptr [esi+16*3],mm5   ; out3: v47-v34 


; column 1

add  esi,8  ; point to the next 4 columns. it can be done by adding 8 to immediates but this is nicer

movq mm0, mmword ptr [esi]	   ; v0
psllw mm0,PRESHIFT
movq mm1, mmword ptr [esi+16*7]	   ; v7
movq  mm2,mm0                  ; duplicate v0 
psllw mm1,PRESHIFT
paddw mm0,mm1                  ; v00: v0+v7  
psubw mm2,mm1                  ; v07: v0-v7  

movq mm1, mmword ptr [esi+16]	   ; v1
psllw mm1,PRESHIFT
movq mm3, mmword ptr [esi+16*6]	   ; v6
movq  mm4,mm1                  ; duplicate v1 
psllw mm3,PRESHIFT
paddw mm1,mm3                  ; v01: v1+v6  
psubw mm4,mm3                  ; v06: v1-v6  

movq mm3, mmword ptr [esi+16*2] ; v2
psllw mm3,PRESHIFT
movq mm5, mmword ptr [esi+16*5]	   ; v5
movq  mm6,mm3                  ; duplicate v2 
psllw mm5,PRESHIFT
paddw mm3,mm5                  ; v02: v2+v5  
psubw mm6,mm5                  ; v05: v2-v5  

movq mm5, mmword ptr [esi+16*3]	   ; v3
psllw mm5,PRESHIFT
movq mm7, mmword ptr [esi+16*4]	   ; v4
psllw mm7,PRESHIFT
movq  mmword ptr scratch1,mm7		; scratch1: v4   ; 
movq  mm7,mm5                  ; duplicate v3 
paddw mm5,scratch1             ; v03: v3+v4  
psubw mm7,scratch1             ; v04: v3-v4  
movq  mmword ptr scratch2,mm5 ; scratch2: v03
movq  mm5,mm0                  ; mm5: v00

paddw mm0,scratch2             ; v10: v00+v03   
psubw mm5,scratch2             ; v13: v00-v03   
movq  mmword ptr scratch3,mm3  ; scratc3: v02
movq  mm3,mm1                  ; duplicate v01

paddw mm1,scratch3             ; v11: v01+v02
psubw mm3,scratch3             ; v12: v01-v02
 
movq  mmword ptr scratch4,mm6  ; scratc4: v05
movq  mm6,mm0                  ; duplicate v10

paddw mm0,mm1				    ; v10+v11
psubw mm6,mm1			        ; v10-v11
 
movq  mmword ptr [esi],mm0   ; out0: v10+v11 
movq  mmword ptr [esi+16*4],mm6   ; out4: v10-v11 

movq  mm0,mm4                ; mm0: v06
paddw mm4,scratch4           ; v15: v05+v06 
paddw  mm0,mm2 		  		; v16: v07+v06

psllw  mm4,16-NSHIFT         ; v35: compensate the coeefient scale
pmulhw mm4,mmword ptr WA3    ; v35': WA3*v15
;psllw  mm4,16-NSHIFT         ; v35: compensate the coeefient scale

movq   mm6,mm4               ; duplicate v35
paddw  mm4,mm2               ; v45: v07+v35
psubw  mm2,mm6               ; v47: v07-v35
 
paddw  mm3,mm5               ; v22: v12+v13

psllw  mm3,16-NSHIFT         ; v32: compensate the coeefient scale
pmulhw mm3,mmword ptr WA1    ; v32': WA3*v15
;psllw  mm3,16-NSHIFT         ; v32: compensate the coeefient scale
movq   mm6,mm5               ; duplicate v13

paddw  mm5,mm3               ; v13+v32
psubw  mm6,mm3               ; v13-v32

movq  mmword ptr [esi+16*2],mm5   ; out2: v13+v32 
movq  mmword ptr [esi+16*6],mm6   ; out6: v13-v32 

paddw  mm7,scratch4				; v14n: v04+v05
movq   mm5,mm0					; duplicate v16

psubw  mm0,mm7					; va1: v16-v14n
psllw  mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
pmulhw mm7,mmword ptr WA2		; v34'': v14n*WA2
pmulhw mm0,mmword ptr WA5		; va0':  va1*WA5
pmulhw mm5,mmword ptr WA4 		; v36'': v16*WA4

;psllw  mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
psllw  mm7,16-NSHIFT         ; v34: compensate the coeefient scale
;psllw  mm0,16-WA5_SHIFT

psubw  mm5,mm0      ; v36': v36''-va0'
psubw  mm7,mm0      ; v34': v34''-va0'

movq   mm0,mm4       ; duplicate v45
paddw  mm4,mm5       ; v45+v36
psubw  mm0,mm5       ; v45-v36

movq  mmword ptr [esi+16*1],mm4   ; out1: v45+v36 
movq  mmword ptr [esi+16*7],mm0   ; out7: v45-v36 

movq   mm5,mm2       ; duplicate v47
paddw  mm2,mm7       ; v47+v34
psubw  mm5,mm7       ; v47-v34

movq  mmword ptr [esi+16*5],mm2   ; out5: v47+v34 
movq  mmword ptr [esi+16*3],mm5   ; out3: v47-v34 


sub  esi,8  ; point back to the first 4 columns/rows. 
push esi
push edi

jmp	 transpose

cont:


pop	edi
pop esi

; first 4 rows 
movq mm0, mmword ptr [esi]	   ; v0
;pinsrw mm0,[esi+16],1		   ; (0,1)->1
;pinsrw mm0,[esi+16*2],2		   ; (0,2)->2
;pinsrw mm0,[esi+16*3],3		   ; (0,3)->3
movq mm1, mmword ptr [esi+16*7]	   ; v7
;pinsrw mm1,[esi+7*2],0		   ; (7,0)->0
;pinsrw mm1,[esi+16+7*2],1		   ; (7,1)->1
;pinsrw mm1,[esi+16*2+7*2],2		   ; (7,1)->2
;pinsrw mm1,[esi+16*3+7*2],3		   ; (7,1)->3
movq  mm2,mm0                  ; duplicate v0 
paddw mm0,mm1                  ; v00: v0+v7  
psubw mm2,mm1                  ; v07: v0-v7  

movq mm1, mmword ptr [esi+16]	   ; v1
;pinsrw mm1,[esi+1*2],0		   ; (7,0)->0
;pinsrw mm1,[esi+16+1*2],1		   ; (7,1)->1
;pinsrw mm1,[esi+16*2+1*2],2		   ; (7,1)->2
;pinsrw mm1,[esi+16*3+1*2],3		   ; (7,1)->3
movq mm3, mmword ptr [esi+16*6]	   ; v6
;pinsrw mm3,[esi+6*2],0		   ; (7,0)->0
;pinsrw mm3,[esi+16+6*2],1		   ; (7,1)->1
;pinsrw mm3,[esi+16*2+6*2],2		   ; (7,1)->2
;pinsrw mm3,[esi+16*3+6*2],3		   ; (7,1)->3
movq  mm4,mm1                  ; duplicate v1 
paddw mm1,mm3                  ; v01: v1+v6  
psubw mm4,mm3                  ; v06: v1-v6  

movq mm3, mmword ptr [esi+16*2] ; v2
;pinsrw mm3,[esi+2*2],0		   ; (7,0)->0
;pinsrw mm3,[esi+16+2*2],1		   ; (7,1)->1
;pinsrw mm3,[esi+16*2+2*2],2		   ; (7,1)->2
;pinsrw mm3,[esi+16*3+2*2],3		   ; (7,1)->3
movq mm5, mmword ptr [esi+16*5]	   ; v5
;pinsrw mm5,[esi+5*2],0		   ; (7,0)->0
;pinsrw mm5,[esi+16+5*2],1		   ; (7,1)->1
;pinsrw mm5,[esi+16*2+5*2],2		   ; (7,1)->2
;pinsrw mm5,[esi+16*3+5*2],3		   ; (7,1)->3
movq  mm6,mm3                  ; duplicate v2 
paddw mm3,mm5                  ; v02: v2+v5  
psubw mm6,mm5                  ; v05: v2-v5  

movq mm5, mmword ptr [esi+16*3]	   ; v3
;pinsrw mm5,[esi+3*2],0		   ; (7,0)->0
;pinsrw mm5,[esi+16+3*2],1		   ; (7,1)->1
;pinsrw mm5,[esi+16*2+3*2],2		   ; (7,1)->2
;pinsrw mm5,[esi+16*3+3*2],3		   ; (7,1)->3
movq mm7, mmword ptr [esi+16*4]	   ; v4
;pinsrw mm7,[esi+4*2],0		   ; (7,0)->0
;pinsrw mm7,[esi+16+4*2],1		   ; (7,1)->1
;pinsrw mm7,[esi+16*2+4*2],2		   ; (7,1)->2
;pinsrw mm7,[esi+16*3+4*2],3		   ; (7,1)->3
movq  mmword ptr scratch1,mm7		; scratch1: v4   ; 
movq  mm7,mm5                  ; duplicate v3 
paddw mm5,scratch1             ; v03: v3+v4  
psubw mm7,scratch1             ; v04: v3-v4  
movq  mmword ptr scratch2,mm5 ; scratch2: v03
movq  mm5,mm0                  ; mm5: v00

paddw mm0,scratch2             ; v10: v00+v03   
psubw mm5,scratch2             ; v13: v00-v03   
movq  mmword ptr scratch3,mm3  ; scratc3: v02
movq  mm3,mm1                  ; duplicate v01

paddw mm1,scratch3             ; v11: v01+v02
psubw mm3,scratch3             ; v12: v01-v02
 
movq  mmword ptr scratch4,mm6  ; scratc4: v05
movq  mm6,mm0                  ; duplicate v10

paddw mm0,mm1          ; v10+v11
psubw mm6,mm1          ; v10-v11
 
movq  mmword ptr [edi],mm0   ; out0: v10+v11 
movq  mmword ptr [edi+16*4],mm6   ; out4: v10-v11 

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?