dctaan_mmx.asm
来自「Intel AN&N FAST dct MMX and X」· 汇编 代码 · 共 757 行 · 第 1/2 页
ASM
757 行
; Copyright (C) 1995-99 - Intel Corporation all rights reserved
; esi - input and output data pointer
; the input data each 16 bit element in the 8x8 matrix is left aligned:
; the output data is tranposed and each 16 bit element in the 8x8 matrix is left aligned:
; e.g. in 11...1110000 format
; israelh. 11/11/97 removed emms. moved to stub
; MMX implementation. Using MMX transpose
.nolist
.586
if @version GE 612
.mmx
mmword TEXTEQU <QWORD>
else
include iammx.inc ; MMX Emulator Macros
endif
if @version GE 614
.xmm
else
include iaxmm.inc ; Streaming SIMD Extensions Emulator Macros
endif
NSHIFT = 15 ; <14>
PRESHIFT = 1 ; <0> <2>
WA4_SHIFT = NSHIFT-1
WA5_SHIFT = NSHIFT+1
.list
.model flat
_DATA SEGMENT PARA PUBLIC USE32 'DATA'
WA1 DW 2d41h*2,2d41h*2,2d41h*2,2d41h*2
WA2 DW 22a3h*2,22a3h*2,22a3h*2,22a3h*2
WA3 DW 2d41h*2,2d41h*2,2d41h*2,2d41h*2
WA4 DQ 539f539f539f539fh
WA5 DW 30fah*2,30fah*2,30fah*2,30fah*2
scratch1 DQ 0
scratch2 DQ 0
scratch3 DQ 0
scratch4 DQ 0
_DATA ENDS
_TEXT SEGMENT PARA PUBLIC USE32 'CODE'
COMMENT ^
void dct8x8aan_mmx (
int16 *src_result, int16 *dst_result);
^
public _dct8x8aan_mmx
_dct8x8aan_mmx proc near
push ebp
mov ebp, esp
push esi
push edi
mov esi, DWORD PTR [ebp+8] ; source
mov edi, DWORD PTR [ebp+12] ; destination
;slot
; column 0
movq mm0, mmword ptr [esi] ; v0
psllw mm0,PRESHIFT
movq mm1, mmword ptr [esi+16*7] ; v7
movq mm2,mm0 ; duplicate v0
psllw mm1,PRESHIFT
paddw mm0,mm1 ; v00: v0+v7
psubw mm2,mm1 ; v07: v0-v7
movq mm1, mmword ptr [esi+16] ; v1
psllw mm1,PRESHIFT
movq mm3, mmword ptr [esi+16*6] ; v6
movq mm4,mm1 ; duplicate v1
psllw mm3,PRESHIFT
paddw mm1,mm3 ; v01: v1+v6
psubw mm4,mm3 ; v06: v1-v6
movq mm3, mmword ptr [esi+16*2] ; v2
psllw mm3,PRESHIFT
movq mm5, mmword ptr [esi+16*5] ; v5
movq mm6,mm3 ; duplicate v2
psllw mm5,PRESHIFT
paddw mm3,mm5 ; v02: v2+v5
psubw mm6,mm5 ; v05: v2-v5
movq mm5, mmword ptr [esi+16*3] ; v3
psllw mm5,PRESHIFT
movq mm7, mmword ptr [esi+16*4] ; v4
psllw mm7,PRESHIFT
movq mmword ptr scratch1,mm7 ; scratch1: v4 ;
movq mm7,mm5 ; duplicate v3
paddw mm5,scratch1 ; v03: v3+v4
psubw mm7,scratch1 ; v04: v3-v4
movq mmword ptr scratch2,mm5 ; scratch2: v03
movq mm5,mm0 ; mm5: v00
paddw mm0,scratch2 ; v10: v00+v03
psubw mm5,scratch2 ; v13: v00-v03
movq mmword ptr scratch3,mm3 ; scratc3: v02
movq mm3,mm1 ; duplicate v01
paddw mm1,scratch3 ; v11: v01+v02
psubw mm3,scratch3 ; v12: v01-v02
movq mmword ptr scratch4,mm6 ; scratc4: v05
movq mm6,mm0 ; duplicate v10
paddw mm0,mm1 ; v10+v11
psubw mm6,mm1 ; v10-v11
movq mmword ptr [esi],mm0 ; out0: v10+v11
movq mmword ptr [esi+16*4],mm6 ; out4: v10-v11
movq mm0,mm4 ; mm0: v06
paddw mm4,scratch4 ; v15: v05+v06
paddw mm0,mm2 ; v16: v07+v06
psllw mm4,16-NSHIFT ; v35: compensate the coeefient scale
pmulhw mm4,mmword ptr WA3 ; v35': WA3*v15
;psllw mm4,16-NSHIFT ; v35: compensate the coeefient scale
movq mm6,mm4 ; duplicate v35
paddw mm4,mm2 ; v45: v07+v35
psubw mm2,mm6 ; v47: v07-v35
paddw mm3,mm5 ; v22: v12+v13
psllw mm3,16-NSHIFT ; v32: compensate the coeefient scale
pmulhw mm3,mmword ptr WA1 ; v32': WA3*v15
;psllw mm3,16-NSHIFT ; v32: compensate the coeefient scale
movq mm6,mm5 ; duplicate v13
paddw mm5,mm3 ; v13+v32
psubw mm6,mm3 ; v13-v32
movq mmword ptr [esi+16*2],mm5 ; out2: v13+v32
movq mmword ptr [esi+16*6],mm6 ; out6: v13-v32
paddw mm7,scratch4 ; v14n: v04+v05
movq mm5,mm0 ; duplicate v16
psubw mm0,mm7 ; va1: v16-v14n
psllw mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
psllw mm7,16-NSHIFT ; v34: compensate the coeefient scale
pmulhw mm0,mmword ptr WA5 ; va0': va1*WA5
pmulhw mm5,mmword ptr WA4 ; v36'': v16*WA4
pmulhw mm7,mmword ptr WA2 ; v34'': v14n*WA2
;psllw mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
;psllw mm7,16-NSHIFT ; v34: compensate the coeefient scale
;psllw mm0,16-WA5_SHIFT
psubw mm5,mm0 ; v36': v36''-va0'
psubw mm7,mm0 ; v34': v34''-va0'
movq mm0,mm4 ; duplicate v45
paddw mm4,mm5 ; v45+v36
psubw mm0,mm5 ; v45-v36
movq mmword ptr [esi+16*1],mm4 ; out1: v45+v36
movq mmword ptr [esi+16*7],mm0 ; out7: v45-v36
movq mm5,mm2 ; duplicate v47
paddw mm2,mm7 ; v47+v34
psubw mm5,mm7 ; v47-v34
movq mmword ptr [esi+16*5],mm2 ; out5: v47+v34
movq mmword ptr [esi+16*3],mm5 ; out3: v47-v34
; column 1
add esi,8 ; point to the next 4 columns. it can be done by adding 8 to immediates but this is nicer
movq mm0, mmword ptr [esi] ; v0
psllw mm0,PRESHIFT
movq mm1, mmword ptr [esi+16*7] ; v7
movq mm2,mm0 ; duplicate v0
psllw mm1,PRESHIFT
paddw mm0,mm1 ; v00: v0+v7
psubw mm2,mm1 ; v07: v0-v7
movq mm1, mmword ptr [esi+16] ; v1
psllw mm1,PRESHIFT
movq mm3, mmword ptr [esi+16*6] ; v6
movq mm4,mm1 ; duplicate v1
psllw mm3,PRESHIFT
paddw mm1,mm3 ; v01: v1+v6
psubw mm4,mm3 ; v06: v1-v6
movq mm3, mmword ptr [esi+16*2] ; v2
psllw mm3,PRESHIFT
movq mm5, mmword ptr [esi+16*5] ; v5
movq mm6,mm3 ; duplicate v2
psllw mm5,PRESHIFT
paddw mm3,mm5 ; v02: v2+v5
psubw mm6,mm5 ; v05: v2-v5
movq mm5, mmword ptr [esi+16*3] ; v3
psllw mm5,PRESHIFT
movq mm7, mmword ptr [esi+16*4] ; v4
psllw mm7,PRESHIFT
movq mmword ptr scratch1,mm7 ; scratch1: v4 ;
movq mm7,mm5 ; duplicate v3
paddw mm5,scratch1 ; v03: v3+v4
psubw mm7,scratch1 ; v04: v3-v4
movq mmword ptr scratch2,mm5 ; scratch2: v03
movq mm5,mm0 ; mm5: v00
paddw mm0,scratch2 ; v10: v00+v03
psubw mm5,scratch2 ; v13: v00-v03
movq mmword ptr scratch3,mm3 ; scratc3: v02
movq mm3,mm1 ; duplicate v01
paddw mm1,scratch3 ; v11: v01+v02
psubw mm3,scratch3 ; v12: v01-v02
movq mmword ptr scratch4,mm6 ; scratc4: v05
movq mm6,mm0 ; duplicate v10
paddw mm0,mm1 ; v10+v11
psubw mm6,mm1 ; v10-v11
movq mmword ptr [esi],mm0 ; out0: v10+v11
movq mmword ptr [esi+16*4],mm6 ; out4: v10-v11
movq mm0,mm4 ; mm0: v06
paddw mm4,scratch4 ; v15: v05+v06
paddw mm0,mm2 ; v16: v07+v06
psllw mm4,16-NSHIFT ; v35: compensate the coeefient scale
pmulhw mm4,mmword ptr WA3 ; v35': WA3*v15
;psllw mm4,16-NSHIFT ; v35: compensate the coeefient scale
movq mm6,mm4 ; duplicate v35
paddw mm4,mm2 ; v45: v07+v35
psubw mm2,mm6 ; v47: v07-v35
paddw mm3,mm5 ; v22: v12+v13
psllw mm3,16-NSHIFT ; v32: compensate the coeefient scale
pmulhw mm3,mmword ptr WA1 ; v32': WA3*v15
;psllw mm3,16-NSHIFT ; v32: compensate the coeefient scale
movq mm6,mm5 ; duplicate v13
paddw mm5,mm3 ; v13+v32
psubw mm6,mm3 ; v13-v32
movq mmword ptr [esi+16*2],mm5 ; out2: v13+v32
movq mmword ptr [esi+16*6],mm6 ; out6: v13-v32
paddw mm7,scratch4 ; v14n: v04+v05
movq mm5,mm0 ; duplicate v16
psubw mm0,mm7 ; va1: v16-v14n
psllw mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
pmulhw mm7,mmword ptr WA2 ; v34'': v14n*WA2
pmulhw mm0,mmword ptr WA5 ; va0': va1*WA5
pmulhw mm5,mmword ptr WA4 ; v36'': v16*WA4
;psllw mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale note that WA4 is shifted 1 bit less tan the others
psllw mm7,16-NSHIFT ; v34: compensate the coeefient scale
;psllw mm0,16-WA5_SHIFT
psubw mm5,mm0 ; v36': v36''-va0'
psubw mm7,mm0 ; v34': v34''-va0'
movq mm0,mm4 ; duplicate v45
paddw mm4,mm5 ; v45+v36
psubw mm0,mm5 ; v45-v36
movq mmword ptr [esi+16*1],mm4 ; out1: v45+v36
movq mmword ptr [esi+16*7],mm0 ; out7: v45-v36
movq mm5,mm2 ; duplicate v47
paddw mm2,mm7 ; v47+v34
psubw mm5,mm7 ; v47-v34
movq mmword ptr [esi+16*5],mm2 ; out5: v47+v34
movq mmword ptr [esi+16*3],mm5 ; out3: v47-v34
sub esi,8 ; point back to the first 4 columns/rows.
push esi
push edi
jmp transpose
cont:
pop edi
pop esi
; first 4 rows
movq mm0, mmword ptr [esi] ; v0
;pinsrw mm0,[esi+16],1 ; (0,1)->1
;pinsrw mm0,[esi+16*2],2 ; (0,2)->2
;pinsrw mm0,[esi+16*3],3 ; (0,3)->3
movq mm1, mmword ptr [esi+16*7] ; v7
;pinsrw mm1,[esi+7*2],0 ; (7,0)->0
;pinsrw mm1,[esi+16+7*2],1 ; (7,1)->1
;pinsrw mm1,[esi+16*2+7*2],2 ; (7,1)->2
;pinsrw mm1,[esi+16*3+7*2],3 ; (7,1)->3
movq mm2,mm0 ; duplicate v0
paddw mm0,mm1 ; v00: v0+v7
psubw mm2,mm1 ; v07: v0-v7
movq mm1, mmword ptr [esi+16] ; v1
;pinsrw mm1,[esi+1*2],0 ; (7,0)->0
;pinsrw mm1,[esi+16+1*2],1 ; (7,1)->1
;pinsrw mm1,[esi+16*2+1*2],2 ; (7,1)->2
;pinsrw mm1,[esi+16*3+1*2],3 ; (7,1)->3
movq mm3, mmword ptr [esi+16*6] ; v6
;pinsrw mm3,[esi+6*2],0 ; (7,0)->0
;pinsrw mm3,[esi+16+6*2],1 ; (7,1)->1
;pinsrw mm3,[esi+16*2+6*2],2 ; (7,1)->2
;pinsrw mm3,[esi+16*3+6*2],3 ; (7,1)->3
movq mm4,mm1 ; duplicate v1
paddw mm1,mm3 ; v01: v1+v6
psubw mm4,mm3 ; v06: v1-v6
movq mm3, mmword ptr [esi+16*2] ; v2
;pinsrw mm3,[esi+2*2],0 ; (7,0)->0
;pinsrw mm3,[esi+16+2*2],1 ; (7,1)->1
;pinsrw mm3,[esi+16*2+2*2],2 ; (7,1)->2
;pinsrw mm3,[esi+16*3+2*2],3 ; (7,1)->3
movq mm5, mmword ptr [esi+16*5] ; v5
;pinsrw mm5,[esi+5*2],0 ; (7,0)->0
;pinsrw mm5,[esi+16+5*2],1 ; (7,1)->1
;pinsrw mm5,[esi+16*2+5*2],2 ; (7,1)->2
;pinsrw mm5,[esi+16*3+5*2],3 ; (7,1)->3
movq mm6,mm3 ; duplicate v2
paddw mm3,mm5 ; v02: v2+v5
psubw mm6,mm5 ; v05: v2-v5
movq mm5, mmword ptr [esi+16*3] ; v3
;pinsrw mm5,[esi+3*2],0 ; (7,0)->0
;pinsrw mm5,[esi+16+3*2],1 ; (7,1)->1
;pinsrw mm5,[esi+16*2+3*2],2 ; (7,1)->2
;pinsrw mm5,[esi+16*3+3*2],3 ; (7,1)->3
movq mm7, mmword ptr [esi+16*4] ; v4
;pinsrw mm7,[esi+4*2],0 ; (7,0)->0
;pinsrw mm7,[esi+16+4*2],1 ; (7,1)->1
;pinsrw mm7,[esi+16*2+4*2],2 ; (7,1)->2
;pinsrw mm7,[esi+16*3+4*2],3 ; (7,1)->3
movq mmword ptr scratch1,mm7 ; scratch1: v4 ;
movq mm7,mm5 ; duplicate v3
paddw mm5,scratch1 ; v03: v3+v4
psubw mm7,scratch1 ; v04: v3-v4
movq mmword ptr scratch2,mm5 ; scratch2: v03
movq mm5,mm0 ; mm5: v00
paddw mm0,scratch2 ; v10: v00+v03
psubw mm5,scratch2 ; v13: v00-v03
movq mmword ptr scratch3,mm3 ; scratc3: v02
movq mm3,mm1 ; duplicate v01
paddw mm1,scratch3 ; v11: v01+v02
psubw mm3,scratch3 ; v12: v01-v02
movq mmword ptr scratch4,mm6 ; scratc4: v05
movq mm6,mm0 ; duplicate v10
paddw mm0,mm1 ; v10+v11
psubw mm6,mm1 ; v10-v11
movq mmword ptr [edi],mm0 ; out0: v10+v11
movq mmword ptr [edi+16*4],mm6 ; out4: v10-v11
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?