dctaan_mmx.asm

来自「Intel AN&N FAST dct MMX and X」· 汇编 代码 · 共 757 行 · 第 1/2 页

ASM
757
字号
movq  mm0,mm4                ; mm0: v06
paddw mm4,scratch4           ; v15: v05+v06 
paddw  mm0,mm2 		  		; v16: v07+v06


pmulhw mm4,mmword ptr WA3    ; v35': WA3*v15
psllw  mm4,16-NSHIFT         ; v35: compensate the coeefient scale

movq   mm6,mm4               ; duplicate v35
paddw  mm4,mm2               ; v45: v07+v35
psubw  mm2,mm6               ; v47: v07-v35
 
paddw  mm3,mm5               ; v22: v12+v13

pmulhw mm3,mmword ptr WA1    ; v32': WA3*v15
psllw  mm3,16-NSHIFT         ; v32: compensate the coeefient scale
movq   mm6,mm5               ; duplicate v13

paddw  mm5,mm3               ; v13+v32
psubw  mm6,mm3               ; v13-v32

movq  mmword ptr [edi+16*2],mm5   ; out2: v13+v32 
movq  mmword ptr [edi+16*6],mm6   ; out6: v13-v32 

paddw  mm7,scratch4			; v14n: v04+v05
movq   mm5,mm0              ; duplicate v16

psubw  mm0,mm7				; va1: v16-v14n
pmulhw mm5,mmword ptr WA4 		; v36'': v16*WA4
pmulhw mm7,mmword ptr WA2		; v34'': v14n*WA2
pmulhw mm0,mmword ptr WA5		; va0':  va1*WA5

psllw  mm5,16-WA4_SHIFT      ; v36: compensate the coeefient scale. Note that WA$ is scaled one bit less
psllw  mm7,16-NSHIFT         ; v34: compensate the coeefient scale
;psllw  mm0,16-WA5_SHIFT

psubw  mm5,mm0      ; v36': v36''-va0'
psubw  mm7,mm0      ; v34': v34''-va0'

movq   mm0,mm4       ; duplicate v45
paddw  mm4,mm5       ; v45+v36
psubw  mm0,mm5       ; v45-v36

movq  mmword ptr [edi+16*1],mm4   ; out1: v45+v36 
movq  mmword ptr [edi+16*7],mm0   ; out7: v45-v36 

movq   mm5,mm2       ; duplicate v47
paddw  mm2,mm7       ; v47+v34
psubw  mm5,mm7       ; v47-v34

movq  mmword ptr [edi+16*5],mm2   ; out5: v47+v34 
movq  mmword ptr [edi+16*3],mm5   ; out3: v47-v34 


; second 4 rows 
add  esi,8 ;16*4  ; source - point to the second 4 rows. 
add  edi,8		  ; destination - point to the second 4 columns (transposed)

movq mm0, mmword ptr [esi]	   ; v0
;pinsrw mm0,[esi+16],1		   ; (0,1)->1
;pinsrw mm0,[esi+16*2],2		   ; (0,2)->2
;pinsrw mm0,[esi+16*3],3		   ; (0,3)->3
movq mm1, mmword ptr [esi+16*7]	   ; v7
;pinsrw mm1,[esi+7*2],0		   ; (7,0)->0
;pinsrw mm1,[esi+16+7*2],1		   ; (7,1)->1
;pinsrw mm1,[esi+16*2+7*2],2		   ; (7,1)->2
;pinsrw mm1,[esi+16*3+7*2],3		   ; (7,1)->3
movq  mm2,mm0                  ; duplicate v0 
paddw mm0,mm1                  ; v00: v0+v7  
psubw mm2,mm1                  ; v07: v0-v7  

movq mm1, mmword ptr [esi+16]	   ; v1
;pinsrw mm1,[esi+1*2],0		   ; (7,0)->0
;pinsrw mm1,[esi+16+1*2],1		   ; (7,1)->1
;pinsrw mm1,[esi+16*2+1*2],2		   ; (7,1)->2
;pinsrw mm1,[esi+16*3+1*2],3		   ; (7,1)->3
movq mm3, mmword ptr [esi+16*6]	   ; v6
;pinsrw mm3,[esi+6*2],0		   ; (7,0)->0
;pinsrw mm3,[esi+16+6*2],1		   ; (7,1)->1
;pinsrw mm3,[esi+16*2+6*2],2		   ; (7,1)->2
;pinsrw mm3,[esi+16*3+6*2],3		   ; (7,1)->3
movq  mm4,mm1                  ; duplicate v1 
paddw mm1,mm3                  ; v01: v1+v6  
psubw mm4,mm3                  ; v06: v1-v6  

movq mm3, mmword ptr [esi+16*2] ; v2
;pinsrw mm3,[esi+2*2],0		   ; (7,0)->0
;pinsrw mm3,[esi+16+2*2],1		   ; (7,1)->1
;pinsrw mm3,[esi+16*2+2*2],2		   ; (7,1)->2
;pinsrw mm3,[esi+16*3+2*2],3		   ; (7,1)->3
movq mm5, mmword ptr [esi+16*5]	   ; v5
;pinsrw mm5,[esi+5*2],0		   ; (7,0)->0
;pinsrw mm5,[esi+16+5*2],1		   ; (7,1)->1
;pinsrw mm5,[esi+16*2+5*2],2		   ; (7,1)->2
;pinsrw mm5,[esi+16*3+5*2],3		   ; (7,1)->3
movq  mm6,mm3                  ; duplicate v2 
paddw mm3,mm5                  ; v02: v2+v5  
psubw mm6,mm5                  ; v05: v2-v5  

movq mm5, mmword ptr [esi+16*3]	   ; v3
;pinsrw mm5,[esi+3*2],0		   ; (7,0)->0
;pinsrw mm5,[esi+16+3*2],1		   ; (7,1)->1
;pinsrw mm5,[esi+16*2+3*2],2		   ; (7,1)->2
;pinsrw mm5,[esi+16*3+3*2],3		   ; (7,1)->3
movq mm7, mmword ptr [esi+16*4]	   ; v4
;pinsrw mm7,[esi+4*2],0		   ; (7,0)->0
;pinsrw mm7,[esi+16+4*2],1		   ; (7,1)->1
;pinsrw mm7,[esi+16*2+4*2],2		   ; (7,1)->2
;pinsrw mm7,[esi+16*3+4*2],3		   ; (7,1)->3
movq  mmword ptr scratch1,mm7		; scratch1: v4   ; 
movq  mm7,mm5                  ; duplicate v3 
paddw mm5,scratch1             ; v03: v3+v4  
psubw mm7,scratch1             ; v04: v3-v4  
movq  mmword ptr scratch2,mm5 ; scratch2: v03
movq  mm5,mm0                  ; mm5: v00

paddw mm0,scratch2             ; v10: v00+v03   
psubw mm5,scratch2             ; v13: v00-v03   
movq  mmword ptr scratch3,mm3  ; scratc3: v02
movq  mm3,mm1                  ; duplicate v01

paddw mm1,scratch3             ; v11: v01+v02
psubw mm3,scratch3             ; v12: v01-v02
 
movq  mmword ptr scratch4,mm6  ; scratc4: v05
movq  mm6,mm0                  ; duplicate v10

paddw mm0,mm1          ; v10+v11
psubw mm6,mm1          ; v10-v11
 
movq  mmword ptr [edi],mm0   ; out0: v10+v11 
movq  mmword ptr [edi+16*4],mm6   ; out4: v10-v11 

movq  mm0,mm4                ; mm0: v06
paddw mm4,scratch4           ; v15: v05+v06 
paddw  mm0,mm2 		  		; v16: v07+v06


pmulhw mm4,mmword ptr WA3    ; v35': WA3*v15
psllw  mm4,16-NSHIFT         ; v35: compensate the coeefient scale

movq   mm6,mm4               ; duplicate v35
paddw  mm4,mm2               ; v45: v07+v35
psubw  mm2,mm6               ; v47: v07-v35
 
paddw  mm3,mm5               ; v22: v12+v13

pmulhw mm3,mmword ptr WA1    ; v32': WA3*v15
psllw  mm3,16-NSHIFT         ; v32: compensate the coeefient scale
movq   mm6,mm5               ; duplicate v13

paddw  mm5,mm3               ; v13+v32
psubw  mm6,mm3               ; v13-v32

movq  mmword ptr [edi+16*2],mm5   ; out2: v13+v32 
movq  mmword ptr [edi+16*6],mm6   ; out6: v13-v32 

paddw  mm7,scratch4			; v14n: v04+v05
movq   mm5,mm0              ; duplicate v16

psubw  mm0,mm7				; va1: v16-v14n
pmulhw mm5,mmword ptr WA4 		; v36'': v16*WA4
pmulhw mm7,mmword ptr WA2		; v34'': v14n*WA2
pmulhw mm0,mmword ptr WA5		; va0':  va1*WA5

psllw  mm5,16-WA4_SHIFT    ; v36: compensate the coeefient scale. Note that WA$ is scaled one bit less
psllw  mm7,16-NSHIFT         ; v34: compensate the coeefient scale
;psllw  mm0,16-WA5_SHIFT

psubw  mm5,mm0      ; v36': v36''-va0'
psubw  mm7,mm0      ; v34': v34''-va0'

movq   mm0,mm4       ; duplicate v45
paddw  mm4,mm5       ; v45+v36
psubw  mm0,mm5       ; v45-v36

movq  mmword ptr [edi+16*1],mm4   ; out1: v45+v36 
movq  mmword ptr [edi+16*7],mm0   ; out7: v45-v36 

movq   mm5,mm2       ; duplicate v47
paddw  mm2,mm7       ; v47+v34
psubw  mm5,mm7       ; v47-v34

movq  mmword ptr [edi+16*5],mm2   ; out5: v47+v34 
movq  mmword ptr [edi+16*3],mm5   ; out3: v47-v34 

pop edi
pop	esi
pop ebp

ret	0

transpose:

        
		push ebx
		push ecx
		push edx

        mov             ebx, 8     ; ebx is x_size
        mov             ecx, ebx
        mov             edi, esi        ; pointer to the matrix

        sal             ecx, 2
        mov             eax, ebx
        add             ecx, ebx
        sub             eax, 4          ; eax is the inner loop variable

        add             ecx, ebx        ; ecx is 6*row size
        mov             edx, eax        ; edx is the outer loop variable
        
do_4x4_block_where_x_equals_y:

        movq    mm0, [esi]              ; m03:m02|m01:m00 - first line

        movq    mm2, [esi+4*ebx]        ; m23:m22|m21:m20 - third line
        movq    mm6, mm0                ; copy first line

        punpcklwd mm0, [esi+2*ebx]      
        ; m11:m01|m10:m00 - interleave first and second lines
        movq    mm7, mm2                ; copy third line

        punpcklwd mm2, [esi+ecx]        
        ; m31:m21|m30:m20 - interleave third and fourth lines
        movq    mm4, mm0                ; copy first intermediate result

        movq    mm1, [esi+2*ebx]        ; m13:m12|m11:m10 - second line
        punpckldq mm0, mm2              
        ; m30:m20|m10:m00 - interleave to produce result 1

        movq    mm3, [esi+ecx]          ; m33:m32|m31:m30 - fourth line
        punpckhdq mm4, mm2              
        ; m31:m21|m11:m01 - interleave to produce result 2

        movq    [esi], mm0              ; write result 1
        punpckhwd mm6, mm1              
        ; m13:m03|m12:m02 - interleave first and second lines

        movq    [esi+2*ebx], mm4        ; write result 2
        punpckhwd mm7, mm3              
        ; m33:m23|m32:m22 - interleave third and fourth lines

        movq    mm5, mm6                ; copy first intermediate result
        punpckldq mm6, mm7              
        ; m32:m22|m12:m02 - interleave to produce result 3

        lea     edi, [edi+8*ebx]                
        ; reload edi to point to a 4x4 set 4 rows down
        punpckhdq mm5, mm7              
        ; m33:m23|m13:m03 - interleave to produce result 4

        movq    [esi+4*ebx], mm6        ; write result 3

        movq    [esi+ecx], mm5          ; write result 4

        cmp     edx, 0                  
        ; check to see if the number of rows left is zero
        je      all_done_ready_to_exit  
        ;last time through you are done and ready to exit

do_4x4_blocks_x_and_y_not_equal:

; transpose the two mirror image 4x4 sets so that the writes 
; can be done without overwriting unused data 

        movq    mm0, [esi+8]            ; m03:m02|m01:m00 - first line

        movq    mm2, [esi+4*ebx+8]      ; m23:m22|m21:m20 - third line
        movq    mm6, mm0                ; copy first line

        punpcklwd mm0, [esi+2*ebx+8]    
        ; m11:m01|m10:m00 - interleave first and second lines
        movq    mm7, mm2                ; copy third line

        punpcklwd mm2, [esi+ecx+8]      
        ; m31:m21|m30:m20 - interleave third and fourth lines
        movq    mm4, mm0                ; copy first intermediate result
; all references for second 4 x 4 block are referred by "n" instead of "m"
        movq    mm1, [edi]              ; n03:n02|n01:n00 - first line 
        punpckldq mm0, mm2              
        ; m30:m20|m10:m00 - interleave to produce first result

        movq    mm3, [edi+4*ebx]        ; n23:n22|n21:n20 - third line
        punpckhdq mm4, mm2              
        ; m31:m21|m11:m01 - interleave to produce second result

        punpckhwd mm6, [esi+2*ebx+8]    
        ; m13:m03|m12:m02 - interleave first and second lines
        movq    mm2, mm1                ; copy first line

        punpckhwd mm7, [esi+ecx+8]      
        ; m33:m23|m32:m22 - interleave third and fourth lines
        movq    mm5, mm6                ; copy first intermediate result

        movq    [edi], mm0              ; write result 1
        punpckhdq mm5, mm7              
        ; m33:m23|m13:m03 - produce third result

        punpcklwd mm1, [edi+2*ebx]      
        ; n11:n01|n10:n00 - interleave first and second lines
        movq    mm0, mm3                ; copy third line

        punpckhwd mm2, [edi+2*ebx]      
        ; n13:n03|n12:n02 - interleave first and second lines

        movq    [edi+2*ebx], mm4        ; write result 2 out
        punpckldq mm6, mm7              
        ; m32:m22|m12:m02 - produce fourth result

        punpcklwd mm3, [edi+ecx]        
        ; n31:n21|n30:n20 - interleave third and fourth lines
        movq    mm4, mm1                ; copy first intermediate result

        movq    [edi+4*ebx], mm6        ; write result 3 out
        punpckldq mm1, mm3              
        ; n30:n20|n10:n00 - produce first result

        punpckhwd mm0, [edi+ecx]        
        ; n33:n23|n32:n22 - interleave third and fourth lines
        movq    mm6, mm2                ; copy second intermediate result

        movq    [edi+ecx], mm5          ; write result 4 out
        punpckhdq mm4, mm3              
        ; n31:n21|n11:n01- produce second result

        movq    [esi+8], mm1            
        ; write result 5 out - (first result for other 4 x 4 block)
        punpckldq mm2, mm0              
        ; n32:n22|n12:n02- produce third result

        movq    [esi+2*ebx+8], mm4      ; write result 6 out
        punpckhdq mm6, mm0              
        ; n33:n23|n13:n03 - produce fourth result

        movq    [esi+4*ebx+8], mm2      ; write result 7 out

        movq    [esi+ecx+8], mm6        ; write result 8 out
        
        add     esi, 8                  
        ; increment esi to point to next 4 x 4 block in same row
        lea     edi, [edi+8*ebx]                
        ; increment edi to point to next 4 x 4 block below current one

        sub     eax, 4                  ; decrement inner loop variable
        jnz     do_4x4_blocks_x_and_y_not_equal
        ; edi points to start of the second row in block we just finished
        
        sal     edx, 1                  
        lea     esi, [esi+8*ebx+8]      ; reload edi to point four rows down

        sub     esi, edx                
        ; subtract the number of bytes in last row 
        ; now we point to spot where row = col 
        sub     edx, 8                  ; sub 4 from row number

        sar     edx, 1
        mov     edi, esi

        mov     eax, edx                        
        ; reset x_size to outer loop variable to start new row
        jmp     do_4x4_block_where_x_equals_y

all_done_ready_to_exit:
		pop edx
		pop ecx
		pop ebx


		jmp cont


_dct8x8aan_mmx ENDP
_TEXT ENDS

END


⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?