dctaan_mmx.asm
来自「Intel AN&N FAST dct MMX and X」· 汇编 代码 · 共 757 行 · 第 1/2 页
ASM
757 行
movq mm0,mm4 ; mm0: v06
paddw mm4,scratch4 ; v15: v05+v06
paddw mm0,mm2 ; v16: v07+v06
pmulhw mm4,mmword ptr WA3 ; v35': WA3*v15
psllw mm4,16-NSHIFT ; v35: compensate the coeefient scale
movq mm6,mm4 ; duplicate v35
paddw mm4,mm2 ; v45: v07+v35
psubw mm2,mm6 ; v47: v07-v35
paddw mm3,mm5 ; v22: v12+v13
pmulhw mm3,mmword ptr WA1 ; v32': WA3*v15
psllw mm3,16-NSHIFT ; v32: compensate the coeefient scale
movq mm6,mm5 ; duplicate v13
paddw mm5,mm3 ; v13+v32
psubw mm6,mm3 ; v13-v32
movq mmword ptr [edi+16*2],mm5 ; out2: v13+v32
movq mmword ptr [edi+16*6],mm6 ; out6: v13-v32
paddw mm7,scratch4 ; v14n: v04+v05
movq mm5,mm0 ; duplicate v16
psubw mm0,mm7 ; va1: v16-v14n
pmulhw mm5,mmword ptr WA4 ; v36'': v16*WA4
pmulhw mm7,mmword ptr WA2 ; v34'': v14n*WA2
pmulhw mm0,mmword ptr WA5 ; va0': va1*WA5
psllw mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale. Note that WA$ is scaled one bit less
psllw mm7,16-NSHIFT ; v34: compensate the coeefient scale
;psllw mm0,16-WA5_SHIFT
psubw mm5,mm0 ; v36': v36''-va0'
psubw mm7,mm0 ; v34': v34''-va0'
movq mm0,mm4 ; duplicate v45
paddw mm4,mm5 ; v45+v36
psubw mm0,mm5 ; v45-v36
movq mmword ptr [edi+16*1],mm4 ; out1: v45+v36
movq mmword ptr [edi+16*7],mm0 ; out7: v45-v36
movq mm5,mm2 ; duplicate v47
paddw mm2,mm7 ; v47+v34
psubw mm5,mm7 ; v47-v34
movq mmword ptr [edi+16*5],mm2 ; out5: v47+v34
movq mmword ptr [edi+16*3],mm5 ; out3: v47-v34
; second 4 rows
add esi,8 ;16*4 ; source - point to the second 4 rows.
add edi,8 ; destination - point to the second 4 columns (transposed)
movq mm0, mmword ptr [esi] ; v0
;pinsrw mm0,[esi+16],1 ; (0,1)->1
;pinsrw mm0,[esi+16*2],2 ; (0,2)->2
;pinsrw mm0,[esi+16*3],3 ; (0,3)->3
movq mm1, mmword ptr [esi+16*7] ; v7
;pinsrw mm1,[esi+7*2],0 ; (7,0)->0
;pinsrw mm1,[esi+16+7*2],1 ; (7,1)->1
;pinsrw mm1,[esi+16*2+7*2],2 ; (7,1)->2
;pinsrw mm1,[esi+16*3+7*2],3 ; (7,1)->3
movq mm2,mm0 ; duplicate v0
paddw mm0,mm1 ; v00: v0+v7
psubw mm2,mm1 ; v07: v0-v7
movq mm1, mmword ptr [esi+16] ; v1
;pinsrw mm1,[esi+1*2],0 ; (7,0)->0
;pinsrw mm1,[esi+16+1*2],1 ; (7,1)->1
;pinsrw mm1,[esi+16*2+1*2],2 ; (7,1)->2
;pinsrw mm1,[esi+16*3+1*2],3 ; (7,1)->3
movq mm3, mmword ptr [esi+16*6] ; v6
;pinsrw mm3,[esi+6*2],0 ; (7,0)->0
;pinsrw mm3,[esi+16+6*2],1 ; (7,1)->1
;pinsrw mm3,[esi+16*2+6*2],2 ; (7,1)->2
;pinsrw mm3,[esi+16*3+6*2],3 ; (7,1)->3
movq mm4,mm1 ; duplicate v1
paddw mm1,mm3 ; v01: v1+v6
psubw mm4,mm3 ; v06: v1-v6
movq mm3, mmword ptr [esi+16*2] ; v2
;pinsrw mm3,[esi+2*2],0 ; (7,0)->0
;pinsrw mm3,[esi+16+2*2],1 ; (7,1)->1
;pinsrw mm3,[esi+16*2+2*2],2 ; (7,1)->2
;pinsrw mm3,[esi+16*3+2*2],3 ; (7,1)->3
movq mm5, mmword ptr [esi+16*5] ; v5
;pinsrw mm5,[esi+5*2],0 ; (7,0)->0
;pinsrw mm5,[esi+16+5*2],1 ; (7,1)->1
;pinsrw mm5,[esi+16*2+5*2],2 ; (7,1)->2
;pinsrw mm5,[esi+16*3+5*2],3 ; (7,1)->3
movq mm6,mm3 ; duplicate v2
paddw mm3,mm5 ; v02: v2+v5
psubw mm6,mm5 ; v05: v2-v5
movq mm5, mmword ptr [esi+16*3] ; v3
;pinsrw mm5,[esi+3*2],0 ; (7,0)->0
;pinsrw mm5,[esi+16+3*2],1 ; (7,1)->1
;pinsrw mm5,[esi+16*2+3*2],2 ; (7,1)->2
;pinsrw mm5,[esi+16*3+3*2],3 ; (7,1)->3
movq mm7, mmword ptr [esi+16*4] ; v4
;pinsrw mm7,[esi+4*2],0 ; (7,0)->0
;pinsrw mm7,[esi+16+4*2],1 ; (7,1)->1
;pinsrw mm7,[esi+16*2+4*2],2 ; (7,1)->2
;pinsrw mm7,[esi+16*3+4*2],3 ; (7,1)->3
movq mmword ptr scratch1,mm7 ; scratch1: v4 ;
movq mm7,mm5 ; duplicate v3
paddw mm5,scratch1 ; v03: v3+v4
psubw mm7,scratch1 ; v04: v3-v4
movq mmword ptr scratch2,mm5 ; scratch2: v03
movq mm5,mm0 ; mm5: v00
paddw mm0,scratch2 ; v10: v00+v03
psubw mm5,scratch2 ; v13: v00-v03
movq mmword ptr scratch3,mm3 ; scratc3: v02
movq mm3,mm1 ; duplicate v01
paddw mm1,scratch3 ; v11: v01+v02
psubw mm3,scratch3 ; v12: v01-v02
movq mmword ptr scratch4,mm6 ; scratc4: v05
movq mm6,mm0 ; duplicate v10
paddw mm0,mm1 ; v10+v11
psubw mm6,mm1 ; v10-v11
movq mmword ptr [edi],mm0 ; out0: v10+v11
movq mmword ptr [edi+16*4],mm6 ; out4: v10-v11
movq mm0,mm4 ; mm0: v06
paddw mm4,scratch4 ; v15: v05+v06
paddw mm0,mm2 ; v16: v07+v06
pmulhw mm4,mmword ptr WA3 ; v35': WA3*v15
psllw mm4,16-NSHIFT ; v35: compensate the coeefient scale
movq mm6,mm4 ; duplicate v35
paddw mm4,mm2 ; v45: v07+v35
psubw mm2,mm6 ; v47: v07-v35
paddw mm3,mm5 ; v22: v12+v13
pmulhw mm3,mmword ptr WA1 ; v32': WA3*v15
psllw mm3,16-NSHIFT ; v32: compensate the coeefient scale
movq mm6,mm5 ; duplicate v13
paddw mm5,mm3 ; v13+v32
psubw mm6,mm3 ; v13-v32
movq mmword ptr [edi+16*2],mm5 ; out2: v13+v32
movq mmword ptr [edi+16*6],mm6 ; out6: v13-v32
paddw mm7,scratch4 ; v14n: v04+v05
movq mm5,mm0 ; duplicate v16
psubw mm0,mm7 ; va1: v16-v14n
pmulhw mm5,mmword ptr WA4 ; v36'': v16*WA4
pmulhw mm7,mmword ptr WA2 ; v34'': v14n*WA2
pmulhw mm0,mmword ptr WA5 ; va0': va1*WA5
psllw mm5,16-WA4_SHIFT ; v36: compensate the coeefient scale. Note that WA$ is scaled one bit less
psllw mm7,16-NSHIFT ; v34: compensate the coeefient scale
;psllw mm0,16-WA5_SHIFT
psubw mm5,mm0 ; v36': v36''-va0'
psubw mm7,mm0 ; v34': v34''-va0'
movq mm0,mm4 ; duplicate v45
paddw mm4,mm5 ; v45+v36
psubw mm0,mm5 ; v45-v36
movq mmword ptr [edi+16*1],mm4 ; out1: v45+v36
movq mmword ptr [edi+16*7],mm0 ; out7: v45-v36
movq mm5,mm2 ; duplicate v47
paddw mm2,mm7 ; v47+v34
psubw mm5,mm7 ; v47-v34
movq mmword ptr [edi+16*5],mm2 ; out5: v47+v34
movq mmword ptr [edi+16*3],mm5 ; out3: v47-v34
pop edi
pop esi
pop ebp
ret 0
transpose:
push ebx
push ecx
push edx
mov ebx, 8 ; ebx is x_size
mov ecx, ebx
mov edi, esi ; pointer to the matrix
sal ecx, 2
mov eax, ebx
add ecx, ebx
sub eax, 4 ; eax is the inner loop variable
add ecx, ebx ; ecx is 6*row size
mov edx, eax ; edx is the outer loop variable
do_4x4_block_where_x_equals_y:
movq mm0, [esi] ; m03:m02|m01:m00 - first line
movq mm2, [esi+4*ebx] ; m23:m22|m21:m20 - third line
movq mm6, mm0 ; copy first line
punpcklwd mm0, [esi+2*ebx]
; m11:m01|m10:m00 - interleave first and second lines
movq mm7, mm2 ; copy third line
punpcklwd mm2, [esi+ecx]
; m31:m21|m30:m20 - interleave third and fourth lines
movq mm4, mm0 ; copy first intermediate result
movq mm1, [esi+2*ebx] ; m13:m12|m11:m10 - second line
punpckldq mm0, mm2
; m30:m20|m10:m00 - interleave to produce result 1
movq mm3, [esi+ecx] ; m33:m32|m31:m30 - fourth line
punpckhdq mm4, mm2
; m31:m21|m11:m01 - interleave to produce result 2
movq [esi], mm0 ; write result 1
punpckhwd mm6, mm1
; m13:m03|m12:m02 - interleave first and second lines
movq [esi+2*ebx], mm4 ; write result 2
punpckhwd mm7, mm3
; m33:m23|m32:m22 - interleave third and fourth lines
movq mm5, mm6 ; copy first intermediate result
punpckldq mm6, mm7
; m32:m22|m12:m02 - interleave to produce result 3
lea edi, [edi+8*ebx]
; reload edi to point to a 4x4 set 4 rows down
punpckhdq mm5, mm7
; m33:m23|m13:m03 - interleave to produce result 4
movq [esi+4*ebx], mm6 ; write result 3
movq [esi+ecx], mm5 ; write result 4
cmp edx, 0
; check to see if the number of rows left is zero
je all_done_ready_to_exit
;last time through you are done and ready to exit
do_4x4_blocks_x_and_y_not_equal:
; transpose the two mirror image 4x4 sets so that the writes
; can be done without overwriting unused data
movq mm0, [esi+8] ; m03:m02|m01:m00 - first line
movq mm2, [esi+4*ebx+8] ; m23:m22|m21:m20 - third line
movq mm6, mm0 ; copy first line
punpcklwd mm0, [esi+2*ebx+8]
; m11:m01|m10:m00 - interleave first and second lines
movq mm7, mm2 ; copy third line
punpcklwd mm2, [esi+ecx+8]
; m31:m21|m30:m20 - interleave third and fourth lines
movq mm4, mm0 ; copy first intermediate result
; all references for second 4 x 4 block are referred by "n" instead of "m"
movq mm1, [edi] ; n03:n02|n01:n00 - first line
punpckldq mm0, mm2
; m30:m20|m10:m00 - interleave to produce first result
movq mm3, [edi+4*ebx] ; n23:n22|n21:n20 - third line
punpckhdq mm4, mm2
; m31:m21|m11:m01 - interleave to produce second result
punpckhwd mm6, [esi+2*ebx+8]
; m13:m03|m12:m02 - interleave first and second lines
movq mm2, mm1 ; copy first line
punpckhwd mm7, [esi+ecx+8]
; m33:m23|m32:m22 - interleave third and fourth lines
movq mm5, mm6 ; copy first intermediate result
movq [edi], mm0 ; write result 1
punpckhdq mm5, mm7
; m33:m23|m13:m03 - produce third result
punpcklwd mm1, [edi+2*ebx]
; n11:n01|n10:n00 - interleave first and second lines
movq mm0, mm3 ; copy third line
punpckhwd mm2, [edi+2*ebx]
; n13:n03|n12:n02 - interleave first and second lines
movq [edi+2*ebx], mm4 ; write result 2 out
punpckldq mm6, mm7
; m32:m22|m12:m02 - produce fourth result
punpcklwd mm3, [edi+ecx]
; n31:n21|n30:n20 - interleave third and fourth lines
movq mm4, mm1 ; copy first intermediate result
movq [edi+4*ebx], mm6 ; write result 3 out
punpckldq mm1, mm3
; n30:n20|n10:n00 - produce first result
punpckhwd mm0, [edi+ecx]
; n33:n23|n32:n22 - interleave third and fourth lines
movq mm6, mm2 ; copy second intermediate result
movq [edi+ecx], mm5 ; write result 4 out
punpckhdq mm4, mm3
; n31:n21|n11:n01- produce second result
movq [esi+8], mm1
; write result 5 out - (first result for other 4 x 4 block)
punpckldq mm2, mm0
; n32:n22|n12:n02- produce third result
movq [esi+2*ebx+8], mm4 ; write result 6 out
punpckhdq mm6, mm0
; n33:n23|n13:n03 - produce fourth result
movq [esi+4*ebx+8], mm2 ; write result 7 out
movq [esi+ecx+8], mm6 ; write result 8 out
add esi, 8
; increment esi to point to next 4 x 4 block in same row
lea edi, [edi+8*ebx]
; increment edi to point to next 4 x 4 block below current one
sub eax, 4 ; decrement inner loop variable
jnz do_4x4_blocks_x_and_y_not_equal
; edi points to start of the second row in block we just finished
sal edx, 1
lea esi, [esi+8*ebx+8] ; reload edi to point four rows down
sub esi, edx
; subtract the number of bytes in last row
; now we point to spot where row = col
sub edx, 8 ; sub 4 from row number
sar edx, 1
mov edi, esi
mov eax, edx
; reset x_size to outer loop variable to start new row
jmp do_4x4_block_where_x_equals_y
all_done_ready_to_exit:
pop edx
pop ecx
pop ebx
jmp cont
_dct8x8aan_mmx ENDP
_TEXT ENDS
END
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?