📄 fdct_sse2_skal.asm
字号:
movdqa xmm3, [sqrt2]
movdqa xmm7, xmm4
movdqa xmm6, xmm2
psubsw xmm4, xmm1 ; tp17-tp35 = t1
psubsw xmm2, xmm0 ; tm17-tm35 = b3
paddsw xmm1, xmm7 ; tp17+tp35 = b0
paddsw xmm0, xmm6 ; tm17+tm35 = t2
; xmm1 = b0, xmm2 = b3. preserved
movdqa xmm6, xmm4
psubsw xmm4, xmm0 ; t1-t2
paddsw xmm0, xmm6 ; t1+t2
pmulhw xmm4, xmm3 ; (t1-t2)/(2.sqrt2)
pmulhw xmm0, xmm3 ; (t1+t2)/(2.sqrt2)
paddsw xmm0, xmm0 ; 2.(t1+t2) = b1
paddsw xmm4, xmm4 ; 2.(t1-t2) = b2
movdqa xmm7, [tan2] ; t2
movdqa xmm3, [%1+2*16] ; x2
movdqa xmm6, [%1+6*16] ; x6
movdqa xmm5, xmm7 ; t2
pmulhw xmm7, xmm6 ; x6*t2
pmulhw xmm5, xmm3 ; x2*t2
paddsw xmm7, xmm3 ; x2+x6*t2 = tp26
psubsw xmm5, xmm6 ; x2*t2-x6 = tm26
; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2
movdqa xmm3, [%1+0*16] ; x0
movdqa xmm6, [%1+4*16] ; x4
psubsw xmm3, xmm6 ; x0-x4 = tm04
paddsw xmm6, xmm6 ; 2.x4
paddsw xmm6, xmm3 ; x0+x4 = tp04
psubsw xmm3, xmm5 ; tm04-tm26 = a2
psubsw xmm6, xmm7 ; tp04-tp26 = a3
paddsw xmm5, xmm5 ; 2.tm26
paddsw xmm7, xmm7 ; 2.tp26
paddsw xmm5, xmm3 ; tm04+tm26 = a1
paddsw xmm7, xmm6 ; tp04+tp26 = a0
psubsw xmm5, xmm0 ; a1-b1
psubsw xmm3, xmm4 ; a2-b2
paddsw xmm0, xmm0 ; 2.b1
paddsw xmm4, xmm4 ; 2.b2
paddsw xmm0, xmm5 ; a1+b1
paddsw xmm4, xmm3 ; a2+b2
psraw xmm5, 6 ; out6
psraw xmm3, 6 ; out5
psraw xmm0, 6 ; out1
psraw xmm4, 6 ; out2
movdqa [%1+6*16], xmm5
movdqa [%1+5*16], xmm3
movdqa [%1+1*16], xmm0
movdqa [%1+2*16], xmm4
; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3
movdqa xmm0, xmm7
movdqa xmm4, xmm6
psubsw xmm7, xmm1 ; a0-b0
psubsw xmm6, xmm2 ; a3-b3
paddsw xmm1, xmm0 ; a0+b0
paddsw xmm2, xmm4 ; a3+b3
psraw xmm1, 6 ; out0
psraw xmm7, 6 ; out7
psraw xmm2, 6 ; out3
psraw xmm6, 6 ; out4
movdqa [%1+0*16], xmm1
movdqa [%1+3*16], xmm2
movdqa [%1+4*16], xmm6
movdqa [%1+7*16], xmm7
%endmacro
;-----------------------------------------------------------------------------
; Function idct (the straight forward version)
;-----------------------------------------------------------------------------
ALIGN 16
idct_sse2_skal:
mov ecx, [esp+4]
iMTX_MULT 0, iTab1, Idct_Rnd0, 11
iMTX_MULT 1, iTab2, Idct_Rnd1, 11
iMTX_MULT 2, iTab3, Idct_Rnd2, 11
iMTX_MULT 3, iTab4, Idct_Rnd3, 11
iMTX_MULT 4, iTab1, Idct_Rnd4, 11
iMTX_MULT 5, iTab4, Idct_Rnd5, 11
iMTX_MULT 6, iTab3, Idct_Rnd6, 11
iMTX_MULT 7, iTab2, Idct_Rnd7, 11
iLLM_PASS ecx+0
ret
;-----------------------------------------------------------------------------
; Helper macro TEST_ROW (test a null row)
;-----------------------------------------------------------------------------
%macro TEST_ROW 2 ; %1:src, %2:label x8
mov eax, [%1 ]
mov edx, [%1+ 8]
or eax, [%1+ 4]
or edx, [%1+12]
or eax, edx
jz near %2
%endmacro
;-----------------------------------------------------------------------------
; Function idct (this one skips null rows)
;-----------------------------------------------------------------------------
ALIGN 16
idct_sse2_sparse_skal:
mov ecx, [esp+ 4] ; Src
TEST_ROW ecx, .Row0_Round
iMTX_MULT 0, iTab1, Idct_Rnd0, 11
jmp .Row1
.Row0_Round
movq mm0, [Idct_Sparse_Rnd0]
movq [ecx ], mm0
movq [ecx+8], mm0
.Row1
TEST_ROW ecx+16, .Row1_Round
iMTX_MULT 1, iTab2, Idct_Rnd1, 11
jmp .Row2
.Row1_Round
movq mm0, [Idct_Sparse_Rnd1]
movq [ecx+16 ], mm0
movq [ecx+16+8], mm0
.Row2
TEST_ROW ecx+32, .Row2_Round
iMTX_MULT 2, iTab3, Idct_Rnd2, 11
jmp .Row3
.Row2_Round
movq mm0, [Idct_Sparse_Rnd2]
movq [ecx+32 ], mm0
movq [ecx+32+8], mm0
.Row3
TEST_ROW ecx+48, .Row4
iMTX_MULT 3, iTab4, Idct_Rnd3, 11
jmp .Row4
.Row4
TEST_ROW ecx+64, .Row5
iMTX_MULT 4, iTab1, Idct_Rnd4, 11
jmp .Row5
.Row5
TEST_ROW ecx+80, .Row6
iMTX_MULT 5, iTab4, Idct_Rnd5, 11
.Row6
TEST_ROW ecx+96, .Row7
iMTX_MULT 6, iTab3, Idct_Rnd6, 11
.Row7
TEST_ROW ecx+112, .End
iMTX_MULT 7, iTab2, Idct_Rnd7, 11
.End
iLLM_PASS ecx+0
ret
;-----------------------------------------------------------------------------
; Helper macro fLLM_PASS
;-----------------------------------------------------------------------------
%macro fLLM_PASS 2 ; %1: src/dst, %2:Shift
movdqa xmm0, [%1+0*16] ; In0
movdqa xmm2, [%1+2*16] ; In2
movdqa xmm3, xmm0
movdqa xmm4, xmm2
movdqa xmm7, [%1+7*16] ; In7
movdqa xmm5, [%1+5*16] ; In5
psubsw xmm0, xmm7 ; t7 = In0-In7
paddsw xmm7, xmm3 ; t0 = In0+In7
psubsw xmm2, xmm5 ; t5 = In2-In5
paddsw xmm5, xmm4 ; t2 = In2+In5
movdqa xmm3, [%1+3*16] ; In3
movdqa xmm4, [%1+4*16] ; In4
movdqa xmm1, xmm3
psubsw xmm3, xmm4 ; t4 = In3-In4
paddsw xmm4, xmm1 ; t3 = In3+In4
movdqa xmm6, [%1+6*16] ; In6
movdqa xmm1, [%1+1*16] ; In1
psubsw xmm1, xmm6 ; t6 = In1-In6
paddsw xmm6, [%1+1*16] ; t1 = In1+In6
psubsw xmm7, xmm4 ; tm03 = t0-t3
psubsw xmm6, xmm5 ; tm12 = t1-t2
paddsw xmm4, xmm4 ; 2.t3
paddsw xmm5, xmm5 ; 2.t2
paddsw xmm4, xmm7 ; tp03 = t0+t3
paddsw xmm5, xmm6 ; tp12 = t1+t2
psllw xmm2, %2+1 ; shift t5 (shift +1 to..
psllw xmm1, %2+1 ; shift t6 ..compensate cos4/2)
psllw xmm4, %2 ; shift t3
psllw xmm5, %2 ; shift t2
psllw xmm7, %2 ; shift t0
psllw xmm6, %2 ; shift t1
psllw xmm3, %2 ; shift t4
psllw xmm0, %2 ; shift t7
psubsw xmm4, xmm5 ; out4 = tp03-tp12
psubsw xmm1, xmm2 ; xmm1: t6-t5
paddsw xmm5, xmm5
paddsw xmm2, xmm2
paddsw xmm5, xmm4 ; out0 = tp03+tp12
movdqa [%1+4*16], xmm4 ; => out4
paddsw xmm2, xmm1 ; xmm2: t6+t5
movdqa [%1+0*16], xmm5 ; => out0
movdqa xmm4, [tan2] ; xmm4 <= tan2
pmulhw xmm4, xmm7 ; tm03*tan2
movdqa xmm5, [tan2] ; xmm5 <= tan2
psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12
pmulhw xmm5, xmm6 ; tm12*tan2
paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03
movdqa xmm6, [sqrt2]
movdqa xmm7, [Rounder1]
pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4
por xmm5, xmm7 ; correct out2
por xmm4, xmm7 ; correct out6
pmulhw xmm1, xmm6 ; xmm1: tm65 = (t6 - t5)*cos4
por xmm2, xmm7 ; correct tp65
movdqa [%1+2*16], xmm5 ; => out2
movdqa xmm5, xmm3 ; save t4
movdqa [%1+6*16], xmm4 ; => out6
movdqa xmm4, xmm0 ; save t7
psubsw xmm3, xmm1 ; xmm3: tm465 = t4 - tm65
psubsw xmm0, xmm2 ; xmm0: tm765 = t7 - tp65
paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65
paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65
movdqa xmm4, [tan3] ; tan3 - 1
movdqa xmm5, [tan1] ; tan1
movdqa xmm7, xmm3 ; save tm465
pmulhw xmm3, xmm4 ; tm465*(tan3-1)
movdqa xmm6, xmm1 ; save tp465
pmulhw xmm1, xmm5 ; tp465*tan1
paddsw xmm3, xmm7 ; tm465*tan3
pmulhw xmm4, xmm0 ; tm765*(tan3-1)
paddsw xmm4, xmm0 ; tm765*tan3
pmulhw xmm5, xmm2 ; tp765*tan1
paddsw xmm1, xmm2 ; out1 = tp765 + tp465*tan1
psubsw xmm0, xmm3 ; out3 = tm765 - tm465*tan3
paddsw xmm7, xmm4 ; out5 = tm465 + tm765*tan3
psubsw xmm5, xmm6 ; out7 =-tp465 + tp765*tan1
movdqa [%1+1*16], xmm1 ; => out1
movdqa [%1+3*16], xmm0 ; => out3
movdqa [%1+5*16], xmm7 ; => out5
movdqa [%1+7*16], xmm5 ; => out7
%endmacro
;-----------------------------------------------------------------------------
;Helper macro fMTX_MULT
;-----------------------------------------------------------------------------
%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders
movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567]
pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654]
pshufd xmm0, xmm0, 01000100b
pshufd xmm1, xmm1, 11101110b
movdqa xmm2, xmm0
paddsw xmm0, xmm1 ; xmm0 = [a0 a1 a2 a3]
psubsw xmm2, xmm1 ; xmm2 = [b0 b1 b2 b3]
punpckldq xmm0, xmm2 ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]
pshufd xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]
; [M00 M01 M16 M17] [M06 M07 M22 M23] x mm0 = [0 /1 /2'/3']
; [M02 M03 M18 M19] [M04 M05 M20 M21] x mm2 = [0'/1'/2 /3 ]
; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7']
; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ]
movdqa xmm1, [%2+16]
movdqa xmm3, [%2+32]
pmaddwd xmm1, xmm2
pmaddwd xmm3, xmm0
pmaddwd xmm2, [%2+48]
pmaddwd xmm0, [%2+ 0]
paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ]
paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ]
psrad xmm0, 16
psrad xmm2, 16
packssdw xmm0, xmm2 ; [ out0 .. out7 ]
paddsw xmm0, [%3] ; Round
psraw xmm0, 4 ; => [-2048, 2047]
movdqa [ecx+%1*16+0], xmm0
%endmacro
;-----------------------------------------------------------------------------
; Function Forward DCT
;-----------------------------------------------------------------------------
ALIGN 16
fdct_sse2_skal:
mov ecx, [esp+4]
fLLM_PASS ecx+0, 3
fMTX_MULT 0, fTab1, Fdct_Rnd0
fMTX_MULT 1, fTab2, Fdct_Rnd2
fMTX_MULT 2, fTab3, Fdct_Rnd1
fMTX_MULT 3, fTab4, Fdct_Rnd1
fMTX_MULT 4, fTab1, Fdct_Rnd0
fMTX_MULT 5, fTab4, Fdct_Rnd1
fMTX_MULT 6, fTab3, Fdct_Rnd1
fMTX_MULT 7, fTab2, Fdct_Rnd1
ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -