📄 skl_dct_sse2.asm
字号:
paddsw xmm0, xmm5 ; a1+b1 paddsw xmm4, xmm3 ; a2+b2%else ; we spill 1 reg to perform safe butterflies movdqa [%1 ], xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm6 ; x0-x4 = tm04 paddsw xmm6, xmm2 ; x0+x4 = tp04 movdqa xmm2, xmm6 psubsw xmm6, xmm7 paddsw xmm7, xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm5 paddsw xmm5, xmm2 movdqa xmm2, xmm5 psubsw xmm5, xmm0 paddsw xmm0, xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm4 paddsw xmm4, xmm2 movdqa xmm2, [%1]%endif psraw xmm5, 6 ; out6 psraw xmm3, 6 ; out5 psraw xmm0, 6 ; out1 psraw xmm4, 6 ; out2%if (%2==0) movdqa [%1+6*16], xmm5 movdqa [%1+5*16], xmm3 movdqa [%1+1*16], xmm0 movdqa [%1+2*16], xmm4%elif (%2==2) movdqa [%1 ], xmm0 ; spill movdqa [%1+16], xmm4 ; spill ADD_TO_DST [%1], eax+ edx, [%1+16], eax+2*edx ; #1 - #2%else packuswb xmm0,xmm0 packuswb xmm4,xmm4 packuswb xmm3,xmm3 packuswb xmm5,xmm5 movq [eax+ edx], xmm0 ; #1 movq [eax+2*edx], xmm4 ; #2 ; keep xmm3 and xmm5 for later%endif ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3 movdqa xmm0, xmm7 movdqa xmm4, xmm6 psubsw xmm7, xmm1 ; a0-b0 psubsw xmm6, xmm2 ; a3-b3 paddsw xmm1, xmm0 ; a0+b0 paddsw xmm2, xmm4 ; a3+b3 psraw xmm1, 6 ; out0 psraw xmm7, 6 ; out7 psraw xmm2, 6 ; out3 psraw xmm6, 6 ; out4 ; combine result%if (%2==0) movdqa [%1+0*16], xmm1 movdqa [%1+3*16], xmm2 movdqa [%1+4*16], xmm6 movdqa [%1+7*16], xmm7%elif (%2==2) ADD_TO_DST xmm1, eax, xmm6, eax+4*edx ; #0 - #4 lea eax, [eax+2*edx] ; -> #2 ADD_TO_DST xmm2, eax+ edx, xmm5, eax+4*edx ; #3 - #6 lea eax, [eax+ edx] ; -> #3 ADD_TO_DST xmm3, eax+2*edx, xmm7, eax+4*edx ; #5 - #7%else packuswb xmm1,xmm1 packuswb xmm2,xmm2 packuswb xmm6,xmm6 packuswb xmm7,xmm7 movq [eax ], xmm1 ; #0 movq [eax+4*edx], xmm6 ; #4 lea eax, [eax+2*edx] ; -> #2 movq [eax+ edx], xmm2 ; #3 movq [eax+4*edx], xmm5 ; #6 lea eax, [eax+ edx] ; -> #3 movq [eax+2*edx], xmm3 ; #5 movq [eax+4*edx], xmm7 ; #7%endif%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_IDct16_SSE2: mov ecx, [esp+4] iMTX_MULT 0, iTab1, Idct_Rnd0, 11 iMTX_MULT 1, iTab2, Idct_Rnd1, 11 iMTX_MULT 2, iTab3, Idct_Rnd2, 11 iMTX_MULT 3, iTab4, Idct_Rnd3, 11 iMTX_MULT 4, iTab1, Idct_Rnd4, 11 iMTX_MULT 5, iTab4, Idct_Rnd5, 11 iMTX_MULT 6, iTab3, Idct_Rnd6, 11 iMTX_MULT 7, iTab2, Idct_Rnd7, 11 iLLM_PASS ecx+0, 0 ret;//////////////////////////////////////////////////////////////////////;// Sparseness-testing version%macro TEST_ROW 2 ; %1:src, %2:label x8 mov eax, [%1 ] mov edx, [%1+ 8] or eax, [%1+ 4] or edx, [%1+12] or eax, edx jz near %2%endmacro%macro IDCT_IMPL 1 ; %1: 0 = 16b store 1:put 2:add mov ecx, [esp+ 4] ; Src TEST_ROW ecx, .Row0_Round iMTX_MULT 0, iTab1, Idct_Rnd0, 11 jmp .Row1.Row0_Round movdqa xmm0, [Idct_Sparse_Rnd0] movdqa [ecx ], xmm0.Row1 TEST_ROW ecx+16, .Row1_Round iMTX_MULT 1, iTab2, Idct_Rnd1, 11 jmp .Row2.Row1_Round movdqa xmm0, [Idct_Sparse_Rnd1] movdqa [ecx+16 ], xmm0.Row2 TEST_ROW ecx+32, .Row2_Round iMTX_MULT 2, iTab3, Idct_Rnd2, 11 jmp .Row3.Row2_Round movdqa xmm0, [Idct_Sparse_Rnd2] movdqa [ecx+32 ], xmm0.Row3 TEST_ROW ecx+48, .Row4 iMTX_MULT 3, iTab4, Idct_Rnd3, 11.Row4 TEST_ROW ecx+64, .Row5 iMTX_MULT 4, iTab1, Idct_Rnd4, 11.Row5 TEST_ROW ecx+80, .Row6 iMTX_MULT 5, iTab4, Idct_Rnd5, 11.Row6 TEST_ROW ecx+96, .Row7 iMTX_MULT 6, iTab3, Idct_Rnd6, 11.Row7 TEST_ROW ecx+112, .End iMTX_MULT 7, iTab2, Idct_Rnd7, 11.End%if (%1!=0) mov eax, [esp+ 8] ; Dst mov edx, [esp+12] ; BpS%endif iLLM_PASS ecx, %1%endmacroalign 16Skl_IDct16_Sparse_SSE2: IDCT_IMPL 0 retalign 16Skl_IDct16_Put_SSE2: IDCT_IMPL 1 retalign 16Skl_IDct16_Add_SSE2: IDCT_IMPL 2 ret;//////////////////////////////////////////////////////////////////////;// fLLM_PASS;//////////////////////////////////////////////////////////////////////%macro fLLM_PASS 1 ; %1:Shift movdqa xmm0, [ecx+0*16] ; In0 movdqa xmm2, [ecx+2*16] ; In2 movdqa xmm3, xmm0 movdqa xmm4, xmm2 movdqa xmm7, [ecx+7*16] ; In7 movdqa xmm5, [ecx+5*16] ; In5 psubsw xmm0, xmm7 ; t7 = In0-In7 paddsw xmm7, xmm3 ; t0 = In0+In7 psubsw xmm2, xmm5 ; t5 = In2-In5 paddsw xmm5, xmm4 ; t2 = In2+In5 movdqa xmm3, [ecx+3*16] ; In3 movdqa xmm4, [ecx+4*16] ; In4 movdqa xmm1, xmm3 psubsw xmm3, xmm4 ; t4 = In3-In4 paddsw xmm4, xmm1 ; t3 = In3+In4 movdqa xmm6, [ecx+6*16] ; In6 movdqa xmm1, [ecx+1*16] ; In1 psubsw xmm1, xmm6 ; t6 = In1-In6 paddsw xmm6, [ecx+1*16] ; t1 = In1+In6 psubsw xmm7, xmm4 ; tm03 = t0-t3 psubsw xmm6, xmm5 ; tm12 = t1-t2 paddsw xmm4, xmm4 ; 2.t3 paddsw xmm5, xmm5 ; 2.t2 paddsw xmm4, xmm7 ; tp03 = t0+t3 paddsw xmm5, xmm6 ; tp12 = t1+t2 psllw xmm2, %1+1 ; shift t5 (shift +1 to.. psllw xmm1, %1+1 ; shift t6 ..compensate cos4/2) psllw xmm4, %1 ; shift t3 psllw xmm5, %1 ; shift t2 psllw xmm7, %1 ; shift t0 psllw xmm6, %1 ; shift t1 psllw xmm3, %1 ; shift t4 psllw xmm0, %1 ; shift t7 psubsw xmm4, xmm5 ; out4 = tp03-tp12 psubsw xmm1, xmm2 ; xmm1: t6-t5 paddsw xmm5, xmm5 paddsw xmm2, xmm2 paddsw xmm5, xmm4 ; out0 = tp03+tp12 movdqa [ecx+4*16], xmm4 ; => out4 paddsw xmm2, xmm1 ; xmm2: t6+t5 movdqa [ecx+0*16], xmm5 ; => out0 movdqa xmm4, [tan2] ; xmm4 <= tan2 pmulhw xmm4, xmm7 ; tm03*tan2 movdqa xmm5, [tan2] ; xmm5 <= tan2 psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12 pmulhw xmm5, xmm6 ; tm12*tan2 paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03 movdqa xmm6, [sqrt2] movdqa xmm7, [Rounder1] pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4 por xmm5, xmm7 ; correct out2 por xmm4, xmm7 ; correct out6 pmulhw xmm1, xmm6 ; xmm1: tm65 = (t6 - t5)*cos4 por xmm2, xmm7 ; correct tp65 movdqa [ecx+2*16], xmm5 ; => out2 movdqa xmm5, xmm3 ; save t4 movdqa [ecx+6*16], xmm4 ; => out6 movdqa xmm4, xmm0 ; save t7 psubsw xmm3, xmm1 ; xmm3: tm465 = t4 - tm65 psubsw xmm0, xmm2 ; xmm0: tm765 = t7 - tp65 paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65 paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65 movdqa xmm4, [tan3] ; tan3 - 1 movdqa xmm5, [tan1] ; tan1 movdqa xmm7, xmm3 ; save tm465 pmulhw xmm3, xmm4 ; tm465*(tan3-1) movdqa xmm6, xmm1 ; save tp465 pmulhw xmm1, xmm5 ; tp465*tan1 paddsw xmm3, xmm7 ; tm465*tan3 pmulhw xmm4, xmm0 ; tm765*(tan3-1) paddsw xmm4, xmm0 ; tm765*tan3 pmulhw xmm5, xmm2 ; tp765*tan1 paddsw xmm1, xmm2 ; out1 = tp765 + tp465*tan1 psubsw xmm0, xmm3 ; out3 = tm765 - tm465*tan3 paddsw xmm7, xmm4 ; out5 = tm465 + tm765*tan3 psubsw xmm5, xmm6 ; out7 =-tp465 + tp765*tan1 movdqa [ecx+1*16], xmm1 ; => out1 movdqa [ecx+3*16], xmm0 ; => out3 movdqa [ecx+5*16], xmm7 ; => out5 movdqa [ecx+7*16], xmm5 ; => out7%endmacro;//////////////////////////////////////////////////////////////////////;// fMTX_MULT;//////////////////////////////////////////////////////////////////////%macro fMTX_MULT 6 ; [%1/2/3=src/coeffs,rounders][%4/5/6=src/coeffs,rounders][ movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567] movdqa xmm2, [ecx+%4*16+0] pshufhw xmm0, xmm0, 00011011b ; xmm0 = [0123][7654] pshufhw xmm2, xmm2, 00011011b ; xmm2 = [0123][7654] movdqa xmm4, xmm0 shufps xmm0, xmm2, 01000100b ; xmm0 = [0123][0123'] shufps xmm4, xmm2, 11101110b ; xmm4 = [7654][7654'] movdqa xmm2, xmm0 paddsw xmm0, xmm4 ; xmm0 = [a0 a1 a2 a3][a0 a1 a2 a3'] psubsw xmm2, xmm4 ; xmm2 = [b0 b1 b2 b3][b0 b1 b2 b3'] movdqa xmm4, xmm0 punpckldq xmm0, xmm2 ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3] punpckhdq xmm4, xmm2 pshufd xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1] pshufd xmm6, xmm4, 01001110b ; [M00 M01 M16 M17] [M06 M07 M22 M23] x mm0 = [0 /1 /2'/3'] ; [M02 M03 M18 M19] [M04 M05 M20 M21] x mm2 = [0'/1'/2 /3 ] ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7'] ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ] movdqa xmm1, [%2+16] movdqa xmm5, [%5+16] movdqa xmm3, [%2+32] movdqa xmm7, [%5+32] pmaddwd xmm1, xmm2 pmaddwd xmm5, xmm6 pmaddwd xmm3, xmm0 pmaddwd xmm7, xmm4 pmaddwd xmm2, [%2+48] pmaddwd xmm6, [%5+48] pmaddwd xmm0, [%2+ 0] pmaddwd xmm4, [%5+ 0] paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ] paddd xmm4, xmm5 paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ] paddd xmm6, xmm7 psrad xmm0, 16 psrad xmm4, 16 psrad xmm2, 16 psrad xmm6, 16 packssdw xmm0, xmm2 ; [ out0 .. out7 ] packssdw xmm4, xmm6 paddsw xmm0, [%3] ; Round paddsw xmm4, [%6] ; Round psraw xmm0, 4 ; => [-2048, 2047] movdqa [ecx+%1*16+0], xmm0 psraw xmm4, 4 ; => [-2048, 2047] movdqa [ecx+%4*16+0], xmm4%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_Dct16_SSE2: mov ecx, [esp+4] fLLM_PASS 3 fMTX_MULT 0, fTab1, Fdct_Rnd0, 1, fTab2, Fdct_Rnd2 fMTX_MULT 2, fTab3, Fdct_Rnd1, 3, fTab4, Fdct_Rnd1 fMTX_MULT 4, fTab1, Fdct_Rnd0, 5, fTab4, Fdct_Rnd1 fMTX_MULT 6, fTab3, Fdct_Rnd1, 7, fTab2, Fdct_Rnd1 ret;//////////////////////////////////////////////////////////////////////
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -