📄 skl_dct_sse2.asm
字号:
; trashes xmm0,xmm4 movdqu xmm0, [%2] movdqu xmm4, [%4] punpcklbw xmm0, xmm0 psrlw xmm0, 8 ; will zero the high words punpcklbw xmm4, xmm4 psrlw xmm4, 8 paddsw xmm0, %1 paddsw xmm4, %3 packuswb xmm0, xmm0 packuswb xmm4, xmm4 movq [%2], xmm0 movq [%4], xmm4%endmacro%macro iLLM_PASS 2 ; %1: src/dst %2: combine func (0:store, 1:add, 2:put) movdqa xmm0, [tan3] ; t3-1 movdqa xmm3, [%1+16*3] ; x3 movdqa xmm1, xmm0 ; t3-1 movdqa xmm5, [%1+16*5] ; x5 movdqa xmm4, [tan1] ; t1 movdqa xmm6, [%1+16*1] ; x1 movdqa xmm7, [%1+16*7] ; x7 movdqa xmm2, xmm4 ; t1 pmulhw xmm0, xmm3 ; x3*(t3-1) pmulhw xmm1, xmm5 ; x5*(t3-1) paddsw xmm0, xmm3 ; x3*t3 paddsw xmm1, xmm5 ; x5*t3 psubsw xmm0, xmm5 ; x3*t3-x5 = tm35 paddsw xmm1, xmm3 ; x3+x5*t3 = tp35 pmulhw xmm4, xmm7 ; x7*t1 pmulhw xmm2, xmm6 ; x1*t1 paddsw xmm4, xmm6 ; x1+t1*x7 = tp17 psubsw xmm2, xmm7 ; x1*t1-x7 = tm17 movdqa xmm3, [sqrt2] movdqa xmm7, xmm4 movdqa xmm6, xmm2 psubsw xmm4, xmm1 ; tp17-tp35 = t1 psubsw xmm2, xmm0 ; tm17-tm35 = b3 paddsw xmm1, xmm7 ; tp17+tp35 = b0 paddsw xmm0, xmm6 ; tm17+tm35 = t2 ; xmm1 = b0, xmm2 = b3. preserved movdqa xmm6, xmm4 psubsw xmm4, xmm0 ; t1-t2 paddsw xmm0, xmm6 ; t1+t2 pmulhw xmm4, xmm3 ; (t1-t2)/(2.sqrt2) pmulhw xmm0, xmm3 ; (t1+t2)/(2.sqrt2) paddsw xmm0, xmm0 ; 2.(t1+t2) = b1 paddsw xmm4, xmm4 ; 2.(t1-t2) = b2 movdqa xmm7, [tan2] ; t2 movdqa xmm3, [%1+2*16] ; x2 movdqa xmm6, [%1+6*16] ; x6 movdqa xmm5, xmm7 ; t2 pmulhw xmm7, xmm6 ; x6*t2 pmulhw xmm5, xmm3 ; x2*t2 paddsw xmm7, xmm3 ; x2+x6*t2 = tp26 psubsw xmm5, xmm6 ; x2*t2-x6 = tm26 ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 movdqa xmm3, [%1+0*16] ; x0 movdqa xmm6, [%1+4*16] ; x4%ifndef IEEE_COMPLIANT psubsw xmm3, xmm6 ; x0-x4 = tm04 paddsw xmm6, xmm6 ; 2.x4 paddsw xmm6, xmm3 ; x0+x4 = tp04 psubsw xmm3, xmm5 ; tm04-tm26 = a2 psubsw xmm6, xmm7 ; tp04-tp26 = a3 paddsw xmm5, xmm5 ; 2.tm26 paddsw xmm7, xmm7 ; 2.tp26 paddsw xmm5, xmm3 ; tm04+tm26 = a1 paddsw xmm7, xmm6 ; tp04+tp26 = a0 psubsw xmm5, xmm0 ; a1-b1 psubsw xmm3, xmm4 ; a2-b2 paddsw xmm0, xmm0 ; 2.b1 paddsw xmm4, xmm4 ; 2.b2 paddsw xmm0, xmm5 ; a1+b1 paddsw xmm4, xmm3 ; a2+b2%else ; we spill 1 reg to perform safe butterflies movdqa [%1 ], xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm6 ; x0-x4 = tm04 paddsw xmm6, xmm2 ; x0+x4 = tp04 movdqa xmm2, xmm6 psubsw xmm6, xmm7 paddsw xmm7, xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm5 paddsw xmm5, xmm2 movdqa xmm2, xmm5 psubsw xmm5, xmm0 paddsw xmm0, xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm4 paddsw xmm4, xmm2 movdqa xmm2, [%1]%endif psraw xmm5, 6 ; out6 psraw xmm3, 6 ; out5 psraw xmm0, 6 ; out1 psraw xmm4, 6 ; out2%if (%2==0) movdqa [%1+6*16], xmm5 movdqa [%1+5*16], xmm3 movdqa [%1+1*16], xmm0 movdqa [%1+2*16], xmm4%elif (%2==2) movdqa [%1 ], xmm0 ; spill movdqa [%1+16], xmm4 ; spill ADD_TO_DST [%1], eax+ edx, [%1+16], eax+2*edx ; #1 - #2%else packuswb xmm0,xmm0 packuswb xmm4,xmm4 packuswb xmm3,xmm3 packuswb xmm5,xmm5 movq [eax+ edx], xmm0 ; #1 movq [eax+2*edx], xmm4 ; #2 ; keep xmm3 and xmm5 for later%endif ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3 movdqa xmm0, xmm7 movdqa xmm4, xmm6 psubsw xmm7, xmm1 ; a0-b0 psubsw xmm6, xmm2 ; a3-b3 paddsw xmm1, xmm0 ; a0+b0 paddsw xmm2, xmm4 ; a3+b3 psraw xmm1, 6 ; out0 psraw xmm7, 6 ; out7 psraw xmm2, 6 ; out3 psraw xmm6, 6 ; out4 ; combine result%if (%2==0) movdqa [%1+0*16], xmm1 movdqa [%1+3*16], xmm2 movdqa [%1+4*16], xmm6 movdqa [%1+7*16], xmm7%elif (%2==2) ADD_TO_DST xmm1, eax, xmm6, eax+4*edx ; #0 - #4 lea eax, [eax+2*edx] ; -> #2 ADD_TO_DST xmm2, eax+ edx, xmm5, eax+4*edx ; #3 - #6 lea eax, [eax+ edx] ; -> #3 ADD_TO_DST xmm3, eax+2*edx, xmm7, eax+4*edx ; #5 - #7%else packuswb xmm1,xmm1 packuswb xmm2,xmm2 packuswb xmm6,xmm6 packuswb xmm7,xmm7 movq [eax ], xmm1 ; #0 movq [eax+4*edx], xmm6 ; #4 lea eax, [eax+2*edx] ; -> #2 movq [eax+ edx], xmm2 ; #3 movq [eax+4*edx], xmm5 ; #6 lea eax, [eax+ edx] ; -> #3 movq [eax+2*edx], xmm3 ; #5 movq [eax+4*edx], xmm7 ; #7%endif%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_IDct16_SSE2: mov ecx, [esp+4] iMTX_MULT 0, iTab1, Idct_Rnd0, 11 iMTX_MULT 1, iTab2, Idct_Rnd1, 11 iMTX_MULT 2, iTab3, Idct_Rnd2, 11 iMTX_MULT 3, iTab4, Idct_Rnd3, 11 iMTX_MULT 4, iTab1, Idct_Rnd4, 11 iMTX_MULT 5, iTab4, Idct_Rnd5, 11 iMTX_MULT 6, iTab3, Idct_Rnd6, 11 iMTX_MULT 7, iTab2, Idct_Rnd7, 11 iLLM_PASS ecx+0, 0 ret;//////////////////////////////////////////////////////////////////////;// Sparseness-testing version%macro TEST_ROW 2 ; %1:src, %2:label x8 mov eax, [%1 ] mov edx, [%1+ 8] or eax, [%1+ 4] or edx, [%1+12] or eax, edx jz near %2%endmacro%macro IDCT_IMPL 1 ; %1: 0 = 16b store 1:put 2:add mov ecx, [esp+ 12] ; Src TEST_ROW ecx, .Row0_Round iMTX_MULT 0, iTab1, Idct_Rnd0, 11 jmp .Row1.Row0_Round movdqa xmm0, [Idct_Sparse_Rnd0] movdqa [ecx ], xmm0.Row1 TEST_ROW ecx+16, .Row1_Round iMTX_MULT 1, iTab2, Idct_Rnd1, 11 jmp .Row2.Row1_Round movdqa xmm0, [Idct_Sparse_Rnd1] movdqa [ecx+16 ], xmm0.Row2 TEST_ROW ecx+32, .Row2_Round iMTX_MULT 2, iTab3, Idct_Rnd2, 11 jmp .Row3.Row2_Round movdqa xmm0, [Idct_Sparse_Rnd2] movdqa [ecx+32 ], xmm0.Row3 TEST_ROW ecx+48, .Row4 iMTX_MULT 3, iTab4, Idct_Rnd3, 11.Row4 TEST_ROW ecx+64, .Row5 iMTX_MULT 4, iTab1, Idct_Rnd4, 11.Row5 TEST_ROW ecx+80, .Row6 iMTX_MULT 5, iTab4, Idct_Rnd5, 11.Row6 TEST_ROW ecx+96, .Row7 iMTX_MULT 6, iTab3, Idct_Rnd6, 11.Row7 TEST_ROW ecx+112, .End iMTX_MULT 7, iTab2, Idct_Rnd7, 11.End%if (%1!=0) mov eax, [esp+ 4] ; Dst mov edx, [esp+ 8] ; BpS%endif iLLM_PASS ecx, %1%endmacroalign 16Skl_IDct16_Put_SSE2: IDCT_IMPL 1 retalign 16Skl_IDct16_Add_SSE2: IDCT_IMPL 2 ret
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -