📄 skl_dct_sse2.asm

📁 mpeg4编解码器
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
  paddsw xmm0, xmm5   ; a1+b1  paddsw xmm4, xmm3   ; a2+b2%else    ; we spill 1 reg to perform safe butterflies  movdqa [%1   ], xmm2  movdqa xmm2, xmm3  psubsw xmm3, xmm6   ; x0-x4 = tm04  paddsw xmm6, xmm2   ; x0+x4 = tp04  movdqa xmm2, xmm6  psubsw xmm6, xmm7  paddsw xmm7, xmm2  movdqa xmm2, xmm3  psubsw xmm3, xmm5  paddsw xmm5, xmm2  movdqa xmm2, xmm5  psubsw xmm5, xmm0  paddsw xmm0, xmm2  movdqa xmm2, xmm3  psubsw xmm3, xmm4  paddsw xmm4, xmm2  movdqa xmm2, [%1]%endif  psraw  xmm5, 6      ; out6  psraw  xmm3, 6      ; out5  psraw  xmm0, 6      ; out1  psraw  xmm4, 6      ; out2%if (%2==0)  movdqa [%1+6*16], xmm5  movdqa [%1+5*16], xmm3  movdqa [%1+1*16], xmm0  movdqa [%1+2*16], xmm4%elif (%2==2)  movdqa [%1   ], xmm0   ; spill  movdqa [%1+16], xmm4   ; spill  ADD_TO_DST [%1], eax+  edx, [%1+16], eax+2*edx    ; #1 - #2%else  packuswb xmm0,xmm0  packuswb xmm4,xmm4  packuswb xmm3,xmm3  packuswb xmm5,xmm5  movq [eax+  edx], xmm0   ; #1  movq [eax+2*edx], xmm4   ; #2    ; keep xmm3 and xmm5 for later%endif    ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3  movdqa xmm0, xmm7  movdqa xmm4, xmm6  psubsw xmm7, xmm1   ; a0-b0  psubsw xmm6, xmm2   ; a3-b3  paddsw xmm1, xmm0   ; a0+b0  paddsw xmm2, xmm4   ; a3+b3  psraw  xmm1, 6      ; out0  psraw  xmm7, 6      ; out7  psraw  xmm2, 6      ; out3  psraw  xmm6, 6      ; out4    ; combine result%if (%2==0)  movdqa [%1+0*16], xmm1  movdqa [%1+3*16], xmm2  movdqa [%1+4*16], xmm6  movdqa [%1+7*16], xmm7%elif (%2==2)  ADD_TO_DST xmm1, eax,       xmm6, eax+4*edx  ; #0 - #4  lea eax, [eax+2*edx]                         ; -> #2  ADD_TO_DST xmm2, eax+  edx, xmm5, eax+4*edx  ; #3 - #6  lea eax, [eax+  edx]                         ; -> #3  ADD_TO_DST xmm3, eax+2*edx, xmm7, eax+4*edx  ; #5 - #7%else  packuswb xmm1,xmm1  packuswb xmm2,xmm2  packuswb xmm6,xmm6  packuswb xmm7,xmm7  movq [eax      ], xmm1   ; #0  movq [eax+4*edx], xmm6   ; #4  lea  eax, [eax+2*edx]    ; -> #2  movq [eax+  edx], xmm2   ; #3  movq [eax+4*edx], xmm5   ; #6  lea  eax, [eax+  edx]    ; -> #3  movq [eax+2*edx], xmm3   ; #5  movq [eax+4*edx], xmm7   ; #7%endif%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_IDct16_SSE2:  mov ecx, [esp+4]  iMTX_MULT  0, iTab1, Idct_Rnd0, 11  iMTX_MULT  1, iTab2, Idct_Rnd1, 11  iMTX_MULT  2, iTab3, Idct_Rnd2, 11  iMTX_MULT  3, iTab4, Idct_Rnd3, 11  iMTX_MULT  4, iTab1, Idct_Rnd4, 11  iMTX_MULT  5, iTab4, Idct_Rnd5, 11  iMTX_MULT  6, iTab3, Idct_Rnd6, 11  iMTX_MULT  7, iTab2, Idct_Rnd7, 11  iLLM_PASS ecx+0, 0  ret;//////////////////////////////////////////////////////////////////////;// Sparseness-testing version%macro TEST_ROW 2     ; %1:src,  %2:label x8  mov eax, [%1   ]  mov edx, [%1+ 8]  or  eax, [%1+ 4]  or  edx, [%1+12]  or  eax, edx  jz near %2%endmacro%macro IDCT_IMPL 1    ; %1: 0 = 16b store     1:put   2:add  mov ecx, [esp+ 4]  ; Src  TEST_ROW ecx, .Row0_Round  iMTX_MULT  0, iTab1, Idct_Rnd0, 11  jmp .Row1.Row0_Round  movdqa xmm0, [Idct_Sparse_Rnd0]  movdqa [ecx  ], xmm0.Row1  TEST_ROW ecx+16, .Row1_Round  iMTX_MULT  1, iTab2, Idct_Rnd1, 11  jmp .Row2.Row1_Round  movdqa xmm0, [Idct_Sparse_Rnd1]  movdqa [ecx+16  ], xmm0.Row2  TEST_ROW ecx+32, .Row2_Round  iMTX_MULT  2, iTab3, Idct_Rnd2, 11  jmp .Row3.Row2_Round  movdqa xmm0, [Idct_Sparse_Rnd2]  movdqa [ecx+32  ], xmm0.Row3  TEST_ROW ecx+48, .Row4  iMTX_MULT  3, iTab4, Idct_Rnd3, 11.Row4  TEST_ROW ecx+64, .Row5  iMTX_MULT  4, iTab1, Idct_Rnd4, 11.Row5  TEST_ROW ecx+80, .Row6  iMTX_MULT  5, iTab4, Idct_Rnd5, 11.Row6  TEST_ROW ecx+96, .Row7  iMTX_MULT  6, iTab3, Idct_Rnd6, 11.Row7  TEST_ROW ecx+112, .End  iMTX_MULT  7, iTab2, Idct_Rnd7, 11.End%if (%1!=0)  mov eax, [esp+ 8]  ; Dst  mov edx, [esp+12]  ; BpS%endif  iLLM_PASS ecx, %1%endmacroalign 16Skl_IDct16_Sparse_SSE2:  IDCT_IMPL 0  retalign 16Skl_IDct16_Put_SSE2:  IDCT_IMPL 1  retalign 16Skl_IDct16_Add_SSE2:  IDCT_IMPL 2  ret;//////////////////////////////////////////////////////////////////////;// fLLM_PASS;//////////////////////////////////////////////////////////////////////%macro fLLM_PASS 1  ; %1:Shift  movdqa xmm0, [ecx+0*16]   ; In0  movdqa xmm2, [ecx+2*16]   ; In2  movdqa xmm3, xmm0  movdqa xmm4, xmm2  movdqa xmm7, [ecx+7*16]   ; In7  movdqa xmm5, [ecx+5*16]   ; In5  psubsw xmm0, xmm7         ; t7 = In0-In7  paddsw xmm7, xmm3         ; t0 = In0+In7  psubsw xmm2, xmm5         ; t5 = In2-In5  paddsw xmm5, xmm4         ; t2 = In2+In5  movdqa xmm3, [ecx+3*16]   ; In3  movdqa xmm4, [ecx+4*16]   ; In4  movdqa xmm1, xmm3  psubsw xmm3, xmm4         ; t4 = In3-In4  paddsw xmm4, xmm1         ; t3 = In3+In4  movdqa xmm6, [ecx+6*16]   ; In6  movdqa xmm1, [ecx+1*16]   ; In1  psubsw xmm1, xmm6         ; t6 = In1-In6  paddsw xmm6, [ecx+1*16]   ; t1 = In1+In6  psubsw xmm7, xmm4         ; tm03 = t0-t3  psubsw xmm6, xmm5         ; tm12 = t1-t2  paddsw xmm4, xmm4         ; 2.t3  paddsw xmm5, xmm5         ; 2.t2  paddsw xmm4, xmm7         ; tp03 = t0+t3  paddsw xmm5, xmm6         ; tp12 = t1+t2  psllw  xmm2, %1+1         ; shift t5 (shift +1 to..  psllw  xmm1, %1+1         ; shift t6  ..compensate cos4/2)  psllw  xmm4, %1           ; shift t3  psllw  xmm5, %1           ; shift t2  psllw  xmm7, %1           ; shift t0  psllw  xmm6, %1           ; shift t1  psllw  xmm3, %1           ; shift t4  psllw  xmm0, %1           ; shift t7  psubsw xmm4, xmm5         ; out4 = tp03-tp12  psubsw xmm1, xmm2         ; xmm1: t6-t5  paddsw xmm5, xmm5  paddsw xmm2, xmm2  paddsw xmm5, xmm4         ; out0 = tp03+tp12  movdqa [ecx+4*16], xmm4   ; => out4  paddsw xmm2, xmm1         ; xmm2: t6+t5  movdqa [ecx+0*16], xmm5   ; => out0  movdqa xmm4, [tan2]       ; xmm4 <= tan2  pmulhw xmm4, xmm7         ; tm03*tan2  movdqa xmm5, [tan2]       ; xmm5 <= tan2  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12  pmulhw xmm5, xmm6         ; tm12*tan2  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03  movdqa xmm6, [sqrt2]    movdqa xmm7, [Rounder1]  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4  por    xmm5, xmm7         ; correct out2  por    xmm4, xmm7         ; correct out6  pmulhw xmm1, xmm6         ; xmm1: tm65 = (t6 - t5)*cos4  por    xmm2, xmm7         ; correct tp65  movdqa [ecx+2*16], xmm5   ; => out2  movdqa xmm5, xmm3         ; save t4  movdqa [ecx+6*16], xmm4   ; => out6  movdqa xmm4, xmm0         ; save t7    psubsw xmm3, xmm1         ; xmm3: tm465 = t4 - tm65  psubsw xmm0, xmm2         ; xmm0: tm765 = t7 - tp65  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65  movdqa xmm4, [tan3]       ; tan3 - 1  movdqa xmm5, [tan1]       ; tan1  movdqa xmm7, xmm3         ; save tm465  pmulhw xmm3, xmm4         ; tm465*(tan3-1)  movdqa xmm6, xmm1         ; save tp465  pmulhw xmm1, xmm5         ; tp465*tan1  paddsw xmm3, xmm7         ; tm465*tan3  pmulhw xmm4, xmm0         ; tm765*(tan3-1)  paddsw xmm4, xmm0         ; tm765*tan3  pmulhw xmm5, xmm2         ; tp765*tan1  paddsw xmm1, xmm2         ; out1 = tp765 + tp465*tan1  psubsw xmm0, xmm3         ; out3 = tm765 - tm465*tan3  paddsw xmm7, xmm4         ; out5 = tm465 + tm765*tan3  psubsw xmm5, xmm6         ; out7 =-tp465 + tp765*tan1  movdqa [ecx+1*16], xmm1   ; => out1  movdqa [ecx+3*16], xmm0   ; => out3  movdqa [ecx+5*16], xmm7   ; => out5  movdqa [ecx+7*16], xmm5   ; => out7%endmacro;//////////////////////////////////////////////////////////////////////;// fMTX_MULT;//////////////////////////////////////////////////////////////////////%macro fMTX_MULT 6   ; [%1/2/3=src/coeffs,rounders][%4/5/6=src/coeffs,rounders][  movdqa    xmm0, [ecx+%1*16+0]   ; xmm0 = [0123][4567]  movdqa    xmm2, [ecx+%4*16+0]  pshufhw   xmm0, xmm0, 00011011b ; xmm0 = [0123][7654]  pshufhw   xmm2, xmm2, 00011011b ; xmm2 = [0123][7654]  movdqa    xmm4, xmm0  shufps    xmm0, xmm2, 01000100b ; xmm0 = [0123][0123']  shufps    xmm4, xmm2, 11101110b ; xmm4 = [7654][7654']  movdqa    xmm2, xmm0  paddsw    xmm0, xmm4     ; xmm0 = [a0 a1 a2 a3][a0 a1 a2 a3']  psubsw    xmm2, xmm4     ; xmm2 = [b0 b1 b2 b3][b0 b1 b2 b3']  movdqa    xmm4, xmm0  punpckldq xmm0, xmm2            ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]    punpckhdq xmm4, xmm2  pshufd    xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]  pshufd    xmm6, xmm4, 01001110b    ;  [M00 M01    M16 M17] [M06 M07    M22 M23]  x mm0 = [0 /1 /2'/3']    ;  [M02 M03    M18 M19] [M04 M05    M20 M21]  x mm2 = [0'/1'/2 /3 ]    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]  movdqa  xmm1, [%2+16]  movdqa  xmm5, [%5+16]  movdqa  xmm3, [%2+32]  movdqa  xmm7, [%5+32]  pmaddwd xmm1, xmm2  pmaddwd xmm5, xmm6  pmaddwd xmm3, xmm0  pmaddwd xmm7, xmm4  pmaddwd xmm2, [%2+48]  pmaddwd xmm6, [%5+48]  pmaddwd xmm0, [%2+ 0]  pmaddwd xmm4, [%5+ 0]  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]  paddd   xmm4, xmm5  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]  paddd   xmm6, xmm7  psrad   xmm0, 16  psrad   xmm4, 16  psrad   xmm2, 16  psrad   xmm6, 16    packssdw xmm0, xmm2            ;  [ out0 .. out7 ]  packssdw xmm4, xmm6  paddsw   xmm0, [%3]            ;  Round  paddsw   xmm4, [%6]            ;  Round  psraw    xmm0, 4               ; => [-2048, 2047]  movdqa  [ecx+%1*16+0], xmm0  psraw    xmm4, 4               ; => [-2048, 2047]  movdqa  [ecx+%4*16+0], xmm4%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_Dct16_SSE2:  mov ecx, [esp+4]  fLLM_PASS  3  fMTX_MULT  0, fTab1, Fdct_Rnd0, 1, fTab2, Fdct_Rnd2  fMTX_MULT  2, fTab3, Fdct_Rnd1, 3, fTab4, Fdct_Rnd1  fMTX_MULT  4, fTab1, Fdct_Rnd0, 5, fTab4, Fdct_Rnd1  fMTX_MULT  6, fTab3, Fdct_Rnd1, 7, fTab2, Fdct_Rnd1  ret;//////////////////////////////////////////////////////////////////////
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -