⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 skl_dct_sse.asm

📁 mpeg4 video codec mpeg4 video codec
💻 ASM
📖 第 1 页 / 共 3 页
字号:
  movq   [%1+0*16], mm1  movq   [%1+7*16], mm7  movq   [%1+3*16], mm2  movq   [%1+4*16], mm6%elif (%2==2)  ADD_TO_DST mm1, eax      +%3, mm6, eax+4*edx+%3  ; #0 - #4  lea eax, [eax+2*edx]                             ; -> #2  ADD_TO_DST mm2, eax+  edx+%3, mm5, eax+4*edx+%3  ; #3 - #6  lea eax, [eax+  edx]                             ; -> #3  ADD_TO_DST mm3, eax+2*edx+%3, mm7, eax+4*edx+%3  ; #5 - #7%else  packuswb mm1,[%1+0*16+8]  packuswb mm6,[%1+4*16+8]  packuswb mm2,[%1+3*16+8]  packuswb mm5,[%1+6*16+8]  packuswb mm3,[%1+5*16+8]  packuswb mm7,[%1+7*16+8]  movq [eax      ], mm1   ; #0  movq [eax+4*edx], mm6   ; #4  lea eax, [eax+2*edx]    ; -> #2  movq [eax+  edx], mm2   ; #3  movq [eax+4*edx], mm5   ; #6  lea eax, [eax+  edx]    ; -> #3  movq [eax+2*edx], mm3   ; #5  movq [eax+4*edx], mm7   ; #7%endif%endmacro;//////////////////////////////////////////////////////////////////////;// basic IDCTs...; Nic - Changed to fastcall conventionalign 16Skl_IDct16_SSE:  ; 249c  mov ecx, [esp+4]  ; Old cdecl way, ecx is the pointer anyway with   iMTX_MULT  0, iTab1, Idct_Rnd0, 11  iMTX_MULT  1, iTab2, Idct_Rnd1, 11  iMTX_MULT  2, iTab3, Idct_Rnd2, 11  iMTX_MULT  3, iTab4, Idct_Rnd3, 11  iMTX_MULT  4, iTab1, Idct_Rnd4, 11  iMTX_MULT  5, iTab4, Idct_Rnd5, 11  iMTX_MULT  6, iTab3, Idct_Rnd6, 11  iMTX_MULT  7, iTab2, Idct_Rnd7, 11  iLLM_PASS ecx+0, 0,0  iLLM_PASS ecx+8, 0,0  retalign 16Skl_IDct16_MMX:  ; 288c  mov ecx, [esp+4]  iMTX_MULT_MMX  0, iTab1_MMX, Idct_Rnd0, 11  iMTX_MULT_MMX  1, iTab2_MMX, Idct_Rnd1, 11  iMTX_MULT_MMX  2, iTab3_MMX, Idct_Rnd2, 11  iMTX_MULT_MMX  3, iTab4_MMX, Idct_Rnd3, 11  iMTX_MULT_MMX  4, iTab1_MMX, Idct_Rnd4, 11  iMTX_MULT_MMX  5, iTab4_MMX, Idct_Rnd5, 11  iMTX_MULT_MMX  6, iTab3_MMX, Idct_Rnd6, 11  iMTX_MULT_MMX  7, iTab2_MMX, Idct_Rnd7, 11  iLLM_PASS ecx+0, 0,0  iLLM_PASS ecx+8, 0,0  ret;//////////////////////////////////////////////////////////////////////;// Optimized Sparse/Put/Add MMX/SSE IDCTs%macro TEST_ROW 3     ; %1:src,  %2:label x8, %3: label x4  mov eax, [%1   ]  mov edx, [%1+ 8]  or  eax, [%1+ 4]  or  edx, [%1+12]  or  eax, edx  jz near %2  test edx, 0xffffffff  jz near %3%endmacro%macro IDCT_IMPL  4   ; %1: combine func (0:none, 1:Put,  2:Add)                        ; %2:HPASS macro, %3:HPASS-03 macro, %4:VPASS macro  mov ecx, [esp+12]  ; Src  TEST_ROW ecx, .Row0_Round, .Row0_4  %2  0, ITAB1, Idct_Rnd0, 11  jmp .Row1.Row0_4  %3  0, ITAB1, Idct_Rnd0, 11  jmp .Row1.Row0_Round  movq mm0, [Idct_Sparse_Rnd0]  movq [ecx  ], mm0  movq [ecx+8], mm0.Row1  TEST_ROW ecx+16, .Row1_Round, .Row1_4  %2  1, ITAB2, Idct_Rnd1, 11  jmp .Row2.Row1_4  %3  1, ITAB2, Idct_Rnd1, 11  jmp .Row2.Row1_Round  movq mm0, [Idct_Sparse_Rnd1]  movq [ecx+16  ], mm0  movq [ecx+16+8], mm0.Row2  TEST_ROW ecx+32, .Row2_Round, .Row2_4  %2  2, ITAB3, Idct_Rnd2, 11  jmp .Row3.Row2_4  %3  2, ITAB3, Idct_Rnd2, 11  jmp .Row3.Row2_Round  movq mm0, [Idct_Sparse_Rnd2]  movq [ecx+32  ], mm0  movq [ecx+32+8], mm0.Row3  TEST_ROW ecx+48, .Row4, .Row3_4  %2  3, ITAB4, Idct_Rnd3, 11  jmp .Row4.Row3_4  %3  3, ITAB4, Idct_Rnd3, 11.Row4  TEST_ROW ecx+64, .Row5, .Row4_4  %2  4, ITAB1, Idct_Rnd4, 11  jmp .Row5.Row4_4:  %3  4, ITAB1, Idct_Rnd4, 11.Row5  TEST_ROW ecx+80, .Row6, .Row5_4  %2  5, ITAB4, Idct_Rnd5, 11  jmp .Row6.Row5_4  %3  5, ITAB4, Idct_Rnd5, 11.Row6  TEST_ROW ecx+96, .Row7, .Row6_4  %2  6, ITAB3, Idct_Rnd6, 11  jmp .Row7.Row6_4  %3  6, ITAB3, Idct_Rnd6, 11.Row7  TEST_ROW ecx+112, .End, .Row7_4  %2  7, ITAB2, Idct_Rnd7, 11  jmp .End.Row7_4  %3  7, ITAB2, Idct_Rnd7, 11.End%if (%1==0)  %4 ecx+0, 0,0  %4 ecx+8, 0,0%elif (%1==1)  mov eax, [esp+ 4]  ; Dst  mov edx, [esp+ 8]  ; BpS  %4 ecx+8, 0,0  %4 ecx+0, 1,0%else  mov eax, [esp+ 4]  ; Dst  mov edx, [esp+ 8]  ; BpS  %4 ecx+0, 2,0  mov eax, [esp+ 4]  ; reload Dst  %4 ecx+8, 2,4%endif%endmacro;//////////////////////////////////////////////////////////////////////%define ITAB1 iTab1%define ITAB2 iTab2%define ITAB3 iTab3%define ITAB4 iTab4align 16Skl_IDct16_Put_SSE:  IDCT_IMPL 1, iMTX_MULT, iMTX_MULT_03, iLLM_PASS  retalign 16Skl_IDct16_Add_SSE:  IDCT_IMPL 2, iMTX_MULT, iMTX_MULT_03, iLLM_PASS  ret;//////////////////////////////////////////////////////////////////////%define ITAB1 iTab1_MMX%define ITAB2 iTab2_MMX%define ITAB3 iTab3_MMX%define ITAB4 iTab4_MMXalign 16Skl_IDct16_Put_MMX:  IDCT_IMPL 1, iMTX_MULT_MMX, iMTX_MULT_MMX, iLLM_PASS  retalign 16Skl_IDct16_Add_MMX:  IDCT_IMPL 2, iMTX_MULT_MMX, iMTX_MULT_MMX, iLLM_PASS  ret;//////////////////////////////////////////////////////////////////////;// fLLM_PASS (~39c);//////////////////////////////////////////////////////////////////////%macro fLLM_PASS 2  ; %1: src/dst, %2:Shift  movq   mm0, [%1+0*16]   ; In0  movq   mm2, [%1+2*16]   ; In2  movq   mm3, mm0  movq   mm4, mm2  movq   mm7, [%1+7*16]   ; In7  movq   mm5, [%1+5*16]   ; In5  psubsw mm0, mm7         ; t7 = In0-In7  paddsw mm7, mm3         ; t0 = In0+In7  psubsw mm2, mm5         ; t5 = In2-In5  paddsw mm5, mm4         ; t2 = In2+In5  movq   mm3, [%1+3*16]   ; In3  movq   mm4, [%1+4*16]   ; In4  movq   mm1, mm3  psubsw mm3, mm4         ; t4 = In3-In4  paddsw mm4, mm1         ; t3 = In3+In4  movq   mm6, [%1+6*16]   ; In6  movq   mm1, [%1+1*16]   ; In1  psubsw mm1, mm6         ; t6 = In1-In6  paddsw mm6, [%1+1*16]   ; t1 = In1+In6  psubsw mm7, mm4         ; tm03 = t0-t3  psubsw mm6, mm5         ; tm12 = t1-t2  paddsw mm4, mm4         ; 2.t3  paddsw mm5, mm5         ; 2.t2  paddsw mm4, mm7         ; tp03 = t0+t3  paddsw mm5, mm6         ; tp12 = t1+t2  psllw  mm2, %2+1        ; shift t5 (shift +1 to..  psllw  mm1, %2+1        ; shift t6  ..compensate cos4/2)  psllw  mm4, %2          ; shift t3  psllw  mm5, %2          ; shift t2  psllw  mm7, %2          ; shift t0  psllw  mm6, %2          ; shift t1  psllw  mm3, %2          ; shift t4  psllw  mm0, %2          ; shift t7  psubsw mm4, mm5         ; out4 = tp03-tp12  psubsw mm1, mm2         ; mm1: t6-t5  paddsw mm5, mm5  paddsw mm2, mm2  paddsw mm5, mm4         ; out0 = tp03+tp12  movq   [%1+4*16], mm4   ; => out4  paddsw mm2, mm1         ; mm2: t6+t5  movq   [%1+0*16], mm5   ; => out0  movq   mm4, [tan2]      ; mm4 <= tan2  pmulhw mm4, mm7         ; tm03*tan2  movq   mm5, [tan2]      ; mm5 <= tan2  psubsw mm4, mm6         ; out6 = tm03*tan2 - tm12  pmulhw mm5, mm6         ; tm12*tan2  paddsw mm5, mm7         ; out2 = tm12*tan2 + tm03  movq   mm6, [sqrt2]    movq   mm7, [MMX_One]  pmulhw mm2, mm6         ; mm2: tp65 = (t6 + t5)*cos4  por    mm5, mm7         ; correct out2  por    mm4, mm7         ; correct out6  pmulhw mm1, mm6         ; mm1: tm65 = (t6 - t5)*cos4  por    mm2, mm7         ; correct tp65  movq   [%1+2*16], mm5   ; => out2  movq   mm5, mm3         ; save t4  movq   [%1+6*16], mm4   ; => out6  movq   mm4, mm0         ; save t7    psubsw mm3, mm1         ; mm3: tm465 = t4 - tm65  psubsw mm0, mm2         ; mm0: tm765 = t7 - tp65  paddsw mm2, mm4         ; mm2: tp765 = t7 + tp65  paddsw mm1, mm5         ; mm1: tp465 = t4 + tm65  movq   mm4, [tan3]      ; tan3 - 1  movq   mm5, [tan1]      ; tan1  movq   mm7, mm3         ; save tm465  pmulhw mm3, mm4         ; tm465*(tan3-1)  movq   mm6, mm1         ; save tp465  pmulhw mm1, mm5         ; tp465*tan1  paddsw mm3, mm7         ; tm465*tan3  pmulhw mm4, mm0         ; tm765*(tan3-1)  paddsw mm4, mm0         ; tm765*tan3  pmulhw mm5, mm2         ; tp765*tan1  paddsw mm1, mm2         ; out1 = tp765 + tp465*tan1  psubsw mm0, mm3         ; out3 = tm765 - tm465*tan3  paddsw mm7, mm4         ; out5 = tm465 + tm765*tan3  psubsw mm5, mm6         ; out7 =-tp465 + tp765*tan1  movq   [%1+1*16], mm1   ; => out1  movq   [%1+3*16], mm0   ; => out3  movq   [%1+5*16], mm7   ; => out5  movq   [%1+7*16], mm5   ; => out7%endmacro;//////////////////////////////////////////////////////////////////////;// fMTX_MULT (~20c)  (~26c for MMX);//////////////////////////////////////////////////////////////////////%macro fMTX_MULT 5   ; %1=src, %2 = Coeffs, %3/%4=rounders, %5=MMX-Only%if %5==0      ; SSE version ('pshufw')  movq    mm0, [ecx+%1*16+0]  ; mm0 = [0123]  pshufw  mm1, [ecx+%1*16+8], 00011011b ; mm1 = [7654]  movq    mm7, mm0%else      ; MMX-only version (~10% slower overall)  movd    mm1, [ecx+%1*16+8+4]  ; [67..]  movq    mm0, [ecx+%1*16+0]    ; mm0 = [0123]  movq    mm7, mm0  punpcklwd mm1, [ecx+%1*16+8]  ; [6475]  movq    mm2, mm1  psrlq   mm1, 32               ; [75..]  punpcklwd mm1,mm2             ; [7654]%endif  paddsw  mm0, mm1      ; mm0 = [a0 a1 a2 a3]  psubsw  mm7, mm1      ; mm7 = [b0 b1 b2 b3]  movq      mm1, mm0  punpckldq mm0, mm7    ; mm0 = [a0 a1 b0 b1]  punpckhdq mm1, mm7    ; mm1 = [a2 a3 b2 b3]  movq    mm2, [%2+ 0]  ;  [   M00    M01      M16    M17]  movq    mm3, [%2+ 8]  ;  [   M02    M03      M18    M19]  pmaddwd mm2, mm0      ;  [a0.M00+a1.M01 | b0.M16+b1.M17]  movq    mm4, [%2+16]  ;  [   M04    M05      M20    M21]  pmaddwd mm3, mm1      ;  [a2.M02+a3.M03 | b2.M18+b3.M19]  movq    mm5, [%2+24]  ;  [   M06    M07      M22    M23]  pmaddwd mm4, mm0      ;  [a0.M04+a1.M05 | b0.M20+b1.M21]  movq    mm6, [%2+32]  ;  [   M08    M09      M24    M25]  pmaddwd mm5, mm1      ;  [a2.M06+a3.M07 | b2.M22+b3.M23]  movq    mm7, [%2+40]  ;  [   M10    M11      M26    M27]  pmaddwd mm6, mm0      ;  [a0.M08+a1.M09 | b0.M24+b1.M25]  paddd   mm2, mm3      ;  [ out0 | out1 ]  pmaddwd mm7, mm1      ;  [a0.M10+a1.M11 | b0.M26+b1.M27]  psrad   mm2, 16  pmaddwd mm0, [%2+48]  ;  [a0.M12+a1.M13 | b0.M28+b1.M29]  paddd   mm4, mm5      ;  [ out2 | out3 ]  pmaddwd mm1, [%2+56]  ;  [a0.M14+a1.M15 | b0.M30+b1.M31]  psrad   mm4, 16  paddd   mm6, mm7            ;  [ out4 | out5 ]  psrad   mm6, 16  paddd   mm0, mm1            ;  [ out6 | out7 ]    psrad   mm0, 16    packssdw mm2, mm4           ;  [ out0|out1|out2|out3 ]  paddsw   mm2, [%3]          ;  Round  packssdw mm6, mm0           ;  [ out4|out5|out6|out7 ]  paddsw   mm6, [%4]          ;  Round  psraw   mm2, 4              ; => [-2048, 2047]  psraw   mm6, 4  movq    [ecx+%1*16+0], mm2  movq    [ecx+%1*16+8], mm6%endmacro;//////////////////////////////////////////////////////////////////////

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -