📄 skl_dct_sse.asm
字号:
movq [%1+0*16], mm1 movq [%1+7*16], mm7 movq [%1+3*16], mm2 movq [%1+4*16], mm6%elif (%2==2) ADD_TO_DST mm1, eax +%3, mm6, eax+4*edx+%3 ; #0 - #4 lea eax, [eax+2*edx] ; -> #2 ADD_TO_DST mm2, eax+ edx+%3, mm5, eax+4*edx+%3 ; #3 - #6 lea eax, [eax+ edx] ; -> #3 ADD_TO_DST mm3, eax+2*edx+%3, mm7, eax+4*edx+%3 ; #5 - #7%else packuswb mm1,[%1+0*16+8] packuswb mm6,[%1+4*16+8] packuswb mm2,[%1+3*16+8] packuswb mm5,[%1+6*16+8] packuswb mm3,[%1+5*16+8] packuswb mm7,[%1+7*16+8] movq [eax ], mm1 ; #0 movq [eax+4*edx], mm6 ; #4 lea eax, [eax+2*edx] ; -> #2 movq [eax+ edx], mm2 ; #3 movq [eax+4*edx], mm5 ; #6 lea eax, [eax+ edx] ; -> #3 movq [eax+2*edx], mm3 ; #5 movq [eax+4*edx], mm7 ; #7%endif%endmacro;//////////////////////////////////////////////////////////////////////;// basic IDCTs...; Nic - Changed to fastcall conventionalign 16Skl_IDct16_SSE: ; 249c mov ecx, [esp+4] ; Old cdecl way, ecx is the pointer anyway with iMTX_MULT 0, iTab1, Idct_Rnd0, 11 iMTX_MULT 1, iTab2, Idct_Rnd1, 11 iMTX_MULT 2, iTab3, Idct_Rnd2, 11 iMTX_MULT 3, iTab4, Idct_Rnd3, 11 iMTX_MULT 4, iTab1, Idct_Rnd4, 11 iMTX_MULT 5, iTab4, Idct_Rnd5, 11 iMTX_MULT 6, iTab3, Idct_Rnd6, 11 iMTX_MULT 7, iTab2, Idct_Rnd7, 11 iLLM_PASS ecx+0, 0,0 iLLM_PASS ecx+8, 0,0 retalign 16Skl_IDct16_MMX: ; 288c mov ecx, [esp+4] iMTX_MULT_MMX 0, iTab1_MMX, Idct_Rnd0, 11 iMTX_MULT_MMX 1, iTab2_MMX, Idct_Rnd1, 11 iMTX_MULT_MMX 2, iTab3_MMX, Idct_Rnd2, 11 iMTX_MULT_MMX 3, iTab4_MMX, Idct_Rnd3, 11 iMTX_MULT_MMX 4, iTab1_MMX, Idct_Rnd4, 11 iMTX_MULT_MMX 5, iTab4_MMX, Idct_Rnd5, 11 iMTX_MULT_MMX 6, iTab3_MMX, Idct_Rnd6, 11 iMTX_MULT_MMX 7, iTab2_MMX, Idct_Rnd7, 11 iLLM_PASS ecx+0, 0,0 iLLM_PASS ecx+8, 0,0 ret;//////////////////////////////////////////////////////////////////////;// Optimized Sparse/Put/Add MMX/SSE IDCTs%macro TEST_ROW 3 ; %1:src, %2:label x8, %3: label x4 mov eax, [%1 ] mov edx, [%1+ 8] or eax, [%1+ 4] or edx, [%1+12] or eax, edx jz near %2 test edx, 0xffffffff jz near %3%endmacro%macro IDCT_IMPL 4 ; %1: combine func (0:none, 1:Put, 2:Add) ; %2:HPASS macro, %3:HPASS-03 macro, %4:VPASS macro mov ecx, [esp+12] ; Src TEST_ROW ecx, .Row0_Round, .Row0_4 %2 0, ITAB1, Idct_Rnd0, 11 jmp .Row1.Row0_4 %3 0, ITAB1, Idct_Rnd0, 11 jmp .Row1.Row0_Round movq mm0, [Idct_Sparse_Rnd0] movq [ecx ], mm0 movq [ecx+8], mm0.Row1 TEST_ROW ecx+16, .Row1_Round, .Row1_4 %2 1, ITAB2, Idct_Rnd1, 11 jmp .Row2.Row1_4 %3 1, ITAB2, Idct_Rnd1, 11 jmp .Row2.Row1_Round movq mm0, [Idct_Sparse_Rnd1] movq [ecx+16 ], mm0 movq [ecx+16+8], mm0.Row2 TEST_ROW ecx+32, .Row2_Round, .Row2_4 %2 2, ITAB3, Idct_Rnd2, 11 jmp .Row3.Row2_4 %3 2, ITAB3, Idct_Rnd2, 11 jmp .Row3.Row2_Round movq mm0, [Idct_Sparse_Rnd2] movq [ecx+32 ], mm0 movq [ecx+32+8], mm0.Row3 TEST_ROW ecx+48, .Row4, .Row3_4 %2 3, ITAB4, Idct_Rnd3, 11 jmp .Row4.Row3_4 %3 3, ITAB4, Idct_Rnd3, 11.Row4 TEST_ROW ecx+64, .Row5, .Row4_4 %2 4, ITAB1, Idct_Rnd4, 11 jmp .Row5.Row4_4: %3 4, ITAB1, Idct_Rnd4, 11.Row5 TEST_ROW ecx+80, .Row6, .Row5_4 %2 5, ITAB4, Idct_Rnd5, 11 jmp .Row6.Row5_4 %3 5, ITAB4, Idct_Rnd5, 11.Row6 TEST_ROW ecx+96, .Row7, .Row6_4 %2 6, ITAB3, Idct_Rnd6, 11 jmp .Row7.Row6_4 %3 6, ITAB3, Idct_Rnd6, 11.Row7 TEST_ROW ecx+112, .End, .Row7_4 %2 7, ITAB2, Idct_Rnd7, 11 jmp .End.Row7_4 %3 7, ITAB2, Idct_Rnd7, 11.End%if (%1==0) %4 ecx+0, 0,0 %4 ecx+8, 0,0%elif (%1==1) mov eax, [esp+ 4] ; Dst mov edx, [esp+ 8] ; BpS %4 ecx+8, 0,0 %4 ecx+0, 1,0%else mov eax, [esp+ 4] ; Dst mov edx, [esp+ 8] ; BpS %4 ecx+0, 2,0 mov eax, [esp+ 4] ; reload Dst %4 ecx+8, 2,4%endif%endmacro;//////////////////////////////////////////////////////////////////////%define ITAB1 iTab1%define ITAB2 iTab2%define ITAB3 iTab3%define ITAB4 iTab4align 16Skl_IDct16_Put_SSE: IDCT_IMPL 1, iMTX_MULT, iMTX_MULT_03, iLLM_PASS retalign 16Skl_IDct16_Add_SSE: IDCT_IMPL 2, iMTX_MULT, iMTX_MULT_03, iLLM_PASS ret;//////////////////////////////////////////////////////////////////////%define ITAB1 iTab1_MMX%define ITAB2 iTab2_MMX%define ITAB3 iTab3_MMX%define ITAB4 iTab4_MMXalign 16Skl_IDct16_Put_MMX: IDCT_IMPL 1, iMTX_MULT_MMX, iMTX_MULT_MMX, iLLM_PASS retalign 16Skl_IDct16_Add_MMX: IDCT_IMPL 2, iMTX_MULT_MMX, iMTX_MULT_MMX, iLLM_PASS ret;//////////////////////////////////////////////////////////////////////;// fLLM_PASS (~39c);//////////////////////////////////////////////////////////////////////%macro fLLM_PASS 2 ; %1: src/dst, %2:Shift movq mm0, [%1+0*16] ; In0 movq mm2, [%1+2*16] ; In2 movq mm3, mm0 movq mm4, mm2 movq mm7, [%1+7*16] ; In7 movq mm5, [%1+5*16] ; In5 psubsw mm0, mm7 ; t7 = In0-In7 paddsw mm7, mm3 ; t0 = In0+In7 psubsw mm2, mm5 ; t5 = In2-In5 paddsw mm5, mm4 ; t2 = In2+In5 movq mm3, [%1+3*16] ; In3 movq mm4, [%1+4*16] ; In4 movq mm1, mm3 psubsw mm3, mm4 ; t4 = In3-In4 paddsw mm4, mm1 ; t3 = In3+In4 movq mm6, [%1+6*16] ; In6 movq mm1, [%1+1*16] ; In1 psubsw mm1, mm6 ; t6 = In1-In6 paddsw mm6, [%1+1*16] ; t1 = In1+In6 psubsw mm7, mm4 ; tm03 = t0-t3 psubsw mm6, mm5 ; tm12 = t1-t2 paddsw mm4, mm4 ; 2.t3 paddsw mm5, mm5 ; 2.t2 paddsw mm4, mm7 ; tp03 = t0+t3 paddsw mm5, mm6 ; tp12 = t1+t2 psllw mm2, %2+1 ; shift t5 (shift +1 to.. psllw mm1, %2+1 ; shift t6 ..compensate cos4/2) psllw mm4, %2 ; shift t3 psllw mm5, %2 ; shift t2 psllw mm7, %2 ; shift t0 psllw mm6, %2 ; shift t1 psllw mm3, %2 ; shift t4 psllw mm0, %2 ; shift t7 psubsw mm4, mm5 ; out4 = tp03-tp12 psubsw mm1, mm2 ; mm1: t6-t5 paddsw mm5, mm5 paddsw mm2, mm2 paddsw mm5, mm4 ; out0 = tp03+tp12 movq [%1+4*16], mm4 ; => out4 paddsw mm2, mm1 ; mm2: t6+t5 movq [%1+0*16], mm5 ; => out0 movq mm4, [tan2] ; mm4 <= tan2 pmulhw mm4, mm7 ; tm03*tan2 movq mm5, [tan2] ; mm5 <= tan2 psubsw mm4, mm6 ; out6 = tm03*tan2 - tm12 pmulhw mm5, mm6 ; tm12*tan2 paddsw mm5, mm7 ; out2 = tm12*tan2 + tm03 movq mm6, [sqrt2] movq mm7, [MMX_One] pmulhw mm2, mm6 ; mm2: tp65 = (t6 + t5)*cos4 por mm5, mm7 ; correct out2 por mm4, mm7 ; correct out6 pmulhw mm1, mm6 ; mm1: tm65 = (t6 - t5)*cos4 por mm2, mm7 ; correct tp65 movq [%1+2*16], mm5 ; => out2 movq mm5, mm3 ; save t4 movq [%1+6*16], mm4 ; => out6 movq mm4, mm0 ; save t7 psubsw mm3, mm1 ; mm3: tm465 = t4 - tm65 psubsw mm0, mm2 ; mm0: tm765 = t7 - tp65 paddsw mm2, mm4 ; mm2: tp765 = t7 + tp65 paddsw mm1, mm5 ; mm1: tp465 = t4 + tm65 movq mm4, [tan3] ; tan3 - 1 movq mm5, [tan1] ; tan1 movq mm7, mm3 ; save tm465 pmulhw mm3, mm4 ; tm465*(tan3-1) movq mm6, mm1 ; save tp465 pmulhw mm1, mm5 ; tp465*tan1 paddsw mm3, mm7 ; tm465*tan3 pmulhw mm4, mm0 ; tm765*(tan3-1) paddsw mm4, mm0 ; tm765*tan3 pmulhw mm5, mm2 ; tp765*tan1 paddsw mm1, mm2 ; out1 = tp765 + tp465*tan1 psubsw mm0, mm3 ; out3 = tm765 - tm465*tan3 paddsw mm7, mm4 ; out5 = tm465 + tm765*tan3 psubsw mm5, mm6 ; out7 =-tp465 + tp765*tan1 movq [%1+1*16], mm1 ; => out1 movq [%1+3*16], mm0 ; => out3 movq [%1+5*16], mm7 ; => out5 movq [%1+7*16], mm5 ; => out7%endmacro;//////////////////////////////////////////////////////////////////////;// fMTX_MULT (~20c) (~26c for MMX);//////////////////////////////////////////////////////////////////////%macro fMTX_MULT 5 ; %1=src, %2 = Coeffs, %3/%4=rounders, %5=MMX-Only%if %5==0 ; SSE version ('pshufw') movq mm0, [ecx+%1*16+0] ; mm0 = [0123] pshufw mm1, [ecx+%1*16+8], 00011011b ; mm1 = [7654] movq mm7, mm0%else ; MMX-only version (~10% slower overall) movd mm1, [ecx+%1*16+8+4] ; [67..] movq mm0, [ecx+%1*16+0] ; mm0 = [0123] movq mm7, mm0 punpcklwd mm1, [ecx+%1*16+8] ; [6475] movq mm2, mm1 psrlq mm1, 32 ; [75..] punpcklwd mm1,mm2 ; [7654]%endif paddsw mm0, mm1 ; mm0 = [a0 a1 a2 a3] psubsw mm7, mm1 ; mm7 = [b0 b1 b2 b3] movq mm1, mm0 punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1] punpckhdq mm1, mm7 ; mm1 = [a2 a3 b2 b3] movq mm2, [%2+ 0] ; [ M00 M01 M16 M17] movq mm3, [%2+ 8] ; [ M02 M03 M18 M19] pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17] movq mm4, [%2+16] ; [ M04 M05 M20 M21] pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19] movq mm5, [%2+24] ; [ M06 M07 M22 M23] pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21] movq mm6, [%2+32] ; [ M08 M09 M24 M25] pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23] movq mm7, [%2+40] ; [ M10 M11 M26 M27] pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25] paddd mm2, mm3 ; [ out0 | out1 ] pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] psrad mm2, 16 pmaddwd mm0, [%2+48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] paddd mm4, mm5 ; [ out2 | out3 ] pmaddwd mm1, [%2+56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] psrad mm4, 16 paddd mm6, mm7 ; [ out4 | out5 ] psrad mm6, 16 paddd mm0, mm1 ; [ out6 | out7 ] psrad mm0, 16 packssdw mm2, mm4 ; [ out0|out1|out2|out3 ] paddsw mm2, [%3] ; Round packssdw mm6, mm0 ; [ out4|out5|out6|out7 ] paddsw mm6, [%4] ; Round psraw mm2, 4 ; => [-2048, 2047] psraw mm6, 4 movq [ecx+%1*16+0], mm2 movq [ecx+%1*16+8], mm6%endmacro;//////////////////////////////////////////////////////////////////////
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -