📄 skl_dct_sse.asm
字号:
movq [ecx+%1*16+8], mm7%endmacro;//////////////////////////////////////////////////////////////////////; iMTX_MULT_MMX (~27c);//////////////////////////////////////////////////////////////////////%macro iMTX_MULT_MMX 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift movq mm0, [ecx+%1*16+0] ; [0123] movq mm1, [ecx+%1*16+8] ; [4567] movq mm2, mm0 punpcklwd mm0, mm1 ; [0415] punpckhwd mm2, mm1 ; [2637] movq mm1, mm0 movq mm3, mm2 punpckldq mm0, mm0 ; [0404] punpckldq mm2, mm2 ; [2626] punpckhdq mm1, mm1 ; [1515] punpckhdq mm3, mm3 ; [3737] movq mm4, [%2+ 0] ; [ M00 M02 M04 M06] movq mm6, [%2+ 8] ; [ M01 M03 M05 M07] pmaddwd mm4, mm0 ; [i0.M00+i4.M02 | i0.M04+i4.M06] movq mm5, [%2+32] ; [ M16 M18 M20 M22] movq mm7, [%2+40] ; [ M17 M19 M21 M23] pmaddwd mm6, mm2 ; [i2.M01+i6.M03 | i2.M05+i6.M07] pmaddwd mm5, mm1 ; [i1.M16+i5.M18 | i1.M20+i5.M22] pmaddwd mm7, mm3 ; [i3.M17+i7.M19 | i3.M21+i7.M23] pmaddwd mm0, [%2+16] ; [i0.M08+i4.M10 | i0.M12+i4.M14] pmaddwd mm2, [%2+24] ; [i2.M09+i6.M11 | i2.M13+i6.M15] pmaddwd mm1, [%2+48] ; [i1.M24+i5.M26 | i1.M28+i5.M30] pmaddwd mm3, [%2+56] ; [i3.M25+i7.M27 | i3.M29+i7.M31] paddd mm0, [%3] ; Round paddd mm1, mm3 ; => b2 | b3 paddd mm4, [%3] ; Round paddd mm5, mm7 ; => b0 | b1 paddd mm4, mm6 ; => a0 | a1 movq mm6, mm4 ; a0 | a1 paddd mm0, mm2 ; => a2 | a3 movq mm2, mm0 ; a2 | a3 paddd mm4, mm5 ; a0+b0 | a1+b1 psrad mm4, %4 ; => out0 | out1 paddd mm0, mm1 ; a2+b2 | a3+b3 psrad mm0, %4 ; => out2 | out3 psubd mm6, mm5 ; a0-b0 | a1-b1 psubd mm2, mm1 ; a2-b2 | a3-b3 psrad mm6, %4 ; => out7 | out6 psrad mm2, %4 ; => out5 | out4 packssdw mm2, mm6 ; [5476] packssdw mm4, mm0 ; [0123] movq mm7, mm2 psrld mm2, 16 ; [4-6-] pslld mm7, 16 ; [-5-7] movq [ecx+%1*16+0], mm4 por mm2, mm7 movq [ecx+%1*16+8], mm2%endmacro;//////////////////////////////////////////////////////////////////////;// iLLM_PASS (~42c);//////////////////////////////////////////////////////////////////////%macro ADD_TO_DST 4 ; %1:src1 %2:dst1 %3:src2 %4:dst2 ; trashes mm0,mm4 punpcklbw mm0, [%2] punpcklbw mm4, [%4] psrlw mm0, 8 ; will zero the high words psrlw mm4, 8 paddsw mm0, %1 paddsw mm4, %3 packuswb mm0, mm0 packuswb mm4, mm4 movd [%2], mm0 movd [%4], mm4%endmacro%macro iLLM_PASS 3 ; %1: src/dst, %2: combine func (0:none, 1:Put, 2:Add) ; %3: dst offset (only for Add) movq mm0, [tan3] ; t3-1 movq mm3, [%1+16*3] ; x3 movq mm1, mm0 ; t3-1 movq mm5, [%1+16*5] ; x5 movq mm4, [tan1] ; t1 movq mm6, [%1+16*1] ; x1 movq mm7, [%1+16*7] ; x7 movq mm2, mm4 ; t1 pmulhw mm0, mm3 ; x3*(t3-1) pmulhw mm1, mm5 ; x5*(t3-1) paddsw mm0, mm3 ; x3*t3 paddsw mm1, mm5 ; x5*t3 psubsw mm0, mm5 ; x3*t3-x5 = tm35 paddsw mm1, mm3 ; x3+x5*t3 = tp35 pmulhw mm4, mm7 ; x7*t1 pmulhw mm2, mm6 ; x1*t1 paddsw mm4, mm6 ; x1+t1*x7 = tp17 psubsw mm2, mm7 ; x1*t1-x7 = tm17 movq mm3, [sqrt2] movq mm7, mm4 movq mm6, mm2 psubsw mm4, mm1 ; tp17-tp35 = t1 psubsw mm2, mm0 ; tm17-tm35 = b3 paddsw mm1, mm7 ; tp17+tp35 = b0 paddsw mm0, mm6 ; tm17+tm35 = t2 ; mm1 = b0, mm2 = b3. preserved movq mm6, mm4 psubsw mm4, mm0 ; t1-t2 paddsw mm0, mm6 ; t1+t2 pmulhw mm4, mm3 ; (t1-t2)/(2.sqrt2) pmulhw mm0, mm3 ; (t1+t2)/(2.sqrt2) paddsw mm0, mm0 ; 2.(t1+t2) = b1 paddsw mm4, mm4 ; 2.(t1-t2) = b2 movq mm7, [tan2] ; t2 movq mm3, [%1+2*16] ; x2 movq mm6, [%1+6*16] ; x6 movq mm5, mm7 ; t2 pmulhw mm7, mm6 ; x6*t2 pmulhw mm5, mm3 ; x2*t2 paddsw mm7, mm3 ; x2+x6*t2 = tp26 psubsw mm5, mm6 ; x2*t2-x6 = tm26 ; use:mm3,mm5,mm6,mm7 frozen: mm0,mm4,mm1,mm2 movq mm3, [%1+0*16] ; x0 movq mm6, [%1+4*16] ; x4%ifndef IEEE_COMPLIANT psubsw mm3, mm6 ; x0-x4 = tm04 paddsw mm6, mm6 paddsw mm6, mm3 ; x0+x4 = tp04 psubsw mm3, mm5 ; tm04-tm26 = a2 psubsw mm6, mm7 ; tp04-tp26 = a3 paddsw mm5, mm5 ; 2.tm26 paddsw mm7, mm7 ; 2.tp26 paddsw mm5, mm3 ; tm04+tm26 = a1 paddsw mm7, mm6 ; tp04+tp26 = a0 psubsw mm5, mm0 ; a1-b1 psubsw mm3, mm4 ; a2-b2 paddsw mm0, mm0 ; 2.b1 paddsw mm4, mm4 ; 2.b2 paddsw mm0, mm5 ; a1+b1 paddsw mm4, mm3 ; a2+b2%else ; we spill 1 reg to perform safe butterflies movq [%1 ], mm2 movq mm2, mm3 psubsw mm3, mm6 ; x0-x4 = tm04 paddsw mm6, mm2 ; x0+x4 = tp04 movq mm2, mm6 psubsw mm6, mm7 paddsw mm7, mm2 movq mm2, mm3 psubsw mm3, mm5 paddsw mm5, mm2 movq mm2, mm5 psubsw mm5, mm0 paddsw mm0, mm2 movq mm2, mm3 psubsw mm3, mm4 paddsw mm4, mm2 movq mm2, [%1]%endif psraw mm5, 6 ; out6 psraw mm3, 6 ; out5 psraw mm0, 6 ; out1 psraw mm4, 6 ; out2%if (%2==0) movq [%1+5*16], mm3 movq [%1+6*16], mm5 movq [%1+1*16], mm0 movq [%1+2*16], mm4%elif (%2==2) movq [%1 ], mm0 ; spill movq [%1+16], mm4 ; spill ADD_TO_DST [%1], eax+ edx+%3, [%1+16], eax+2*edx+%3 ; #1 - #2%else packuswb mm0,[%1+1*16+8] packuswb mm4,[%1+2*16+8] movq [eax+ edx], mm0 ; #1 movq [eax+2*edx], mm4 ; #2%endif ; reminder: mm1=b0, mm2=b3, mm7=a0, mm6=a3 movq mm0, mm7 movq mm4, mm6 psubsw mm7, mm1 ; a0-b0 psubsw mm6, mm2 ; a3-b3 paddsw mm1, mm0 ; a0+b0 paddsw mm2, mm4 ; a3+b3 psraw mm1, 6 ; out0 psraw mm7, 6 ; out7 psraw mm2, 6 ; out3 psraw mm6, 6 ; out4%if (%2==0) movq [%1+0*16], mm1 movq [%1+7*16], mm7 movq [%1+3*16], mm2 movq [%1+4*16], mm6%elif (%2==2) ADD_TO_DST mm1, eax +%3, mm6, eax+4*edx+%3 ; #0 - #4 lea eax, [eax+2*edx] ; -> #2 ADD_TO_DST mm2, eax+ edx+%3, mm5, eax+4*edx+%3 ; #3 - #6 lea eax, [eax+ edx] ; -> #3 ADD_TO_DST mm3, eax+2*edx+%3, mm7, eax+4*edx+%3 ; #5 - #7%else packuswb mm1,[%1+0*16+8] packuswb mm6,[%1+4*16+8] packuswb mm2,[%1+3*16+8] packuswb mm5,[%1+6*16+8] packuswb mm3,[%1+5*16+8] packuswb mm7,[%1+7*16+8] movq [eax ], mm1 ; #0 movq [eax+4*edx], mm6 ; #4 lea eax, [eax+2*edx] ; -> #2 movq [eax+ edx], mm2 ; #3 movq [eax+4*edx], mm5 ; #6 lea eax, [eax+ edx] ; -> #3 movq [eax+2*edx], mm3 ; #5 movq [eax+4*edx], mm7 ; #7%endif%endmacro%macro iLLM_PASS_03 3 ; %1: src/dst, %2: combine func (0:none, 1:Put, 2:Add) ; %3: dst offset (only for Add) movq mm0, [tan3] ; t3-1 movq mm1, [%1+16*3] ; x3 movq mm2, [%1+16*1] ; x1 pmulhw mm0, mm1 ; x3*(t3-1) movq mm4, mm2 pmulhw mm2, [tan1] ; x1*t1 => mm2:tm17, mm4:tp17 paddsw mm0, mm1 ; x3*t3 => mm0:tm35, mm1:tp35 movq mm3, [sqrt2] movq mm7, mm4 movq mm6, mm2 psubsw mm4, mm1 ; tp17-tp35 = t1 psubsw mm2, mm0 ; tm17-tm35 = b3 paddsw mm1, mm7 ; tp17+tp35 = b0 paddsw mm0, mm6 ; tm17+tm35 = t2 ; mm1 = b0, mm2 = b3. preserved movq mm6, mm4 psubsw mm4, mm0 ; t1-t2 paddsw mm0, mm6 ; t1+t2 pmulhw mm4, mm3 ; (t1-t2)/(2.sqrt2) pmulhw mm0, mm3 ; (t1+t2)/(2.sqrt2) paddsw mm0, mm0 ; 2.(t1+t2) = b1 paddsw mm4, mm4 ; 2.(t1-t2) = b2 movq mm5, [tan2] ; t2 movq mm3, [%1+0*16] ; x0 => mm3:tm04 movq mm7, [%1+2*16] ; x2 => mm7:tp26 pmulhw mm5, mm7 ; x2*t2 => mm5:tm26 movq mm6, mm3 ; mm6:tp04%ifndef IEEE_COMPLIANT psubsw mm3, mm5 ; tm04-tm26 = a2 psubsw mm6, mm7 ; tp04-tp26 = a3 paddsw mm5, mm5 ; 2.tm26 paddsw mm7, mm7 ; 2.tp26 paddsw mm5, mm3 ; tm04+tm26 = a1 paddsw mm7, mm6 ; tp04+tp26 = a0 psubsw mm5, mm0 ; a1-b1 psubsw mm3, mm4 ; a2-b2 paddsw mm0, mm0 ; 2.b1 paddsw mm4, mm4 ; 2.b2 paddsw mm0, mm5 ; a1+b1 paddsw mm4, mm3 ; a2+b2%else ; we spill 1 reg to perform safe butterflies movq [%1 ], mm2 movq mm2, mm6 psubsw mm6, mm7 paddsw mm7, mm2 movq mm2, mm3 psubsw mm3, mm5 paddsw mm5, mm2 movq mm2, mm5 psubsw mm5, mm0 paddsw mm0, mm2 movq mm2, mm3 psubsw mm3, mm4 paddsw mm4, mm2 movq mm2, [%1]%endif psraw mm5, 6 ; out6 psraw mm3, 6 ; out5 psraw mm0, 6 ; out1 psraw mm4, 6 ; out2%if (%2==0) movq [%1+5*16], mm3 movq [%1+6*16], mm5 movq [%1+1*16], mm0 movq [%1+2*16], mm4%elif (%2==2) movq [%1 ], mm0 ; spill movq [%1+16], mm4 ; spill ADD_TO_DST [%1], eax+ edx+%3, [%1+16], eax+2*edx+%3 ; #1 - #2%else packuswb mm0,[%1+1*16+8] packuswb mm4,[%1+2*16+8] movq [eax+ edx], mm0 ; #1 movq [eax+2*edx], mm4 ; #2%endif ; reminder: mm1=b0, mm2=b3, mm7=a0, mm6=a3 movq mm0, mm7 movq mm4, mm6 psubsw mm7, mm1 ; a0-b0 psubsw mm6, mm2 ; a3-b3 paddsw mm1, mm0 ; a0+b0 paddsw mm2, mm4 ; a3+b3 psraw mm1, 6 ; out0 psraw mm7, 6 ; out7 psraw mm2, 6 ; out3 psraw mm6, 6 ; out4%if (%2==0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -