📄 skl_dct.c
字号:
psubsw mm3, mm4 ; a2-b2 paddsw mm0, mm0 ; 2.b1 paddsw mm4, mm4 ; 2.b2 paddsw mm0, mm5 ; a1+b1 paddsw mm4, mm3 ; a2+b2 psraw mm5, 6 ; out6 psraw mm3, 6 ; out5 psraw mm0, 6 ; out1 psraw mm4, 6 ; out2%if (%2==0) movq [%1+5*16], mm3 movq [%1+6*16], mm5 movq [%1+1*16], mm0 movq [%1+2*16], mm4%elif (%2==2) movq [%1 ], mm0 ; spill movq [%1+16], mm4 ; spill ADD_TO_DST [%1], eax+ edx+%3, [%1+16], eax+2*edx+%3 ; #1 - #2%else packuswb mm0,[%1+1*16+8] packuswb mm4,[%1+2*16+8] movq [eax+ edx], mm0 ; #1 movq [eax+2*edx], mm4 ; #2%endif ; reminder: mm1=b0, mm2=b3, mm7=a0, mm6=a3 movq mm0, mm7 movq mm4, mm6 psubsw mm7, mm1 ; a0-b0 psubsw mm6, mm2 ; a3-b3 paddsw mm1, mm0 ; a0+b0 paddsw mm2, mm4 ; a3+b3 psraw mm1, 6 ; out0 psraw mm7, 6 ; out7 psraw mm2, 6 ; out3 psraw mm6, 6 ; out4%if (%2==0) movq [%1+0*16], mm1 movq [%1+7*16], mm7 movq [%1+3*16], mm2 movq [%1+4*16], mm6%elif (%2==2) ADD_TO_DST mm1, eax +%3, mm6, eax+4*edx+%3 ; #0 - #4 lea eax, [eax+2*edx] ; -> #2 ADD_TO_DST mm2, eax+ edx+%3, mm5, eax+4*edx+%3 ; #3 - #6 lea eax, [eax+ edx] ; -> #3 ADD_TO_DST mm3, eax+2*edx+%3, mm7, eax+4*edx+%3 ; #5 - #7%else packuswb mm1,[%1+0*16+8] packuswb mm6,[%1+4*16+8] packuswb mm2,[%1+3*16+8] packuswb mm5,[%1+6*16+8] packuswb mm3,[%1+5*16+8] packuswb mm7,[%1+7*16+8] movq [eax ], mm1 ; #0 movq [eax+4*edx], mm6 ; #4 lea eax, [eax+2*edx] ; -> #2 movq [eax+ edx], mm2 ; #3 movq [eax+4*edx], mm5 ; #6 lea eax, [eax+ edx] ; -> #3 movq [eax+2*edx], mm3 ; #5 movq [eax+4*edx], mm7 ; #7%endif%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_IDct16_SSE: ; 249c mov ecx, [esp+4] iMTX_MULT 0, iTab1, Idct_Rnd0, 11 iMTX_MULT 1, iTab2, Idct_Rnd1, 11 iMTX_MULT 2, iTab3, Idct_Rnd2, 11 iMTX_MULT 3, iTab4, Idct_Rnd3, 11 iMTX_MULT 4, iTab1, Idct_Rnd4, 11 iMTX_MULT 5, iTab4, Idct_Rnd5, 11 iMTX_MULT 6, iTab3, Idct_Rnd6, 11 iMTX_MULT 7, iTab2, Idct_Rnd7, 11 iLLM_PASS ecx+0, 0,0 iLLM_PASS ecx+8, 0,0 retalign 16Skl_IDct16_MMX: ; 288c mov ecx, [esp+4] iMTX_MULT_MMX 0, iTab1_MMX, Idct_Rnd0, 11 iMTX_MULT_MMX 1, iTab2_MMX, Idct_Rnd1, 11 iMTX_MULT_MMX 2, iTab3_MMX, Idct_Rnd2, 11 iMTX_MULT_MMX 3, iTab4_MMX, Idct_Rnd3, 11 iMTX_MULT_MMX 4, iTab1_MMX, Idct_Rnd4, 11 iMTX_MULT_MMX 5, iTab4_MMX, Idct_Rnd5, 11 iMTX_MULT_MMX 6, iTab3_MMX, Idct_Rnd6, 11 iMTX_MULT_MMX 7, iTab2_MMX, Idct_Rnd7, 11 iLLM_PASS ecx+0, 0,0 iLLM_PASS ecx+8, 0,0 ret;//////////////////////////////////////////////////////////////////////;// fLLM_PASS (~39c);//////////////////////////////////////////////////////////////////////%macro fLLM_PASS 2 ; %1: src/dst, %2:Shift movq mm0, [%1+0*16] ; In0 movq mm2, [%1+2*16] ; In2 movq mm3, mm0 movq mm4, mm2 movq mm7, [%1+7*16] ; In7 movq mm5, [%1+5*16] ; In5 psubsw mm0, mm7 ; t7 = In0-In7 paddsw mm7, mm3 ; t0 = In0+In7 psubsw mm2, mm5 ; t5 = In2-In5 paddsw mm5, mm4 ; t2 = In2+In5 movq mm3, [%1+3*16] ; In3 movq mm4, [%1+4*16] ; In4 movq mm1, mm3 psubsw mm3, mm4 ; t4 = In3-In4 paddsw mm4, mm1 ; t3 = In3+In4 movq mm6, [%1+6*16] ; In6 movq mm1, [%1+1*16] ; In1 psubsw mm1, mm6 ; t6 = In1-In6 paddsw mm6, [%1+1*16] ; t1 = In1+In6 psubsw mm7, mm4 ; tm03 = t0-t3 psubsw mm6, mm5 ; tm12 = t1-t2 paddsw mm4, mm4 ; 2.t3 paddsw mm5, mm5 ; 2.t2 paddsw mm4, mm7 ; tp03 = t0+t3 paddsw mm5, mm6 ; tp12 = t1+t2 psllw mm2, %2+1 ; shift t5 (shift +1 to.. psllw mm1, %2+1 ; shift t6 ..compensate cos4/2) psllw mm4, %2 ; shift t3 psllw mm5, %2 ; shift t2 psllw mm7, %2 ; shift t0 psllw mm6, %2 ; shift t1 psllw mm3, %2 ; shift t4 psllw mm0, %2 ; shift t7 psubsw mm4, mm5 ; out4 = tp03-tp12 psubsw mm1, mm2 ; mm1: t6-t5 paddsw mm5, mm5 paddsw mm2, mm2 paddsw mm5, mm4 ; out0 = tp03+tp12 movq [%1+4*16], mm4 ; => out4 paddsw mm2, mm1 ; mm2: t6+t5 movq [%1+0*16], mm5 ; => out0 movq mm4, [tan2] ; mm4 <= tan2 pmulhw mm4, mm7 ; tm03*tan2 movq mm5, [tan2] ; mm5 <= tan2 psubsw mm4, mm6 ; out6 = tm03*tan2 - tm12 pmulhw mm5, mm6 ; tm12*tan2 paddsw mm5, mm7 ; out2 = tm12*tan2 + tm03 movq mm6, [sqrt2] movq mm7, [MMX_One] pmulhw mm2, mm6 ; mm2: tp65 = (t6 + t5)*cos4 por mm5, mm7 ; correct out2 por mm4, mm7 ; correct out6 pmulhw mm1, mm6 ; mm1: tm65 = (t6 - t5)*cos4 por mm2, mm7 ; correct tp65 movq [%1+2*16], mm5 ; => out2 movq mm5, mm3 ; save t4 movq [%1+6*16], mm4 ; => out6 movq mm4, mm0 ; save t7 psubsw mm3, mm1 ; mm3: tm465 = t4 - tm65 psubsw mm0, mm2 ; mm0: tm765 = t7 - tp65 paddsw mm2, mm4 ; mm2: tp765 = t7 + tp65 paddsw mm1, mm5 ; mm1: tp465 = t4 + tm65 movq mm4, [tan3] ; tan3 - 1 movq mm5, [tan1] ; tan1 movq mm7, mm3 ; save tm465 pmulhw mm3, mm4 ; tm465*(tan3-1) movq mm6, mm1 ; save tp465 pmulhw mm1, mm5 ; tp465*tan1 paddsw mm3, mm7 ; tm465*tan3 pmulhw mm4, mm0 ; tm765*(tan3-1) paddsw mm4, mm0 ; tm765*tan3 pmulhw mm5, mm2 ; tp765*tan1 paddsw mm1, mm2 ; out1 = tp765 + tp465*tan1 psubsw mm0, mm3 ; out3 = tm765 - tm465*tan3 paddsw mm7, mm4 ; out5 = tm465 + tm765*tan3 psubsw mm5, mm6 ; out7 =-tp465 + tp765*tan1 movq [%1+1*16], mm1 ; => out1 movq [%1+3*16], mm0 ; => out3 movq [%1+5*16], mm7 ; => out5 movq [%1+7*16], mm5 ; => out7%endmacro;//////////////////////////////////////////////////////////////////////;// fMTX_MULT (~20c) (~26c for MMX);//////////////////////////////////////////////////////////////////////%macro fMTX_MULT 5 ; %1=src, %2 = Coeffs, %3/%4=rounders, %5=MMX-Only%if %5==0 ; SSE version ('pshufw') movq mm0, [ecx+%1*16+0] ; mm0 = [0123] pshufw mm1, [ecx+%1*16+8], 00011011b ; mm1 = [7654] movq mm7, mm0%else ; MMX-only version (~10% slower overall) movd mm1, [ecx+%1*16+8+4] ; [67..] movq mm0, [ecx+%1*16+0] ; mm0 = [0123] movq mm7, mm0 punpcklwd mm1, [ecx+%1*16+8] ; [6475] movq mm2, mm1 psrlq mm1, 32 ; [75..] punpcklwd mm1,mm2 ; [7654]%endif paddsw mm0, mm1 ; mm0 = [a0 a1 a2 a3] psubsw mm7, mm1 ; mm7 = [b0 b1 b2 b3] movq mm1, mm0 punpckldq mm0, mm7 ; mm0 = [a0 a1 b0 b1] punpckhdq mm1, mm7 ; mm1 = [b2 b3 a2 a3] movq mm2, [%2+ 0] ; [ M00 M01 M16 M17] movq mm3, [%2+ 8] ; [ M02 M03 M18 M19] pmaddwd mm2, mm0 ; [a0.M00+a1.M01 | b0.M16+b1.M17] movq mm4, [%2+16] ; [ M04 M05 M20 M21] pmaddwd mm3, mm1 ; [a2.M02+a3.M03 | b2.M18+b3.M19] movq mm5, [%2+24] ; [ M06 M07 M22 M23] pmaddwd mm4, mm0 ; [a0.M04+a1.M05 | b0.M20+b1.M21] movq mm6, [%2+32] ; [ M08 M09 M24 M25] pmaddwd mm5, mm1 ; [a2.M06+a3.M07 | b2.M22+b3.M23] movq mm7, [%2+40] ; [ M10 M11 M26 M27] pmaddwd mm6, mm0 ; [a0.M08+a1.M09 | b0.M24+b1.M25] paddd mm2, mm3 ; [ out0 | out1 ] pmaddwd mm7, mm1 ; [a0.M10+a1.M11 | b0.M26+b1.M27] psrad mm2, 16 pmaddwd mm0, [%2+48] ; [a0.M12+a1.M13 | b0.M28+b1.M29] paddd mm4, mm5 ; [ out2 | out3 ] pmaddwd mm1, [%2+56] ; [a0.M14+a1.M15 | b0.M30+b1.M31] psrad mm4, 16 paddd mm6, mm7 ; [ out4 | out5 ] psrad mm6, 16 paddd mm0, mm1 ; [ out6 | out7 ] psrad mm0, 16 packssdw mm2, mm4 ; [ out0|out1|out2|out3 ] paddsw mm2, [%3] ; Round packssdw mm6, mm0 ; [ out4|out5|out6|out7 ] paddsw mm6, [%4] ; Round psraw mm2, 4 ; => [-2048, 2047] psraw mm6, 4 movq [ecx+%1*16+0], mm2 movq [ecx+%1*16+8], mm6%endmacro;//////////////////////////////////////////////////////////////////////align 16Skl_Dct16_SSE: ; ~240c mov ecx, [esp+4] fLLM_PASS ecx+0, 3 fLLM_PASS ecx+8, 3 fMTX_MULT 0, fTab1, Fdct_Rnd0, Fdct_Rnd0, 0 fMTX_MULT 1, fTab2, Fdct_Rnd2, Fdct_Rnd1, 0 fMTX_MULT 2, fTab3, Fdct_Rnd1, Fdct_Rnd1, 0 fMTX_MULT 3, fTab4, Fdct_Rnd1, Fdct_Rnd1, 0 fMTX_MULT 4, fTab1, Fdct_Rnd0, Fdct_Rnd0, 0 fMTX_MULT 5, fTab4, Fdct_Rnd1, Fdct_Rnd1, 0 fMTX_MULT 6, fTab3, Fdct_Rnd1, Fdct_Rnd1, 0 fMTX_MULT 7, fTab2, Fdct_Rnd1, Fdct_Rnd1, 0 retalign 16Skl_Dct16_MMX: ; ~269c mov ecx, [esp+4] fLLM_PASS ecx+0, 3 fLLM_PASS ecx+8, 3 fMTX_MULT 0, fTab1, Fdct_Rnd0, Fdct_Rnd0, 1 fMTX_MULT 1, fTab2, Fdct_Rnd2, Fdct_Rnd1, 1 fMTX_MULT 2, fTab3, Fdct_Rnd1, Fdct_Rnd1, 1 fMTX_MULT 3, fTab4, Fdct_Rnd1, Fdct_Rnd1, 1 fMTX_MULT 4, fTab1, Fdct_Rnd0, Fdct_Rnd0, 1 fMTX_MULT 5, fTab4, Fdct_Rnd1, Fdct_Rnd1, 1 fMTX_MULT 6, fTab3, Fdct_Rnd1, Fdct_Rnd1, 1 fMTX_MULT 7, fTab2, Fdct_Rnd1, Fdct_Rnd1, 1 ret//////////////////////////////////////////////////////////*/ //////////////////////////////////////////////////////////// - Data flow schematics for FDCT -// Output is scaled by 2.sqrt(2)// Initial butterflies (in0/in7, etc.) are not fully depicted.// Note: Rot6 coeffs are multiplied by sqrt(2).///////////////////////////////////////////////////////////* <---------Stage1 =even part=-----------> in3 mm3 +_____.___-___________.____* out6 x \ / | in4 mm4 \ | / \ | in0 mm0 +_____o___+__.___-___ | ___* out4 x \ / | in7 mm7 \ (Rot6) / \ | in1 mm1 +_____o___+__o___+___ | ___* out0 x \ / | in6 mm6 / | / \ | in2 mm2 +_____.___-___________o____* out2 x in5 mm5 <---------Stage2 =odd part=----------------> mm7*___._________.___-___[xSqrt2]___* out3 | \ / (Rot3) \ | / \ mm5*__ | ___o____o___+___.___-______* out7 | | \ / | (Rot1) \ | | / \ mm6*__ |____.____o___+___o___+______* out1 | \ / | / | / \ mm4*___o_________.___-___[xSqrt2]___* out5 Alternative schematics for stage 2: ----------------------------------- mm7 *___[xSqrt2]____o___+____o_______* out1 \ / | / (Rot1) / \ | mm6 *____o___+______.___-___ | __.___* out5 \ / | | / | | / \ | | mm5 *____.___-______.___-____.__ | __* out7 \ / | \ (Rot3) / \ | mm4 *___[xSqrt2]____o___+________o___* out3*///////////////////////////////////////////////////////////// - Data flow schematics for IDCT -// Output is scaled by 2.sqrt(2)// Note: Rot6 coeffs are multiplied by sqrt(2).///////////////////////////////////////////////////////////* <---------Stage 1 =even part=----------------> in3 *___[xSqrt2]____.___-________o____* mm7 \ / | / | / \ | in1 *____o___+______o___+____.__ | ___* mm3 \ / | | / (Rot3)| / \ | | in7 *____.___-______.___-____o__ | ___* mm1 \ / | \ (Rot1) / \ | in5 *___[xSqrt2]____o___+________.____* mm5 Alternative schematics for stage 1: ----------------------------------- in1 *________.____.___-_____[xSqrt2]__* mm1 | \ / (Rot1) \ | / \ in5 *___o___ | ___o___+_____.___-_____* mm5 | | \ / | | \ | | / \ in7 *__ | ___o____o___+__ __o___+_____* mm7 | \ / (Rot3) / | / \ in3 *___._________.___-_____[xSqrt2]__* mm3 <---------Stage2 =odd part=---------> in6 *__o___________o___+___* mm6 -> butterfly with mm3 -> out3/out4 mm6 | \ / | / | / \ in4 *_ | ___o___+__.___-___* mm4 -> butf. w/ mm1 -> out7/out0 mm4 | \ / (Rot6) / | / \ in0 *_ | ___.___-__.___-___* mm0 -> butf. w/ mm5 -> out6/out1 mm0 | \ / | \ | / \ in2 *__.___________o___+___* mm2 -> butf. w/ mm7 -> out2/out5 mm2*///////////////////////////////////////////////////////////
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -