📄 skl_dct.c
字号:
; 0.007 0.006 0.005 0.007 0.006 0.007 0.005 0.007 [0.006]; 0.006 0.008 0.007 0.007 0.007 0.008 0.008 0.008 [0.007]; 0.008 0.008 0.008 0.008 0.007 0.009 0.010 0.007 [0.008]; 0.007 0.007 0.006 0.007 0.008 0.007 0.006 0.008 [0.007]; 0.007 0.006 0.006 0.006 0.006 0.005 0.006 0.006 [0.006]; 0.008 0.007 0.006 0.008 0.007 0.008 0.009 0.009 [0.008]; 0.008 0.006 0.010 0.008 0.008 0.008 0.007 0.007 [0.008]; 0.007 0.006 0.006 0.007 0.007 0.006 0.006 0.007 [0.006]; ; == Abs Mean errors ==; 0.001 0.000 0.000 0.001 0.001 0.000 0.000 0.000 [0.000]; 0.000 0.002 0.002 0.000 0.001 0.001 0.000 0.002 [0.000]; 0.001 0.002 0.001 0.001 0.001 0.001 0.000 0.001 [-0.001]; 0.000 0.002 0.000 0.000 0.001 0.000 0.000 0.001 [-0.000]; 0.000 0.001 0.001 0.001 0.000 0.001 0.000 0.001 [0.000]; 0.000 0.001 0.001 0.001 0.001 0.000 0.001 0.000 [0.000]; 0.001 0.001 0.002 0.001 0.001 0.002 0.001 0.001 [0.001]; 0.000 0.000 0.001 0.000 0.000 0.000 0.000 0.000 [0.000];; =========================;; Peak error: 1.0000; Peak MSE: 0.0092; Overall MSE: 0.0071; Peak ME: 0.0022; Overall ME: -0.0002;;//////////////////////////////////////////////////////////////////////SECTION .dataalign 16tan1: dw 0x32ec,0x32ec,0x32ec,0x32ec ; tan( pi/16)tan2: dw 0x6a0a,0x6a0a,0x6a0a,0x6a0a ; tan(2pi/16) (=sqrt(2)-1)tan3: dw 0xab0e,0xab0e,0xab0e,0xab0e ; tan(3pi/16)-1sqrt2: dw 0x5a82,0x5a82,0x5a82,0x5a82 ; 0.5/sqrt(2);//////////////////////////////////////////////////////////////////////align 16iTab1: dw 0x4000, 0x539f, 0x4000, 0x22a3, dw 0x4000, 0x22a3, 0xc000, 0xac61, dw 0x4000, 0xdd5d, 0x4000, 0xac61, dw 0xc000, 0x539f, 0x4000, 0xdd5d, dw 0x58c5, 0x4b42, 0x4b42, 0xee58, dw 0x3249, 0x11a8, 0xa73b, 0xcdb7, dw 0x3249, 0xa73b, 0x11a8, 0xcdb7, dw 0x11a8, 0x4b42, 0x4b42, 0xa73biTab2: dw 0x58c5, 0x73fc, 0x58c5, 0x300b, dw 0x58c5, 0x300b, 0xa73b, 0x8c04, dw 0x58c5, 0xcff5, 0x58c5, 0x8c04, dw 0xa73b, 0x73fc, 0x58c5, 0xcff5, dw 0x7b21, 0x6862, 0x6862, 0xe782, dw 0x45bf, 0x187e, 0x84df, 0xba41, dw 0x45bf, 0x84df, 0x187e, 0xba41, dw 0x187e, 0x6862, 0x6862, 0x84dfiTab3: dw 0x539f, 0x6d41, 0x539f, 0x2d41, dw 0x539f, 0x2d41, 0xac61, 0x92bf, dw 0x539f, 0xd2bf, 0x539f, 0x92bf, dw 0xac61, 0x6d41, 0x539f, 0xd2bf, dw 0x73fc, 0x6254, 0x6254, 0xe8ee, dw 0x41b3, 0x1712, 0x8c04, 0xbe4d, dw 0x41b3, 0x8c04, 0x1712, 0xbe4d, dw 0x1712, 0x6254, 0x6254, 0x8c04iTab4: dw 0x4b42, 0x6254, 0x4b42, 0x28ba, dw 0x4b42, 0x28ba, 0xb4be, 0x9dac, dw 0x4b42, 0xd746, 0x4b42, 0x9dac, dw 0xb4be, 0x6254, 0x4b42, 0xd746, dw 0x6862, 0x587e, 0x587e, 0xeb3d, dw 0x3b21, 0x14c3, 0x979e, 0xc4df, dw 0x3b21, 0x979e, 0x14c3, 0xc4df, dw 0x14c3, 0x587e, 0x587e, 0x979ealign 16iTab1_MMX: dw 0x4000, 0x4000, 0x4000, 0xc000 dw 0x539f, 0x22a3, 0x22a3, 0xac61 dw 0x4000, 0xc000, 0x4000, 0x4000 dw 0xdd5d, 0x539f, 0xac61, 0xdd5d dw 0x58c5, 0x3249, 0x4b42, 0xa73b dw 0x4b42, 0x11a8, 0xee58, 0xcdb7 dw 0x3249, 0x11a8, 0x11a8, 0x4b42 dw 0xa73b, 0x4b42, 0xcdb7, 0xa73biTab2_MMX: dw 0x58c5, 0x58c5, 0x58c5, 0xa73b dw 0x73fc, 0x300b, 0x300b, 0x8c04 dw 0x58c5, 0xa73b, 0x58c5, 0x58c5 dw 0xcff5, 0x73fc, 0x8c04, 0xcff5 dw 0x7b21, 0x45bf, 0x6862, 0x84df dw 0x6862, 0x187e, 0xe782, 0xba41 dw 0x45bf, 0x187e, 0x187e, 0x6862 dw 0x84df, 0x6862, 0xba41, 0x84dfiTab3_MMX: dw 0x539f, 0x539f, 0x539f, 0xac61 dw 0x6d41, 0x2d41, 0x2d41, 0x92bf dw 0x539f, 0xac61, 0x539f, 0x539f dw 0xd2bf, 0x6d41, 0x92bf, 0xd2bf dw 0x73fc, 0x41b3, 0x6254, 0x8c04 dw 0x6254, 0x1712, 0xe8ee, 0xbe4d dw 0x41b3, 0x1712, 0x1712, 0x6254 dw 0x8c04, 0x6254, 0xbe4d, 0x8c04iTab4_MMX: dw 0x4b42, 0x4b42, 0x4b42, 0xb4be dw 0x6254, 0x28ba, 0x28ba, 0x9dac dw 0x4b42, 0xb4be, 0x4b42, 0x4b42 dw 0xd746, 0x6254, 0x9dac, 0xd746 dw 0x6862, 0x3b21, 0x587e, 0x979e dw 0x587e, 0x14c3, 0xeb3d, 0xc4df dw 0x3b21, 0x14c3, 0x14c3, 0x587e dw 0x979e, 0x587e, 0xc4df, 0x979e ; the original rounding trick is by ; Michel Lespinasse (hi Walken!) <walken@zoy.org>align 16Idct_Rnd0 dd 65535, 65535Idct_Rnd1 dd 3612, 3612Idct_Rnd2 dd 2271, 2271Idct_Rnd3 dd 1203, 1203Idct_Rnd4 dd 1023, 1023Idct_Rnd5 dd 102, 102Idct_Rnd6 dd 398, 398Idct_Rnd7 dd 469, 469Idct_Sparse_Rnd0 times 4 dw (65535>>11)Idct_Sparse_Rnd1 times 4 dw ( 3612>>11)Idct_Sparse_Rnd2 times 4 dw ( 2271>>11) ; other rounders are zero...;//////////////////////////////////////////////////////////////////////align 16fTab1: dw 0x4000, 0x4000, 0x58c5, 0x4b42, dw 0x4000, 0x4000, 0x3249, 0x11a8, dw 0x539f, 0x22a3, 0x4b42, 0xee58, dw 0xdd5d, 0xac61, 0xa73b, 0xcdb7, dw 0x4000, 0xc000, 0x3249, 0xa73b, dw 0xc000, 0x4000, 0x11a8, 0x4b42, dw 0x22a3, 0xac61, 0x11a8, 0xcdb7, dw 0x539f, 0xdd5d, 0x4b42, 0xa73bfTab2: dw 0x58c5, 0x58c5, 0x7b21, 0x6862, dw 0x58c5, 0x58c5, 0x45bf, 0x187e, dw 0x73fc, 0x300b, 0x6862, 0xe782, dw 0xcff5, 0x8c04, 0x84df, 0xba41, dw 0x58c5, 0xa73b, 0x45bf, 0x84df, dw 0xa73b, 0x58c5, 0x187e, 0x6862, dw 0x300b, 0x8c04, 0x187e, 0xba41, dw 0x73fc, 0xcff5, 0x6862, 0x84dffTab3: dw 0x539f, 0x539f, 0x73fc, 0x6254, dw 0x539f, 0x539f, 0x41b3, 0x1712, dw 0x6d41, 0x2d41, 0x6254, 0xe8ee, dw 0xd2bf, 0x92bf, 0x8c04, 0xbe4d, dw 0x539f, 0xac61, 0x41b3, 0x8c04, dw 0xac61, 0x539f, 0x1712, 0x6254, dw 0x2d41, 0x92bf, 0x1712, 0xbe4d, dw 0x6d41, 0xd2bf, 0x6254, 0x8c04fTab4: dw 0x4b42, 0x4b42, 0x6862, 0x587e, dw 0x4b42, 0x4b42, 0x3b21, 0x14c3, dw 0x6254, 0x28ba, 0x587e, 0xeb3d, dw 0xd746, 0x9dac, 0x979e, 0xc4df, dw 0x4b42, 0xb4be, 0x3b21, 0x979e, dw 0xb4be, 0x4b42, 0x14c3, 0x587e, dw 0x28ba, 0x9dac, 0x14c3, 0xc4df, dw 0x6254, 0xd746, 0x587e, 0x979ealign 16Fdct_Rnd0: dw 6,8,8,8Fdct_Rnd1: dw 8,8,8,8Fdct_Rnd2: dw 10,8,8,8MMX_One: dw 1,1,1,1;//////////////////////////////////////////////////////////////////////SECTION .text;//////////////////////////////////////////////////////////////////////; iMTX_MULT (~24c);//////////////////////////////////////////////////////////////////////%macro iMTX_MULT 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift movq mm0, [ecx+%1*16+0] ; mm0 = [0123] movq mm1, [ecx+%1*16+8] ; mm1 = [4567] movq mm3, [%2+0] ; [ M00 M01 M04 M05] pshufw mm2, mm0, 11011101b ; [1313] movq mm4, [%2+8] ; [ M02 M03 M06 M07] pshufw mm0, mm0, 10001000b ; [0202] movq mm6, [%2+32] ; [ M16 M17 M20 M21] pshufw mm5, mm1, 11011101b ; [5757] movq mm7, [%2+40] ; [ M18 M19 M22 M23] pshufw mm1, mm1, 10001000b ; [4646] pmaddwd mm3, mm0 ; [i0.M00+i2.M01 | i0.M04+i2.M05] pmaddwd mm6, mm2 ; [i1.M16+i3.M17 | i1.M20+i3.M21] pmaddwd mm4, mm1 ; [i4.M02+i6.M03 | i4.M06+i6.M07] pmaddwd mm7, mm5 ; [i5.M18+i7.M19 | i5.M22+i7.M23] pmaddwd mm2, [%2+48] ; [i1.M24+i3.M25 | i1.M28+i3.M29] pmaddwd mm5, [%2+56] ; [i5.M26+i7.M27 | i5.M30+i7.M31] pmaddwd mm0, [%2+16] ; [i0.M08+i2.M09 | i0.M12+i2.M13] pmaddwd mm1, [%2+24] ; [i4.M10+i6.M11 | i4.M14+i6.M15] paddd mm3, [%3] ; Round paddd mm6, mm7 ; => b0 | b1 paddd mm0, [%3] ; Round paddd mm2, mm5 ; => b2 | b3 paddd mm3, mm4 ; => a0 | a1 paddd mm0, mm1 ; => a2 | a3 movq mm4, mm3 ; a0 | a1 movq mm7, mm0 ; a2 | a3 paddd mm3, mm6 ; a0+b0 | a1+b1 psubd mm4, mm6 ; a0-b0 | a1-b1 psubd mm7, mm2 ; a2-b2 | a3-b3 paddd mm0, mm2 ; a2+b2 | a3+b3 psrad mm3, %4 ; => out0 | out1 psrad mm4, %4 ; => out7 | out6 psrad mm0, %4 ; => out2 | out3 psrad mm7, %4 ; => out5 | out4 packssdw mm3, mm0 ; [0123] packssdw mm7, mm4 ; [5476] movq [ecx+%1*16+0], mm3 pshufw mm7, mm7, 10110001b ; [4567] movq [ecx+%1*16+8], mm7%endmacro%macro iMTX_MULT_03 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift ; this version assume [4567] coeffs are zero... movq mm0, [ecx+%1*16+0] ; mm0 = [0123] movq mm3, [%2+0] ; [ M00 M01 M04 M05] pshufw mm2, mm0, 11011101b ; [1313] movq mm6, [%2+32] ; [ M16 M17 M20 M21] pshufw mm0, mm0, 10001000b ; [0202] pmaddwd mm6, mm2 ; [i1.M16+i3.M17 | i1.M20+i3.M21] pmaddwd mm3, mm0 ; [i0.M00+i2.M01 | i0.M04+i2.M05] pmaddwd mm2, [%2+48] ; [i1.M24+i3.M25 | i1.M28+i3.M29] pmaddwd mm0, [%2+16] ; [i0.M08+i2.M09 | i0.M12+i2.M13] ; mm2=b2|b3 mm6 = b0|b1 paddd mm3, [%3] ; Round paddd mm0, [%3] ; Round movq mm4, mm3 ; a0 | a1 movq mm7, mm0 ; a2 | a3 paddd mm3, mm6 ; a0+b0 | a1+b1 psubd mm4, mm6 ; a0-b0 | a1-b1 psubd mm7, mm2 ; a2-b2 | a3-b3 paddd mm0, mm2 ; a2+b2 | a3+b3 psrad mm3, %4 ; => out0 | out1 psrad mm4, %4 ; => out7 | out6 psrad mm0, %4 ; => out2 | out3 psrad mm7, %4 ; => out5 | out4 packssdw mm3, mm0 ; [0123] packssdw mm7, mm4 ; [5476] movq [ecx+%1*16+0], mm3 pshufw mm7, mm7, 10110001b ; [4567] movq [ecx+%1*16+8], mm7%endmacro;//////////////////////////////////////////////////////////////////////; iMTX_MULT_MMX (~27c);//////////////////////////////////////////////////////////////////////%macro iMTX_MULT_MMX 4 ; %1=src, %2 = Table to use, %3=rounder, %4=Shift movq mm0, [ecx+%1*16+0] ; [0123] movq mm1, [ecx+%1*16+8] ; [4567] movq mm2, mm0 punpcklwd mm0, mm1 ; [0415] punpckhwd mm2, mm1 ; [2637] movq mm1, mm0 movq mm3, mm2 punpckldq mm0, mm0 ; [0404] punpckldq mm2, mm2 ; [2626] punpckhdq mm1, mm1 ; [1515] punpckhdq mm3, mm3 ; [3737] movq mm4, [%2+ 0] ; [ M00 M02 M04 M06] movq mm6, [%2+ 8] ; [ M01 M03 M05 M07] pmaddwd mm4, mm0 ; [i0.M00+i4.M02 | i0.M04+i4.M06] movq mm5, [%2+32] ; [ M16 M18 M20 M22] movq mm7, [%2+40] ; [ M17 M19 M21 M23] pmaddwd mm6, mm2 ; [i2.M01+i6.M03 | i2.M05+i6.M07] pmaddwd mm5, mm1 ; [i1.M16+i5.M18 | i1.M20+i5.M22] pmaddwd mm7, mm3 ; [i3.M17+i7.M19 | i3.M21+i7.M23] pmaddwd mm0, [%2+16] ; [i0.M08+i4.M10 | i0.M12+i4.M14] pmaddwd mm2, [%2+24] ; [i2.M09+i6.M11 | i2.M13+i6.M15] pmaddwd mm1, [%2+48] ; [i1.M24+i5.M26 | i1.M28+i5.M30] pmaddwd mm3, [%2+56] ; [i3.M25+i7.M27 | i3.M29+i7.M31] paddd mm0, [%3] ; Round paddd mm1, mm3 ; => b2 | b3 paddd mm4, [%3] ; Round paddd mm5, mm7 ; => b0 | b1 paddd mm4, mm6 ; => a0 | a1 movq mm6, mm4 ; a0 | a1 paddd mm0, mm2 ; => a2 | a3 movq mm2, mm0 ; a2 | a3 paddd mm4, mm5 ; a0+b0 | a1+b1 psrad mm4, %4 ; => out0 | out1 paddd mm0, mm1 ; a2+b2 | a3+b3 psrad mm0, %4 ; => out2 | out3 psubd mm6, mm5 ; a0-b0 | a1-b1 psubd mm2, mm1 ; a2-b2 | a3-b3 psrad mm6, %4 ; => out7 | out6 psrad mm2, %4 ; => out5 | out4 packssdw mm2, mm6 ; [5476] packssdw mm4, mm0 ; [0123] movq mm7, mm2 psrld mm2, 16 ; [4-6-] pslld mm7, 16 ; [-5-7] movq [ecx+%1*16+0], mm4 por mm2, mm7 movq [ecx+%1*16+8], mm2%endmacro;//////////////////////////////////////////////////////////////////////;// iLLM_PASS (~42c);//////////////////////////////////////////////////////////////////////%macro ADD_TO_DST 4 ; %1:src1 %2:dst1 %3:src2 %4:dst2 ; trashes mm0,mm4 punpcklbw mm0, [%2] punpcklbw mm4, [%4] psrlw mm0, 8 ; will zero the high words psrlw mm4, 8 paddsw mm0, %1 paddsw mm4, %3 packuswb mm0, mm0 packuswb mm4, mm4 movd [%2], mm0 movd [%4], mm4%endmacro%macro iLLM_PASS 3 ; %1: src/dst, %2: combine func (0:none, 1:Put, 2:Add) ; %3: dst offset (only for Add) movq mm0, [tan3] ; t3-1 movq mm3, [%1+16*3] ; x3 movq mm1, mm0 ; t3-1 movq mm5, [%1+16*5] ; x5 movq mm4, [tan1] ; t1 movq mm6, [%1+16*1] ; x1 movq mm7, [%1+16*7] ; x7 movq mm2, mm4 ; t1 pmulhw mm0, mm3 ; x3*(t3-1) pmulhw mm1, mm5 ; x5*(t3-1) paddsw mm0, mm3 ; x3*t3 paddsw mm1, mm5 ; x5*t3 psubsw mm0, mm5 ; x3*t3-x5 = tm35 paddsw mm1, mm3 ; x3+x5*t3 = tp35 pmulhw mm4, mm7 ; x7*t1 pmulhw mm2, mm6 ; x1*t1 paddsw mm4, mm6 ; x1+t1*x7 = tp17 psubsw mm2, mm7 ; x1*t1-x7 = tm17 movq mm3, [sqrt2] movq mm7, mm4 movq mm6, mm2 psubsw mm4, mm1 ; tp17-tp35 = t1 psubsw mm2, mm0 ; tm17-tm35 = b3 paddsw mm1, mm7 ; tp17+tp35 = b0 paddsw mm0, mm6 ; tm17+tm35 = t2 ; mm1 = b0, mm2 = b3. preserved movq mm6, mm4 psubsw mm4, mm0 ; t1-t2 paddsw mm0, mm6 ; t1+t2 pmulhw mm4, mm3 ; (t1-t2)/(2.sqrt2) pmulhw mm0, mm3 ; (t1+t2)/(2.sqrt2) paddsw mm0, mm0 ; 2.(t1+t2) = b1 paddsw mm4, mm4 ; 2.(t1-t2) = b2 movq mm7, [tan2] ; t2 movq mm3, [%1+2*16] ; x2 movq mm6, [%1+6*16] ; x6 movq mm5, mm7 ; t2 pmulhw mm7, mm6 ; x6*t2 pmulhw mm5, mm3 ; x2*t2 paddsw mm7, mm3 ; x2+x6*t2 = tp26 psubsw mm5, mm6 ; x2*t2-x6 = tm26 ; use:mm3,mm5,mm6,mm7 frozen: mm0,mm4,mm1,mm2 movq mm3, [%1+0*16] ; x0 movq mm6, [%1+4*16] ; x4 psubsw mm3, mm6 ; x0-x4 = tm04 paddsw mm6, mm6 paddsw mm6, mm3 ; x0+x4 = tp04 psubsw mm3, mm5 ; tm04-tm26 = a2 psubsw mm6, mm7 ; tp04-tp26 = a3 paddsw mm5, mm5 ; 2.tm26 paddsw mm7, mm7 ; 2.tp26 paddsw mm5, mm3 ; tm04+tm26 = a1 paddsw mm7, mm6 ; tp04+tp26 = a0 psubsw mm5, mm0 ; a1-b1
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -