📄 skl_dct_llm.cpp
字号:
;//////////////////////////////////////////////////////////////////////; in-place 8x8 TRANSPOSE%macro TRANSPOSE_4x4_MMX 1 ; #1: Ptr. uses m0,m1,m2,m3, + m4,m5 movq mm0, [%1+0*16] ; 00 01 02 03 movq mm4, [%1+2*16] ; 20 21 22 23 movq mm1, [%1+1*16] ; 10 11 12 13 movq mm3, [%1+3*16] ; 30 31 32 33 movq mm2, mm0 movq mm5, mm4 punpcklwd mm0, mm1 ; mm0 = 00 10 01 11 punpckhwd mm2, mm1 ; mm2 = 02 12 03 13 punpcklwd mm4, mm3 ; mm4 = 20 30 21 31 punpckhwd mm5, mm3 ; mm5 = 22 32 23 33 movq mm1, mm0 punpckhdq mm1, mm4 ; mm1 = 01 11 21 31 movq mm3, mm2 punpckldq mm0, mm4 ; mm0 = 00 10 20 30 movq [%1+1*16], mm1 punpckldq mm2, mm5 ; mm2 = 02 12 22 32 movq [%1+0*16], mm0 punpckhdq mm3, mm5 ; mm3 = 03 13 23 33 movq [%1+2*16], mm2 movq [%1+3*16], mm3%endmacro%macro TRANSPOSE_8x4_MMX 1 TRANSPOSE_4x4_MMX %1 TRANSPOSE_4x4_MMX %1 +8%endmacro%macro TRANSPOSE_4x8_MMX 1 TRANSPOSE_4x4_MMX %1 TRANSPOSE_4x4_MMX %1 + 4*16%endmacro%macro MULT2 2 ; Ptr, i movq mm0, [%1 +%2*16 + 0] movq mm1, [%1 +%2*16 + 8] paddw mm0, [Halves+%2*16 + 0] paddw mm1, [Halves+%2*16 + 8] pmulhw mm0, [Mults+%2*16 + 0] pmulhw mm1, [Mults+%2*16 + 8] movq [%1 +%2*16 + 0], mm0 movq [%1 +%2*16 + 8], mm1%endmacro;//////////////////////////////////////////////////////////////////////%macro LOAD 3 ; m1, a, In ; performs: m1 = In[a] movq %1, [%3 + %2*8]%endmacro%macro LOAD_BUTF 5 ; m1, m2, a, b, In ; 'butterflied' load: m1 = In[a]-In[b], m2 = In[a]+In[b] movq %1, [%5 + %3*8] movq %2, [%5 + %4*8] psubw %1, %2 paddw %2, [%5 + %3*8]%endmacro%macro STORE 3 ; m1, a, Out movq [%3 + %2*8], %1%endmacro%macro LOAD2 3 ; m1, a, In ; performs: m1 = In[a] movq %1, [%3 + %2*16]%endmacro%macro LOAD2_BUTF 5 ; m1, m2, a, b, In ; 'butterflied' load: m1 = In[a]-m2, m2 = In[a]+m2 movq %1, [%5 + %3*16] movq %2, [%5 + %4*16] psubw %1, %2 paddw %2, [%5 + %3*16]%endmacro%macro STORE2 3 ; m1, a, Out movq [%3 + %2*16], %1%endmacro%macro BUTF 3 ; a, b, tmp ; performs butterfly: (a,b) -> (a-b, a+b) movq %3, %2 paddsw %2, %1 psubsw %1, %3%endmacro%macro BUTF_SAFE 3 ; a, b, tmp ; performs butterfly: (a,b) -> ( a/2-b/2, a/2+b/2 ) psraw %1, 1 psraw %2, 1 movq %3, %2 paddsw %2, %1 psubsw %1, %3%endmacro%macro DBL_BUTF 6 ; a, b, tmp, a', b', tmp' ; performs double butterfly on (a,b) / (a',b') movq %3, %2 movq %6, %5 paddw %2, %1 paddw %5, %4 psubw %1, %3 psubw %4, %6%endmacro%macro ROTATE16 5 ; a, b, t, tmp1, tmp2 ; performs: (a,b) -> (a + b.t, a.t - b) movq %4, %1 movq %5, %2 pmulhw %4, [%3] ; a*t pmulhw %5, [%3] ; b*t paddw %1, %5 ; bt+a psubw %4, %2 ; at-b movq %2, %4 ; REMOVE!%endmacro%macro ROTATE15 5 ; a, b, t, tmp1, tmp2 ; performs: (a,b) -> (a+b + b.t, b-a - a.t) movq %4, %1 movq %5, %2 paddw %1, %2 ; a <- a+b psubw %2, %4 ; b <- b-a pmulhw %4, [%3] ; a*t pmulhw %5, [%3] ; b*t paddw %1, %5 ; bt+ a+b psubw %2, %4 ; b-a -at%endmacro%macro SHIFTL 2 psllw %1, %2%endmacro;//////////////////////////////////////////////////////////////////////%define PASSB 2%macro FOUR_COLS_FWD_PASS1 1 ; In ; stage 1 -even part- ; ~20c LOAD_BUTF mm7, mm0, 0, 7, %1 ; 0, 7 LOAD_BUTF mm4, mm3, 6, 1, %1 ; 3, 4 BUTF mm0, mm3, mm6 LOAD_BUTF mm6, mm1, 2, 5, %1 ; 1, 6 LOAD_BUTF mm5, mm2, 4, 3, %1 ; 2, 5 BUTF mm1, mm2, [Spill] BUTF mm3, mm2, [Spill] SHIFTL mm0, PASSB SHIFTL mm1, PASSB SHIFTL mm2, PASSB SHIFTL mm3, PASSB STORE mm2, 0, %1 ; ->0 ; preserve mm3 ; (C6-rotation between mm0 and mm1) movq mm2, mm0 pmulhw mm2, [tan_2_16_16b] STORE mm3, 1, %1 ; ->4 ; leftover from previous stage paddw mm2, mm0 paddw mm0, mm1 movq mm1, mm2 ; TODO: Remove! psubw mm1, mm0 paddw mm0, mm2 por mm1, [Rounder] por mm0, [Rounder] ; preserve mm0, mm1 ; stage 2 -odd part- BUTF mm6, mm5, mm2 SHIFTL mm4, PASSB+1 SHIFTL mm5, PASSB SHIFTL mm6, PASSB SHIFTL mm7, PASSB+1 movq mm2, mm5 movq mm3, mm6 pmulhw mm5, [tan_2_16_16b] STORE mm0, 4, %1 ; ->2 ; leftover from previous stage pmulhw mm6, [tan_2_16_16b] STORE mm1, 5, %1 ; ->6 ; leftover from previous stage paddw mm5, mm2 paddw mm6, mm3 por mm5, [Rounder] por mm6, [Rounder] DBL_BUTF mm7, mm5, mm0, mm4, mm6, mm1 ROTATE16 mm5, mm6, tan_1_16_16b, mm0, mm1 ROTATE15 mm4, mm7, tan_3_16_16b, mm2, mm3 STORE mm5, 2, %1 ; ->1 STORE mm6, 7, %1 ; ->7 STORE mm7, 6, %1 ; ->3 STORE mm4, 3, %1 ; ->5%endmacro%macro FOUR_COLS_FWD_PASS2 2 ; In, Scale offset ; stage 1 -even part- LOAD2_BUTF mm7, mm0, 0, 7, %1 ; <- 0, 7 LOAD2_BUTF mm4, mm3, 3, 4, %1 ; <- 3, 4 BUTF mm0, mm3, mm6 LOAD2_BUTF mm6, mm1, 1, 6, %1 ; <- 1,6 LOAD2_BUTF mm5, mm2, 2, 5, %1 ; <- 2,5 BUTF mm1, mm2, [Spill] BUTF_SAFE mm3, mm2, [Spill] ; overflow-prone paddw mm2, [Halves + 0*16 + %2] paddw mm3, [Halves + 4*16 + %2] pmulhw mm2, [Mults + 0*16 + %2] pmulhw mm3, [Mults + 4*16 + %2] STORE2 mm2, 0, %1 ; ->0 ; (C6-rotation between mm0 and mm1) movq mm2, mm0 pmulhw mm2, [tan_2_16_16b] STORE2 mm3, 4, %1 ; ->4 ; leftover from previous stage paddw mm2, mm0 paddw mm0, mm1 movq mm1, mm2 ; TODO: Remove! psubw mm1, mm0 paddw mm0, mm2 ; TODO: overflow-prone. Improve! por mm1, [Rounder] por mm0, [Rounder] paddw mm1, [Halves + 6*16 + %2] paddw mm0, [Halves + 2*16 + %2] pmulhw mm1, [Mults + 6*16 + %2] pmulhw mm0, [Mults + 2*16 + %2] ; preserve mm0, mm1 ; stage 2 -odd part- (C1/C3 rotations + 3 butterflies) BUTF mm6, mm5, mm2 movq mm2, mm5 movq mm3, mm6 pmulhw mm5, [ucos_4_16_16b] pmulhw mm6, [ucos_4_16_16b] STORE2 mm0, 2, %1 ; ->2 ; leftover STORE2 mm1, 6, %1 ; ->6 paddw mm5, mm2 paddw mm6, mm3 por mm5, [Rounder] BUTF mm7, mm5, mm0 BUTF mm4, mm6, mm0 ROTATE16 mm5, mm6, tan_1_16_16b, mm0, mm1 ROTATE15 mm4, mm7, tan_3_16_16b, mm2, mm3 paddw mm5, [Halves + 1*16 + %2] paddw mm7, [Halves + 3*16 + %2] pmulhw mm5, [Mults + 1*16 + %2] paddw mm4, [Halves + 5*16 + %2] pmulhw mm7, [Mults + 3*16 + %2] STORE2 mm5, 1, %1 ; ->1 paddw mm6, [Halves + 7*16 + %2] pmulhw mm4, [Mults + 5*16 + %2] STORE2 mm7, 3, %1 ; ->3 pmulhw mm6, [Mults + 7*16 + %2] STORE2 mm4, 5, %1 ; ->5 STORE2 mm6, 7, %1 ; ->7%endmacroSkl_Dct16_LLM: ; interlaced final scale: total: 298c mov ecx,[esp+4] ; In ; transp. x 4: 80c ; PASS1 x 2: 100c ; PASS2 x 2: 121c ; Final scale: 46c ; total: 347 cycles. TRANSPOSE_8x4_MMX ecx FOUR_COLS_FWD_PASS1 ecx TRANSPOSE_8x4_MMX ecx + 4*16 FOUR_COLS_FWD_PASS1 ecx+4*16 TRANSPOSE_4x8_MMX ecx FOUR_COLS_FWD_PASS2 ecx, 0 TRANSPOSE_4x8_MMX ecx+8 FOUR_COLS_FWD_PASS2 ecx+8, 8%if 0 ; this descaling phase could be fused with Quantizing stage MULT2 ecx, 0 MULT2 ecx, 1 MULT2 ecx, 2 MULT2 ecx, 3 MULT2 ecx, 4 MULT2 ecx, 5 MULT2 ecx, 6 MULT2 ecx, 7%endif ret;//////////////////////////////////////////////////////////////////////*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -