📄 fdct_sse2_skal.asm
字号:
movdqa xmm7, [tan2] ; t2 movdqa xmm3, [%1+2*16] ; x2 movdqa xmm6, [%1+6*16] ; x6 movdqa xmm5, xmm7 ; t2 pmulhw xmm7, xmm6 ; x6*t2 pmulhw xmm5, xmm3 ; x2*t2 paddsw xmm7, xmm3 ; x2+x6*t2 = tp26 psubsw xmm5, xmm6 ; x2*t2-x6 = tm26 ; use:xmm3,xmm5,xmm6,xmm7 frozen: xmm0,xmm4,xmm1,xmm2 movdqa xmm3, [%1+0*16] ; x0 movdqa xmm6, [%1+4*16] ; x4 movdqa [%1 ], xmm2 ; we spill 1 reg to perform safe butterflies movdqa xmm2, xmm3 psubsw xmm3, xmm6 ; x0-x4 = tm04 paddsw xmm6, xmm2 ; x0+x4 = tp04 movdqa xmm2, xmm6 psubsw xmm6, xmm7 paddsw xmm7, xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm5 paddsw xmm5, xmm2 movdqa xmm2, xmm5 psubsw xmm5, xmm0 paddsw xmm0, xmm2 movdqa xmm2, xmm3 psubsw xmm3, xmm4 paddsw xmm4, xmm2 movdqa xmm2, [%1] psraw xmm5, 6 ; out6 psraw xmm3, 6 ; out5 psraw xmm0, 6 ; out1 psraw xmm4, 6 ; out2 movdqa [%1+6*16], xmm5 movdqa [%1+5*16], xmm3 movdqa [%1+1*16], xmm0 movdqa [%1+2*16], xmm4 ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3 movdqa xmm0, xmm7 movdqa xmm4, xmm6 psubsw xmm7, xmm1 ; a0-b0 psubsw xmm6, xmm2 ; a3-b3 paddsw xmm1, xmm0 ; a0+b0 paddsw xmm2, xmm4 ; a3+b3 psraw xmm1, 6 ; out0 psraw xmm7, 6 ; out7 psraw xmm2, 6 ; out3 psraw xmm6, 6 ; out4 ; store result movdqa [%1+0*16], xmm1 movdqa [%1+3*16], xmm2 movdqa [%1+4*16], xmm6 movdqa [%1+7*16], xmm7%endmacro;-----------------------------------------------------------------------------; Helper macro TEST_ROW (test a null row);-----------------------------------------------------------------------------%macro TEST_ROW 2 ; %1:src, %2:label x8 mov eax, [%1 ] mov edx, [%1+ 8] or eax, [%1+ 4] or edx, [%1+12] or eax, edx jz near %2%endmacro;-----------------------------------------------------------------------------; Function idct (this one skips null rows);-----------------------------------------------------------------------------; IEEE1180 and Walken compatible versionALIGN 16idct_sse2_skal: mov ecx, [esp+ 4] ; Src TEST_ROW ecx, .Row0_Round iMTX_MULT 0, iTab1, Walken_Idct_Rounders + 16*0, 11 jmp .Row1.Row0_Round movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0] movdqa [ecx ], xmm0.Row1 TEST_ROW ecx+16, .Row1_Round iMTX_MULT 1, iTab2, Walken_Idct_Rounders + 16*1, 11 jmp .Row2.Row1_Round movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1] movdqa [ecx+16 ], xmm0.Row2 TEST_ROW ecx+32, .Row2_Round iMTX_MULT 2, iTab3, Walken_Idct_Rounders + 16*2, 11 jmp .Row3.Row2_Round movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2] movdqa [ecx+32 ], xmm0.Row3 TEST_ROW ecx+48, .Row4 iMTX_MULT 3, iTab4, Walken_Idct_Rounders + 16*3, 11.Row4 TEST_ROW ecx+64, .Row5 iMTX_MULT 4, iTab1, Walken_Idct_Rounders + 16*4, 11.Row5 TEST_ROW ecx+80, .Row6 iMTX_MULT 5, iTab4, Walken_Idct_Rounders + 16*5, 11.Row6 TEST_ROW ecx+96, .Row7 iMTX_MULT 6, iTab3, Walken_Idct_Rounders + 16*6, 11.Row7 TEST_ROW ecx+112, .End iMTX_MULT 7, iTab2, Walken_Idct_Rounders + 16*7, 11.End iLLM_PASS ecx ret.endfunc;-----------------------------------------------------------------------------; Helper macro fLLM_PASS;-----------------------------------------------------------------------------%macro fLLM_PASS 2 ; %1: src/dst, %2:Shift movdqa xmm0, [%1+0*16] ; In0 movdqa xmm2, [%1+2*16] ; In2 movdqa xmm3, xmm0 movdqa xmm4, xmm2 movdqa xmm7, [%1+7*16] ; In7 movdqa xmm5, [%1+5*16] ; In5 psubsw xmm0, xmm7 ; t7 = In0-In7 paddsw xmm7, xmm3 ; t0 = In0+In7 psubsw xmm2, xmm5 ; t5 = In2-In5 paddsw xmm5, xmm4 ; t2 = In2+In5 movdqa xmm3, [%1+3*16] ; In3 movdqa xmm4, [%1+4*16] ; In4 movdqa xmm1, xmm3 psubsw xmm3, xmm4 ; t4 = In3-In4 paddsw xmm4, xmm1 ; t3 = In3+In4 movdqa xmm6, [%1+6*16] ; In6 movdqa xmm1, [%1+1*16] ; In1 psubsw xmm1, xmm6 ; t6 = In1-In6 paddsw xmm6, [%1+1*16] ; t1 = In1+In6 psubsw xmm7, xmm4 ; tm03 = t0-t3 psubsw xmm6, xmm5 ; tm12 = t1-t2 paddsw xmm4, xmm4 ; 2.t3 paddsw xmm5, xmm5 ; 2.t2 paddsw xmm4, xmm7 ; tp03 = t0+t3 paddsw xmm5, xmm6 ; tp12 = t1+t2 psllw xmm2, %2+1 ; shift t5 (shift +1 to.. psllw xmm1, %2+1 ; shift t6 ..compensate cos4/2) psllw xmm4, %2 ; shift t3 psllw xmm5, %2 ; shift t2 psllw xmm7, %2 ; shift t0 psllw xmm6, %2 ; shift t1 psllw xmm3, %2 ; shift t4 psllw xmm0, %2 ; shift t7 psubsw xmm4, xmm5 ; out4 = tp03-tp12 psubsw xmm1, xmm2 ; xmm1: t6-t5 paddsw xmm5, xmm5 paddsw xmm2, xmm2 paddsw xmm5, xmm4 ; out0 = tp03+tp12 movdqa [%1+4*16], xmm4 ; => out4 paddsw xmm2, xmm1 ; xmm2: t6+t5 movdqa [%1+0*16], xmm5 ; => out0 movdqa xmm4, [tan2] ; xmm4 <= tan2 pmulhw xmm4, xmm7 ; tm03*tan2 movdqa xmm5, [tan2] ; xmm5 <= tan2 psubsw xmm4, xmm6 ; out6 = tm03*tan2 - tm12 pmulhw xmm5, xmm6 ; tm12*tan2 paddsw xmm5, xmm7 ; out2 = tm12*tan2 + tm03 movdqa xmm6, [sqrt2] movdqa xmm7, [Rounder1] pmulhw xmm2, xmm6 ; xmm2: tp65 = (t6 + t5)*cos4 por xmm5, xmm7 ; correct out2 por xmm4, xmm7 ; correct out6 pmulhw xmm1, xmm6 ; xmm1: tm65 = (t6 - t5)*cos4 por xmm2, xmm7 ; correct tp65 movdqa [%1+2*16], xmm5 ; => out2 movdqa xmm5, xmm3 ; save t4 movdqa [%1+6*16], xmm4 ; => out6 movdqa xmm4, xmm0 ; save t7 psubsw xmm3, xmm1 ; xmm3: tm465 = t4 - tm65 psubsw xmm0, xmm2 ; xmm0: tm765 = t7 - tp65 paddsw xmm2, xmm4 ; xmm2: tp765 = t7 + tp65 paddsw xmm1, xmm5 ; xmm1: tp465 = t4 + tm65 movdqa xmm4, [tan3] ; tan3 - 1 movdqa xmm5, [tan1] ; tan1 movdqa xmm7, xmm3 ; save tm465 pmulhw xmm3, xmm4 ; tm465*(tan3-1) movdqa xmm6, xmm1 ; save tp465 pmulhw xmm1, xmm5 ; tp465*tan1 paddsw xmm3, xmm7 ; tm465*tan3 pmulhw xmm4, xmm0 ; tm765*(tan3-1) paddsw xmm4, xmm0 ; tm765*tan3 pmulhw xmm5, xmm2 ; tp765*tan1 paddsw xmm1, xmm2 ; out1 = tp765 + tp465*tan1 psubsw xmm0, xmm3 ; out3 = tm765 - tm465*tan3 paddsw xmm7, xmm4 ; out5 = tm465 + tm765*tan3 psubsw xmm5, xmm6 ; out7 =-tp465 + tp765*tan1 movdqa [%1+1*16], xmm1 ; => out1 movdqa [%1+3*16], xmm0 ; => out3 movdqa [%1+5*16], xmm7 ; => out5 movdqa [%1+7*16], xmm5 ; => out7%endmacro;-----------------------------------------------------------------------------;Helper macro fMTX_MULT;-----------------------------------------------------------------------------%macro fMTX_MULT 3 ; %1=src, %2 = Coeffs, %3=rounders movdqa xmm0, [ecx+%1*16+0] ; xmm0 = [0123][4567] pshufhw xmm1, xmm0, 00011011b ; xmm1 = [----][7654] pshufd xmm0, xmm0, 01000100b pshufd xmm1, xmm1, 11101110b movdqa xmm2, xmm0 paddsw xmm0, xmm1 ; xmm0 = [a0 a1 a2 a3] psubsw xmm2, xmm1 ; xmm2 = [b0 b1 b2 b3] punpckldq xmm0, xmm2 ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3] pshufd xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1] ; [M00 M01 M16 M17] [M06 M07 M22 M23] x mm0 = [0 /1 /2'/3'] ; [M02 M03 M18 M19] [M04 M05 M20 M21] x mm2 = [0'/1'/2 /3 ] ; [M08 M09 M24 M25] [M14 M15 M30 M31] x mm0 = [4 /5 /6'/7'] ; [M10 M11 M26 M27] [M12 M13 M28 M29] x mm2 = [4'/5'/6 /7 ] movdqa xmm1, [%2+16] movdqa xmm3, [%2+32] pmaddwd xmm1, xmm2 pmaddwd xmm3, xmm0 pmaddwd xmm2, [%2+48] pmaddwd xmm0, [%2+ 0] paddd xmm0, xmm1 ; [ out0 | out1 ][ out2 | out3 ] paddd xmm2, xmm3 ; [ out4 | out5 ][ out6 | out7 ] psrad xmm0, 16 psrad xmm2, 16 packssdw xmm0, xmm2 ; [ out0 .. out7 ] paddsw xmm0, [%3] ; Round psraw xmm0, 4 ; => [-2048, 2047] movdqa [ecx+%1*16+0], xmm0%endmacro;-----------------------------------------------------------------------------; Function Forward DCT;-----------------------------------------------------------------------------ALIGN 16fdct_sse2_skal: mov ecx, [esp+4] fLLM_PASS ecx+0, 3 fMTX_MULT 0, fTab1, Fdct_Rnd0 fMTX_MULT 1, fTab2, Fdct_Rnd2 fMTX_MULT 2, fTab3, Fdct_Rnd1 fMTX_MULT 3, fTab4, Fdct_Rnd1 fMTX_MULT 4, fTab1, Fdct_Rnd0 fMTX_MULT 5, fTab4, Fdct_Rnd1 fMTX_MULT 6, fTab3, Fdct_Rnd1 fMTX_MULT 7, fTab2, Fdct_Rnd1 ret.endfunc
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -