📄 fdct_sse2_skal.asm

📁 这是一个Xvid的源代码
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
  movdqa xmm3, [sqrt2]  movdqa xmm7, xmm4  movdqa xmm6, xmm2  psubsw xmm4, xmm1       ; tp17-tp35 = t1  psubsw xmm2, xmm0       ; tm17-tm35 = b3  paddsw xmm1, xmm7       ; tp17+tp35 = b0  paddsw xmm0, xmm6       ; tm17+tm35 = t2    ; xmm1 = b0, xmm2 = b3. preserved  movdqa xmm6, xmm4  psubsw xmm4, xmm0       ; t1-t2  paddsw xmm0, xmm6       ; t1+t2  pmulhw xmm4, xmm3       ; (t1-t2)/(2.sqrt2)  pmulhw xmm0, xmm3       ; (t1+t2)/(2.sqrt2)  paddsw xmm0, xmm0       ; 2.(t1+t2) = b1  paddsw xmm4, xmm4       ; 2.(t1-t2) = b2  movdqa xmm7, [tan2]    ; t2  movdqa xmm3, [%1+2*16] ; x2  movdqa xmm6, [%1+6*16] ; x6  movdqa xmm5, xmm7       ; t2  pmulhw xmm7, xmm6       ; x6*t2  pmulhw xmm5, xmm3       ; x2*t2  paddsw xmm7, xmm3       ; x2+x6*t2 = tp26  psubsw xmm5, xmm6       ; x2*t2-x6 = tm26  ; use:xmm3,xmm5,xmm6,xmm7   frozen: xmm0,xmm4,xmm1,xmm2  movdqa xmm3, [%1+0*16] ; x0  movdqa xmm6, [%1+4*16] ; x4  psubsw xmm3, xmm6   ; x0-x4 = tm04  paddsw xmm6, xmm6   ; 2.x4  paddsw xmm6, xmm3   ; x0+x4 = tp04  psubsw xmm3, xmm5   ; tm04-tm26 = a2  psubsw xmm6, xmm7   ; tp04-tp26 = a3  paddsw xmm5, xmm5   ; 2.tm26  paddsw xmm7, xmm7   ; 2.tp26  paddsw xmm5, xmm3   ; tm04+tm26 = a1  paddsw xmm7, xmm6   ; tp04+tp26 = a0  psubsw xmm5, xmm0   ; a1-b1  psubsw xmm3, xmm4   ; a2-b2  paddsw xmm0, xmm0   ; 2.b1  paddsw xmm4, xmm4   ; 2.b2  paddsw xmm0, xmm5   ; a1+b1  paddsw xmm4, xmm3   ; a2+b2  psraw  xmm5, 6     ; out6  psraw  xmm3, 6     ; out5  psraw  xmm0, 6     ; out1  psraw  xmm4, 6     ; out2  movdqa [%1+6*16], xmm5  movdqa [%1+5*16], xmm3  movdqa [%1+1*16], xmm0  movdqa [%1+2*16], xmm4    ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3  movdqa xmm0, xmm7  movdqa xmm4, xmm6  psubsw xmm7, xmm1   ; a0-b0  psubsw xmm6, xmm2   ; a3-b3  paddsw xmm1, xmm0   ; a0+b0  paddsw xmm2, xmm4   ; a3+b3  psraw  xmm1, 6     ; out0  psraw  xmm7, 6     ; out7  psraw  xmm2, 6     ; out3  psraw  xmm6, 6     ; out4  movdqa [%1+0*16], xmm1  movdqa [%1+3*16], xmm2  movdqa [%1+4*16], xmm6  movdqa [%1+7*16], xmm7%endmacro;-----------------------------------------------------------------------------; Function idct (the straight forward version);-----------------------------------------------------------------------------ALIGN 16idct_sse2_skal:  mov ecx, [esp+4]  iMTX_MULT  0, iTab1, Idct_Rnd0, 11  iMTX_MULT  1, iTab2, Idct_Rnd1, 11  iMTX_MULT  2, iTab3, Idct_Rnd2, 11  iMTX_MULT  3, iTab4, Idct_Rnd3, 11  iMTX_MULT  4, iTab1, Idct_Rnd4, 11  iMTX_MULT  5, iTab4, Idct_Rnd5, 11  iMTX_MULT  6, iTab3, Idct_Rnd6, 11  iMTX_MULT  7, iTab2, Idct_Rnd7, 11  iLLM_PASS ecx+0  ret;-----------------------------------------------------------------------------; Helper macro TEST_ROW (test a null row);-----------------------------------------------------------------------------%macro TEST_ROW 2     ; %1:src,  %2:label x8  mov eax, [%1   ]  mov edx, [%1+ 8]  or  eax, [%1+ 4]  or  edx, [%1+12]  or  eax, edx  jz near %2%endmacro;-----------------------------------------------------------------------------; Function idct (this one skips null rows);-----------------------------------------------------------------------------ALIGN 16idct_sse2_sparse_skal:  mov ecx, [esp+ 4]  ; Src  TEST_ROW ecx, .Row0_Round  iMTX_MULT  0, iTab1, Idct_Rnd0, 11  jmp .Row1.Row0_Round  movq mm0, [Idct_Sparse_Rnd0]  movq [ecx  ], mm0  movq [ecx+8], mm0.Row1  TEST_ROW ecx+16, .Row1_Round  iMTX_MULT  1, iTab2, Idct_Rnd1, 11  jmp .Row2.Row1_Round  movq mm0, [Idct_Sparse_Rnd1]  movq [ecx+16  ], mm0  movq [ecx+16+8], mm0.Row2  TEST_ROW ecx+32, .Row2_Round  iMTX_MULT  2, iTab3, Idct_Rnd2, 11  jmp .Row3.Row2_Round  movq mm0, [Idct_Sparse_Rnd2]  movq [ecx+32  ], mm0  movq [ecx+32+8], mm0.Row3  TEST_ROW ecx+48, .Row4  iMTX_MULT  3, iTab4, Idct_Rnd3, 11  jmp .Row4.Row4  TEST_ROW ecx+64, .Row5  iMTX_MULT  4, iTab1, Idct_Rnd4, 11  jmp .Row5.Row5  TEST_ROW ecx+80, .Row6  iMTX_MULT  5, iTab4, Idct_Rnd5, 11.Row6  TEST_ROW ecx+96, .Row7  iMTX_MULT  6, iTab3, Idct_Rnd6, 11.Row7  TEST_ROW ecx+112, .End  iMTX_MULT  7, iTab2, Idct_Rnd7, 11.End  iLLM_PASS ecx+0  ret;-----------------------------------------------------------------------------; Helper macro fLLM_PASS;-----------------------------------------------------------------------------%macro fLLM_PASS 2  ; %1: src/dst, %2:Shift  movdqa xmm0, [%1+0*16]   ; In0  movdqa xmm2, [%1+2*16]   ; In2  movdqa xmm3, xmm0  movdqa xmm4, xmm2  movdqa xmm7, [%1+7*16]   ; In7  movdqa xmm5, [%1+5*16]   ; In5  psubsw xmm0, xmm7         ; t7 = In0-In7  paddsw xmm7, xmm3         ; t0 = In0+In7  psubsw xmm2, xmm5         ; t5 = In2-In5  paddsw xmm5, xmm4         ; t2 = In2+In5  movdqa xmm3, [%1+3*16]   ; In3  movdqa xmm4, [%1+4*16]   ; In4  movdqa xmm1, xmm3  psubsw xmm3, xmm4         ; t4 = In3-In4  paddsw xmm4, xmm1         ; t3 = In3+In4  movdqa xmm6, [%1+6*16]   ; In6  movdqa xmm1, [%1+1*16]   ; In1  psubsw xmm1, xmm6         ; t6 = In1-In6  paddsw xmm6, [%1+1*16]   ; t1 = In1+In6  psubsw xmm7, xmm4         ; tm03 = t0-t3  psubsw xmm6, xmm5         ; tm12 = t1-t2  paddsw xmm4, xmm4         ; 2.t3  paddsw xmm5, xmm5         ; 2.t2  paddsw xmm4, xmm7         ; tp03 = t0+t3  paddsw xmm5, xmm6         ; tp12 = t1+t2  psllw  xmm2, %2+1        ; shift t5 (shift +1 to..  psllw  xmm1, %2+1        ; shift t6  ..compensate cos4/2)  psllw  xmm4, %2          ; shift t3  psllw  xmm5, %2          ; shift t2  psllw  xmm7, %2          ; shift t0  psllw  xmm6, %2          ; shift t1  psllw  xmm3, %2          ; shift t4  psllw  xmm0, %2          ; shift t7  psubsw xmm4, xmm5         ; out4 = tp03-tp12  psubsw xmm1, xmm2         ; xmm1: t6-t5  paddsw xmm5, xmm5  paddsw xmm2, xmm2  paddsw xmm5, xmm4         ; out0 = tp03+tp12  movdqa [%1+4*16], xmm4   ; => out4  paddsw xmm2, xmm1         ; xmm2: t6+t5  movdqa [%1+0*16], xmm5   ; => out0  movdqa xmm4, [tan2]      ; xmm4 <= tan2  pmulhw xmm4, xmm7         ; tm03*tan2  movdqa xmm5, [tan2]      ; xmm5 <= tan2  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12  pmulhw xmm5, xmm6         ; tm12*tan2  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03  movdqa xmm6, [sqrt2]    movdqa xmm7, [Rounder1]  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4  por    xmm5, xmm7         ; correct out2  por    xmm4, xmm7         ; correct out6  pmulhw xmm1, xmm6         ; xmm1: tm65 = (t6 - t5)*cos4  por    xmm2, xmm7         ; correct tp65  movdqa [%1+2*16], xmm5   ; => out2  movdqa xmm5, xmm3         ; save t4  movdqa [%1+6*16], xmm4   ; => out6  movdqa xmm4, xmm0         ; save t7    psubsw xmm3, xmm1         ; xmm3: tm465 = t4 - tm65  psubsw xmm0, xmm2         ; xmm0: tm765 = t7 - tp65  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65  movdqa xmm4, [tan3]      ; tan3 - 1  movdqa xmm5, [tan1]      ; tan1  movdqa xmm7, xmm3         ; save tm465  pmulhw xmm3, xmm4         ; tm465*(tan3-1)  movdqa xmm6, xmm1         ; save tp465  pmulhw xmm1, xmm5         ; tp465*tan1  paddsw xmm3, xmm7         ; tm465*tan3  pmulhw xmm4, xmm0         ; tm765*(tan3-1)  paddsw xmm4, xmm0         ; tm765*tan3  pmulhw xmm5, xmm2         ; tp765*tan1  paddsw xmm1, xmm2         ; out1 = tp765 + tp465*tan1  psubsw xmm0, xmm3         ; out3 = tm765 - tm465*tan3  paddsw xmm7, xmm4         ; out5 = tm465 + tm765*tan3  psubsw xmm5, xmm6         ; out7 =-tp465 + tp765*tan1  movdqa [%1+1*16], xmm1   ; => out1  movdqa [%1+3*16], xmm0   ; => out3  movdqa [%1+5*16], xmm7   ; => out5  movdqa [%1+7*16], xmm5   ; => out7%endmacro;-----------------------------------------------------------------------------;Helper macro fMTX_MULT;-----------------------------------------------------------------------------%macro fMTX_MULT 3   ; %1=src, %2 = Coeffs, %3=rounders  movdqa   xmm0, [ecx+%1*16+0]   ; xmm0 = [0123][4567]  pshufhw  xmm1, xmm0, 00011011b ; xmm1 = [----][7654]  pshufd   xmm0, xmm0, 01000100b  pshufd   xmm1, xmm1, 11101110b  movdqa   xmm2, xmm0  paddsw  xmm0, xmm1              ; xmm0 = [a0 a1 a2 a3]  psubsw  xmm2, xmm1              ; xmm2 = [b0 b1 b2 b3]  punpckldq xmm0, xmm2            ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]  pshufd    xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]    ;  [M00 M01    M16 M17] [M06 M07    M22 M23]  x mm0 = [0 /1 /2'/3']    ;  [M02 M03    M18 M19] [M04 M05    M20 M21]  x mm2 = [0'/1'/2 /3 ]    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]  movdqa  xmm1, [%2+16]  movdqa  xmm3, [%2+32]  pmaddwd xmm1, xmm2  pmaddwd xmm3, xmm0  pmaddwd xmm2, [%2+48]  pmaddwd xmm0, [%2+ 0]  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]  psrad   xmm0, 16  psrad   xmm2, 16    packssdw xmm0, xmm2            ;  [ out0 .. out7 ]  paddsw   xmm0, [%3]            ;  Round  psraw    xmm0, 4               ; => [-2048, 2047]  movdqa  [ecx+%1*16+0], xmm0%endmacro;-----------------------------------------------------------------------------; Function Forward DCT;-----------------------------------------------------------------------------ALIGN 16fdct_sse2_skal:  mov ecx, [esp+4]  fLLM_PASS ecx+0, 3  fMTX_MULT  0, fTab1, Fdct_Rnd0  fMTX_MULT  1, fTab2, Fdct_Rnd2  fMTX_MULT  2, fTab3, Fdct_Rnd1  fMTX_MULT  3, fTab4, Fdct_Rnd1  fMTX_MULT  4, fTab1, Fdct_Rnd0  fMTX_MULT  5, fTab4, Fdct_Rnd1  fMTX_MULT  6, fTab3, Fdct_Rnd1  fMTX_MULT  7, fTab2, Fdct_Rnd1  ret
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -