📄 fdct_sse2_skal.asm

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
  movdqa xmm7, [tan2]     ; t2  movdqa xmm3, [%1+2*16]  ; x2  movdqa xmm6, [%1+6*16]  ; x6  movdqa xmm5, xmm7       ; t2  pmulhw xmm7, xmm6       ; x6*t2  pmulhw xmm5, xmm3       ; x2*t2  paddsw xmm7, xmm3       ; x2+x6*t2 = tp26  psubsw xmm5, xmm6       ; x2*t2-x6 = tm26   ; use:xmm3,xmm5,xmm6,xmm7   frozen: xmm0,xmm4,xmm1,xmm2  movdqa xmm3, [%1+0*16] ; x0  movdqa xmm6, [%1+4*16] ; x4  movdqa [%1   ], xmm2  ; we spill 1 reg to perform safe butterflies  movdqa xmm2, xmm3  psubsw xmm3, xmm6   ; x0-x4 = tm04  paddsw xmm6, xmm2   ; x0+x4 = tp04  movdqa xmm2, xmm6  psubsw xmm6, xmm7  paddsw xmm7, xmm2  movdqa xmm2, xmm3  psubsw xmm3, xmm5  paddsw xmm5, xmm2  movdqa xmm2, xmm5  psubsw xmm5, xmm0  paddsw xmm0, xmm2  movdqa xmm2, xmm3  psubsw xmm3, xmm4  paddsw xmm4, xmm2  movdqa xmm2, [%1]  psraw  xmm5, 6      ; out6  psraw  xmm3, 6      ; out5  psraw  xmm0, 6      ; out1  psraw  xmm4, 6      ; out2  movdqa [%1+6*16], xmm5  movdqa [%1+5*16], xmm3  movdqa [%1+1*16], xmm0  movdqa [%1+2*16], xmm4    ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3  movdqa xmm0, xmm7  movdqa xmm4, xmm6  psubsw xmm7, xmm1   ; a0-b0  psubsw xmm6, xmm2   ; a3-b3  paddsw xmm1, xmm0   ; a0+b0  paddsw xmm2, xmm4   ; a3+b3  psraw  xmm1, 6      ; out0  psraw  xmm7, 6      ; out7  psraw  xmm2, 6      ; out3  psraw  xmm6, 6      ; out4    ; store result  movdqa [%1+0*16], xmm1  movdqa [%1+3*16], xmm2  movdqa [%1+4*16], xmm6  movdqa [%1+7*16], xmm7%endmacro;-----------------------------------------------------------------------------; Helper macro TEST_ROW (test a null row);-----------------------------------------------------------------------------%macro TEST_ROW 2     ; %1:src,  %2:label x8  mov eax, [%1   ]  mov edx, [%1+ 8]  or  eax, [%1+ 4]  or  edx, [%1+12]  or  eax, edx  jz near %2%endmacro;-----------------------------------------------------------------------------; Function idct (this one skips null rows);-----------------------------------------------------------------------------; IEEE1180 and Walken compatible versionALIGN 16idct_sse2_skal:  mov ecx, [esp+ 4]  ; Src  TEST_ROW ecx, .Row0_Round  iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11  jmp .Row1.Row0_Round  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]  movdqa [ecx  ], xmm0.Row1  TEST_ROW ecx+16, .Row1_Round  iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11  jmp .Row2.Row1_Round  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]  movdqa [ecx+16  ], xmm0.Row2  TEST_ROW ecx+32, .Row2_Round  iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11  jmp .Row3.Row2_Round  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]  movdqa [ecx+32  ], xmm0.Row3  TEST_ROW ecx+48, .Row4  iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11.Row4  TEST_ROW ecx+64, .Row5  iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11.Row5  TEST_ROW ecx+80, .Row6  iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11.Row6  TEST_ROW ecx+96, .Row7  iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11.Row7  TEST_ROW ecx+112, .End  iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11.End  iLLM_PASS ecx  ret.endfunc;-----------------------------------------------------------------------------; Helper macro fLLM_PASS;-----------------------------------------------------------------------------%macro fLLM_PASS 2  ; %1: src/dst, %2:Shift  movdqa xmm0, [%1+0*16]   ; In0  movdqa xmm2, [%1+2*16]   ; In2  movdqa xmm3, xmm0  movdqa xmm4, xmm2  movdqa xmm7, [%1+7*16]   ; In7  movdqa xmm5, [%1+5*16]   ; In5  psubsw xmm0, xmm7         ; t7 = In0-In7  paddsw xmm7, xmm3         ; t0 = In0+In7  psubsw xmm2, xmm5         ; t5 = In2-In5  paddsw xmm5, xmm4         ; t2 = In2+In5  movdqa xmm3, [%1+3*16]   ; In3  movdqa xmm4, [%1+4*16]   ; In4  movdqa xmm1, xmm3  psubsw xmm3, xmm4         ; t4 = In3-In4  paddsw xmm4, xmm1         ; t3 = In3+In4  movdqa xmm6, [%1+6*16]   ; In6  movdqa xmm1, [%1+1*16]   ; In1  psubsw xmm1, xmm6         ; t6 = In1-In6  paddsw xmm6, [%1+1*16]   ; t1 = In1+In6  psubsw xmm7, xmm4         ; tm03 = t0-t3  psubsw xmm6, xmm5         ; tm12 = t1-t2  paddsw xmm4, xmm4         ; 2.t3  paddsw xmm5, xmm5         ; 2.t2  paddsw xmm4, xmm7         ; tp03 = t0+t3  paddsw xmm5, xmm6         ; tp12 = t1+t2  psllw  xmm2, %2+1        ; shift t5 (shift +1 to..  psllw  xmm1, %2+1        ; shift t6  ..compensate cos4/2)  psllw  xmm4, %2          ; shift t3  psllw  xmm5, %2          ; shift t2  psllw  xmm7, %2          ; shift t0  psllw  xmm6, %2          ; shift t1  psllw  xmm3, %2          ; shift t4  psllw  xmm0, %2          ; shift t7  psubsw xmm4, xmm5         ; out4 = tp03-tp12  psubsw xmm1, xmm2         ; xmm1: t6-t5  paddsw xmm5, xmm5  paddsw xmm2, xmm2  paddsw xmm5, xmm4         ; out0 = tp03+tp12  movdqa [%1+4*16], xmm4   ; => out4  paddsw xmm2, xmm1         ; xmm2: t6+t5  movdqa [%1+0*16], xmm5   ; => out0  movdqa xmm4, [tan2]      ; xmm4 <= tan2  pmulhw xmm4, xmm7         ; tm03*tan2  movdqa xmm5, [tan2]      ; xmm5 <= tan2  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12  pmulhw xmm5, xmm6         ; tm12*tan2  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03  movdqa xmm6, [sqrt2]    movdqa xmm7, [Rounder1]  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4  por    xmm5, xmm7         ; correct out2  por    xmm4, xmm7         ; correct out6  pmulhw xmm1, xmm6         ; xmm1: tm65 = (t6 - t5)*cos4  por    xmm2, xmm7         ; correct tp65  movdqa [%1+2*16], xmm5   ; => out2  movdqa xmm5, xmm3         ; save t4  movdqa [%1+6*16], xmm4   ; => out6  movdqa xmm4, xmm0         ; save t7    psubsw xmm3, xmm1         ; xmm3: tm465 = t4 - tm65  psubsw xmm0, xmm2         ; xmm0: tm765 = t7 - tp65  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65  movdqa xmm4, [tan3]      ; tan3 - 1  movdqa xmm5, [tan1]      ; tan1  movdqa xmm7, xmm3         ; save tm465  pmulhw xmm3, xmm4         ; tm465*(tan3-1)  movdqa xmm6, xmm1         ; save tp465  pmulhw xmm1, xmm5         ; tp465*tan1  paddsw xmm3, xmm7         ; tm465*tan3  pmulhw xmm4, xmm0         ; tm765*(tan3-1)  paddsw xmm4, xmm0         ; tm765*tan3  pmulhw xmm5, xmm2         ; tp765*tan1  paddsw xmm1, xmm2         ; out1 = tp765 + tp465*tan1  psubsw xmm0, xmm3         ; out3 = tm765 - tm465*tan3  paddsw xmm7, xmm4         ; out5 = tm465 + tm765*tan3  psubsw xmm5, xmm6         ; out7 =-tp465 + tp765*tan1  movdqa [%1+1*16], xmm1   ; => out1  movdqa [%1+3*16], xmm0   ; => out3  movdqa [%1+5*16], xmm7   ; => out5  movdqa [%1+7*16], xmm5   ; => out7%endmacro;-----------------------------------------------------------------------------;Helper macro fMTX_MULT;-----------------------------------------------------------------------------%macro fMTX_MULT 3   ; %1=src, %2 = Coeffs, %3=rounders  movdqa   xmm0, [ecx+%1*16+0]   ; xmm0 = [0123][4567]  pshufhw  xmm1, xmm0, 00011011b ; xmm1 = [----][7654]  pshufd   xmm0, xmm0, 01000100b  pshufd   xmm1, xmm1, 11101110b  movdqa   xmm2, xmm0  paddsw  xmm0, xmm1              ; xmm0 = [a0 a1 a2 a3]  psubsw  xmm2, xmm1              ; xmm2 = [b0 b1 b2 b3]  punpckldq xmm0, xmm2            ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]  pshufd    xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]    ;  [M00 M01    M16 M17] [M06 M07    M22 M23]  x mm0 = [0 /1 /2'/3']    ;  [M02 M03    M18 M19] [M04 M05    M20 M21]  x mm2 = [0'/1'/2 /3 ]    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]  movdqa  xmm1, [%2+16]  movdqa  xmm3, [%2+32]  pmaddwd xmm1, xmm2  pmaddwd xmm3, xmm0  pmaddwd xmm2, [%2+48]  pmaddwd xmm0, [%2+ 0]  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]  psrad   xmm0, 16  psrad   xmm2, 16    packssdw xmm0, xmm2            ;  [ out0 .. out7 ]  paddsw   xmm0, [%3]            ;  Round  psraw    xmm0, 4               ; => [-2048, 2047]  movdqa  [ecx+%1*16+0], xmm0%endmacro;-----------------------------------------------------------------------------; Function Forward DCT;-----------------------------------------------------------------------------ALIGN 16fdct_sse2_skal:  mov ecx, [esp+4]  fLLM_PASS ecx+0, 3  fMTX_MULT  0, fTab1, Fdct_Rnd0  fMTX_MULT  1, fTab2, Fdct_Rnd2  fMTX_MULT  2, fTab3, Fdct_Rnd1  fMTX_MULT  3, fTab4, Fdct_Rnd1  fMTX_MULT  4, fTab1, Fdct_Rnd0  fMTX_MULT  5, fTab4, Fdct_Rnd1  fMTX_MULT  6, fTab3, Fdct_Rnd1  fMTX_MULT  7, fTab2, Fdct_Rnd1  ret.endfunc
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -