📄 fdct_sse2_skal.asm

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
 
   movdqa xmm3, [sqrt2]
   movdqa xmm7, xmm4
   movdqa xmm6, xmm2
   psubsw xmm4, xmm1       ; tp17-tp35 = t1
   psubsw xmm2, xmm0       ; tm17-tm35 = b3
   paddsw xmm1, xmm7       ; tp17+tp35 = b0
   paddsw xmm0, xmm6       ; tm17+tm35 = t2
 
     ; xmm1 = b0, xmm2 = b3. preserved
 
   movdqa xmm6, xmm4
   psubsw xmm4, xmm0       ; t1-t2
   paddsw xmm0, xmm6       ; t1+t2
 
   pmulhw xmm4, xmm3       ; (t1-t2)/(2.sqrt2)
   pmulhw xmm0, xmm3       ; (t1+t2)/(2.sqrt2)
 
   paddsw xmm0, xmm0       ; 2.(t1+t2) = b1
   paddsw xmm4, xmm4       ; 2.(t1-t2) = b2
 
   movdqa xmm7, [tan2]    ; t2
   movdqa xmm3, [%1+2*16] ; x2
   movdqa xmm6, [%1+6*16] ; x6
   movdqa xmm5, xmm7       ; t2
 
   pmulhw xmm7, xmm6       ; x6*t2
   pmulhw xmm5, xmm3       ; x2*t2
 
   paddsw xmm7, xmm3       ; x2+x6*t2 = tp26
   psubsw xmm5, xmm6       ; x2*t2-x6 = tm26
 
 
   ; use:xmm3,xmm5,xmm6,xmm7   frozen: xmm0,xmm4,xmm1,xmm2
 
   movdqa xmm3, [%1+0*16] ; x0
   movdqa xmm6, [%1+4*16] ; x4
 
   psubsw xmm3, xmm6   ; x0-x4 = tm04
   paddsw xmm6, xmm6   ; 2.x4
   paddsw xmm6, xmm3   ; x0+x4 = tp04
 
   psubsw xmm3, xmm5   ; tm04-tm26 = a2
   psubsw xmm6, xmm7   ; tp04-tp26 = a3
   paddsw xmm5, xmm5   ; 2.tm26
   paddsw xmm7, xmm7   ; 2.tp26
   paddsw xmm5, xmm3   ; tm04+tm26 = a1
   paddsw xmm7, xmm6   ; tp04+tp26 = a0
 
   psubsw xmm5, xmm0   ; a1-b1
   psubsw xmm3, xmm4   ; a2-b2
   paddsw xmm0, xmm0   ; 2.b1
   paddsw xmm4, xmm4   ; 2.b2
   paddsw xmm0, xmm5   ; a1+b1
   paddsw xmm4, xmm3   ; a2+b2
 
   psraw  xmm5, 6     ; out6
   psraw  xmm3, 6     ; out5
   psraw  xmm0, 6     ; out1
   psraw  xmm4, 6     ; out2
 
   movdqa [%1+6*16], xmm5
   movdqa [%1+5*16], xmm3
   movdqa [%1+1*16], xmm0
   movdqa [%1+2*16], xmm4
 
     ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3
 
   movdqa xmm0, xmm7
   movdqa xmm4, xmm6
   psubsw xmm7, xmm1   ; a0-b0
   psubsw xmm6, xmm2   ; a3-b3
   paddsw xmm1, xmm0   ; a0+b0
   paddsw xmm2, xmm4   ; a3+b3
 
   psraw  xmm1, 6     ; out0
   psraw  xmm7, 6     ; out7
   psraw  xmm2, 6     ; out3
   psraw  xmm6, 6     ; out4
 
   movdqa [%1+0*16], xmm1
   movdqa [%1+3*16], xmm2
   movdqa [%1+4*16], xmm6
   movdqa [%1+7*16], xmm7
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; Function idct (the straight forward version)
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 idct_sse2_skal:
   mov ecx, [esp+4]
   iMTX_MULT  0, iTab1, Idct_Rnd0, 11
   iMTX_MULT  1, iTab2, Idct_Rnd1, 11
   iMTX_MULT  2, iTab3, Idct_Rnd2, 11
   iMTX_MULT  3, iTab4, Idct_Rnd3, 11
   iMTX_MULT  4, iTab1, Idct_Rnd4, 11
   iMTX_MULT  5, iTab4, Idct_Rnd5, 11
   iMTX_MULT  6, iTab3, Idct_Rnd6, 11
   iMTX_MULT  7, iTab2, Idct_Rnd7, 11
   iLLM_PASS ecx+0
   ret
 
 ;-----------------------------------------------------------------------------
 ; Helper macro TEST_ROW (test a null row)
 ;-----------------------------------------------------------------------------
 
 %macro TEST_ROW 2     ; %1:src,  %2:label x8
   mov eax, [%1   ]
   mov edx, [%1+ 8]
   or  eax, [%1+ 4]
   or  edx, [%1+12]
   or  eax, edx
   jz near %2
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; Function idct (this one skips null rows)
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 idct_sse2_sparse_skal:
 
   mov ecx, [esp+ 4]  ; Src
 
   TEST_ROW ecx, .Row0_Round
   iMTX_MULT  0, iTab1, Idct_Rnd0, 11
   jmp .Row1
 .Row0_Round
   movq mm0, [Idct_Sparse_Rnd0]
   movq [ecx  ], mm0
   movq [ecx+8], mm0
 
 .Row1
   TEST_ROW ecx+16, .Row1_Round
   iMTX_MULT  1, iTab2, Idct_Rnd1, 11
   jmp .Row2
 .Row1_Round
   movq mm0, [Idct_Sparse_Rnd1]
   movq [ecx+16  ], mm0
   movq [ecx+16+8], mm0
 
 .Row2
   TEST_ROW ecx+32, .Row2_Round
   iMTX_MULT  2, iTab3, Idct_Rnd2, 11
   jmp .Row3
 .Row2_Round
   movq mm0, [Idct_Sparse_Rnd2]
   movq [ecx+32  ], mm0
   movq [ecx+32+8], mm0
 
 .Row3
   TEST_ROW ecx+48, .Row4
   iMTX_MULT  3, iTab4, Idct_Rnd3, 11
   jmp .Row4
 
 .Row4
   TEST_ROW ecx+64, .Row5
   iMTX_MULT  4, iTab1, Idct_Rnd4, 11
   jmp .Row5
 
 .Row5
   TEST_ROW ecx+80, .Row6
   iMTX_MULT  5, iTab4, Idct_Rnd5, 11
 
 .Row6
   TEST_ROW ecx+96, .Row7
   iMTX_MULT  6, iTab3, Idct_Rnd6, 11
 
 .Row7
   TEST_ROW ecx+112, .End
   iMTX_MULT  7, iTab2, Idct_Rnd7, 11
 .End
 
   iLLM_PASS ecx+0
   ret
 
 ;-----------------------------------------------------------------------------
 ; Helper macro fLLM_PASS
 ;-----------------------------------------------------------------------------
 
 %macro fLLM_PASS 2  ; %1: src/dst, %2:Shift
 
   movdqa xmm0, [%1+0*16]   ; In0
   movdqa xmm2, [%1+2*16]   ; In2
   movdqa xmm3, xmm0
   movdqa xmm4, xmm2
   movdqa xmm7, [%1+7*16]   ; In7
   movdqa xmm5, [%1+5*16]   ; In5
 
   psubsw xmm0, xmm7         ; t7 = In0-In7
   paddsw xmm7, xmm3         ; t0 = In0+In7
   psubsw xmm2, xmm5         ; t5 = In2-In5
   paddsw xmm5, xmm4         ; t2 = In2+In5
 
   movdqa xmm3, [%1+3*16]   ; In3
   movdqa xmm4, [%1+4*16]   ; In4
   movdqa xmm1, xmm3
   psubsw xmm3, xmm4         ; t4 = In3-In4
   paddsw xmm4, xmm1         ; t3 = In3+In4
   movdqa xmm6, [%1+6*16]   ; In6
   movdqa xmm1, [%1+1*16]   ; In1
   psubsw xmm1, xmm6         ; t6 = In1-In6
   paddsw xmm6, [%1+1*16]   ; t1 = In1+In6
 
   psubsw xmm7, xmm4         ; tm03 = t0-t3
   psubsw xmm6, xmm5         ; tm12 = t1-t2
   paddsw xmm4, xmm4         ; 2.t3
   paddsw xmm5, xmm5         ; 2.t2
   paddsw xmm4, xmm7         ; tp03 = t0+t3
   paddsw xmm5, xmm6         ; tp12 = t1+t2
 
   psllw  xmm2, %2+1        ; shift t5 (shift +1 to..
   psllw  xmm1, %2+1        ; shift t6  ..compensate cos4/2)
   psllw  xmm4, %2          ; shift t3
   psllw  xmm5, %2          ; shift t2
   psllw  xmm7, %2          ; shift t0
   psllw  xmm6, %2          ; shift t1
   psllw  xmm3, %2          ; shift t4
   psllw  xmm0, %2          ; shift t7
 
   psubsw xmm4, xmm5         ; out4 = tp03-tp12
   psubsw xmm1, xmm2         ; xmm1: t6-t5
   paddsw xmm5, xmm5
   paddsw xmm2, xmm2
   paddsw xmm5, xmm4         ; out0 = tp03+tp12
   movdqa [%1+4*16], xmm4   ; => out4
   paddsw xmm2, xmm1         ; xmm2: t6+t5
   movdqa [%1+0*16], xmm5   ; => out0
 
   movdqa xmm4, [tan2]      ; xmm4 <= tan2
   pmulhw xmm4, xmm7         ; tm03*tan2
   movdqa xmm5, [tan2]      ; xmm5 <= tan2
   psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
   pmulhw xmm5, xmm6         ; tm12*tan2
   paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03
 
   movdqa xmm6, [sqrt2]  
   movdqa xmm7, [Rounder1]
 
   pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
   por    xmm5, xmm7         ; correct out2
   por    xmm4, xmm7         ; correct out6
   pmulhw xmm1, xmm6         ; xmm1: tm65 = (t6 - t5)*cos4
   por    xmm2, xmm7         ; correct tp65
 
   movdqa [%1+2*16], xmm5   ; => out2
   movdqa xmm5, xmm3         ; save t4
   movdqa [%1+6*16], xmm4   ; => out6
   movdqa xmm4, xmm0         ; save t7
   
   psubsw xmm3, xmm1         ; xmm3: tm465 = t4 - tm65
   psubsw xmm0, xmm2         ; xmm0: tm765 = t7 - tp65
   paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65
   paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65
 
   movdqa xmm4, [tan3]      ; tan3 - 1
   movdqa xmm5, [tan1]      ; tan1
 
   movdqa xmm7, xmm3         ; save tm465
   pmulhw xmm3, xmm4         ; tm465*(tan3-1)
   movdqa xmm6, xmm1         ; save tp465
   pmulhw xmm1, xmm5         ; tp465*tan1
 
   paddsw xmm3, xmm7         ; tm465*tan3
   pmulhw xmm4, xmm0         ; tm765*(tan3-1)
   paddsw xmm4, xmm0         ; tm765*tan3
   pmulhw xmm5, xmm2         ; tp765*tan1
 
   paddsw xmm1, xmm2         ; out1 = tp765 + tp465*tan1
   psubsw xmm0, xmm3         ; out3 = tm765 - tm465*tan3
   paddsw xmm7, xmm4         ; out5 = tm465 + tm765*tan3
   psubsw xmm5, xmm6         ; out7 =-tp465 + tp765*tan1
 
   movdqa [%1+1*16], xmm1   ; => out1
   movdqa [%1+3*16], xmm0   ; => out3
   movdqa [%1+5*16], xmm7   ; => out5
   movdqa [%1+7*16], xmm5   ; => out7
 
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ;Helper macro fMTX_MULT
 ;-----------------------------------------------------------------------------
 
 %macro fMTX_MULT 3   ; %1=src, %2 = Coeffs, %3=rounders
 
   movdqa   xmm0, [ecx+%1*16+0]   ; xmm0 = [0123][4567]
   pshufhw  xmm1, xmm0, 00011011b ; xmm1 = [----][7654]
   pshufd   xmm0, xmm0, 01000100b
   pshufd   xmm1, xmm1, 11101110b
 
   movdqa   xmm2, xmm0
   paddsw  xmm0, xmm1              ; xmm0 = [a0 a1 a2 a3]
   psubsw  xmm2, xmm1              ; xmm2 = [b0 b1 b2 b3]
 
   punpckldq xmm0, xmm2            ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]
   pshufd    xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]
 
     ;  [M00 M01    M16 M17] [M06 M07    M22 M23]  x mm0 = [0 /1 /2'/3']
     ;  [M02 M03    M18 M19] [M04 M05    M20 M21]  x mm2 = [0'/1'/2 /3 ]
     ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']
     ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]
 
   movdqa  xmm1, [%2+16]
   movdqa  xmm3, [%2+32]
   pmaddwd xmm1, xmm2
   pmaddwd xmm3, xmm0
   pmaddwd xmm2, [%2+48]
   pmaddwd xmm0, [%2+ 0]
 
   paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]
   paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]
   psrad   xmm0, 16
   psrad   xmm2, 16
   
   packssdw xmm0, xmm2            ;  [ out0 .. out7 ]
   paddsw   xmm0, [%3]            ;  Round
 
   psraw    xmm0, 4               ; => [-2048, 2047]
 
   movdqa  [ecx+%1*16+0], xmm0
 %endmacro
 
 ;-----------------------------------------------------------------------------
 ; Function Forward DCT
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 fdct_sse2_skal:
   mov ecx, [esp+4]
   fLLM_PASS ecx+0, 3
   fMTX_MULT  0, fTab1, Fdct_Rnd0
   fMTX_MULT  1, fTab2, Fdct_Rnd2
   fMTX_MULT  2, fTab3, Fdct_Rnd1
   fMTX_MULT  3, fTab4, Fdct_Rnd1
   fMTX_MULT  4, fTab1, Fdct_Rnd0
   fMTX_MULT  5, fTab4, Fdct_Rnd1
   fMTX_MULT  6, fTab3, Fdct_Rnd1
   fMTX_MULT  7, fTab2, Fdct_Rnd1
   ret
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -