📄 skl_fdct_sse2.asm

📁 从FFMPEG转换而来的H264解码程序,VC下编译..
💻 ASM
📖 第 1 页 / 共 2 页
字号:
上一页 12
   ; use:xmm3,xmm5,xmm6,xmm7   frozen: xmm0,xmm4,xmm1,xmm2

  movdqa xmm3, [%1+0*16] ; x0
  movdqa xmm6, [%1+4*16] ; x4

  movdqa [%1   ], xmm2  ; we spill 1 reg to perform safe butterflies

  movdqa xmm2, xmm3
  psubsw xmm3, xmm6   ; x0-x4 = tm04
  paddsw xmm6, xmm2   ; x0+x4 = tp04

  movdqa xmm2, xmm6
  psubsw xmm6, xmm7
  paddsw xmm7, xmm2
  movdqa xmm2, xmm3
  psubsw xmm3, xmm5
  paddsw xmm5, xmm2

  movdqa xmm2, xmm5
  psubsw xmm5, xmm0
  paddsw xmm0, xmm2
  movdqa xmm2, xmm3
  psubsw xmm3, xmm4
  paddsw xmm4, xmm2

  movdqa xmm2, [%1]

  psraw  xmm5, 6      ; out6
  psraw  xmm3, 6      ; out5
  psraw  xmm0, 6      ; out1
  psraw  xmm4, 6      ; out2

  movdqa [%1+6*16], xmm5
  movdqa [%1+5*16], xmm3
  movdqa [%1+1*16], xmm0
  movdqa [%1+2*16], xmm4

    ; reminder: xmm1=b0, xmm2=b3, xmm7=a0, xmm6=a3

  movdqa xmm0, xmm7
  movdqa xmm4, xmm6
  psubsw xmm7, xmm1   ; a0-b0
  psubsw xmm6, xmm2   ; a3-b3
  paddsw xmm1, xmm0   ; a0+b0
  paddsw xmm2, xmm4   ; a3+b3

  psraw  xmm1, 6      ; out0
  psraw  xmm7, 6      ; out7
  psraw  xmm2, 6      ; out3
  psraw  xmm6, 6      ; out4

    ; store result

  movdqa [%1+0*16], xmm1
  movdqa [%1+3*16], xmm2
  movdqa [%1+4*16], xmm6
  movdqa [%1+7*16], xmm7

%endmacro

;-----------------------------------------------------------------------------
; Helper macro TEST_ROW (test a null row)
;-----------------------------------------------------------------------------

%macro TEST_ROW 2     ; %1:src,  %2:label x8
  mov eax, [%1   ]
  mov edx, [%1+ 8]
  or  eax, [%1+ 4]
  or  edx, [%1+12]
  or  eax, edx
  jz near %2
%endmacro

;-----------------------------------------------------------------------------
; Function idct (this one skips null rows)
;-----------------------------------------------------------------------------
; IEEE1180 and Walken compatible version

align 16
idct_sse2_skal:

  mov ecx, [esp+ 4]  ; Src

  TEST_ROW ecx, .Row0_Round
  iMTX_MULT  0, iTab1, Walken_Idct_Rounders + 16*0, 11
  jmp .Row1
.Row0_Round
  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 8*0]
  movdqa [ecx  ], xmm0

.Row1
  TEST_ROW ecx+16, .Row1_Round
  iMTX_MULT  1, iTab2, Walken_Idct_Rounders + 16*1, 11
  jmp .Row2
.Row1_Round
  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*1]
  movdqa [ecx+16  ], xmm0

.Row2
  TEST_ROW ecx+32, .Row2_Round
  iMTX_MULT  2, iTab3, Walken_Idct_Rounders + 16*2, 11
  jmp .Row3
.Row2_Round
  movdqa xmm0, [Walken_Idct_Rounders + 16*8 + 16*2]
  movdqa [ecx+32  ], xmm0

.Row3
  TEST_ROW ecx+48, .Row4
  iMTX_MULT  3, iTab4, Walken_Idct_Rounders + 16*3, 11

.Row4
  TEST_ROW ecx+64, .Row5
  iMTX_MULT  4, iTab1, Walken_Idct_Rounders + 16*4, 11

.Row5
  TEST_ROW ecx+80, .Row6
  iMTX_MULT  5, iTab4, Walken_Idct_Rounders + 16*5, 11

.Row6
  TEST_ROW ecx+96, .Row7
  iMTX_MULT  6, iTab3, Walken_Idct_Rounders + 16*6, 11

.Row7
  TEST_ROW ecx+112, .End
  iMTX_MULT  7, iTab2, Walken_Idct_Rounders + 16*7, 11
.End

  iLLM_PASS ecx

  ret
.endfunc

;-----------------------------------------------------------------------------
; Helper macro fLLM_PASS
;-----------------------------------------------------------------------------

%macro fLLM_PASS 2  ; %1: src/dst, %2:Shift

  movdqa xmm0, [%1+0*16]   ; In0
  movdqa xmm2, [%1+2*16]   ; In2
  movdqa xmm3, xmm0
  movdqa xmm4, xmm2
  movdqa xmm7, [%1+7*16]   ; In7
  movdqa xmm5, [%1+5*16]   ; In5

  psubsw xmm0, xmm7         ; t7 = In0-In7
  paddsw xmm7, xmm3         ; t0 = In0+In7
  psubsw xmm2, xmm5         ; t5 = In2-In5
  paddsw xmm5, xmm4         ; t2 = In2+In5

  movdqa xmm3, [%1+3*16]   ; In3
  movdqa xmm4, [%1+4*16]   ; In4
  movdqa xmm1, xmm3
  psubsw xmm3, xmm4         ; t4 = In3-In4
  paddsw xmm4, xmm1         ; t3 = In3+In4
  movdqa xmm6, [%1+6*16]   ; In6
  movdqa xmm1, [%1+1*16]   ; In1
  psubsw xmm1, xmm6         ; t6 = In1-In6
  paddsw xmm6, [%1+1*16]   ; t1 = In1+In6

  psubsw xmm7, xmm4         ; tm03 = t0-t3
  psubsw xmm6, xmm5         ; tm12 = t1-t2
  paddsw xmm4, xmm4         ; 2.t3
  paddsw xmm5, xmm5         ; 2.t2
  paddsw xmm4, xmm7         ; tp03 = t0+t3
  paddsw xmm5, xmm6         ; tp12 = t1+t2

  psllw  xmm2, %2+1        ; shift t5 (shift +1 to..
  psllw  xmm1, %2+1        ; shift t6  ..compensate cos4/2)
  psllw  xmm4, %2          ; shift t3
  psllw  xmm5, %2          ; shift t2
  psllw  xmm7, %2          ; shift t0
  psllw  xmm6, %2          ; shift t1
  psllw  xmm3, %2          ; shift t4
  psllw  xmm0, %2          ; shift t7

  psubsw xmm4, xmm5         ; out4 = tp03-tp12
  psubsw xmm1, xmm2         ; xmm1: t6-t5
  paddsw xmm5, xmm5
  paddsw xmm2, xmm2
  paddsw xmm5, xmm4         ; out0 = tp03+tp12
  movdqa [%1+4*16], xmm4   ; => out4
  paddsw xmm2, xmm1         ; xmm2: t6+t5
  movdqa [%1+0*16], xmm5   ; => out0

  movdqa xmm4, [tan2]      ; xmm4 <= tan2
  pmulhw xmm4, xmm7         ; tm03*tan2
  movdqa xmm5, [tan2]      ; xmm5 <= tan2
  psubsw xmm4, xmm6         ; out6 = tm03*tan2 - tm12
  pmulhw xmm5, xmm6         ; tm12*tan2
  paddsw xmm5, xmm7         ; out2 = tm12*tan2 + tm03

  movdqa xmm6, [sqrt2]
  movdqa xmm7, [Rounder1]

  pmulhw xmm2, xmm6         ; xmm2: tp65 = (t6 + t5)*cos4
  por    xmm5, xmm7         ; correct out2
  por    xmm4, xmm7         ; correct out6
  pmulhw xmm1, xmm6         ; xmm1: tm65 = (t6 - t5)*cos4
  por    xmm2, xmm7         ; correct tp65

  movdqa [%1+2*16], xmm5   ; => out2
  movdqa xmm5, xmm3         ; save t4
  movdqa [%1+6*16], xmm4   ; => out6
  movdqa xmm4, xmm0         ; save t7

  psubsw xmm3, xmm1         ; xmm3: tm465 = t4 - tm65
  psubsw xmm0, xmm2         ; xmm0: tm765 = t7 - tp65
  paddsw xmm2, xmm4         ; xmm2: tp765 = t7 + tp65
  paddsw xmm1, xmm5         ; xmm1: tp465 = t4 + tm65

  movdqa xmm4, [tan3]      ; tan3 - 1
  movdqa xmm5, [tan1]      ; tan1

  movdqa xmm7, xmm3         ; save tm465
  pmulhw xmm3, xmm4         ; tm465*(tan3-1)
  movdqa xmm6, xmm1         ; save tp465
  pmulhw xmm1, xmm5         ; tp465*tan1

  paddsw xmm3, xmm7         ; tm465*tan3
  pmulhw xmm4, xmm0         ; tm765*(tan3-1)
  paddsw xmm4, xmm0         ; tm765*tan3
  pmulhw xmm5, xmm2         ; tp765*tan1

  paddsw xmm1, xmm2         ; out1 = tp765 + tp465*tan1
  psubsw xmm0, xmm3         ; out3 = tm765 - tm465*tan3
  paddsw xmm7, xmm4         ; out5 = tm465 + tm765*tan3
  psubsw xmm5, xmm6         ; out7 =-tp465 + tp765*tan1

  movdqa [%1+1*16], xmm1   ; => out1
  movdqa [%1+3*16], xmm0   ; => out3
  movdqa [%1+5*16], xmm7   ; => out5
  movdqa [%1+7*16], xmm5   ; => out7

%endmacro

;-----------------------------------------------------------------------------
;Helper macro fMTX_MULT
;-----------------------------------------------------------------------------

%macro fMTX_MULT 3   ; %1=src, %2 = Coeffs, %3=rounders

  movdqa   xmm0, [ecx+%1*16+0]   ; xmm0 = [0123][4567]
  pshufhw  xmm1, xmm0, 00011011b ; xmm1 = [----][7654]
  pshufd   xmm0, xmm0, 01000100b
  pshufd   xmm1, xmm1, 11101110b

  movdqa   xmm2, xmm0
  paddsw  xmm0, xmm1              ; xmm0 = [a0 a1 a2 a3]
  psubsw  xmm2, xmm1              ; xmm2 = [b0 b1 b2 b3]

  punpckldq xmm0, xmm2            ; xmm0 = [a0 a1 b0 b1][a2 a3 b2 b3]
  pshufd    xmm2, xmm0, 01001110b ; xmm2 = [a2 a3 b2 b3][a0 a1 b0 b1]

    ;  [M00 M01    M16 M17] [M06 M07    M22 M23]  x mm0 = [0 /1 /2'/3']
    ;  [M02 M03    M18 M19] [M04 M05    M20 M21]  x mm2 = [0'/1'/2 /3 ]
    ;  [M08 M09    M24 M25] [M14 M15    M30 M31]  x mm0 = [4 /5 /6'/7']
    ;  [M10 M11    M26 M27] [M12 M13    M28 M29]  x mm2 = [4'/5'/6 /7 ]

  movdqa  xmm1, [%2+16]
  movdqa  xmm3, [%2+32]
  pmaddwd xmm1, xmm2
  pmaddwd xmm3, xmm0
  pmaddwd xmm2, [%2+48]
  pmaddwd xmm0, [%2+ 0]

  paddd   xmm0, xmm1             ;  [ out0 | out1 ][ out2 | out3 ]
  paddd   xmm2, xmm3             ;  [ out4 | out5 ][ out6 | out7 ]
  psrad   xmm0, 16
  psrad   xmm2, 16

  packssdw xmm0, xmm2            ;  [ out0 .. out7 ]
  paddsw   xmm0, [%3]            ;  Round

  psraw    xmm0, 4               ; => [-2048, 2047]

  movdqa  [ecx+%1*16+0], xmm0
%endmacro

;-----------------------------------------------------------------------------
; Function Forward DCT
;-----------------------------------------------------------------------------

ALIGN 16
fdct_sse2_skal:
  mov ecx, [esp+4]
  fLLM_PASS ecx+0, 3
  fMTX_MULT  0, fTab1, Fdct_Rnd0
  fMTX_MULT  1, fTab2, Fdct_Rnd2
  fMTX_MULT  2, fTab3, Fdct_Rnd1
  fMTX_MULT  3, fTab4, Fdct_Rnd1
  fMTX_MULT  4, fTab1, Fdct_Rnd0
  fMTX_MULT  5, fTab4, Fdct_Rnd1
  fMTX_MULT  6, fTab3, Fdct_Rnd1
  fMTX_MULT  7, fTab2, Fdct_Rnd1
  ret
.endfunc
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -