quantize_h263_3dne.asm

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· 汇编 代码 · 共 787 行 · 第 1/2 页

ASM
787
字号
;/**************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - 3dne Quantization/Dequantization -
; *
; *  Copyright(C) 2002-2003 Jaan Kalda
; *
; *  This program is free software ; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation ; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program ; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: quantize_h263_3dne.asm,v 1.5 2004/08/29 10:02:38 edgomez Exp $
; *
; *************************************************************************/
;
; these 3dne functions are compatible with iSSE, but are optimized specifically for
; K7 pipelines

; enable dequant saturate [-2048,2047], test purposes only.
%define SATURATE

BITS 32

%macro cglobal 1
	%ifdef PREFIX
		%ifdef MARK_FUNCS
			global _%1:function %1.endfunc-%1
			%define %1 _%1:function %1.endfunc-%1
		%else
			global _%1
			%define %1 _%1
		%endif
	%else
		%ifdef MARK_FUNCS
			global %1:function %1.endfunc-%1
		%else
			global %1
		%endif
	%endif
%endmacro

;=============================================================================
; Local data
;=============================================================================

%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif

align 4
int_div:
	dd 0
%assign i 1
%rep 255
	dd  (1 << 16) / (i) + 1
	%assign i i+1
%endrep

ALIGN 16
plus_one:
	times 8 dw 1

;-----------------------------------------------------------------------------
; subtract by Q/2 table
;-----------------------------------------------------------------------------

ALIGN 16
mmx_sub:
%assign i 1
%rep 31
	times 4 dw i / 2
	%assign i i+1
%endrep


;-----------------------------------------------------------------------------
;
; divide by 2Q table
;
; use a shift of 16 to take full advantage of _pmulhw_
; for q=1, _pmulhw_ will overflow so it is treated seperately
; (3dnow2 provides _pmulhuw_ which wont cause overflow)
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_div:
%assign i 1
%rep 31
	times 4 dw  (1 << 16) / (i * 2) + 1
	%assign i i+1
%endrep

;-----------------------------------------------------------------------------
; add by (odd(Q) ? Q : Q - 1) table
;-----------------------------------------------------------------------------

ALIGN 16
mmx_add:
%assign i 1
%rep 31
	%if i % 2 != 0
	times 4 dw i
	%else
	times 4 dw i - 1
	%endif
	%assign i i+1
%endrep

;-----------------------------------------------------------------------------
; multiple by 2Q table
;-----------------------------------------------------------------------------

ALIGN 16
mmx_mul:
%assign i 1
%rep 31
	times 4 dw i * 2
	%assign i i+1
%endrep

;-----------------------------------------------------------------------------
; saturation limits
;-----------------------------------------------------------------------------

ALIGN 8
mmx_32768_minus_2048:
	times 4 dw (32768-2048)
mmx_32767_minus_2047:
	times 4 dw (32767-2047)

ALIGN 16
mmx_2047:
	times 4 dw 2047

ALIGN 8
mmzero:
	dd 0, 0
int2047:
	dd 2047
int_2048:
	dd -2048

;=============================================================================
; Code
;=============================================================================

SECTION .text


;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_intra_3dne(int16_t * coeff,
;                                const int16_t const * data,
;                                const uint32_t quant,
;                                const uint32_t dcscalar,
;                                const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
;This is Athlon-optimized code (ca 70 clk per call)

%macro quant_intra1  1
  psubw mm1, mm0    ;A3
  psubw mm3, mm2    ;B3
%if (%1)
  psubw mm5, mm4    ;C8
  psubw mm7, mm6    ;D8
%endif

ALIGN 8
  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
  pmaxsw mm1, mm0   ;A4
  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
  pmaxsw mm3, mm2   ;B4


  psraw mm0, 15     ;A5
  psraw mm2, 15     ;B5
%if (%1)
  movq [edx + %1 * 32 + 16-32], mm5 ;C9
  movq [edx + %1 * 32 + 24-32], mm7 ;D9
%endif

  psrlw mm1, 1      ;A6
  psrlw mm3, 1      ;B6
  movq mm5, [ebx]   ;C2
  movq mm7, [ebx]   ;D2

  pxor mm1, mm0 ;A7
  pxor mm3, mm2 ;B7

  psubw mm5, mm4    ;C3
  psubw mm7, mm6    ;D3
  psubw mm1, mm0    ;A8
  psubw mm3, mm2    ;B8

%if (%1 == 0)
  push ebp
  movq mm0, [ecx + %1 * 32 +32]
%elif (%1 < 3)
  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32)  ;movq   mm0, [ecx + %1 * 32 +32]    ;A1
%endif
  pmaxsw mm5, mm4   ;C4
%if (%1 < 3)
  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32)   ;movq   mm2, [ecx + %1 * 32 +8+32]  ;B1
%else
  cmp esp, esp
%endif
  pmaxsw mm7, mm6   ;D4

  psraw mm4, 15     ;C5
  psraw mm6, 15     ;D5
  movq [byte edx + %1 * 32], mm1    ;A9
  movq [edx + %1 * 32+8], mm3       ;B9


  psrlw mm5, 1      ;C6
  psrlw mm7, 1      ;D6
%if (%1 < 3)
  movq mm1, [ebx]   ;A2
  movq mm3, [ebx]   ;B2
%endif
%if (%1 == 3)
  imul eax, [int_div+4*edi]
%endif
  pxor mm5, mm4 ;C7
  pxor mm7, mm6 ;D7
%endm


%macro quant_intra  1
    ; Rules for athlon:
        ; 1) schedule latencies
        ; 2) add/mul and load/store in 2:1 proportion
        ; 3) avoid spliting >3byte instructions over 8byte boundaries

  psubw mm1, mm0    ;A3
  psubw mm3, mm2    ;B3
%if (%1)
  psubw mm5, mm4    ;C8
  psubw mm7, mm6    ;D8
%endif

ALIGN 8
  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
  pmaxsw mm1, mm0   ;A4
  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
  pmaxsw mm3, mm2   ;B4


  psraw mm0, 15     ;A5
  psraw mm2, 15     ;B5
%if (%1)
  movq [edx + %1 * 32 + 16-32], mm5 ;C9
  movq [edx + %1 * 32 + 24-32], mm7 ;D9
%endif

  pmulhw mm1, [esi] ;A6
  pmulhw mm3, [esi] ;B6
  movq mm5, [ebx]   ;C2
  movq mm7, [ebx]   ;D2

  nop
  nop
  pxor mm1, mm0 ;A7
  pxor mm3, mm2 ;B7

  psubw mm5, mm4    ;C3
  psubw mm7, mm6    ;D3
  psubw mm1, mm0    ;A8
  psubw mm3, mm2    ;B8


%if (%1 < 3)
  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq    mm0, [ecx + %1 * 32 +32]    ;A1
%endif
  pmaxsw mm5, mm4     ;C4
%if (%1 < 3)
  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32]  ;B1
%else
  cmp esp, esp
%endif
  pmaxsw mm7,mm6        ;D4

  psraw mm4, 15     ;C5
  psraw mm6, 15     ;D5
  movq [byte edx + %1 * 32], mm1 ;A9
  movq [edx + %1 * 32+8], mm3     ;B9


  pmulhw mm5, [esi] ;C6
  pmulhw mm7, [esi] ;D6
%if (%1 < 3)
  movq mm1, [ebx]   ;A2
  movq mm3, [ebx]   ;B2
%endif
%if (%1 == 0)
  push ebp
%elif (%1 < 3)
  nop
%endif
  nop
%if (%1 == 3)
  imul eax, [int_div+4*edi]
%endif
  pxor mm5, mm4 ;C7
  pxor mm7, mm6 ;D7
%endmacro


ALIGN 16
cglobal quant_h263_intra_3dne
quant_h263_intra_3dne:

  mov eax, [esp + 12]       ; quant
  mov ecx, [esp + 8]        ; data
  mov edx, [esp + 4]        ; coeff
  cmp al, 1
  pxor mm1, mm1
  pxor mm3, mm3
  movq mm0, [ecx]           ; mm0 = [1st]
  movq mm2, [ecx + 8]
  push esi
  lea esi, [mmx_div + eax*8 - 8]

  push ebx
  mov ebx, mmzero
  push edi
  jz near .q1loop

  quant_intra 0
  mov ebp, [esp + 16 + 16]      ; dcscalar
                                ; NB -- there are 3 pushes in the function preambule and one more
                                ; in "quant_intra 0", thus an added offset of 16 bytes
  movsx eax, word [byte ecx]    ; DC

  quant_intra 1
  mov edi, eax
  sar edi, 31                       ; sign(DC)
  shr ebp, byte 1                   ; ebp = dcscalar/2

  quant_intra 2
  sub eax, edi                      ; DC (+1)
  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
  mov edi, [esp + 16 + 16]          ; dscalar
  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2
  mov ebp, [byte esp]

  quant_intra 3
  psubw mm5, mm4                    ;C8
  mov esi, [esp + 12]               ; pop back the register value
  mov edi, [esp + 4]                ; pop back the register value
  sar eax, 16
  lea ebx, [byte eax + 1]           ; workaround for eax < 0
  cmovs eax, ebx                    ; conditionnaly move the corrected value
  mov [edx], ax                     ; coeff[0] = ax
  mov ebx, [esp + 8]                ; pop back the register value
  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
  psubw mm7, mm6                    ;D8
  movq [edx + 3 * 32 + 16], mm5     ;C9
  movq [edx + 3 * 32 + 24], mm7     ;D9

  xor eax, eax
  ret

ALIGN 16

.q1loop
  quant_intra1 0
  mov ebp, [esp + 16 + 16]          ; dcscalar
  movsx eax, word [byte ecx]        ; DC

  quant_intra1 1
  mov edi, eax
  sar edi, 31                       ; sign(DC)
  shr ebp, byte 1                   ; ebp = dcscalar /2

  quant_intra1 2
  sub eax, edi                      ; DC (+1)
  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
  mov edi, [esp + 16 + 16]          ; dcscalar
  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?