quantize_mpeg_mmx.asm

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· 汇编 代码 · 共 669 行 · 第 1/2 页

ASM
669
字号
;/**************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - 3dne Quantization/Dequantization -
; *
; *  Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
; *                2002-2003 Michael Militzer <isibaar@xvid.org>
; *                2002-2003 Pascal Massimino <skal@planet-d.net>
; *
; *  This program is free software ; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation ; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program ; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: quantize_mpeg_mmx.asm,v 1.8 2006/09/22 03:40:11 syskin Exp $
; *
; *************************************************************************/

%define SATURATE

BITS 32

%macro cglobal 1
	%ifdef PREFIX
		%ifdef MARK_FUNCS
			global _%1:function %1.endfunc-%1
			%define %1 _%1:function %1.endfunc-%1
		%else
			global _%1
			%define %1 _%1
		%endif
	%else
		%ifdef MARK_FUNCS
			global %1:function %1.endfunc-%1
		%else
			global %1
		%endif
	%endif
%endmacro

%macro cextern 1
	%ifdef PREFIX
		extern _%1
		%define %1 _%1
	%else
		extern %1
	%endif
%endmacro

;=============================================================================
; Local data (Read Only)
;=============================================================================

%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif

mmx_one:
	times 4	dw	 1

;-----------------------------------------------------------------------------
; divide by 2Q table
;-----------------------------------------------------------------------------

ALIGN 16
mmx_div:
	times 4 dw 65535 ; the div by 2 formula will overflow for the case
	                 ; quant=1 but we don't care much because quant=1
	                 ; is handled by a different piece of code that
	                 ; doesn't use this table.
%assign quant 2
%rep 30
	times 4 dw  (1<<17) / (quant*2) + 1
	%assign quant quant+1
%endrep

%define VM18P 3
%define VM18Q 4


;-----------------------------------------------------------------------------
; quantd table
;-----------------------------------------------------------------------------

quantd:
%assign quant 1
%rep 31
	times 4 dw  ((VM18P*quant) + (VM18Q/2)) / VM18Q
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
; multiple by 2Q table
;-----------------------------------------------------------------------------

mmx_mul_quant:
%assign quant 1
%rep 31
	times 4 dw  quant
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
; saturation limits
;-----------------------------------------------------------------------------

ALIGN 16

mmx_32767_minus_2047:
	times 4 dw (32767-2047)
mmx_32768_minus_2048:
	times 4 dw (32768-2048)
mmx_2047:
	times 4 dw 2047
mmx_minus_2048:
	times 4 dw (-2048)
zero:
	times 4 dw 0

;=============================================================================
; rounding
;=============================================================================

mmx_rounding:
	dw (1<<13)
	dw 0
	dw (1<<13)
	dw 0

;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal quant_mpeg_intra_mmx
cglobal quant_mpeg_inter_mmx
cglobal dequant_mpeg_intra_mmx
cglobal dequant_mpeg_inter_mmx


%macro QUANT_MMX	1
	movq	mm0, [eax + 16*(%1)]			; data
	movq	mm2, [ecx + 16*(%1) + 128]		; intra_matrix_rec
	movq	mm4, [eax + 16*(%1) + 8]		; data
	movq	mm6, [ecx + 16*(%1) + 128 + 8]	; intra_matrix_rec
	
	movq	mm1, mm0
	movq	mm5, mm4

	pmullw	mm0, mm2					; low results
	pmulhw	mm1, mm2					; high results
	pmullw	mm4, mm6					; low results
	pmulhw	mm5, mm6					; high results

	movq	mm2, mm0
	movq	mm6, mm4

	punpckhwd mm0, mm1
	punpcklwd mm2, mm1
	punpckhwd mm4, mm5
	punpcklwd mm6, mm5

	paddd	mm2, mm7
	paddd	mm0, mm7
	paddd	mm6, mm7
	paddd	mm4, mm7

	psrad	mm2, 14
	psrad	mm0, 14
	psrad	mm6, 14
	psrad	mm4, 14
	
	packssdw mm2, mm0
	packssdw mm6, mm4

	movq	[edi + 16*(%1)], mm2
	movq	[edi + 16*(%1)+8], mm6
%endmacro

;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_intra_mmx(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint32_t dcscalar,
;                               const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------

ALIGN 16
quant_mpeg_intra_mmx:

  push edi
  movq mm7, [mmx_rounding]

  mov eax, [esp + 4 + 8]		; data
  mov ecx, [esp + 4 + 20]		; mpeg_quant_matrices
  mov edi, [esp + 4 + 4]		; coeff

  QUANT_MMX(0)
  QUANT_MMX(1)
  QUANT_MMX(2)
  QUANT_MMX(3)
  QUANT_MMX(4)
  QUANT_MMX(5)
  QUANT_MMX(6)
  QUANT_MMX(7)

  ; calculate DC
  movsx eax, word [eax]     ; data[0]
  mov ecx, [esp + 4 + 16]   ; dcscalar
  mov edx, eax
  mov edi, ecx
  shr ecx, 1                ; ecx = dcscalar/2
  sar edx, 31               ; edx = sign extend of eax (ready for division too)
  xor ecx, edx              ; adjust ecx according to the sign of data[0]
  sub ecx, edx
  add eax, ecx

  mov ecx, [esp + 4 + 4]	; coeff again 
  idiv edi                  ; eax = edx:eax / dcscalar
  mov [ecx], ax             ; coeff[0] = ax
 
  pop edi

  xor eax, eax              ; return(0);
  ret
.endfunc


;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_inter_mmx(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------

ALIGN 16
quant_mpeg_inter_mmx:

  push ecx
  push esi
  push edi
  push ebx

  mov edi, [esp + 16 + 4]       ; coeff
  mov esi, [esp + 16 + 8]       ; data
  mov eax, [esp + 16 + 12]  ; quant
  mov ebx, [esp + 16 + 16]		; mpeg_quant_matrices

  xor ecx, ecx

  pxor mm5, mm5                 ; sum

  cmp al, 1
  jz near .q1loop

  cmp al, 2
  jz near .q2loop

  movq mm7, [mmx_div + eax * 8 - 8] ; divider

ALIGN 16
.loop
  movq mm0, [esi + 8*ecx]       ; mm0 = [1st]
  movq mm3, [esi + 8*ecx + 8]   ;
  pxor mm1, mm1                 ; mm1 = 0
  pxor mm4, mm4                 ;
  pcmpgtw mm1, mm0              ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3              ;
  pxor mm0, mm1                 ; mm0 = |mm0|
  pxor mm3, mm4                 ;
  psubw mm0, mm1                ; displace
  psubw mm3, mm4                ;
  psllw mm0, 4
  psllw mm3, 4
  movq mm2, [ebx + 512 + 8*ecx]
  psrlw mm2, 1
  paddw mm0, mm2
  movq mm2, [ebx + 768 + ecx*8]
  pmulhw mm0, mm2               ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
  movq mm2, [ebx + 512 + 8*ecx + 8]
  psrlw mm2, 1
  paddw mm3, mm2
  movq mm2, [ebx + 768 + ecx*8 + 8]
  pmulhw mm3, mm2
  pmulhw mm0, mm7               ; mm0 = (mm0 / 2Q) >> 16
  pmulhw mm3, mm7               ;
  psrlw mm0, 1                  ; additional shift by 1 => 16 + 1 = 17
  psrlw mm3, 1
  paddw mm5, mm0                ; sum += mm0
  pxor mm0, mm1                 ; mm0 *= sign(mm0)
  paddw mm5, mm3                ;
  pxor mm3, mm4                 ;
  psubw mm0, mm1                ; undisplace
  psubw mm3, mm4
  movq [edi + 8*ecx], mm0
  movq [edi + 8*ecx + 8], mm3

  add ecx, 2
  cmp ecx, 16
  jnz near .loop

.done
  pmaddwd mm5, [mmx_one]
  movq mm0, mm5
  psrlq mm5, 32
  paddd mm0, mm5
  movd eax, mm0                 ; return sum

  pop ebx
  pop edi
  pop esi
  pop ecx

  ret

ALIGN 16
.q1loop
  movq mm0, [esi + 8*ecx]       ; mm0 = [1st]
  movq mm3, [esi + 8*ecx+ 8]

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?