quantize_mpeg_xmm.asm

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· 汇编 代码 · 共 810 行 · 第 1/2 页

ASM
810
字号
;/****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - 3dne Quantization/Dequantization -
; *
; *  Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
; *                2002      Jaan Kalda
; *		   2004 Andre Werthmann <wertmann@aei.mpg.de>
; *
; *  This program is free software ; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation ; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program ; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: quantize_mpeg_xmm.asm,v 1.1 2005/01/05 23:02:15 edgomez Exp $
; *
; ***************************************************************************/

; _3dne functions are compatible with iSSE, but are optimized specifically
; for K7 pipelines

%define SATURATE

BITS 64

%macro cglobal 1
	%ifdef PREFIX
		%ifdef MARK_FUNCS
			global _%1:function %1.endfunc-%1
			%define %1 _%1:function %1.endfunc-%1
		%else
			global _%1
			%define %1 _%1
		%endif
	%else
		%ifdef MARK_FUNCS
			global %1:function %1.endfunc-%1
		%else
			global %1
		%endif
	%endif
%endmacro

;=============================================================================
; Local data
;=============================================================================

%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif

ALIGN 8
mmzero:
	dd 0,0
mmx_one:
	times 4 dw 1

;-----------------------------------------------------------------------------
; divide by 2Q table
;-----------------------------------------------------------------------------

ALIGN 16
mmx_divs:		;i>2
%assign i 1
%rep 31
	times 4 dw  ((1 << 15) / i + 1)
	%assign i i+1
%endrep

ALIGN 16
mmx_div:		;quant>2
	times 4 dw 65535 ; the div by 2 formula will overflow for the case
	                 ; quant=1 but we don't care much because quant=1
	                 ; is handled by a different piece of code that
	                 ; doesn't use this table.
%assign quant 2
%rep 31
	times 4 dw  ((1 << 16) / quant + 1)
	%assign quant quant+1
%endrep

%macro FIXX 1
dw (1 << 16) / (%1) + 1
%endmacro

%define nop4	db	08Dh, 074h, 026h,0
%define nop3	add	esp, byte 0
%define nop2	mov	esp, esp
%define nop7	db	08dh, 02ch, 02dh,0,0,0,0
%define nop6	add	ebp, dword 0

;-----------------------------------------------------------------------------
; quantd table
;-----------------------------------------------------------------------------

%define VM18P	3
%define VM18Q	4

ALIGN 16
quantd:
%assign i 1
%rep 31
	times 4 dw  (((VM18P*i) + (VM18Q/2)) / VM18Q)
	%assign i i+1
%endrep

;-----------------------------------------------------------------------------
; multiple by 2Q table
;-----------------------------------------------------------------------------

ALIGN 16
mmx_mul_quant:
%assign i 1
%rep 31
	times 4 dw  i
	%assign i i+1
%endrep

;-----------------------------------------------------------------------------
; saturation limits
;-----------------------------------------------------------------------------

ALIGN 16
mmx_32767_minus_2047:
	times 4 dw (32767-2047)
mmx_32768_minus_2048:
	times 4 dw (32768-2048)
mmx_2047:
	times 4 dw 2047
mmx_minus_2048:
	times 4 dw (-2048)
zero:
	times 4 dw 0

int_div:
dd 0
%assign i 1
%rep 255
	dd  (1 << 17) / ( i) + 1
	%assign i i+1
%endrep

;=============================================================================
; Code
;=============================================================================

SECTION .text align=16

cglobal quant_mpeg_intra_x86_64
cglobal quant_mpeg_inter_x86_64
cglobal dequant_mpeg_intra_x86_64
cglobal dequant_mpeg_inter_x86_64

;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_intra_x86_64(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint32_t dcscalar,
;                               const uint16_t *mpeg_matrices);
; Ported from its 32bit xmm cousin
;-----------------------------------------------------------------------------

ALIGN 16
quant_mpeg_intra_x86_64:
  mov rax, rsi			; data
  mov r9, rcx			; save dcscalar
  mov rcx, rdx			; quant
  mov rdx, rdi			; coeff

  push rbx
  
  mov rdi, r8				; mpeg_quant_matrices

  mov rsi, -14
  pxor mm0, mm0
  pxor mm3, mm3
  cmp rcx, byte 1
  je near .q1loop
  cmp rcx, byte 19
  jg near .lloop

ALIGN 16
.loop
  movq mm1, [rax + 8*rsi+112]   ; mm0 = [1st]
  psubw mm0, mm1                ;-mm1
  movq mm4, [rax + 8*rsi + 120] ;
  psubw mm3, mm4                ;-mm4
  pmaxsw mm0, mm1               ;|src|
  pmaxsw mm3,mm4
;  nop2
  psraw mm1, 15     ;sign src
  psraw mm4, 15
  psllw mm0, 4      ;level << 4 ;
  psllw mm3, 4
  paddw mm0, [rdi + 128 + 8*rsi+112]
  paddw mm3, [rdi + 128 + 8*rsi+120]
  movq mm5, [rdi + 384 + 8*rsi+112]
  movq mm7, [rdi + 384 + 8*rsi+120]
  pmulhuw mm5, mm0
  pmulhuw mm7, mm3
;  mov esp, esp
  movq mm2, [rdi + 8*rsi+112]
  movq mm6, [rdi + 8*rsi+120]
  pmullw mm2, mm5
  pmullw mm6, mm7
  psubw mm0, mm2
  psubw mm3, mm6
;  nop4
  lea r11, [quantd wrt rip]
  movq mm2, [r11 + rcx * 8 - 8]
  lea r11, [mmx_divs wrt rip]
  movq mm6, [r11 + rcx * 8 - 8]
  paddw mm5, mm2
  paddw mm7, mm2
;  mov esp, esp
  pmulhuw mm0, [rdi + 256 + 8*rsi+112]
  pmulhuw mm3, [rdi + 256 + 8*rsi+120]
  paddw mm5, mm0
  paddw mm7, mm3
  pxor mm0, mm0
  pxor mm3, mm3
  pmulhuw mm5, mm6      ; mm0 = (mm0 / 2Q) >> 16
  pmulhuw mm7, mm6      ;  (level + quantd) / quant (0<quant<32)
  pxor mm5, mm1         ; mm0 *= sign(mm0)
  pxor mm7, mm4         ;
  psubw mm5, mm1        ; undisplace
  psubw mm7, mm4        ;
  movq [rdx + 8*rsi+112], mm5
  movq [rdx + 8*rsi +120], mm7
  add rsi, byte 2
  jng near .loop

.done
; calculate  data[0] // (int32_t)dcscalar)
;  mov esi, [esp + 12 + 16]  ; dcscalar
  mov rsi, r9			; dcscalar
  movsx rcx, word [rax]
  mov rdi, rcx
;  mov edx, [esp + 12 + 16]
  mov r11, rdx		; save rdx
  mov rdx, r9		;
  shr edx, 1            ; ebx = dcscalar /2
  sar edi, 31           ; cdq is vectorpath
  xor edx, edi          ; ebx = eax V -eax -1
  sub ecx, edi
  add ecx, edx
;;  mov rdx, [dword esp + 12 + 4]
  mov rdx, r11		; restore rdx
  lea r11, [int_div wrt rip]
  mov rsi, [r11+4*rsi]
  imul ecx, esi
  sar ecx, 17
  lea rbx, [byte rcx + 1]
  cmovs rcx, rbx
  ; idiv    cx          ; ecx = edi:ecx / dcscalar

;  mov ebx, [esp]
;  mov edi, [esp+4]
;  mov esi, [esp+8]
;  add esp, byte 12	; pops...
  pop rbx
;  mov [rdx], rcx     ; coeff[0] = ax
  mov [rdx], cx		; coeff[0] = cx

  xor rax, rax
  ret

ALIGN 16
.q1loop
  movq mm1, [rax + 8*rsi+112]               ; mm0 = [1st]
  psubw mm0, mm1                            ;-mm1
  movq mm4, [rax + 8*rsi+120]               ;
  psubw mm3, mm4                            ;-mm4
  pmaxsw mm0, mm1                           ;|src|
  pmaxsw mm3, mm4
;  nop2
  psraw mm1, 15                             ;sign src
  psraw mm4, 15
  psllw mm0, 4                              ; level << 4
  psllw mm3, 4
  paddw mm0, [rdi + 128 + 8*rsi+112]    ;mm0 is to be divided
  paddw mm3, [rdi + 128 + 8*rsi+120]    ;intra1 contains fix for division by 1
  movq mm5, [rdi + 384 + 8*rsi+112] ;with rounding down
  movq mm7, [rdi + 384 + 8*rsi+120]
  pmulhuw mm5, mm0
  pmulhuw mm7, mm3      ;mm7: first approx of division
;  mov esp, esp
  movq mm2, [rdi + 8*rsi+112]
  movq mm6, [rdi + 8*rsi+120]      ; divs for q<=16
  pmullw mm2, mm5       ;test value <= original
  pmullw mm6, mm7
  psubw mm0, mm2        ;mismatch
  psubw mm3, mm6
;  nop4
  lea r11, [quantd wrt rip]
  movq mm2, [r11 + rcx * 8 - 8]
  paddw mm5, mm2        ;first approx with quantd
  paddw mm7, mm2
;  mov esp, esp
  pmulhuw mm0, [rdi + 256 + 8*rsi+112]   ;correction
  pmulhuw mm3, [rdi + 256 + 8*rsi+120]
  paddw mm5, mm0        ;final result with quantd
  paddw mm7, mm3
  pxor mm0, mm0
  pxor mm3, mm3
;  mov esp, esp
  psrlw mm5, 1          ;  (level + quantd) /2  (quant = 1)
  psrlw mm7, 1
  pxor mm5, mm1         ; mm0 *= sign(mm0)
  pxor mm7, mm4         ;
  psubw mm5, mm1        ; undisplace
  psubw mm7, mm4        ;
  movq [rdx + 8*rsi+112], mm5
  movq [rdx + 8*rsi +120], mm7
  add rsi, byte 2
  jng near .q1loop
  jmp near .done

ALIGN 8
.lloop
  movq mm1, [rax + 8*rsi+112]       ; mm0 = [1st]
  psubw mm0, mm1        ;-mm1
  movq mm4, [rax + 8*rsi+120]
  psubw mm3, mm4        ;-mm4
  pmaxsw mm0, mm1       ;|src|
  pmaxsw mm3, mm4
;  nop2
  psraw mm1, 15         ;sign src
  psraw mm4, 15
  psllw mm0, 4          ; level << 4
  psllw mm3, 4          ;
  paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided intra1 contains fix for division by 1
  paddw mm3, [rdi + 128 + 8*rsi+120]
  movq mm5, [rdi + 384 + 8*rsi+112]
  movq mm7, [rdi + 384 + 8*rsi+120]
  pmulhuw mm5, mm0
  pmulhuw mm7, mm3      ;mm7: first approx of division
;  mov esp, esp
  movq mm2, [rdi + 8*rsi+112]
  movq mm6, [rdi + 8*rsi+120]
  pmullw mm2, mm5       ;test value <= original
  pmullw mm6, mm7
  psubw mm0, mm2        ;mismatch
  psubw mm3, mm6
;  nop4
  lea r11, [quantd wrt rip]
  movq mm2, [r11 + rcx * 8 - 8]
  lea r11, [mmx_div wrt rip]
  movq mm6, [r11 + rcx * 8 - 8] ; divs for q<=16
  paddw mm5, mm2        ;first approx with quantd
  paddw mm7, mm2
;  mov esp, esp
  pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
  pmulhuw mm3, [rdi + 256 + 8*rsi+120]
  paddw mm5, mm0        ;final result with quantd
  paddw mm7, mm3
  pxor mm0, mm0
  pxor mm3, mm3
;  mov esp, esp
  pmulhuw mm5, mm6      ; mm0 = (mm0 / 2Q) >> 16
  pmulhuw mm7, mm6      ;  (level + quantd) / quant (0<quant<32)
  psrlw mm5, 1          ; (level + quantd) / (2*quant)
  psrlw mm7, 1
  pxor mm5, mm1         ; mm0 *= sign(mm0)
  pxor mm7, mm4         ;
  psubw mm5, mm1        ; undisplace
  psubw mm7, mm4        ;
  movq [rdx + 8*rsi+112], mm5
  movq [rdx + 8*rsi +120], mm7
  add rsi,byte 2
  jng near .lloop
  jmp near .done
.endfunc

;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_inter_x86_64(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint16_t *mpeg_matrices);
; Ported from its 32bit xmm cousin
;-----------------------------------------------------------------------------

ALIGN 16
quant_mpeg_inter_x86_64:
  mov rax, rsi			; data
  mov r8, rdi			; save coeff
  mov rdi, rcx			; mpeg_matrices
  mov rcx, rdx			; quant
  mov rdx, r8			; coeff

  push rbx

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?