quantize_mpeg_xmm.asm
来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· 汇编 代码 · 共 810 行 · 第 1/2 页
ASM
810 行
;/****************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - 3dne Quantization/Dequantization -
; *
; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
; * 2002 Jaan Kalda
; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
; *
; * This program is free software ; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation ; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program ; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; * $Id: quantize_mpeg_xmm.asm,v 1.1 2005/01/05 23:02:15 edgomez Exp $
; *
; ***************************************************************************/
; _3dne functions are compatible with iSSE, but are optimized specifically
; for K7 pipelines
%define SATURATE
BITS 64
%macro cglobal 1
%ifdef PREFIX
%ifdef MARK_FUNCS
global _%1:function %1.endfunc-%1
%define %1 _%1:function %1.endfunc-%1
%else
global _%1
%define %1 _%1
%endif
%else
%ifdef MARK_FUNCS
global %1:function %1.endfunc-%1
%else
global %1
%endif
%endif
%endmacro
;=============================================================================
; Local data
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif
ALIGN 8
mmzero:
dd 0,0
mmx_one:
times 4 dw 1
;-----------------------------------------------------------------------------
; divide by 2Q table
;-----------------------------------------------------------------------------
ALIGN 16
mmx_divs: ;i>2
%assign i 1
%rep 31
times 4 dw ((1 << 15) / i + 1)
%assign i i+1
%endrep
ALIGN 16
mmx_div: ;quant>2
times 4 dw 65535 ; the div by 2 formula will overflow for the case
; quant=1 but we don't care much because quant=1
; is handled by a different piece of code that
; doesn't use this table.
%assign quant 2
%rep 31
times 4 dw ((1 << 16) / quant + 1)
%assign quant quant+1
%endrep
%macro FIXX 1
dw (1 << 16) / (%1) + 1
%endmacro
%define nop4 db 08Dh, 074h, 026h,0
%define nop3 add esp, byte 0
%define nop2 mov esp, esp
%define nop7 db 08dh, 02ch, 02dh,0,0,0,0
%define nop6 add ebp, dword 0
;-----------------------------------------------------------------------------
; quantd table
;-----------------------------------------------------------------------------
%define VM18P 3
%define VM18Q 4
ALIGN 16
quantd:
%assign i 1
%rep 31
times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q)
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
; multiple by 2Q table
;-----------------------------------------------------------------------------
ALIGN 16
mmx_mul_quant:
%assign i 1
%rep 31
times 4 dw i
%assign i i+1
%endrep
;-----------------------------------------------------------------------------
; saturation limits
;-----------------------------------------------------------------------------
ALIGN 16
mmx_32767_minus_2047:
times 4 dw (32767-2047)
mmx_32768_minus_2048:
times 4 dw (32768-2048)
mmx_2047:
times 4 dw 2047
mmx_minus_2048:
times 4 dw (-2048)
zero:
times 4 dw 0
int_div:
dd 0
%assign i 1
%rep 255
dd (1 << 17) / ( i) + 1
%assign i i+1
%endrep
;=============================================================================
; Code
;=============================================================================
SECTION .text align=16
cglobal quant_mpeg_intra_x86_64
cglobal quant_mpeg_inter_x86_64
cglobal dequant_mpeg_intra_x86_64
cglobal dequant_mpeg_inter_x86_64
;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_intra_x86_64(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
; Ported from its 32bit xmm cousin
;-----------------------------------------------------------------------------
ALIGN 16
quant_mpeg_intra_x86_64:
mov rax, rsi ; data
mov r9, rcx ; save dcscalar
mov rcx, rdx ; quant
mov rdx, rdi ; coeff
push rbx
mov rdi, r8 ; mpeg_quant_matrices
mov rsi, -14
pxor mm0, mm0
pxor mm3, mm3
cmp rcx, byte 1
je near .q1loop
cmp rcx, byte 19
jg near .lloop
ALIGN 16
.loop
movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
psubw mm0, mm1 ;-mm1
movq mm4, [rax + 8*rsi + 120] ;
psubw mm3, mm4 ;-mm4
pmaxsw mm0, mm1 ;|src|
pmaxsw mm3,mm4
; nop2
psraw mm1, 15 ;sign src
psraw mm4, 15
psllw mm0, 4 ;level << 4 ;
psllw mm3, 4
paddw mm0, [rdi + 128 + 8*rsi+112]
paddw mm3, [rdi + 128 + 8*rsi+120]
movq mm5, [rdi + 384 + 8*rsi+112]
movq mm7, [rdi + 384 + 8*rsi+120]
pmulhuw mm5, mm0
pmulhuw mm7, mm3
; mov esp, esp
movq mm2, [rdi + 8*rsi+112]
movq mm6, [rdi + 8*rsi+120]
pmullw mm2, mm5
pmullw mm6, mm7
psubw mm0, mm2
psubw mm3, mm6
; nop4
lea r11, [quantd wrt rip]
movq mm2, [r11 + rcx * 8 - 8]
lea r11, [mmx_divs wrt rip]
movq mm6, [r11 + rcx * 8 - 8]
paddw mm5, mm2
paddw mm7, mm2
; mov esp, esp
pmulhuw mm0, [rdi + 256 + 8*rsi+112]
pmulhuw mm3, [rdi + 256 + 8*rsi+120]
paddw mm5, mm0
paddw mm7, mm3
pxor mm0, mm0
pxor mm3, mm3
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
pxor mm5, mm1 ; mm0 *= sign(mm0)
pxor mm7, mm4 ;
psubw mm5, mm1 ; undisplace
psubw mm7, mm4 ;
movq [rdx + 8*rsi+112], mm5
movq [rdx + 8*rsi +120], mm7
add rsi, byte 2
jng near .loop
.done
; calculate data[0] // (int32_t)dcscalar)
; mov esi, [esp + 12 + 16] ; dcscalar
mov rsi, r9 ; dcscalar
movsx rcx, word [rax]
mov rdi, rcx
; mov edx, [esp + 12 + 16]
mov r11, rdx ; save rdx
mov rdx, r9 ;
shr edx, 1 ; ebx = dcscalar /2
sar edi, 31 ; cdq is vectorpath
xor edx, edi ; ebx = eax V -eax -1
sub ecx, edi
add ecx, edx
;; mov rdx, [dword esp + 12 + 4]
mov rdx, r11 ; restore rdx
lea r11, [int_div wrt rip]
mov rsi, [r11+4*rsi]
imul ecx, esi
sar ecx, 17
lea rbx, [byte rcx + 1]
cmovs rcx, rbx
; idiv cx ; ecx = edi:ecx / dcscalar
; mov ebx, [esp]
; mov edi, [esp+4]
; mov esi, [esp+8]
; add esp, byte 12 ; pops...
pop rbx
; mov [rdx], rcx ; coeff[0] = ax
mov [rdx], cx ; coeff[0] = cx
xor rax, rax
ret
ALIGN 16
.q1loop
movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
psubw mm0, mm1 ;-mm1
movq mm4, [rax + 8*rsi+120] ;
psubw mm3, mm4 ;-mm4
pmaxsw mm0, mm1 ;|src|
pmaxsw mm3, mm4
; nop2
psraw mm1, 15 ;sign src
psraw mm4, 15
psllw mm0, 4 ; level << 4
psllw mm3, 4
paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided
paddw mm3, [rdi + 128 + 8*rsi+120] ;intra1 contains fix for division by 1
movq mm5, [rdi + 384 + 8*rsi+112] ;with rounding down
movq mm7, [rdi + 384 + 8*rsi+120]
pmulhuw mm5, mm0
pmulhuw mm7, mm3 ;mm7: first approx of division
; mov esp, esp
movq mm2, [rdi + 8*rsi+112]
movq mm6, [rdi + 8*rsi+120] ; divs for q<=16
pmullw mm2, mm5 ;test value <= original
pmullw mm6, mm7
psubw mm0, mm2 ;mismatch
psubw mm3, mm6
; nop4
lea r11, [quantd wrt rip]
movq mm2, [r11 + rcx * 8 - 8]
paddw mm5, mm2 ;first approx with quantd
paddw mm7, mm2
; mov esp, esp
pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
pmulhuw mm3, [rdi + 256 + 8*rsi+120]
paddw mm5, mm0 ;final result with quantd
paddw mm7, mm3
pxor mm0, mm0
pxor mm3, mm3
; mov esp, esp
psrlw mm5, 1 ; (level + quantd) /2 (quant = 1)
psrlw mm7, 1
pxor mm5, mm1 ; mm0 *= sign(mm0)
pxor mm7, mm4 ;
psubw mm5, mm1 ; undisplace
psubw mm7, mm4 ;
movq [rdx + 8*rsi+112], mm5
movq [rdx + 8*rsi +120], mm7
add rsi, byte 2
jng near .q1loop
jmp near .done
ALIGN 8
.lloop
movq mm1, [rax + 8*rsi+112] ; mm0 = [1st]
psubw mm0, mm1 ;-mm1
movq mm4, [rax + 8*rsi+120]
psubw mm3, mm4 ;-mm4
pmaxsw mm0, mm1 ;|src|
pmaxsw mm3, mm4
; nop2
psraw mm1, 15 ;sign src
psraw mm4, 15
psllw mm0, 4 ; level << 4
psllw mm3, 4 ;
paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided intra1 contains fix for division by 1
paddw mm3, [rdi + 128 + 8*rsi+120]
movq mm5, [rdi + 384 + 8*rsi+112]
movq mm7, [rdi + 384 + 8*rsi+120]
pmulhuw mm5, mm0
pmulhuw mm7, mm3 ;mm7: first approx of division
; mov esp, esp
movq mm2, [rdi + 8*rsi+112]
movq mm6, [rdi + 8*rsi+120]
pmullw mm2, mm5 ;test value <= original
pmullw mm6, mm7
psubw mm0, mm2 ;mismatch
psubw mm3, mm6
; nop4
lea r11, [quantd wrt rip]
movq mm2, [r11 + rcx * 8 - 8]
lea r11, [mmx_div wrt rip]
movq mm6, [r11 + rcx * 8 - 8] ; divs for q<=16
paddw mm5, mm2 ;first approx with quantd
paddw mm7, mm2
; mov esp, esp
pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction
pmulhuw mm3, [rdi + 256 + 8*rsi+120]
paddw mm5, mm0 ;final result with quantd
paddw mm7, mm3
pxor mm0, mm0
pxor mm3, mm3
; mov esp, esp
pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16
pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32)
psrlw mm5, 1 ; (level + quantd) / (2*quant)
psrlw mm7, 1
pxor mm5, mm1 ; mm0 *= sign(mm0)
pxor mm7, mm4 ;
psubw mm5, mm1 ; undisplace
psubw mm7, mm4 ;
movq [rdx + 8*rsi+112], mm5
movq [rdx + 8*rsi +120], mm7
add rsi,byte 2
jng near .lloop
jmp near .done
.endfunc
;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_inter_x86_64(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
; Ported from its 32bit xmm cousin
;-----------------------------------------------------------------------------
ALIGN 16
quant_mpeg_inter_x86_64:
mov rax, rsi ; data
mov r8, rdi ; save coeff
mov rdi, rcx ; mpeg_matrices
mov rcx, rdx ; quant
mov rdx, r8 ; coeff
push rbx
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?