quantize_h263_mmx.asm
来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· 汇编 代码 · 共 508 行
ASM
508 行
;/*****************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - MPEG4 Quantization H263 implementation / MMX optimized -
; *
; * Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
; * 2002-2003 Pascal Massimino <skal@planet-d.net>
; * 2004 Andre Werthmann <wertmann@aei.mpg.de>
; *
; * This program is free software ; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation ; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program ; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; * $Id: quantize_h263_mmx.asm,v 1.1 2005/01/05 23:02:15 edgomez Exp $
; *
; ****************************************************************************/
; enable dequant saturate [-2048,2047], test purposes only.
%define SATURATE
BITS 64
%macro cglobal 1
%ifdef PREFIX
%ifdef MARK_FUNCS
global _%1:function %1.endfunc-%1
%define %1 _%1:function %1.endfunc-%1
%else
global _%1
%define %1 _%1
%endif
%else
%ifdef MARK_FUNCS
global %1:function %1.endfunc-%1
%else
global %1
%endif
%endif
%endmacro
;=============================================================================
; Read only Local data
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif
ALIGN 16
plus_one:
times 8 dw 1
;-----------------------------------------------------------------------------
;
; subtract by Q/2 table
;
;-----------------------------------------------------------------------------
ALIGN 16
mmx_sub:
%assign quant 1
%rep 31
times 4 dw quant / 2
%assign quant quant+1
%endrep
;-----------------------------------------------------------------------------
;
; divide by 2Q table
;
; use a shift of 16 to take full advantage of _pmulhw_
; for q=1, _pmulhw_ will overflow so it is treated seperately
; (3dnow2 provides _pmulhuw_ which wont cause overflow)
;
;-----------------------------------------------------------------------------
ALIGN 16
mmx_div:
%assign quant 1
%rep 31
times 4 dw (1<<16) / (quant*2) + 1
%assign quant quant+1
%endrep
;-----------------------------------------------------------------------------
;
; add by (odd(Q) ? Q : Q - 1) table
;
;-----------------------------------------------------------------------------
ALIGN 16
mmx_add:
%assign quant 1
%rep 31
%if quant % 2 != 0
times 4 dw quant
%else
times 4 dw quant - 1
%endif
%assign quant quant+1
%endrep
;-----------------------------------------------------------------------------
;
; multiple by 2Q table
;
;-----------------------------------------------------------------------------
ALIGN 16
mmx_mul:
%assign quant 1
%rep 31
times 4 dw quant*2
%assign quant quant+1
%endrep
;-----------------------------------------------------------------------------
;
; saturation limits
;
;-----------------------------------------------------------------------------
ALIGN 16
sse2_2047:
times 8 dw 2047
ALIGN 16
mmx_2047:
times 4 dw 2047
ALIGN 8
mmx_32768_minus_2048:
times 4 dw (32768-2048)
mmx_32767_minus_2047:
times 4 dw (32767-2047)
;=============================================================================
; Code
;=============================================================================
SECTION .text align=16
cglobal quant_h263_intra_x86_64
cglobal quant_h263_inter_x86_64
cglobal dequant_h263_intra_x86_64
cglobal dequant_h263_inter_x86_64
;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_intra_x86_64(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
; Port of the 32bit mmx cousin
;-----------------------------------------------------------------------------
ALIGN 16
quant_h263_intra_x86_64:
mov rax, rdx ; quant
; rsi is data
; rdi is coeff
mov r8, rcx ; save dscalar
xor rcx, rcx
cmp rax, 1
jz .q1loop
lea r9, [mmx_div wrt rip]
movq mm7, [r9 + rax * 8 - 8]
ALIGN 16
.loop
movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
movq mm3, [rsi + 8*rcx + 8]
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
pmulhw mm3, mm7 ;
pxor mm0, mm1 ; mm0 *= sign(mm0)
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4 ;
movq [rdi + 8*rcx], mm0
movq [rdi + 8*rcx + 8], mm3
add rcx, 2
cmp rcx, 16
jnz .loop
.done
; caclulate data[0] // (int32_t)dcscalar)
mov rcx, r8 ; dscalar
mov rdx, rcx
movsx eax, word [rsi] ; data[0] with sign extend
shr rdx, 1 ; edx = dcscalar /2
cmp eax, 0
jg .gtzero
sub rax, rdx
jmp short .mul
.gtzero
add rax, rdx
.mul
cdq ; expand eax -> edx:eax
idiv ecx ; eax = edx:eax / dcscalar
mov [rdi], ax ; coeff[0] = ax
xor rax, rax ; return(0);
ret
ALIGN 16
.q1loop
movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
movq mm3, [rsi + 8*rcx + 8]
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psrlw mm0, 1 ; mm0 >>= 1 (/2)
psrlw mm3, 1 ;
pxor mm0, mm1 ; mm0 *= sign(mm0)
pxor mm3, mm4
psubw mm0, mm1 ; undisplace
psubw mm3, mm4 ;
movq [rdi + 8*rcx], mm0
movq [rdi + 8*rcx + 8], mm3
add rcx, 2
cmp rcx, 16
jnz .q1loop
jmp .done
.endfunc
;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_inter_x86_64(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
; Port of the 32bit mmx cousin
;-----------------------------------------------------------------------------
ALIGN 16
quant_h263_inter_x86_64:
mov rax, rdx ; quant
; rsi is data
; rdi is coeff
xor rcx, rcx
pxor mm5, mm5 ; sum
lea r9, [mmx_sub wrt rip]
movq mm6, [r9 + rax * 8 - 8] ; sub
cmp rax, 1
jz .q1loop
lea r9, [mmx_div wrt rip]
movq mm7, [r9 + rax * 8 - 8] ; divider
ALIGN 8
.loop
movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
movq mm3, [rsi + 8*rcx + 8]
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
psubusw mm3, mm6 ;
pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
pmulhw mm3, mm7 ;
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [rdi + 8*rcx], mm0
movq [rdi + 8*rcx + 8], mm3
add rcx, 2
cmp rcx, 16
jnz .loop
.done
pmaddwd mm5, [plus_one wrt rip]
movq mm0, mm5
psrlq mm5, 32
paddd mm0, mm5
movd rax, mm0 ; return sum
ret
ALIGN 8
.q1loop
movq mm0, [rsi + 8*rcx] ; mm0 = [1st]
movq mm3, [rsi + 8*rcx+ 8] ;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psubusw mm0, mm6 ; mm0 -= sub (unsigned, dont go < 0)
psubusw mm3, mm6 ;
psrlw mm0, 1 ; mm0 >>= 1 (/2)
psrlw mm3, 1 ;
paddw mm5, mm0 ; sum += mm0
pxor mm0, mm1 ; mm0 *= sign(mm0)
paddw mm5, mm3 ;
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4
movq [rdi + 8*rcx], mm0
movq [rdi + 8*rcx + 8], mm3
add rcx, 2
cmp rcx, 16
jnz .q1loop
jmp .done
.endfunc
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_intra_x86_64(int16_t *data,
; const int16_t const *coeff,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
; port of the 32bit xmm cousin
;-----------------------------------------------------------------------------
; this is the same as dequant_inter_mmx, except that we're
; saturating using 'pminsw' (saves 2 cycles/loop => ~5% faster)
ALIGN 16
dequant_h263_intra_x86_64:
mov rax, rdx ; quant
mov [rsp-8], rcx ; save dscalar
mov rcx, rsi ; coeff
mov rdx, rdi ; data
lea r9, [mmx_add wrt rip]
movq mm6, [r9 + rax*8 - 8] ; quant or quant-1
lea r9, [mmx_mul wrt rip]
movq mm7, [r9 + rax*8 - 8] ; 2*quant
mov rax, -16
ALIGN 16
.loop
movq mm0, [rcx+8*rax+8*16] ; c = coeff[i]
movq mm3, [rcx+8*rax+8*16 + 8] ; c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; sign(c)
pcmpgtw mm4, mm3 ; sign(c')
pxor mm2, mm2
pxor mm5, mm5
pcmpeqw mm2, mm0 ; c is zero
pcmpeqw mm5, mm3 ; c' is zero
pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
pandn mm5, mm6
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
pmullw mm0, mm7 ; *= 2Q
pmullw mm3, mm7 ; *= 2Q
paddw mm0, mm2 ; + offset
paddw mm3, mm5 ; + offset
paddw mm0, mm1 ; negate back
paddw mm3, mm4 ; negate back
; saturates to +2047
movq mm2, [mmx_2047 wrt rip]
pminsw mm0, mm2
add rax, 2
pminsw mm3, mm2
pxor mm0, mm1
pxor mm3, mm4
movq [rdx + 8*rax + 8*16 - 2*8], mm0
movq [rdx + 8*rax + 8*16+8 - 2*8], mm3
jnz near .loop
; deal with DC
movd mm0, [rcx]
pmullw mm0, [rsp-8] ; dscalar
movq mm2, [mmx_32767_minus_2047 wrt rip]
paddsw mm0, mm2
psubsw mm0, mm2
movq mm2, [mmx_32768_minus_2048 wrt rip]
psubsw mm0, mm2
paddsw mm0, mm2
movd rax, mm0
mov [rdx], ax
xor rax, rax
ret
.endfunc
;-----------------------------------------------------------------------------
;
; uint32_t dequant_h263_inter_x86_64(int16_t * data,
; const int16_t * const coeff,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
; Port of the 32bit xmm cousin
;-----------------------------------------------------------------------------
; this is the same as dequant_inter_mmx,
; except that we're saturating using 'pminsw' (saves 2 cycles/loop)
ALIGN 16
dequant_h263_inter_x86_64:
mov rax, rdx ; quant
mov rcx, rsi ; coeff
mov rdx, rdi ; data
lea r9, [mmx_add wrt rip]
movq mm6, [r9 + rax*8 - 8] ; quant or quant-1
lea r9, [mmx_mul wrt rip]
movq mm7, [r9 + rax*8 - 8] ; 2*quant
mov rax, -16
ALIGN 16
.loop
movq mm0, [rcx+8*rax+8*16] ; c = coeff[i]
movq mm3, [rcx+8*rax+8*16 + 8] ; c' = coeff[i+1]
pxor mm1, mm1
pxor mm4, mm4
pcmpgtw mm1, mm0 ; sign(c)
pcmpgtw mm4, mm3 ; sign(c')
pxor mm2, mm2
pxor mm5, mm5
pcmpeqw mm2, mm0 ; c is zero
pcmpeqw mm5, mm3 ; c' is zero
pandn mm2, mm6 ; offset = isZero ? 0 : quant_add
pandn mm5, mm6
pxor mm0, mm1 ; negate if negative
pxor mm3, mm4 ; negate if negative
psubw mm0, mm1
psubw mm3, mm4
pmullw mm0, mm7 ; *= 2Q
pmullw mm3, mm7 ; *= 2Q
paddw mm0, mm2 ; + offset
paddw mm3, mm5 ; + offset
paddw mm0, mm1 ; start restoring sign
paddw mm3, mm4 ; start restoring sign
; saturates to +2047
movq mm2, [mmx_2047 wrt rip]
pminsw mm0, mm2
add rax, 2
pminsw mm3, mm2
pxor mm0, mm1 ; finish restoring sign
pxor mm3, mm4 ; finish restoring sign
movq [rdx + 8*rax + 8*16 - 2*8], mm0
movq [rdx + 8*rax + 8*16+8 - 2*8], mm3
jnz near .loop
xor rax, rax
ret
.endfunc
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?