📄 quantize_mpeg_mmx.asm
字号:
;/**************************************************************************
; *
; * XVID MPEG-4 VIDEO CODEC
; * - 3dne Quantization/Dequantization -
; *
; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
; * 2002-2003 Michael Militzer <isibaar@xvid.org>
; * 2002-2003 Pascal Massimino <skal@planet-d.net>
; *
; * This program is free software ; you can redistribute it and/or modify
; * it under the terms of the GNU General Public License as published by
; * the Free Software Foundation ; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY ; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program ; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
; *
; * $Id: quantize_mpeg_mmx.asm,v 1.1 2005/07/21 09:09:04 klschoef Exp $
; *
; *************************************************************************/
%define SATURATE
BITS 32
%macro cglobal 1
%ifdef PREFIX
global _%1
%define %1 _%1
%else
global %1
%endif
%endmacro
%macro cextern 1
%ifdef PREFIX
extern _%1
%define %1 _%1
%else
extern %1
%endif
%endmacro
;=============================================================================
; Local data (Read Only)
;=============================================================================
%ifdef FORMAT_COFF
SECTION .rodata data
%else
SECTION .rodata data align=16
%endif
mmx_one:
times 4 dw 1
;-----------------------------------------------------------------------------
; divide by 2Q table
;-----------------------------------------------------------------------------
ALIGN 16
mmx_div:
times 4 dw 65535 ; the div by 2 formula will overflow for the case
; quant=1 but we don't care much because quant=1
; is handled by a different piece of code that
; doesn't use this table.
%assign quant 2
%rep 30
times 4 dw (1<<17) / (quant*2) + 1
%assign quant quant+1
%endrep
%define VM18P 3
%define VM18Q 4
;-----------------------------------------------------------------------------
; quantd table
;-----------------------------------------------------------------------------
quantd:
%assign quant 1
%rep 31
times 4 dw ((VM18P*quant) + (VM18Q/2)) / VM18Q
%assign quant quant+1
%endrep
;-----------------------------------------------------------------------------
; multiple by 2Q table
;-----------------------------------------------------------------------------
mmx_mul_quant:
%assign quant 1
%rep 31
times 4 dw quant
%assign quant quant+1
%endrep
;-----------------------------------------------------------------------------
; saturation limits
;-----------------------------------------------------------------------------
ALIGN 16
mmx_32767_minus_2047:
times 4 dw (32767-2047)
mmx_32768_minus_2048:
times 4 dw (32768-2048)
mmx_2047:
times 4 dw 2047
mmx_minus_2048:
times 4 dw (-2048)
zero:
times 4 dw 0
;=============================================================================
; Code
;=============================================================================
SECTION .text
cglobal quant_mpeg_intra_mmx
cglobal quant_mpeg_inter_mmx
cglobal dequant_mpeg_intra_mmx
cglobal dequant_mpeg_inter_mmx
;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_intra_mmx(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint32_t dcscalar,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
ALIGN 16
quant_mpeg_intra_mmx:
push ecx
push esi
push edi
push ebx
mov edi, [esp + 16 + 4] ; coeff
mov esi, [esp + 16 + 8] ; data
mov eax, [esp + 16 + 12] ; quant
mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices
movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5
xor ecx, ecx
cmp al, 1
jz near .q1loop
cmp al, 2
jz near .q2loop
movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7
ALIGN 16
.loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx + 8] ;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4 ; level << 4
psllw mm3, 4
movq mm2, [ebx + 8*ecx]
psrlw mm2, 1 ; intra_matrix[i]>>1
paddw mm0, mm2
movq mm2, [ebx + 256 + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
movq mm2, [ebx + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [ebx + 256 + ecx*8 + 8]
pmulhw mm3, mm2
paddw mm0, mm5 ; + quantd
paddw mm3, mm5
pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16
pmulhw mm3, mm7 ;
psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17
psrlw mm3, 1
pxor mm0, mm1 ; mm0 *= sign(mm0)
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4 ;
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx,2
cmp ecx,16
jnz near .loop
.done
; caclulate data[0] // (int32_t)dcscalar)
mov ecx, [esp + 16 + 16] ; dcscalar
mov edx, ecx
movsx eax, word [esi] ; data[0]
shr edx, 1 ; edx = dcscalar /2
cmp eax, 0
jg .gtzero
sub eax, edx
jmp short .mul
.gtzero
add eax, edx
.mul
cdq ; expand eax -> edx:eax
idiv ecx ; eax = edx:eax / dcscalar
mov [edi], ax ; coeff[0] = ax
pop ebx
pop edi
pop esi
pop ecx
xor eax, eax ; return(0);
ret
ALIGN 16
.q1loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx + 8] ;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [ebx + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [ebx + 256 + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
movq mm2, [ebx + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [ebx + 256 + ecx*8 + 8]
pmulhw mm3, mm2
paddw mm0, mm5
paddw mm3, mm5
psrlw mm0, 1 ; mm0 >>= 1 (/2)
psrlw mm3, 1 ;
pxor mm0, mm1 ; mm0 *= sign(mm0)
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4 ;
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx, 2
cmp ecx, 16
jnz near .q1loop
jmp near .done
ALIGN 16
.q2loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx + 8] ;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [ebx + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [ebx + 256 + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i]
movq mm2, [ebx + 8*ecx + 8]
psrlw mm2, 1
paddw mm3, mm2
movq mm2, [ebx + 256 + ecx*8 + 8]
pmulhw mm3, mm2
paddw mm0, mm5
paddw mm3, mm5
psrlw mm0, 2 ; mm0 >>= 1 (/4)
psrlw mm3, 2 ;
pxor mm0, mm1 ; mm0 *= sign(mm0)
pxor mm3, mm4 ;
psubw mm0, mm1 ; undisplace
psubw mm3, mm4 ;
movq [edi + 8*ecx], mm0
movq [edi + 8*ecx + 8], mm3
add ecx,2
cmp ecx,16
jnz near .q2loop
jmp near .done
;-----------------------------------------------------------------------------
;
; uint32_t quant_mpeg_inter_mmx(int16_t * coeff,
; const int16_t const * data,
; const uint32_t quant,
; const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
ALIGN 16
quant_mpeg_inter_mmx:
push ecx
push esi
push edi
push ebx
mov edi, [esp + 16 + 4] ; coeff
mov esi, [esp + 16 + 8] ; data
mov eax, [esp + 16 + 12] ; quant
mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices
xor ecx, ecx
pxor mm5, mm5 ; sum
cmp al, 1
jz near .q1loop
cmp al, 2
jz near .q2loop
movq mm7, [mmx_div + eax * 8 - 8] ; divider
ALIGN 16
.loop
movq mm0, [esi + 8*ecx] ; mm0 = [1st]
movq mm3, [esi + 8*ecx + 8] ;
pxor mm1, mm1 ; mm1 = 0
pxor mm4, mm4 ;
pcmpgtw mm1, mm0 ; mm1 = (0 > mm0)
pcmpgtw mm4, mm3 ;
pxor mm0, mm1 ; mm0 = |mm0|
pxor mm3, mm4 ;
psubw mm0, mm1 ; displace
psubw mm3, mm4 ;
psllw mm0, 4
psllw mm3, 4
movq mm2, [ebx + 512 + 8*ecx]
psrlw mm2, 1
paddw mm0, mm2
movq mm2, [ebx + 768 + ecx*8]
pmulhw mm0, mm2 ; (level<<4 + inter_matrix[i]>>1) / inter_matrix[i]
movq mm2, [ebx + 512 + 8*ecx + 8]
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -