📄 quantize_mpeg_xmm.asm
字号:
;/****************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - 3dne Quantization/Dequantization -; *; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>; * 2002 Jaan Kalda; * 2004 Andre Werthmann <wertmann@aei.mpg.de>; *; * This program is free software ; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation ; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY ; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program ; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: quantize_mpeg_xmm.asm,v 1.1 2005/01/05 23:02:15 edgomez Exp $; *; ***************************************************************************/; _3dne functions are compatible with iSSE, but are optimized specifically; for K7 pipelines%define SATURATEBITS 64%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Local data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifALIGN 8mmzero: dd 0,0mmx_one: times 4 dw 1;-----------------------------------------------------------------------------; divide by 2Q table;-----------------------------------------------------------------------------ALIGN 16mmx_divs: ;i>2%assign i 1%rep 31 times 4 dw ((1 << 15) / i + 1) %assign i i+1%endrepALIGN 16mmx_div: ;quant>2 times 4 dw 65535 ; the div by 2 formula will overflow for the case ; quant=1 but we don't care much because quant=1 ; is handled by a different piece of code that ; doesn't use this table.%assign quant 2%rep 31 times 4 dw ((1 << 16) / quant + 1) %assign quant quant+1%endrep%macro FIXX 1dw (1 << 16) / (%1) + 1%endmacro%define nop4 db 08Dh, 074h, 026h,0%define nop3 add esp, byte 0%define nop2 mov esp, esp%define nop7 db 08dh, 02ch, 02dh,0,0,0,0%define nop6 add ebp, dword 0;-----------------------------------------------------------------------------; quantd table;-----------------------------------------------------------------------------%define VM18P 3%define VM18Q 4ALIGN 16quantd:%assign i 1%rep 31 times 4 dw (((VM18P*i) + (VM18Q/2)) / VM18Q) %assign i i+1%endrep;-----------------------------------------------------------------------------; multiple by 2Q table;-----------------------------------------------------------------------------ALIGN 16mmx_mul_quant:%assign i 1%rep 31 times 4 dw i %assign i i+1%endrep;-----------------------------------------------------------------------------; saturation limits;-----------------------------------------------------------------------------ALIGN 16mmx_32767_minus_2047: times 4 dw (32767-2047)mmx_32768_minus_2048: times 4 dw (32768-2048)mmx_2047: times 4 dw 2047mmx_minus_2048: times 4 dw (-2048)zero: times 4 dw 0int_div:dd 0%assign i 1%rep 255 dd (1 << 17) / ( i) + 1 %assign i i+1%endrep;=============================================================================; Code;=============================================================================SECTION .text align=16cglobal quant_mpeg_intra_x86_64cglobal quant_mpeg_inter_x86_64cglobal dequant_mpeg_intra_x86_64cglobal dequant_mpeg_inter_x86_64;-----------------------------------------------------------------------------;; uint32_t quant_mpeg_intra_x86_64(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);; Ported from its 32bit xmm cousin;-----------------------------------------------------------------------------ALIGN 16quant_mpeg_intra_x86_64: mov rax, rsi ; data mov r9, rcx ; save dcscalar mov rcx, rdx ; quant mov rdx, rdi ; coeff push rbx mov rdi, r8 ; mpeg_quant_matrices mov rsi, -14 pxor mm0, mm0 pxor mm3, mm3 cmp rcx, byte 1 je near .q1loop cmp rcx, byte 19 jg near .lloopALIGN 16.loop movq mm1, [rax + 8*rsi+112] ; mm0 = [1st] psubw mm0, mm1 ;-mm1 movq mm4, [rax + 8*rsi + 120] ; psubw mm3, mm4 ;-mm4 pmaxsw mm0, mm1 ;|src| pmaxsw mm3,mm4; nop2 psraw mm1, 15 ;sign src psraw mm4, 15 psllw mm0, 4 ;level << 4 ; psllw mm3, 4 paddw mm0, [rdi + 128 + 8*rsi+112] paddw mm3, [rdi + 128 + 8*rsi+120] movq mm5, [rdi + 384 + 8*rsi+112] movq mm7, [rdi + 384 + 8*rsi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3; mov esp, esp movq mm2, [rdi + 8*rsi+112] movq mm6, [rdi + 8*rsi+120] pmullw mm2, mm5 pmullw mm6, mm7 psubw mm0, mm2 psubw mm3, mm6; nop4 lea r11, [quantd wrt rip] movq mm2, [r11 + rcx * 8 - 8] lea r11, [mmx_divs wrt rip] movq mm6, [r11 + rcx * 8 - 8] paddw mm5, mm2 paddw mm7, mm2; mov esp, esp pmulhuw mm0, [rdi + 256 + 8*rsi+112] pmulhuw mm3, [rdi + 256 + 8*rsi+120] paddw mm5, mm0 paddw mm7, mm3 pxor mm0, mm0 pxor mm3, mm3 pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) pxor mm5, mm1 ; mm0 *= sign(mm0) pxor mm7, mm4 ; psubw mm5, mm1 ; undisplace psubw mm7, mm4 ; movq [rdx + 8*rsi+112], mm5 movq [rdx + 8*rsi +120], mm7 add rsi, byte 2 jng near .loop.done; calculate data[0] // (int32_t)dcscalar); mov esi, [esp + 12 + 16] ; dcscalar mov rsi, r9 ; dcscalar movsx rcx, word [rax] mov rdi, rcx; mov edx, [esp + 12 + 16] mov r11, rdx ; save rdx mov rdx, r9 ; shr edx, 1 ; ebx = dcscalar /2 sar edi, 31 ; cdq is vectorpath xor edx, edi ; ebx = eax V -eax -1 sub ecx, edi add ecx, edx;; mov rdx, [dword esp + 12 + 4] mov rdx, r11 ; restore rdx lea r11, [int_div wrt rip] mov rsi, [r11+4*rsi] imul ecx, esi sar ecx, 17 lea rbx, [byte rcx + 1] cmovs rcx, rbx ; idiv cx ; ecx = edi:ecx / dcscalar; mov ebx, [esp]; mov edi, [esp+4]; mov esi, [esp+8]; add esp, byte 12 ; pops... pop rbx; mov [rdx], rcx ; coeff[0] = ax mov [rdx], cx ; coeff[0] = cx xor rax, rax retALIGN 16.q1loop movq mm1, [rax + 8*rsi+112] ; mm0 = [1st] psubw mm0, mm1 ;-mm1 movq mm4, [rax + 8*rsi+120] ; psubw mm3, mm4 ;-mm4 pmaxsw mm0, mm1 ;|src| pmaxsw mm3, mm4; nop2 psraw mm1, 15 ;sign src psraw mm4, 15 psllw mm0, 4 ; level << 4 psllw mm3, 4 paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided paddw mm3, [rdi + 128 + 8*rsi+120] ;intra1 contains fix for division by 1 movq mm5, [rdi + 384 + 8*rsi+112] ;with rounding down movq mm7, [rdi + 384 + 8*rsi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3 ;mm7: first approx of division; mov esp, esp movq mm2, [rdi + 8*rsi+112] movq mm6, [rdi + 8*rsi+120] ; divs for q<=16 pmullw mm2, mm5 ;test value <= original pmullw mm6, mm7 psubw mm0, mm2 ;mismatch psubw mm3, mm6; nop4 lea r11, [quantd wrt rip] movq mm2, [r11 + rcx * 8 - 8] paddw mm5, mm2 ;first approx with quantd paddw mm7, mm2; mov esp, esp pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction pmulhuw mm3, [rdi + 256 + 8*rsi+120] paddw mm5, mm0 ;final result with quantd paddw mm7, mm3 pxor mm0, mm0 pxor mm3, mm3; mov esp, esp psrlw mm5, 1 ; (level + quantd) /2 (quant = 1) psrlw mm7, 1 pxor mm5, mm1 ; mm0 *= sign(mm0) pxor mm7, mm4 ; psubw mm5, mm1 ; undisplace psubw mm7, mm4 ; movq [rdx + 8*rsi+112], mm5 movq [rdx + 8*rsi +120], mm7 add rsi, byte 2 jng near .q1loop jmp near .doneALIGN 8.lloop movq mm1, [rax + 8*rsi+112] ; mm0 = [1st] psubw mm0, mm1 ;-mm1 movq mm4, [rax + 8*rsi+120] psubw mm3, mm4 ;-mm4 pmaxsw mm0, mm1 ;|src| pmaxsw mm3, mm4; nop2 psraw mm1, 15 ;sign src psraw mm4, 15 psllw mm0, 4 ; level << 4 psllw mm3, 4 ; paddw mm0, [rdi + 128 + 8*rsi+112] ;mm0 is to be divided intra1 contains fix for division by 1 paddw mm3, [rdi + 128 + 8*rsi+120] movq mm5, [rdi + 384 + 8*rsi+112] movq mm7, [rdi + 384 + 8*rsi+120] pmulhuw mm5, mm0 pmulhuw mm7, mm3 ;mm7: first approx of division; mov esp, esp movq mm2, [rdi + 8*rsi+112] movq mm6, [rdi + 8*rsi+120] pmullw mm2, mm5 ;test value <= original pmullw mm6, mm7 psubw mm0, mm2 ;mismatch psubw mm3, mm6; nop4 lea r11, [quantd wrt rip] movq mm2, [r11 + rcx * 8 - 8] lea r11, [mmx_div wrt rip] movq mm6, [r11 + rcx * 8 - 8] ; divs for q<=16 paddw mm5, mm2 ;first approx with quantd paddw mm7, mm2; mov esp, esp pmulhuw mm0, [rdi + 256 + 8*rsi+112] ;correction pmulhuw mm3, [rdi + 256 + 8*rsi+120] paddw mm5, mm0 ;final result with quantd paddw mm7, mm3 pxor mm0, mm0 pxor mm3, mm3; mov esp, esp pmulhuw mm5, mm6 ; mm0 = (mm0 / 2Q) >> 16 pmulhuw mm7, mm6 ; (level + quantd) / quant (0<quant<32) psrlw mm5, 1 ; (level + quantd) / (2*quant) psrlw mm7, 1 pxor mm5, mm1 ; mm0 *= sign(mm0) pxor mm7, mm4 ; psubw mm5, mm1 ; undisplace psubw mm7, mm4 ; movq [rdx + 8*rsi+112], mm5 movq [rdx + 8*rsi +120], mm7 add rsi,byte 2 jng near .lloop jmp near .done.endfunc;-----------------------------------------------------------------------------;; uint32_t quant_mpeg_inter_x86_64(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint16_t *mpeg_matrices);; Ported from its 32bit xmm cousin;-----------------------------------------------------------------------------ALIGN 16quant_mpeg_inter_x86_64: mov rax, rsi ; data mov r8, rdi ; save coeff mov rdi, rcx ; mpeg_matrices mov rcx, rdx ; quant mov rdx, r8 ; coeff push rbx
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -