📄 quantize_mpeg_mmx.asm
字号:
;/**************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - 3dne Quantization/Dequantization -; *; * Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>; * 2002-2003 Michael Militzer <isibaar@xvid.org>; * 2002-2003 Pascal Massimino <skal@planet-d.net>; *; * This program is free software ; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation ; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY ; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program ; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: quantize_mpeg_mmx.asm,v 1.5 2004/08/29 10:02:38 edgomez Exp $; *; *************************************************************************/%define SATURATEBITS 32%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro%macro cextern 1 %ifdef PREFIX extern _%1 %define %1 _%1 %else extern %1 %endif%endmacro;=============================================================================; Local data (Read Only);=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifmmx_one: times 4 dw 1;-----------------------------------------------------------------------------; divide by 2Q table;-----------------------------------------------------------------------------ALIGN 16mmx_div: times 4 dw 65535 ; the div by 2 formula will overflow for the case ; quant=1 but we don't care much because quant=1 ; is handled by a different piece of code that ; doesn't use this table.%assign quant 2%rep 30 times 4 dw (1<<17) / (quant*2) + 1 %assign quant quant+1%endrep%define VM18P 3%define VM18Q 4;-----------------------------------------------------------------------------; quantd table;-----------------------------------------------------------------------------quantd:%assign quant 1%rep 31 times 4 dw ((VM18P*quant) + (VM18Q/2)) / VM18Q %assign quant quant+1%endrep;-----------------------------------------------------------------------------; multiple by 2Q table;-----------------------------------------------------------------------------mmx_mul_quant:%assign quant 1%rep 31 times 4 dw quant %assign quant quant+1%endrep;-----------------------------------------------------------------------------; saturation limits;-----------------------------------------------------------------------------ALIGN 16mmx_32767_minus_2047: times 4 dw (32767-2047)mmx_32768_minus_2048: times 4 dw (32768-2048)mmx_2047: times 4 dw 2047mmx_minus_2048: times 4 dw (-2048)zero: times 4 dw 0;=============================================================================; Code;=============================================================================SECTION .textcglobal quant_mpeg_intra_mmxcglobal quant_mpeg_inter_mmxcglobal dequant_mpeg_intra_mmxcglobal dequant_mpeg_inter_mmx;-----------------------------------------------------------------------------;; uint32_t quant_mpeg_intra_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16quant_mpeg_intra_mmx: push ecx push esi push edi push ebx mov edi, [esp + 16 + 4] ; coeff mov esi, [esp + 16 + 8] ; data mov eax, [esp + 16 + 12] ; quant mov ebx, [esp + 16 + 20] ; mpeg_quant_matrices movq mm5, [quantd + eax * 8 - 8] ; quantd -> mm5 xor ecx, ecx cmp al, 1 jz near .q1loop cmp al, 2 jz near .q2loop movq mm7, [mmx_div + eax * 8 - 8] ; multipliers[quant] -> mm7ALIGN 16.loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 ; level << 4 psllw mm3, 4 movq mm2, [ebx + 8*ecx] psrlw mm2, 1 ; intra_matrix[i]>>1 paddw mm0, mm2 movq mm2, [ebx + 256 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] movq mm2, [ebx + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [ebx + 256 + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 ; + quantd paddw mm3, mm5 pmulhw mm0, mm7 ; mm0 = (mm0 / 2Q) >> 16 pmulhw mm3, mm7 ; psrlw mm0, 1 ; additional shift by 1 => 16 + 1 = 17 psrlw mm3, 1 pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz near .loop.done ; caclulate data[0] // (int32_t)dcscalar) mov ecx, [esp + 16 + 16] ; dcscalar mov edx, ecx movsx eax, word [esi] ; data[0] shr edx, 1 ; edx = dcscalar /2 cmp eax, 0 jg .gtzero sub eax, edx jmp short .mul.gtzero add eax, edx.mul cdq ; expand eax -> edx:eax idiv ecx ; eax = edx:eax / dcscalar mov [edi], ax ; coeff[0] = ax pop ebx pop edi pop esi pop ecx xor eax, eax ; return(0); retALIGN 16.q1loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 movq mm2, [ebx + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 movq mm2, [ebx + 256 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] movq mm2, [ebx + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [ebx + 256 + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 paddw mm3, mm5 psrlw mm0, 1 ; mm0 >>= 1 (/2) psrlw mm3, 1 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx, 2 cmp ecx, 16 jnz near .q1loop jmp near .doneALIGN 16.q2loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 movq mm2, [ebx + 8*ecx] psrlw mm2, 1 paddw mm0, mm2 movq mm2, [ebx + 256 + ecx*8] pmulhw mm0, mm2 ; (level<<4 + intra_matrix[i]>>1) / intra_matrix[i] movq mm2, [ebx + 8*ecx + 8] psrlw mm2, 1 paddw mm3, mm2 movq mm2, [ebx + 256 + ecx*8 + 8] pmulhw mm3, mm2 paddw mm0, mm5 paddw mm3, mm5 psrlw mm0, 2 ; mm0 >>= 1 (/4) psrlw mm3, 2 ; pxor mm0, mm1 ; mm0 *= sign(mm0) pxor mm3, mm4 ; psubw mm0, mm1 ; undisplace psubw mm3, mm4 ; movq [edi + 8*ecx], mm0 movq [edi + 8*ecx + 8], mm3 add ecx,2 cmp ecx,16 jnz near .q2loop jmp near .done.endfunc;-----------------------------------------------------------------------------;; uint32_t quant_mpeg_inter_mmx(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------ALIGN 16quant_mpeg_inter_mmx: push ecx push esi push edi push ebx mov edi, [esp + 16 + 4] ; coeff mov esi, [esp + 16 + 8] ; data mov eax, [esp + 16 + 12] ; quant mov ebx, [esp + 16 + 16] ; mpeg_quant_matrices xor ecx, ecx pxor mm5, mm5 ; sum cmp al, 1 jz near .q1loop cmp al, 2 jz near .q2loop movq mm7, [mmx_div + eax * 8 - 8] ; dividerALIGN 16.loop movq mm0, [esi + 8*ecx] ; mm0 = [1st] movq mm3, [esi + 8*ecx + 8] ; pxor mm1, mm1 ; mm1 = 0 pxor mm4, mm4 ; pcmpgtw mm1, mm0 ; mm1 = (0 > mm0) pcmpgtw mm4, mm3 ; pxor mm0, mm1 ; mm0 = |mm0| pxor mm3, mm4 ; psubw mm0, mm1 ; displace psubw mm3, mm4 ; psllw mm0, 4 psllw mm3, 4 movq mm2, [ebx + 512 + 8*ecx] psrlw mm2, 1 paddw mm0, mm2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -