📄 quantize_h263_3dne.asm
字号:
;/**************************************************************************; *; * XVID MPEG-4 VIDEO CODEC; * - 3dne Quantization/Dequantization -; *; * Copyright(C) 2002-2003 Jaan Kalda; *; * This program is free software ; you can redistribute it and/or modify; * it under the terms of the GNU General Public License as published by; * the Free Software Foundation ; either version 2 of the License, or; * (at your option) any later version.; *; * This program is distributed in the hope that it will be useful,; * but WITHOUT ANY WARRANTY ; without even the implied warranty of; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the; * GNU General Public License for more details.; *; * You should have received a copy of the GNU General Public License; * along with this program ; if not, write to the Free Software; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA; *; * $Id: quantize_h263_3dne.asm,v 1.5 2004/08/29 10:02:38 edgomez Exp $; *; *************************************************************************/;; these 3dne functions are compatible with iSSE, but are optimized specifically for; K7 pipelines; enable dequant saturate [-2048,2047], test purposes only.%define SATURATEBITS 32%macro cglobal 1 %ifdef PREFIX %ifdef MARK_FUNCS global _%1:function %1.endfunc-%1 %define %1 _%1:function %1.endfunc-%1 %else global _%1 %define %1 _%1 %endif %else %ifdef MARK_FUNCS global %1:function %1.endfunc-%1 %else global %1 %endif %endif%endmacro;=============================================================================; Local data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifalign 4int_div: dd 0%assign i 1%rep 255 dd (1 << 16) / (i) + 1 %assign i i+1%endrepALIGN 16plus_one: times 8 dw 1;-----------------------------------------------------------------------------; subtract by Q/2 table;-----------------------------------------------------------------------------ALIGN 16mmx_sub:%assign i 1%rep 31 times 4 dw i / 2 %assign i i+1%endrep;-----------------------------------------------------------------------------;; divide by 2Q table;; use a shift of 16 to take full advantage of _pmulhw_; for q=1, _pmulhw_ will overflow so it is treated seperately; (3dnow2 provides _pmulhuw_ which wont cause overflow);;-----------------------------------------------------------------------------ALIGN 16mmx_div:%assign i 1%rep 31 times 4 dw (1 << 16) / (i * 2) + 1 %assign i i+1%endrep;-----------------------------------------------------------------------------; add by (odd(Q) ? Q : Q - 1) table;-----------------------------------------------------------------------------ALIGN 16mmx_add:%assign i 1%rep 31 %if i % 2 != 0 times 4 dw i %else times 4 dw i - 1 %endif %assign i i+1%endrep;-----------------------------------------------------------------------------; multiple by 2Q table;-----------------------------------------------------------------------------ALIGN 16mmx_mul:%assign i 1%rep 31 times 4 dw i * 2 %assign i i+1%endrep;-----------------------------------------------------------------------------; saturation limits;-----------------------------------------------------------------------------ALIGN 8mmx_32768_minus_2048: times 4 dw (32768-2048)mmx_32767_minus_2047: times 4 dw (32767-2047)ALIGN 16mmx_2047: times 4 dw 2047ALIGN 8mmzero: dd 0, 0int2047: dd 2047int_2048: dd -2048;=============================================================================; Code;=============================================================================SECTION .text;-----------------------------------------------------------------------------;; uint32_t quant_h263_intra_3dne(int16_t * coeff,; const int16_t const * data,; const uint32_t quant,; const uint32_t dcscalar,; const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------;This is Athlon-optimized code (ca 70 clk per call)%macro quant_intra1 1 psubw mm1, mm0 ;A3 psubw mm3, mm2 ;B3%if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8%endifALIGN 8 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 pmaxsw mm1, mm0 ;A4 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5%if (%1) movq [edx + %1 * 32 + 16-32], mm5 ;C9 movq [edx + %1 * 32 + 24-32], mm7 ;D9%endif psrlw mm1, 1 ;A6 psrlw mm3, 1 ;B6 movq mm5, [ebx] ;C2 movq mm7, [ebx] ;D2 pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5, mm4 ;C3 psubw mm7, mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8%if (%1 == 0) push ebp movq mm0, [ecx + %1 * 32 +32]%elif (%1 < 3) db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1%endif pmaxsw mm5, mm4 ;C4%if (%1 < 3) db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1%else cmp esp, esp%endif pmaxsw mm7, mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 movq [byte edx + %1 * 32], mm1 ;A9 movq [edx + %1 * 32+8], mm3 ;B9 psrlw mm5, 1 ;C6 psrlw mm7, 1 ;D6%if (%1 < 3) movq mm1, [ebx] ;A2 movq mm3, [ebx] ;B2%endif%if (%1 == 3) imul eax, [int_div+4*edi]%endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7%endm%macro quant_intra 1 ; Rules for athlon: ; 1) schedule latencies ; 2) add/mul and load/store in 2:1 proportion ; 3) avoid spliting >3byte instructions over 8byte boundaries psubw mm1, mm0 ;A3 psubw mm3, mm2 ;B3%if (%1) psubw mm5, mm4 ;C8 psubw mm7, mm6 ;D8%endifALIGN 8 db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16) ;movq mm4, [ecx + %1 * 32 +16+32] ;C1 pmaxsw mm1, mm0 ;A4 db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24) ;movq mm6, [ecx + %1 * 32 +24+32] ;D1 pmaxsw mm3, mm2 ;B4 psraw mm0, 15 ;A5 psraw mm2, 15 ;B5%if (%1) movq [edx + %1 * 32 + 16-32], mm5 ;C9 movq [edx + %1 * 32 + 24-32], mm7 ;D9%endif pmulhw mm1, [esi] ;A6 pmulhw mm3, [esi] ;B6 movq mm5, [ebx] ;C2 movq mm7, [ebx] ;D2 nop nop pxor mm1, mm0 ;A7 pxor mm3, mm2 ;B7 psubw mm5, mm4 ;C3 psubw mm7, mm6 ;D3 psubw mm1, mm0 ;A8 psubw mm3, mm2 ;B8%if (%1 < 3) db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq mm0, [ecx + %1 * 32 +32] ;A1%endif pmaxsw mm5, mm4 ;C4%if (%1 < 3) db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32] ;B1%else cmp esp, esp%endif pmaxsw mm7,mm6 ;D4 psraw mm4, 15 ;C5 psraw mm6, 15 ;D5 movq [byte edx + %1 * 32], mm1 ;A9 movq [edx + %1 * 32+8], mm3 ;B9 pmulhw mm5, [esi] ;C6 pmulhw mm7, [esi] ;D6%if (%1 < 3) movq mm1, [ebx] ;A2 movq mm3, [ebx] ;B2%endif%if (%1 == 0) push ebp%elif (%1 < 3) nop%endif nop%if (%1 == 3) imul eax, [int_div+4*edi]%endif pxor mm5, mm4 ;C7 pxor mm7, mm6 ;D7%endmacroALIGN 16cglobal quant_h263_intra_3dnequant_h263_intra_3dne: mov eax, [esp + 12] ; quant mov ecx, [esp + 8] ; data mov edx, [esp + 4] ; coeff cmp al, 1 pxor mm1, mm1 pxor mm3, mm3 movq mm0, [ecx] ; mm0 = [1st] movq mm2, [ecx + 8] push esi lea esi, [mmx_div + eax*8 - 8] push ebx mov ebx, mmzero push edi jz near .q1loop quant_intra 0 mov ebp, [esp + 16 + 16] ; dcscalar ; NB -- there are 3 pushes in the function preambule and one more ; in "quant_intra 0", thus an added offset of 16 bytes movsx eax, word [byte ecx] ; DC quant_intra 1 mov edi, eax sar edi, 31 ; sign(DC) shr ebp, byte 1 ; ebp = dcscalar/2 quant_intra 2 sub eax, edi ; DC (+1) xor ebp, edi ; sign(DC) dcscalar /2 (-1) mov edi, [esp + 16 + 16] ; dscalar lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar/2 mov ebp, [byte esp] quant_intra 3 psubw mm5, mm4 ;C8 mov esi, [esp + 12] ; pop back the register value mov edi, [esp + 4] ; pop back the register value sar eax, 16 lea ebx, [byte eax + 1] ; workaround for eax < 0 cmovs eax, ebx ; conditionnaly move the corrected value mov [edx], ax ; coeff[0] = ax mov ebx, [esp + 8] ; pop back the register value add esp, byte 16 ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16 psubw mm7, mm6 ;D8 movq [edx + 3 * 32 + 16], mm5 ;C9 movq [edx + 3 * 32 + 24], mm7 ;D9 xor eax, eax retALIGN 16.q1loop quant_intra1 0 mov ebp, [esp + 16 + 16] ; dcscalar movsx eax, word [byte ecx] ; DC quant_intra1 1 mov edi, eax sar edi, 31 ; sign(DC) shr ebp, byte 1 ; ebp = dcscalar /2 quant_intra1 2 sub eax, edi ; DC (+1) xor ebp, edi ; sign(DC) dcscalar /2 (-1) mov edi, [esp + 16 + 16] ; dcscalar lea eax, [byte eax + ebp] ; DC + sign(DC) dcscalar /2
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -