⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quantize_h263_3dne.asm

📁 这是一个压缩解压包,用C语言进行编程的,里面有详细的源代码.
💻 ASM
📖 第 1 页 / 共 2 页
字号:
;/**************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *  - 3dne Quantization/Dequantization -; *; *  Copyright(C) 2002-2003 Jaan Kalda; *; *  This program is free software ; you can redistribute it and/or modify; *  it under the terms of the GNU General Public License as published by; *  the Free Software Foundation ; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program ; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; * $Id: quantize_h263_3dne.asm,v 1.5 2004/08/29 10:02:38 edgomez Exp $; *; *************************************************************************/;; these 3dne functions are compatible with iSSE, but are optimized specifically for; K7 pipelines; enable dequant saturate [-2048,2047], test purposes only.%define SATURATEBITS 32%macro cglobal 1	%ifdef PREFIX		%ifdef MARK_FUNCS			global _%1:function %1.endfunc-%1			%define %1 _%1:function %1.endfunc-%1		%else			global _%1			%define %1 _%1		%endif	%else		%ifdef MARK_FUNCS			global %1:function %1.endfunc-%1		%else			global %1		%endif	%endif%endmacro;=============================================================================; Local data;=============================================================================%ifdef FORMAT_COFFSECTION .rodata%elseSECTION .rodata align=16%endifalign 4int_div:	dd 0%assign i 1%rep 255	dd  (1 << 16) / (i) + 1	%assign i i+1%endrepALIGN 16plus_one:	times 8 dw 1;-----------------------------------------------------------------------------; subtract by Q/2 table;-----------------------------------------------------------------------------ALIGN 16mmx_sub:%assign i 1%rep 31	times 4 dw i / 2	%assign i i+1%endrep;-----------------------------------------------------------------------------;; divide by 2Q table;; use a shift of 16 to take full advantage of _pmulhw_; for q=1, _pmulhw_ will overflow so it is treated seperately; (3dnow2 provides _pmulhuw_ which wont cause overflow);;-----------------------------------------------------------------------------ALIGN 16mmx_div:%assign i 1%rep 31	times 4 dw  (1 << 16) / (i * 2) + 1	%assign i i+1%endrep;-----------------------------------------------------------------------------; add by (odd(Q) ? Q : Q - 1) table;-----------------------------------------------------------------------------ALIGN 16mmx_add:%assign i 1%rep 31	%if i % 2 != 0	times 4 dw i	%else	times 4 dw i - 1	%endif	%assign i i+1%endrep;-----------------------------------------------------------------------------; multiple by 2Q table;-----------------------------------------------------------------------------ALIGN 16mmx_mul:%assign i 1%rep 31	times 4 dw i * 2	%assign i i+1%endrep;-----------------------------------------------------------------------------; saturation limits;-----------------------------------------------------------------------------ALIGN 8mmx_32768_minus_2048:	times 4 dw (32768-2048)mmx_32767_minus_2047:	times 4 dw (32767-2047)ALIGN 16mmx_2047:	times 4 dw 2047ALIGN 8mmzero:	dd 0, 0int2047:	dd 2047int_2048:	dd -2048;=============================================================================; Code;=============================================================================SECTION .text;-----------------------------------------------------------------------------;; uint32_t quant_h263_intra_3dne(int16_t * coeff,;                                const int16_t const * data,;                                const uint32_t quant,;                                const uint32_t dcscalar,;                                const uint16_t *mpeg_matrices);;;-----------------------------------------------------------------------------;This is Athlon-optimized code (ca 70 clk per call)%macro quant_intra1  1  psubw mm1, mm0    ;A3  psubw mm3, mm2    ;B3%if (%1)  psubw mm5, mm4    ;C8  psubw mm7, mm6    ;D8%endifALIGN 8  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1  pmaxsw mm1, mm0   ;A4  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1  pmaxsw mm3, mm2   ;B4  psraw mm0, 15     ;A5  psraw mm2, 15     ;B5%if (%1)  movq [edx + %1 * 32 + 16-32], mm5 ;C9  movq [edx + %1 * 32 + 24-32], mm7 ;D9%endif  psrlw mm1, 1      ;A6  psrlw mm3, 1      ;B6  movq mm5, [ebx]   ;C2  movq mm7, [ebx]   ;D2  pxor mm1, mm0 ;A7  pxor mm3, mm2 ;B7  psubw mm5, mm4    ;C3  psubw mm7, mm6    ;D3  psubw mm1, mm0    ;A8  psubw mm3, mm2    ;B8%if (%1 == 0)  push ebp  movq mm0, [ecx + %1 * 32 +32]%elif (%1 < 3)  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32)  ;movq   mm0, [ecx + %1 * 32 +32]    ;A1%endif  pmaxsw mm5, mm4   ;C4%if (%1 < 3)  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32)   ;movq   mm2, [ecx + %1 * 32 +8+32]  ;B1%else  cmp esp, esp%endif  pmaxsw mm7, mm6   ;D4  psraw mm4, 15     ;C5  psraw mm6, 15     ;D5  movq [byte edx + %1 * 32], mm1    ;A9  movq [edx + %1 * 32+8], mm3       ;B9  psrlw mm5, 1      ;C6  psrlw mm7, 1      ;D6%if (%1 < 3)  movq mm1, [ebx]   ;A2  movq mm3, [ebx]   ;B2%endif%if (%1 == 3)  imul eax, [int_div+4*edi]%endif  pxor mm5, mm4 ;C7  pxor mm7, mm6 ;D7%endm%macro quant_intra  1    ; Rules for athlon:        ; 1) schedule latencies        ; 2) add/mul and load/store in 2:1 proportion        ; 3) avoid spliting >3byte instructions over 8byte boundaries  psubw mm1, mm0    ;A3  psubw mm3, mm2    ;B3%if (%1)  psubw mm5, mm4    ;C8  psubw mm7, mm6    ;D8%endifALIGN 8  db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1  pmaxsw mm1, mm0   ;A4  db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1  pmaxsw mm3, mm2   ;B4  psraw mm0, 15     ;A5  psraw mm2, 15     ;B5%if (%1)  movq [edx + %1 * 32 + 16-32], mm5 ;C9  movq [edx + %1 * 32 + 24-32], mm7 ;D9%endif  pmulhw mm1, [esi] ;A6  pmulhw mm3, [esi] ;B6  movq mm5, [ebx]   ;C2  movq mm7, [ebx]   ;D2  nop  nop  pxor mm1, mm0 ;A7  pxor mm3, mm2 ;B7  psubw mm5, mm4    ;C3  psubw mm7, mm6    ;D3  psubw mm1, mm0    ;A8  psubw mm3, mm2    ;B8%if (%1 < 3)  db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq    mm0, [ecx + %1 * 32 +32]    ;A1%endif  pmaxsw mm5, mm4     ;C4%if (%1 < 3)  db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32]  ;B1%else  cmp esp, esp%endif  pmaxsw mm7,mm6        ;D4  psraw mm4, 15     ;C5  psraw mm6, 15     ;D5  movq [byte edx + %1 * 32], mm1 ;A9  movq [edx + %1 * 32+8], mm3     ;B9  pmulhw mm5, [esi] ;C6  pmulhw mm7, [esi] ;D6%if (%1 < 3)  movq mm1, [ebx]   ;A2  movq mm3, [ebx]   ;B2%endif%if (%1 == 0)  push ebp%elif (%1 < 3)  nop%endif  nop%if (%1 == 3)  imul eax, [int_div+4*edi]%endif  pxor mm5, mm4 ;C7  pxor mm7, mm6 ;D7%endmacroALIGN 16cglobal quant_h263_intra_3dnequant_h263_intra_3dne:  mov eax, [esp + 12]       ; quant  mov ecx, [esp + 8]        ; data  mov edx, [esp + 4]        ; coeff  cmp al, 1  pxor mm1, mm1  pxor mm3, mm3  movq mm0, [ecx]           ; mm0 = [1st]  movq mm2, [ecx + 8]  push esi  lea esi, [mmx_div + eax*8 - 8]  push ebx  mov ebx, mmzero  push edi  jz near .q1loop  quant_intra 0  mov ebp, [esp + 16 + 16]      ; dcscalar                                ; NB -- there are 3 pushes in the function preambule and one more                                ; in "quant_intra 0", thus an added offset of 16 bytes  movsx eax, word [byte ecx]    ; DC  quant_intra 1  mov edi, eax  sar edi, 31                       ; sign(DC)  shr ebp, byte 1                   ; ebp = dcscalar/2  quant_intra 2  sub eax, edi                      ; DC (+1)  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)  mov edi, [esp + 16 + 16]          ; dscalar  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2  mov ebp, [byte esp]  quant_intra 3  psubw mm5, mm4                    ;C8  mov esi, [esp + 12]               ; pop back the register value  mov edi, [esp + 4]                ; pop back the register value  sar eax, 16  lea ebx, [byte eax + 1]           ; workaround for eax < 0  cmovs eax, ebx                    ; conditionnaly move the corrected value  mov [edx], ax                     ; coeff[0] = ax  mov ebx, [esp + 8]                ; pop back the register value  add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16  psubw mm7, mm6                    ;D8  movq [edx + 3 * 32 + 16], mm5     ;C9  movq [edx + 3 * 32 + 24], mm7     ;D9  xor eax, eax  retALIGN 16.q1loop  quant_intra1 0  mov ebp, [esp + 16 + 16]          ; dcscalar  movsx eax, word [byte ecx]        ; DC  quant_intra1 1  mov edi, eax  sar edi, 31                       ; sign(DC)  shr ebp, byte 1                   ; ebp = dcscalar /2  quant_intra1 2  sub eax, edi                      ; DC (+1)  xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)  mov edi, [esp + 16 + 16]          ; dcscalar  lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -