📄 quantize_h263_3dne.asm

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 ASM
📖 第 1 页 / 共 2 页
字号:
12 下一页
 ;/**************************************************************************
 ; *
 ; *  XVID MPEG-4 VIDEO CODEC
 ; *  - 3dne Quantization/Dequantization -
 ; *
 ; *  Copyright(C) 2002-2003 Jaan Kalda
 ; *
 ; *  This program is free software ; you can redistribute it and/or modify
 ; *  it under the terms of the GNU General Public License as published by
 ; *  the Free Software Foundation ; either version 2 of the License, or
 ; *  (at your option) any later version.
 ; *
 ; *  This program is distributed in the hope that it will be useful,
 ; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
 ; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ; *  GNU General Public License for more details.
 ; *
 ; *  You should have received a copy of the GNU General Public License
 ; *  along with this program ; if not, write to the Free Software
 ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
 ; * $Id: quantize_h263_3dne.asm,v 1.1 2005/07/21 09:09:03 klschoef Exp $
 ; *
 ; *************************************************************************/
 ;
 ; these 3dne functions are compatible with iSSE, but are optimized specifically for
 ; K7 pipelines
 
 ; enable dequant saturate [-2048,2047], test purposes only.
 %define SATURATE
 
 BITS 32
 
 %macro cglobal 1
         %ifdef PREFIX
                 global _%1
                 %define %1 _%1
         %else
                 global %1
         %endif
 %endmacro
 
 ;=============================================================================
 ; Local data
 ;=============================================================================
 
 %ifdef FORMAT_COFF
 SECTION .rodata data
 %else
 SECTION .rodata data align=16
 %endif
 
 align 4
 int_div:
         dd 0
 %assign i 1
 %rep 255
         dd  (1 << 16) / (i) + 1
         %assign i i+1
 %endrep
 
 ALIGN 16
 plus_one:
         times 8 dw 1
 
 ;-----------------------------------------------------------------------------
 ; subtract by Q/2 table
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_sub:
 %assign i 1
 %rep 31
         times 4 dw i / 2
         %assign i i+1
 %endrep
 
 
 ;-----------------------------------------------------------------------------
 ;
 ; divide by 2Q table
 ;
 ; use a shift of 16 to take full advantage of _pmulhw_
 ; for q=1, _pmulhw_ will overflow so it is treated seperately
 ; (3dnow2 provides _pmulhuw_ which wont cause overflow)
 ;
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_div:
 %assign i 1
 %rep 31
         times 4 dw  (1 << 16) / (i * 2) + 1
         %assign i i+1
 %endrep
 
 ;-----------------------------------------------------------------------------
 ; add by (odd(Q) ? Q : Q - 1) table
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_add:
 %assign i 1
 %rep 31
         %if i % 2 != 0
         times 4 dw i
         %else
         times 4 dw i - 1
         %endif
         %assign i i+1
 %endrep
 
 ;-----------------------------------------------------------------------------
 ; multiple by 2Q table
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_mul:
 %assign i 1
 %rep 31
         times 4 dw i * 2
         %assign i i+1
 %endrep
 
 ;-----------------------------------------------------------------------------
 ; saturation limits
 ;-----------------------------------------------------------------------------
 
 ALIGN 8
 mmx_32768_minus_2048:
         times 4 dw (32768-2048)
 mmx_32767_minus_2047:
         times 4 dw (32767-2047)
 
 ALIGN 16
 mmx_2047:
         times 4 dw 2047
 
 ALIGN 8
 mmzero:
         dd 0, 0
 int2047:
         dd 2047
 int_2048:
         dd -2048
 
 ;=============================================================================
 ; Code
 ;=============================================================================
 
 SECTION .text
 
 
 ;-----------------------------------------------------------------------------
 ;
 ; uint32_t quant_h263_intra_3dne(int16_t * coeff,
 ;                                const int16_t const * data,
 ;                                const uint32_t quant,
 ;                                const uint32_t dcscalar,
 ;                                const uint16_t *mpeg_matrices);
 ;
 ;-----------------------------------------------------------------------------
 ;This is Athlon-optimized code (ca 70 clk per call)
 
 %macro quant_intra1  1
   psubw mm1, mm0    ;A3
   psubw mm3, mm2    ;B3
 %if (%1)
   psubw mm5, mm4    ;C8
   psubw mm7, mm6    ;D8
 %endif
 
 ALIGN 8
   db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
   pmaxsw mm1, mm0   ;A4
   db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
   pmaxsw mm3, mm2   ;B4
 
 
   psraw mm0, 15     ;A5
   psraw mm2, 15     ;B5
 %if (%1)
   movq [edx + %1 * 32 + 16-32], mm5 ;C9
   movq [edx + %1 * 32 + 24-32], mm7 ;D9
 %endif
 
   psrlw mm1, 1      ;A6
   psrlw mm3, 1      ;B6
   movq mm5, [ebx]   ;C2
   movq mm7, [ebx]   ;D2
 
   pxor mm1, mm0 ;A7
   pxor mm3, mm2 ;B7
 
   psubw mm5, mm4    ;C3
   psubw mm7, mm6    ;D3
   psubw mm1, mm0    ;A8
   psubw mm3, mm2    ;B8
 
 %if (%1 == 0)
   push ebp
   movq mm0, [ecx + %1 * 32 +32]
 %elif (%1 < 3)
   db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32)  ;movq   mm0, [ecx + %1 * 32 +32]    ;A1
 %endif
   pmaxsw mm5, mm4   ;C4
 %if (%1 < 3)
   db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32)   ;movq   mm2, [ecx + %1 * 32 +8+32]  ;B1
 %else
   cmp esp, esp
 %endif
   pmaxsw mm7, mm6   ;D4
 
   psraw mm4, 15     ;C5
   psraw mm6, 15     ;D5
   movq [byte edx + %1 * 32], mm1    ;A9
   movq [edx + %1 * 32+8], mm3       ;B9
 
 
   psrlw mm5, 1      ;C6
   psrlw mm7, 1      ;D6
 %if (%1 < 3)
   movq mm1, [ebx]   ;A2
   movq mm3, [ebx]   ;B2
 %endif
 %if (%1 == 3)
   imul eax, [int_div+4*edi]
 %endif
   pxor mm5, mm4 ;C7
   pxor mm7, mm6 ;D7
 %endm
 
 
 %macro quant_intra  1
     ; Rules for athlon:
         ; 1) schedule latencies
         ; 2) add/mul and load/store in 2:1 proportion
         ; 3) avoid spliting >3byte instructions over 8byte boundaries
 
   psubw mm1, mm0    ;A3
   psubw mm3, mm2    ;B3
 %if (%1)
   psubw mm5, mm4    ;C8
   psubw mm7, mm6    ;D8
 %endif
 
 ALIGN 8
   db 0Fh, 6Fh, 64h, 21h, (%1 * 32 +16)  ;movq   mm4, [ecx + %1 * 32 +16+32] ;C1
   pmaxsw mm1, mm0   ;A4
   db 0Fh, 6Fh, 74h, 21h, (%1 * 32 +24)  ;movq   mm6, [ecx + %1 * 32 +24+32] ;D1
   pmaxsw mm3, mm2   ;B4
 
 
   psraw mm0, 15     ;A5
   psraw mm2, 15     ;B5
 %if (%1)
   movq [edx + %1 * 32 + 16-32], mm5 ;C9
   movq [edx + %1 * 32 + 24-32], mm7 ;D9
 %endif
 
   pmulhw mm1, [esi] ;A6
   pmulhw mm3, [esi] ;B6
   movq mm5, [ebx]   ;C2
   movq mm7, [ebx]   ;D2
 
   nop
   nop
   pxor mm1, mm0 ;A7
   pxor mm3, mm2 ;B7
 
   psubw mm5, mm4    ;C3
   psubw mm7, mm6    ;D3
   psubw mm1, mm0    ;A8
   psubw mm3, mm2    ;B8
 
 
 %if (%1 < 3)
   db 0Fh, 6Fh, 44h, 21h, (%1 * 32 +32) ;movq    mm0, [ecx + %1 * 32 +32]    ;A1
 %endif
   pmaxsw mm5, mm4     ;C4
 %if (%1 < 3)
   db 0Fh, 6Fh, 54h, 21h, ( %1 * 32 +8+32) ;movq mm2, [ecx + %1 * 32 +8+32]  ;B1
 %else
   cmp esp, esp
 %endif
   pmaxsw mm7,mm6        ;D4
 
   psraw mm4, 15     ;C5
   psraw mm6, 15     ;D5
   movq [byte edx + %1 * 32], mm1 ;A9
   movq [edx + %1 * 32+8], mm3     ;B9
 
 
   pmulhw mm5, [esi] ;C6
   pmulhw mm7, [esi] ;D6
 %if (%1 < 3)
   movq mm1, [ebx]   ;A2
   movq mm3, [ebx]   ;B2
 %endif
 %if (%1 == 0)
   push ebp
 %elif (%1 < 3)
   nop
 %endif
   nop
 %if (%1 == 3)
   imul eax, [int_div+4*edi]
 %endif
   pxor mm5, mm4 ;C7
   pxor mm7, mm6 ;D7
 %endmacro
 
 
 ALIGN 16
 cglobal quant_h263_intra_3dne
 quant_h263_intra_3dne:
 
   mov eax, [esp + 12]       ; quant
   mov ecx, [esp + 8]        ; data
   mov edx, [esp + 4]        ; coeff
   cmp al, 1
   pxor mm1, mm1
   pxor mm3, mm3
   movq mm0, [ecx]           ; mm0 = [1st]
   movq mm2, [ecx + 8]
   push esi
   lea esi, [mmx_div + eax*8 - 8]
 
   push ebx
   mov ebx, mmzero
   push edi
   jz near .q1loop
 
   quant_intra 0
   mov ebp, [esp + 16 + 16]      ; dcscalar
                                 ; NB -- there are 3 pushes in the function preambule and one more
                                 ; in "quant_intra 0", thus an added offset of 16 bytes
   movsx eax, word [byte ecx]    ; DC
 
   quant_intra 1
   mov edi, eax
   sar edi, 31                       ; sign(DC)
   shr ebp, byte 1                   ; ebp = dcscalar/2
 
   quant_intra 2
   sub eax, edi                      ; DC (+1)
   xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
   mov edi, [esp + 16 + 16]          ; dscalar
   lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar/2
   mov ebp, [byte esp]
 
   quant_intra 3
   psubw mm5, mm4                    ;C8
   mov esi, [esp + 12]               ; pop back the register value
   mov edi, [esp + 4]                ; pop back the register value
   sar eax, 16
   lea ebx, [byte eax + 1]           ; workaround for eax < 0
   cmovs eax, ebx                    ; conditionnaly move the corrected value
   mov [edx], ax                     ; coeff[0] = ax
   mov ebx, [esp + 8]                ; pop back the register value
   add esp, byte 16                  ; "quant_intra 0" pushed ebp, but we don't restore that one, just correct the stack offset by 16
   psubw mm7, mm6                    ;D8
   movq [edx + 3 * 32 + 16], mm5     ;C9
   movq [edx + 3 * 32 + 24], mm7     ;D9
 
   xor eax, eax
   ret
 
 ALIGN 16
 
 .q1loop
   quant_intra1 0
   mov ebp, [esp + 16 + 16]          ; dcscalar
   movsx eax, word [byte ecx]        ; DC
 
   quant_intra1 1
   mov edi, eax
   sar edi, 31                       ; sign(DC)
   shr ebp, byte 1                   ; ebp = dcscalar /2
 
   quant_intra1 2
   sub eax, edi                      ; DC (+1)
   xor ebp, edi                      ; sign(DC) dcscalar /2  (-1)
   mov edi, [esp + 16 + 16]          ; dcscalar
   lea eax, [byte eax + ebp]         ; DC + sign(DC) dcscalar /2
   mov ebp, [byte esp]
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -