📄 quantize_mpeg_xmm.asm

📁 wince下的xvidcore开发库,可用于MP4等视频播放开发
💻 ASM
📖 第 1 页 / 共 2 页
字号:
12 下一页
 ;/****************************************************************************
 ; *
 ; *  XVID MPEG-4 VIDEO CODEC
 ; *  - 3dne Quantization/Dequantization -
 ; *
 ; *  Copyright (C) 2002-2003 Peter Ross <pross@xvid.org>
 ; *                2002      Jaan Kalda
 ; *
 ; *  This program is free software ; you can redistribute it and/or modify
 ; *  it under the terms of the GNU General Public License as published by
 ; *  the Free Software Foundation ; either version 2 of the License, or
 ; *  (at your option) any later version.
 ; *
 ; *  This program is distributed in the hope that it will be useful,
 ; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
 ; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ; *  GNU General Public License for more details.
 ; *
 ; *  You should have received a copy of the GNU General Public License
 ; *  along with this program ; if not, write to the Free Software
 ; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
 ; *
 ; * $Id: quantize_mpeg_xmm.asm,v 1.1 2005/07/21 09:09:04 klschoef Exp $
 ; *
 ; ***************************************************************************/
 
 ; _3dne functions are compatible with iSSE, but are optimized specifically
 ; for K7 pipelines
 
 %define SATURATE
 
 BITS 32
 
 %macro cglobal 1
         %ifdef PREFIX
                 global _%1
                 %define %1 _%1
         %else
                 global %1
         %endif
 %endmacro
 
 %macro cextern 1
         %ifdef PREFIX
                 extern _%1
                 %define %1 _%1
         %else
                 extern %1
         %endif
 %endmacro
 
 ;=============================================================================
 ; Local data
 ;=============================================================================
 
 %ifdef FORMAT_COFF
 SECTION .rodata data
 %else
 SECTION .rodata data align=16
 %endif
 
 ALIGN 8
 mmzero:
         dd 0,0
 mmx_one:
         times 4 dw 1
 
 ;-----------------------------------------------------------------------------
 ; divide by 2Q table
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_divs:               ;i>2
 %assign i 1
 %rep 31
         times 4 dw  ((1 << 15) / i + 1)
         %assign i i+1
 %endrep
 
 ALIGN 16
 mmx_div:                ;quant>2
         times 4 dw 65535 ; the div by 2 formula will overflow for the case
                          ; quant=1 but we don't care much because quant=1
                          ; is handled by a different piece of code that
                          ; doesn't use this table.
 %assign quant 2
 %rep 31
         times 4 dw  ((1 << 16) / quant + 1)
         %assign quant quant+1
 %endrep
 
 %macro FIXX 1
 dw (1 << 16) / (%1) + 1
 %endmacro
 
 %define nop4    db      08Dh, 074h, 026h,0
 %define nop3    add     esp, byte 0
 %define nop2    mov     esp, esp
 %define nop7    db      08dh, 02ch, 02dh,0,0,0,0
 %define nop6    add     ebp, dword 0
 
 ;-----------------------------------------------------------------------------
 ; quantd table
 ;-----------------------------------------------------------------------------
 
 %define VM18P   3
 %define VM18Q   4
 
 ALIGN 16
 quantd:
 %assign i 1
 %rep 31
         times 4 dw  (((VM18P*i) + (VM18Q/2)) / VM18Q)
         %assign i i+1
 %endrep
 
 ;-----------------------------------------------------------------------------
 ; multiple by 2Q table
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_mul_quant:
 %assign i 1
 %rep 31
         times 4 dw  i
         %assign i i+1
 %endrep
 
 ;-----------------------------------------------------------------------------
 ; saturation limits
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 mmx_32767_minus_2047:
         times 4 dw (32767-2047)
 mmx_32768_minus_2048:
         times 4 dw (32768-2048)
 mmx_2047:
         times 4 dw 2047
 mmx_minus_2048:
         times 4 dw (-2048)
 zero:
         times 4 dw 0
 
 int_div:
 dd 0
 %assign i 1
 %rep 255
         dd  (1 << 17) / ( i) + 1
         %assign i i+1
 %endrep
 
 ;=============================================================================
 ; Code
 ;=============================================================================
 
 SECTION .text
 
 cglobal quant_mpeg_intra_xmm
 cglobal quant_mpeg_inter_xmm
 cglobal dequant_mpeg_intra_3dne
 cglobal dequant_mpeg_inter_3dne
 
 ;-----------------------------------------------------------------------------
 ;
 ; uint32_t quant_mpeg_intra_xmm(int16_t * coeff,
 ;                               const int16_t const * data,
 ;                               const uint32_t quant,
 ;                               const uint32_t dcscalar,
 ;                               const uint16_t *mpeg_matrices);
 ;
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 quant_mpeg_intra_xmm:
   mov eax, [esp  + 8]       ; data
   mov ecx, [esp  + 12]      ; quant
   mov edx, [esp  + 4]       ; coeff
   push esi
   push edi
   push ebx
   nop
   mov edi, [esp + 12 + 20]              ; mpeg_quant_matrices
   mov esi, -14
   pxor mm0, mm0
   pxor mm3, mm3
   cmp ecx, byte 1
   je near .q1loop
   cmp ecx, byte 19
   jg near .lloop
   nop6
 
 ALIGN 16
 .loop
   movq mm1, [eax + 8*esi+112]   ; mm0 = [1st]
   psubw mm0, mm1                ;-mm1
   movq mm4, [eax + 8*esi + 120] ;
   psubw mm3, mm4                ;-mm4
   pmaxsw mm0, mm1               ;|src|
   pmaxsw mm3,mm4
   nop2
   psraw mm1, 15     ;sign src
   psraw mm4, 15
   psllw mm0, 4      ;level << 4 ;
   psllw mm3, 4
   paddw mm0, [edi + 128 + 8*esi+112]
   paddw mm3, [edi + 128 + 8*esi+120]
   movq mm5, [edi + 384 + 8*esi+112]
   movq mm7, [edi + 384 + 8*esi+120]
   pmulhuw mm5, mm0
   pmulhuw mm7, mm3
   mov esp, esp
   movq mm2, [edi + 8*esi+112]
   movq mm6, [edi + 8*esi+120]
   pmullw mm2, mm5
   pmullw mm6, mm7
   psubw mm0, mm2
   psubw mm3, mm6
   nop4
   movq mm2, [quantd + ecx * 8 - 8]
   movq mm6, [mmx_divs + ecx * 8 - 8]
   paddw mm5, mm2
   paddw mm7, mm2
   mov esp, esp
   pmulhuw mm0, [edi + 256 + 8*esi+112]
   pmulhuw mm3, [edi + 256 + 8*esi+120]
   paddw mm5, mm0
   paddw mm7, mm3
   pxor mm0, mm0
   pxor mm3, mm3
   pmulhuw mm5, mm6      ; mm0 = (mm0 / 2Q) >> 16
   pmulhuw mm7, mm6      ;  (level + quantd) / quant (0<quant<32)
   pxor mm5, mm1         ; mm0 *= sign(mm0)
   pxor mm7, mm4         ;
   psubw mm5, mm1        ; undisplace
   psubw mm7, mm4        ;
   movq [edx + 8*esi+112], mm5
   movq [edx + 8*esi +120], mm7
   add esi, byte 2
   jng near .loop
 
 .done
 ; calculate  data[0] // (int32_t)dcscalar)
   mov esi, [esp + 12 + 16]  ; dcscalar
   movsx ecx, word [eax]
   mov edi, ecx
   mov edx, [esp + 12 + 16]
   shr edx, 1            ; ebx = dcscalar /2
   sar edi, 31           ; cdq is vectorpath
   xor edx, edi          ; ebx = eax V -eax -1
   sub ecx, edi
   add ecx, edx
   mov edx, [dword esp + 12 + 4]
   mov esi, [int_div+4*esi]
   imul ecx, esi
   sar ecx, 17
   lea ebx, [byte ecx + 1]
   cmovs ecx, ebx
   ; idiv    cx          ; ecx = edi:ecx / dcscalar
 
   mov ebx, [esp]
   mov edi, [esp+4]
   mov esi, [esp+8]
   add esp, byte 12
   mov [edx], cx     ; coeff[0] = ax
 
   xor eax, eax
   ret
 
 ALIGN 16
 .q1loop
   movq mm1, [eax + 8*esi+112]               ; mm0 = [1st]
   psubw mm0, mm1                            ;-mm1
   movq mm4, [eax + 8*esi+120]               ;
   psubw mm3, mm4                            ;-mm4
   pmaxsw mm0, mm1                           ;|src|
   pmaxsw mm3, mm4
   nop2
   psraw mm1, 15                             ;sign src
   psraw mm4, 15
   psllw mm0, 4                              ; level << 4
   psllw mm3, 4
   paddw mm0, [edi + 128 + 8*esi+112]    ;mm0 is to be divided
   paddw mm3, [edi + 128 + 8*esi+120]    ;intra1 contains fix for division by 1
   movq mm5, [edi + 384 + 8*esi+112] ;with rounding down
   movq mm7, [edi + 384 + 8*esi+120]
   pmulhuw mm5, mm0
   pmulhuw mm7, mm3      ;mm7: first approx of division
   mov esp, esp
   movq mm2, [edi + 8*esi+112]
   movq mm6, [edi + 8*esi+120]      ; divs for q<=16
   pmullw mm2, mm5       ;test value <= original
   pmullw mm6, mm7
   psubw mm0, mm2        ;mismatch
   psubw mm3, mm6
   nop4
   movq mm2, [quantd + ecx * 8 - 8]
   paddw mm5, mm2        ;first approx with quantd
   paddw mm7, mm2
   mov esp, esp
   pmulhuw mm0, [edi + 256 + 8*esi+112]   ;correction
   pmulhuw mm3, [edi + 256 + 8*esi+120]
   paddw mm5, mm0        ;final result with quantd
   paddw mm7, mm3
   pxor mm0, mm0
   pxor mm3, mm3
   mov esp, esp
   psrlw mm5, 1          ;  (level + quantd) /2  (quant = 1)
   psrlw mm7, 1
   pxor mm5, mm1         ; mm0 *= sign(mm0)
   pxor mm7, mm4         ;
   psubw mm5, mm1        ; undisplace
   psubw mm7, mm4        ;
   movq [edx + 8*esi+112], mm5
   movq [edx + 8*esi +120], mm7
   add esi, byte 2
   jng near .q1loop
   jmp near .done
 
 ALIGN 8
 .lloop
   movq mm1, [eax + 8*esi+112]       ; mm0 = [1st]
   psubw mm0, mm1        ;-mm1
   movq mm4, [eax + 8*esi+120]
   psubw mm3, mm4        ;-mm4
   pmaxsw mm0, mm1       ;|src|
   pmaxsw mm3, mm4
   nop2
   psraw mm1, 15         ;sign src
   psraw mm4, 15
   psllw mm0, 4          ; level << 4
   psllw mm3, 4          ;
   paddw mm0, [edi + 128 + 8*esi+112] ;mm0 is to be divided intra1 contains fix for division by 1
   paddw mm3, [edi + 128 + 8*esi+120]
   movq mm5, [edi + 384 + 8*esi+112]
   movq mm7, [edi + 384 + 8*esi+120]
   pmulhuw mm5, mm0
   pmulhuw mm7, mm3      ;mm7: first approx of division
   mov esp, esp
   movq mm2, [edi + 8*esi+112]
   movq mm6, [edi + 8*esi+120]
   pmullw mm2, mm5       ;test value <= original
   pmullw mm6, mm7
   psubw mm0, mm2        ;mismatch
   psubw mm3, mm6
   nop4
   movq mm2, [quantd + ecx * 8 - 8]
   movq mm6, [mmx_div + ecx * 8 - 8] ; divs for q<=16
   paddw mm5, mm2        ;first approx with quantd
   paddw mm7, mm2
   mov esp, esp
   pmulhuw mm0, [edi + 256 + 8*esi+112] ;correction
   pmulhuw mm3, [edi + 256 + 8*esi+120]
   paddw mm5, mm0        ;final result with quantd
   paddw mm7, mm3
   pxor mm0, mm0
   pxor mm3, mm3
   mov esp, esp
   pmulhuw mm5, mm6      ; mm0 = (mm0 / 2Q) >> 16
   pmulhuw mm7, mm6      ;  (level + quantd) / quant (0<quant<32)
   psrlw mm5, 1          ; (level + quantd) / (2*quant)
   psrlw mm7, 1
   pxor mm5, mm1         ; mm0 *= sign(mm0)
   pxor mm7, mm4         ;
   psubw mm5, mm1        ; undisplace
   psubw mm7, mm4        ;
   movq [edx + 8*esi+112], mm5
   movq [edx + 8*esi +120], mm7
   add esi,byte 2
   jng near .lloop
   jmp near .done
 
 ;-----------------------------------------------------------------------------
 ;
 ; uint32_t quant_mpeg_inter_xmm(int16_t * coeff,
 ;                               const int16_t const * data,
 ;                               const uint32_t quant,
 ;                               const uint16_t *mpeg_matrices);
 ;
 ;-----------------------------------------------------------------------------
 
 ALIGN 16
 quant_mpeg_inter_xmm:
   mov eax, [esp  + 8]       ; data
   mov ecx, [esp  + 12]      ; quant
   mov edx, [esp  + 4]       ; coeff
   push esi
   push edi
   push ebx
   nop
   mov edi, [esp + 12 + 16]
   mov esi, -14
   mov ebx, esp
   sub esp, byte 24
   lea ebx, [esp+8]
   and ebx, byte -8 ;ALIGN 8
   pxor mm0, mm0
   pxor mm3, mm3
   movq [byte ebx],mm0
   db 0Fh, 7Fh, 44h, 23h, 8 ;movq [ebx+8],mm0
   cmp ecx, byte 1
   je near .q1loop
   cmp ecx, byte 19
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -