quantize_h263_mmx.asm

来自「从FFMPEG转换而来的H264解码程序,VC下编译..」· 汇编 代码 · 共 1,042 行 · 第 1/2 页

ASM
1,042
字号
;/*****************************************************************************
; *
; *  XVID MPEG-4 VIDEO CODEC
; *  - MPEG4 Quantization H263 implementation / MMX optimized -
; *
; *  Copyright(C) 2001-2003 Peter Ross <pross@xvid.org>
; *               2002-2003 Pascal Massimino <skal@planet-d.net>
; *               2004      Jean-Marc Bastide <jmtest@voila.fr>
; *
; *  This program is free software ; you can redistribute it and/or modify
; *  it under the terms of the GNU General Public License as published by
; *  the Free Software Foundation ; either version 2 of the License, or
; *  (at your option) any later version.
; *
; *  This program is distributed in the hope that it will be useful,
; *  but WITHOUT ANY WARRANTY ; without even the implied warranty of
; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
; *  GNU General Public License for more details.
; *
; *  You should have received a copy of the GNU General Public License
; *  along with this program ; if not, write to the Free Software
; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA
; *
; * $Id: quantize_h263_mmx.asm,v 1.7 2004/08/29 10:02:38 edgomez Exp $
; *
; ****************************************************************************/

; enable dequant saturate [-2048,2047], test purposes only.
%define SATURATE

BITS 32

%macro cglobal 1
	%ifdef PREFIX
		%ifdef MARK_FUNCS
			global _%1:function %1.endfunc-%1
			%define %1 _%1:function %1.endfunc-%1
		%else
			global _%1
			%define %1 _%1
		%endif
	%else
		%ifdef MARK_FUNCS
			global %1:function %1.endfunc-%1
		%else
			global %1
		%endif
	%endif
%endmacro

;=============================================================================
; Read only Local data
;=============================================================================

%ifdef FORMAT_COFF
SECTION .rodata
%else
SECTION .rodata align=16
%endif

ALIGN 16
plus_one:
	times 8 dw 1

;-----------------------------------------------------------------------------
;
; quant table
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_quant:
%assign quant 0
%rep 32
	times 4 dw quant
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
;
; subtract by Q/2 table
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_sub:
%assign quant 1
%rep 31
	times 4 dw  quant / 2
	%assign quant quant+1
%endrep

;-----------------------------------------------------------------------------
;
; divide by 2Q table
;
; use a shift of 16 to take full advantage of _pmulhw_
; for q=1, _pmulhw_ will overflow so it is treated seperately
; (3dnow2 provides _pmulhuw_ which wont cause overflow)
;
;-----------------------------------------------------------------------------

ALIGN 16
mmx_div:
%assign quant 1
%rep 31
	times 4 dw  (1<<16) / (quant*2) + 1
	%assign quant quant+1
%endrep

;=============================================================================
; Code
;=============================================================================

SECTION .text

cglobal quant_h263_intra_mmx
cglobal quant_h263_intra_sse2
cglobal quant_h263_inter_mmx
cglobal quant_h263_inter_sse2
cglobal dequant_h263_intra_mmx
cglobal dequant_h263_intra_xmm
cglobal dequant_h263_intra_sse2
cglobal dequant_h263_inter_mmx
cglobal dequant_h263_inter_xmm
cglobal dequant_h263_inter_sse2

;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_intra_mmx(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint32_t dcscalar,
;                               const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------

ALIGN 16
quant_h263_intra_mmx:

  push esi

  mov esi, [esp + 4 + 8]     ; data
  mov ecx,[esp + 4 + 16]     ; dcscalar
  movsx eax, word [esi]      ; data[0]
   
  sar ecx,1                  ; dcscalar /2
  mov edx,eax
  sar edx,31                 ; sgn(data[0])
  xor ecx,edx                ; *sgn(data[0])
  sub eax,edx
  add eax,ecx                ; + (dcscalar/2)*sgn(data[0])

  mov ecx, [esp + 4 + 12]    ; quant
  cdq 
  idiv dword [esp + 4 + 16]  ; dcscalar
  cmp ecx, 1
  mov edx, [esp + 4 + 4]     ; coeff
  je .low
 
  movq mm7, [mmx_div+ecx * 8 - 8]
  mov ecx,4

.loop
  movq mm0, [esi]           ; data      
  pxor mm4,mm4
  movq mm1, [esi + 8]
  pcmpgtw mm4,mm0           ; (data<0)
  pxor mm5,mm5
  pmulhw mm0,mm7            ; /(2*quant)
  pcmpgtw mm5,mm1
  movq mm2, [esi+16] 
  psubw mm0,mm4             ;  +(data<0)
  pmulhw mm1,mm7
  pxor mm4,mm4
  movq mm3,[esi+24]
  pcmpgtw mm4,mm2
  psubw mm1,mm5 
  pmulhw mm2,mm7 
  pxor mm5,mm5
  pcmpgtw mm5,mm3
  pmulhw mm3,mm7
  psubw mm2,mm4
  psubw mm3,mm5  
  movq [edx], mm0
  lea esi, [esi+32]
  movq [edx + 8], mm1
  movq [edx + 16], mm2
  movq [edx + 24], mm3
   
  dec ecx
  lea edx, [edx+32]
  jne .loop
  jmp .end
   
.low
  movd mm7,ecx
  mov ecx,4
.loop_low  
  movq mm0, [esi]           
  pxor mm4,mm4
  movq mm1, [esi + 8]
  pcmpgtw mm4,mm0
  pxor mm5,mm5
  psubw mm0,mm4
  pcmpgtw mm5,mm1
  psraw mm0,mm7
  psubw mm1,mm5 
  movq mm2,[esi+16]
  pxor mm4,mm4
  psraw mm1,mm7
  pcmpgtw mm4,mm2
  pxor mm5,mm5
  psubw mm2,mm4
  movq mm3,[esi+24]
  pcmpgtw mm5,mm3
  psraw mm2,mm7
  psubw mm3,mm5
  movq [edx], mm0
  psraw mm3,mm7
  movq [edx + 8], mm1
  movq [edx+16],mm2
  lea esi, [esi+32]
  movq [edx+24],mm3
  
  dec ecx
  lea edx, [edx+32]
  jne .loop_low
  
.end
  mov edx, [esp + 4 + 4]     ; coeff
  mov [edx],ax  
  xor eax,eax                ; return 0

  pop esi
  ret
.endfunc
 

;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_intra_sse2(int16_t * coeff,
;                                const int16_t const * data,
;                                const uint32_t quant,
;                                const uint32_t dcscalar,
;                                const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------

ALIGN 16
quant_h263_intra_sse2:

  push esi

  mov esi, [esp + 4 + 8]     ; data
 
  movsx eax, word [esi]      ; data[0]
 
  mov ecx,[esp + 4 + 16]     ; dcscalar
  mov edx,eax
  sar ecx,1
  add eax,ecx
  sub edx,ecx
  cmovl eax,edx              ; +/- dcscalar/2
  mov ecx, [esp + 4 + 12]    ; quant
  cdq 
  idiv dword [esp + 4 + 16]  ; dcscalar
  cmp ecx, 1
  mov edx, [esp + 4 + 4]     ; coeff
  movq xmm7, [mmx_div+ecx * 8 - 8]
  je .low
  
  mov ecx,2
  movlhps xmm7,xmm7

.loop
  movdqa xmm0, [esi]           
  pxor xmm4,xmm4
  movdqa xmm1, [esi + 16]
  pcmpgtw xmm4,xmm0 
  pxor xmm5,xmm5
  pmulhw xmm0,xmm7
  pcmpgtw xmm5,xmm1
  movdqa xmm2, [esi+32] 
  psubw xmm0,xmm4
  pmulhw xmm1,xmm7
  pxor xmm4,xmm4
  movdqa xmm3,[esi+48]
  pcmpgtw xmm4,xmm2
  psubw xmm1,xmm5 
  pmulhw xmm2,xmm7 
  pxor xmm5,xmm5
  pcmpgtw xmm5,xmm3
  pmulhw xmm3,xmm7
  psubw xmm2,xmm4
  psubw xmm3,xmm5  
  movdqa [edx], xmm0
  lea esi, [esi+64]
  movdqa [edx + 16], xmm1
  movdqa [edx + 32], xmm2
  movdqa [edx + 48], xmm3
   
  dec ecx
  lea edx, [edx+64]
  jne .loop
  jmp .end
   
.low
  movd xmm7,ecx
  mov ecx,2
.loop_low  
  movdqa xmm0, [esi]           
  pxor xmm4,xmm4
  movdqa xmm1, [esi + 16]
  pcmpgtw xmm4,xmm0
  pxor xmm5,xmm5
  psubw xmm0,xmm4
  pcmpgtw xmm5,xmm1
  psraw xmm0,xmm7
  psubw xmm1,xmm5 
  movdqa xmm2,[esi+32]
  pxor xmm4,xmm4
  psraw xmm1,xmm7
  pcmpgtw xmm4,xmm2
  pxor xmm5,xmm5
  psubw xmm2,xmm4
  movdqa xmm3,[esi+48]
  pcmpgtw xmm5,xmm3
  psraw xmm2,xmm7
  psubw xmm3,xmm5
  movdqa [edx], xmm0
  psraw xmm3,xmm7
  movdqa [edx+16], xmm1
  movdqa [edx+32],xmm2
  lea esi, [esi+64]
  movdqa [edx+48],xmm3
  
  dec ecx
  lea edx, [edx+64]
  jne .loop_low
  
.end
  mov edx, [esp + 4 + 4]     ; coeff
  mov [edx],ax  
  xor eax,eax                ; return 0

  pop esi
  ret
.endfunc
 
;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_inter_mmx(int16_t * coeff,
;                               const int16_t const * data,
;                               const uint32_t quant,
;                               const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------
  
ALIGN 16
quant_h263_inter_mmx:

  push ecx
  push esi
  push edi

  mov edi, [esp + 12 + 4]           ; coeff
  mov esi, [esp + 12 + 8]           ; data
  mov eax, [esp + 12 + 12]          ; quant

  xor ecx, ecx

  pxor mm5, mm5                     ; sum
  movq mm6, [mmx_sub + eax * 8 - 8] ; sub

  cmp al, 1
  jz .q1loop

  movq mm7, [mmx_div + eax * 8 - 8] ; divider

ALIGN 8
.loop
  movq mm0, [esi + 8*ecx]           ; mm0 = [1st]
  movq mm3, [esi + 8*ecx + 8]
  pxor mm1, mm1                     ; mm1 = 0
  pxor mm4, mm4                     ;
  pcmpgtw mm1, mm0                  ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3                  ;
  pxor mm0, mm1                     ; mm0 = |mm0|
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; displace
  psubw mm3, mm4                    ;
  psubusw mm0, mm6                  ; mm0 -= sub (unsigned, dont go < 0)
  psubusw mm3, mm6                  ;
  pmulhw mm0, mm7                   ; mm0 = (mm0 / 2Q) >> 16
  pmulhw mm3, mm7                   ;
  paddw mm5, mm0                    ; sum += mm0
  pxor mm0, mm1                     ; mm0 *= sign(mm0)
  paddw mm5, mm3                    ;
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; undisplace
  psubw mm3, mm4
  movq [edi + 8*ecx], mm0
  movq [edi + 8*ecx + 8], mm3

  add ecx, 2
  cmp ecx, 16
  jnz .loop

.done
  pmaddwd mm5, [plus_one]
  movq mm0, mm5
  psrlq mm5, 32
  paddd mm0, mm5

  movd eax, mm0     ; return sum
  pop edi
  pop esi
  pop ecx

  ret

ALIGN 8
.q1loop
  movq mm0, [esi + 8*ecx]           ; mm0 = [1st]
  movq mm3, [esi + 8*ecx+ 8]        ;
  pxor mm1, mm1                     ; mm1 = 0
  pxor mm4, mm4                     ;
  pcmpgtw mm1, mm0                  ; mm1 = (0 > mm0)
  pcmpgtw mm4, mm3                  ;
  pxor mm0, mm1                     ; mm0 = |mm0|
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; displace
  psubw mm3, mm4                    ;
  psubusw mm0, mm6                  ; mm0 -= sub (unsigned, dont go < 0)
  psubusw mm3, mm6                  ;
  psrlw mm0, 1                      ; mm0 >>= 1   (/2)
  psrlw mm3, 1                      ;
  paddw mm5, mm0                    ; sum += mm0
  pxor mm0, mm1                     ; mm0 *= sign(mm0)
  paddw mm5, mm3                    ;
  pxor mm3, mm4                     ;
  psubw mm0, mm1                    ; undisplace
  psubw mm3, mm4
  movq [edi + 8*ecx], mm0
  movq [edi + 8*ecx + 8], mm3

  add ecx, 2
  cmp ecx, 16
  jnz .q1loop

  jmp .done
.endfunc



;-----------------------------------------------------------------------------
;
; uint32_t quant_h263_inter_sse2(int16_t * coeff,
;                                const int16_t const * data,
;                                const uint32_t quant,
;                                const uint16_t *mpeg_matrices);
;
;-----------------------------------------------------------------------------

ALIGN 16
quant_h263_inter_sse2:

  push esi
  push edi

  mov edi, [esp + 8 + 4]      ; coeff
  mov esi, [esp + 8 + 8]      ; data
  mov eax, [esp + 8 + 12]     ; quant

  xor ecx, ecx

  pxor xmm5, xmm5                           ; sum

  movq mm0, [mmx_sub + eax*8 - 8]           ; sub
  movq2dq xmm6, mm0                         ; load into low 8 bytes
  movlhps xmm6, xmm6                        ; duplicate into high 8 bytes

  cmp al, 1
  jz near .qes2_q1loop

.qes2_not1
  movq mm0, [mmx_div + eax*8 - 8]           ; divider
  movq2dq xmm7, mm0
  movlhps xmm7, xmm7

ALIGN 16
.qes2_loop
  movdqa xmm0, [esi + ecx*8]                ; xmm0 = [1st]
  movdqa xmm3, [esi + ecx*8 + 16]           ; xmm3 = [2nd]
  pxor xmm1, xmm1
  pxor xmm4, xmm4
  pcmpgtw xmm1, xmm0
  pcmpgtw xmm4, xmm3
  pxor xmm0, xmm1
  pxor xmm3, xmm4
  psubw xmm0, xmm1
  psubw xmm3, xmm4
  psubusw xmm0, xmm6
  psubusw xmm3, xmm6
  pmulhw xmm0, xmm7
  pmulhw xmm3, xmm7
  paddw xmm5, xmm0
  pxor xmm0, xmm1
  paddw xmm5, xmm3
  pxor xmm3, xmm4
  psubw xmm0, xmm1
  psubw xmm3, xmm4
  movdqa [edi + ecx*8], xmm0
  movdqa [edi + ecx*8 + 16], xmm3

  add ecx, 4
  cmp ecx, 16
  jnz .qes2_loop

.qes2_done

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?