📄 quantize_mmx.asm

📁 视频压缩标准MPEG4的视频参考代码xvid9.1
💻 ASM
📖 第 1 页 / 共 2 页
字号:
12 下一页
;/*****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *  mmx optimized quantization/dequantization             ; *; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>; *  Copyright(C) 2002 Michael Militzer <michael@xvid.org>; *  Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>; *; *  This file is part of XviD, a free MPEG-4 video encoder/decoder; *; *  XviD is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; *  Under section 8 of the GNU General Public License, the copyright; *  holders of XVID explicitly forbid distribution in the following; *  countries:; *; *    - Japan; *    - United States of America; *; *  Linking XviD statically or dynamically with other modules is making a; *  combined work based on XviD.  Thus, the terms and conditions of the; *  GNU General Public License cover the whole combination.; *; *  As a special exception, the copyright holders of XviD give you; *  permission to link XviD with independent modules that communicate with; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the; *  license terms of these independent modules, and to copy and distribute; *  the resulting combined work under terms of your choice, provided that; *  every copy of the combined work is accompanied by a complete copy of; *  the source code of XviD (the version of XviD used to produce the; *  combined work), being distributed under the terms of the GNU General; *  Public License plus this exception.  An independent module is a module; *  which is not derived from or based on XviD.; *; *  Note that people who make modified versions of XviD are not obligated; *  to grant this special exception for their modified versions; it is; *  their choice whether to do so.  The GNU General Public License gives; *  permission to release a modified version without this exception; this; *  exception also makes it possible to release a modified version which; *  carries forward this exception.; *; * $Id: quantize_mmx.asm,v 1.7 2002/11/17 00:41:20 edgomez Exp $; *; *************************************************************************/; enable dequant saturate [-2048,2047], test purposes only.%define SATURATE; data/text alignment%define ALIGN 8bits 32section .data%macro cglobal 1 	%ifdef PREFIX		global _%1 		%define %1 _%1	%else		global %1	%endif%endmacroalign 16plus_one times 8	dw	 1;===========================================================================;; subtract by Q/2 table;;===========================================================================%macro MMX_SUB  1times 4 dw %1 / 2%endmacroalign 16mmx_sub		MMX_SUB 1		MMX_SUB 2		MMX_SUB 3		MMX_SUB 4		MMX_SUB 5		MMX_SUB 6		MMX_SUB 7		MMX_SUB 8		MMX_SUB 9		MMX_SUB 10		MMX_SUB 11		MMX_SUB 12		MMX_SUB 13		MMX_SUB 14		MMX_SUB 15		MMX_SUB 16		MMX_SUB 17		MMX_SUB 18		MMX_SUB 19		MMX_SUB 20		MMX_SUB 21		MMX_SUB 22		MMX_SUB 23		MMX_SUB 24		MMX_SUB 25		MMX_SUB 26		MMX_SUB 27		MMX_SUB 28		MMX_SUB 29		MMX_SUB 30		MMX_SUB 31;===========================================================================;; divide by 2Q table ;; use a shift of 16 to take full advantage of _pmulhw_; for q=1, _pmulhw_ will overflow so it is treated seperately; (3dnow2 provides _pmulhuw_ which wont cause overflow);;===========================================================================%macro MMX_DIV  1times 4 dw  (1 << 16) / (%1 * 2) + 1%endmacroalign 16mmx_div		MMX_DIV 1		MMX_DIV 2		MMX_DIV 3		MMX_DIV 4		MMX_DIV 5		MMX_DIV 6		MMX_DIV 7		MMX_DIV 8		MMX_DIV 9		MMX_DIV 10		MMX_DIV 11		MMX_DIV 12		MMX_DIV 13		MMX_DIV 14		MMX_DIV 15		MMX_DIV 16		MMX_DIV 17		MMX_DIV 18		MMX_DIV 19		MMX_DIV 20		MMX_DIV 21		MMX_DIV 22		MMX_DIV 23		MMX_DIV 24		MMX_DIV 25		MMX_DIV 26		MMX_DIV 27		MMX_DIV 28		MMX_DIV 29		MMX_DIV 30		MMX_DIV 31;===========================================================================;; add by (odd(Q) ? Q : Q - 1) table;;===========================================================================%macro MMX_ADD  1%if %1 % 2 != 0times 4 dw %1%elsetimes 4 dw %1 - 1%endif%endmacroalign 16mmx_add		MMX_ADD 1		MMX_ADD 2		MMX_ADD 3		MMX_ADD 4		MMX_ADD 5		MMX_ADD 6		MMX_ADD 7		MMX_ADD 8		MMX_ADD 9		MMX_ADD 10		MMX_ADD 11		MMX_ADD 12		MMX_ADD 13		MMX_ADD 14		MMX_ADD 15		MMX_ADD 16		MMX_ADD 17		MMX_ADD 18		MMX_ADD 19		MMX_ADD 20		MMX_ADD 21		MMX_ADD 22		MMX_ADD 23		MMX_ADD 24		MMX_ADD 25		MMX_ADD 26		MMX_ADD 27		MMX_ADD 28		MMX_ADD 29		MMX_ADD 30		MMX_ADD 31;===========================================================================;; multiple by 2Q table;;===========================================================================%macro MMX_MUL  1times 4 dw %1 * 2%endmacroalign 16mmx_mul		MMX_MUL 1		MMX_MUL 2		MMX_MUL 3		MMX_MUL 4		MMX_MUL 5		MMX_MUL 6		MMX_MUL 7		MMX_MUL 8		MMX_MUL 9		MMX_MUL 10		MMX_MUL 11		MMX_MUL 12		MMX_MUL 13		MMX_MUL 14		MMX_MUL 15		MMX_MUL 16		MMX_MUL 17		MMX_MUL 18		MMX_MUL 19		MMX_MUL 20		MMX_MUL 21		MMX_MUL 22		MMX_MUL 23		MMX_MUL 24		MMX_MUL 25		MMX_MUL 26		MMX_MUL 27		MMX_MUL 28		MMX_MUL 29		MMX_MUL 30		MMX_MUL 31;===========================================================================;; saturation limits ;;===========================================================================align 16sse2_2047	times 8 dw 2047align 16mmx_2047	times 4 dw 2047align 8mmx_32768_minus_2048				times 4 dw (32768-2048)mmx_32767_minus_2047				times 4 dw (32767-2047)section .text;===========================================================================;; void quant_intra_mmx(int16_t * coeff, ;					const int16_t const * data,;					const uint32_t quant,;					const uint32_t dcscalar);;;===========================================================================align ALIGNcglobal quant_intra_mmxquant_intra_mmx		push	ecx		push	esi		push	edi		mov	edi, [esp + 12 + 4]		; coeff		mov	esi, [esp + 12 + 8]		; data		mov	eax, [esp + 12 + 12]		; quant		xor ecx, ecx		cmp	al, 1		jz	.q1loop		movq	mm7, [mmx_div + eax * 8 - 8]align ALIGN.loop		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]		movq	mm3, [esi + 8*ecx + 8]	; 		pxor	mm1, mm1		; mm1 = 0		pxor	mm4, mm4		;		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)		pcmpgtw	mm4, mm3		; 		pxor	mm0, mm1		; mm0 = |mm0|		pxor	mm3, mm4		;		psubw	mm0, mm1		; displace		psubw	mm3, mm4		;		pmulhw	mm0, mm7		; mm0 = (mm0 / 2Q) >> 16		pmulhw	mm3, mm7		;		pxor	mm0, mm1		; mm0 *= sign(mm0)		pxor	mm3, mm4		;		psubw	mm0, mm1		; undisplace		psubw	mm3, mm4		;		movq	[edi + 8*ecx], mm0		movq	[edi + 8*ecx + 8], mm3				add ecx,2		cmp ecx,16		jnz 	.loop .done		; caclulate  data[0] // (int32_t)dcscalar)		mov 	ecx, [esp + 12 + 16]	; dcscalar		mov 	edx, ecx		movsx 	eax, word [esi]	; data[0]		shr 	edx, 1			; edx = dcscalar /2		cmp		eax, 0		jg		.gtzero		sub		eax, edx		jmp		short .mul.gtzero		add		eax, edx.mul		cdq 				; expand eax -> edx:eax		idiv	ecx			; eax = edx:eax / dcscalar				mov	[edi], ax		; coeff[0] = ax		pop	edi		pop	esi		pop	ecx		ret				align ALIGN.q1loop		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]		movq	mm3, [esi + 8*ecx + 8]	; 		pxor	mm1, mm1		; mm1 = 0		pxor	mm4, mm4		;		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)		pcmpgtw	mm4, mm3		; 		pxor	mm0, mm1		; mm0 = |mm0|		pxor	mm3, mm4		; 		psubw	mm0, mm1		; displace		psubw	mm3, mm4		; 		psrlw	mm0, 1			; mm0 >>= 1   (/2)		psrlw	mm3, 1			;		pxor	mm0, mm1		; mm0 *= sign(mm0)		pxor	mm3, mm4        ;		psubw	mm0, mm1		; undisplace		psubw	mm3, mm4		;		movq	[edi + 8*ecx], mm0		movq	[edi + 8*ecx + 8], mm3		add ecx,2		cmp ecx,16		jnz	.q1loop		jmp	short .done;===========================================================================;; void quant_intra_sse2(int16_t * coeff, ;					const int16_t const * data,;					const uint32_t quant,;					const uint32_t dcscalar);;;===========================================================================align ALIGNcglobal quant_intra_sse2quant_intra_sse2		push	esi		push	edi		mov		edi, [esp + 8 + 4]			; coeff		mov		esi, [esp + 8 + 8]			; data		mov		eax, [esp + 8 + 12]			; quant		xor		ecx, ecx		cmp		al, 1		jz		near .qas2_q1loop.qas2_not1		movq	mm7, [mmx_div + eax*8 - 8]		movq2dq	xmm7, mm7		movlhps	xmm7, xmm7align 16.qas2_loop		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]		pxor	xmm1, xmm1		pxor	xmm4, xmm4		pcmpgtw	xmm1, xmm0		pcmpgtw	xmm4, xmm3		pxor	xmm0, xmm1		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		pmulhw	xmm0, xmm7		pmulhw	xmm3, xmm7		pxor	xmm0, xmm1		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		movdqa	[edi + ecx*8], xmm0		movdqa	[edi + ecx*8 + 16], xmm3				add		ecx, 4		cmp		ecx, 16		jnz 	.qas2_loop .qas2_done			mov 	ecx, [esp + 8 + 16]	; dcscalar		mov 	edx, ecx		movsx 	eax, word [esi]		shr 	edx, 1		cmp		eax, 0		jg		.qas2_gtzero		sub		eax, edx		jmp		short .qas2_mul.qas2_gtzero		add		eax, edx.qas2_mul		cdq		idiv	ecx				mov		[edi], ax		pop		edi		pop		esi		ret		align 16.qas2_q1loop		movdqa	xmm0, [esi + ecx*8]			; xmm0 = [1st]		movdqa	xmm3, [esi + ecx*8 + 16]	; xmm3 = [2nd]		pxor	xmm1, xmm1		pxor	xmm4, xmm4		pcmpgtw	xmm1, xmm0		pcmpgtw	xmm4, xmm3		pxor	xmm0, xmm1		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		psrlw	xmm0, 1		psrlw	xmm3, 1		pxor	xmm0, xmm1		pxor	xmm3, xmm4		psubw	xmm0, xmm1		psubw	xmm3, xmm4		movdqa	[edi + ecx*8], xmm0		movdqa	[edi + ecx*8 + 16], xmm3		add		ecx, 4		cmp		ecx, 16		jnz		.qas2_q1loop		jmp		near .qas2_done;===========================================================================;; uint32_t quant_inter_mmx(int16_t * coeff,;					const int16_t const * data,;					const uint32_t quant);;;===========================================================================align ALIGNcglobal quant_inter_mmx		quant_inter_mmx		push	ecx		push	esi		push	edi		mov	edi, [esp + 12 + 4]		; coeff		mov	esi, [esp + 12 + 8]		; data		mov	eax, [esp + 12 + 12]	; quant		xor ecx, ecx		pxor mm5, mm5					; sum		movq mm6, [mmx_sub + eax * 8 - 8]	; sub		cmp	al, 1		jz  .q1loop		movq	mm7, [mmx_div + eax * 8 - 8]	; divideralign ALIGN.loop		movq	mm0, [esi + 8*ecx]		; mm0 = [1st]		movq	mm3, [esi + 8*ecx + 8]	; 		pxor	mm1, mm1		; mm1 = 0		pxor	mm4, mm4		;		pcmpgtw	mm1, mm0		; mm1 = (0 > mm0)		pcmpgtw	mm4, mm3		; 		pxor	mm0, mm1		; mm0 = |mm0|		pxor	mm3, mm4		; 		psubw	mm0, mm1		; displace		psubw	mm3, mm4		; 		psubusw	mm0, mm6		; mm0 -= sub (unsigned, dont go < 0)		psubusw	mm3, mm6		;		pmulhw	mm0, mm7		; mm0 = (mm0 / 2Q) >> 16		pmulhw	mm3, mm7		; 		paddw	mm5, mm0		; sum += mm0		pxor	mm0, mm1		; mm0 *= sign(mm0)		paddw	mm5, mm3		;		pxor	mm3, mm4		;		psubw	mm0, mm1		; undisplace		psubw	mm3, mm4		movq	[edi + 8*ecx], mm0		movq	[edi + 8*ecx + 8], mm3		add ecx, 2			cmp ecx, 16		jnz .loop.done		pmaddwd mm5, [plus_one]		movq    mm0, mm5		psrlq   mm5, 32		paddd   mm0, mm5
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -