📄 yv12_to_yuyv_mmx.asm

📁 MPEG4的VC代码
💻 ASM
字号:
;/*****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *	 mmx yuv planar to yuyv/uyvy conversion ; *; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>; *; *  This file is part of XviD, a free MPEG-4 video encoder/decoder; *; *  XviD is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; *  Under section 8 of the GNU General Public License, the copyright; *  holders of XVID explicitly forbid distribution in the following; *  countries:; *; *    - Japan; *    - United States of America; *; *  Linking XviD statically or dynamically with other modules is making a; *  combined work based on XviD.  Thus, the terms and conditions of the; *  GNU General Public License cover the whole combination.; *; *  As a special exception, the copyright holders of XviD give you; *  permission to link XviD with independent modules that communicate with; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the; *  license terms of these independent modules, and to copy and distribute; *  the resulting combined work under terms of your choice, provided that; *  every copy of the combined work is accompanied by a complete copy of; *  the source code of XviD (the version of XviD used to produce the; *  combined work), being distributed under the terms of the GNU General; *  Public License plus this exception.  An independent module is a module; *  which is not derived from or based on XviD.; *; *  Note that people who make modified versions of XviD are not obligated; *  to grant this special exception for their modified versions; it is; *  their choice whether to do so.  The GNU General Public License gives; *  permission to release a modified version without this exception; this; *  exception also makes it possible to release a modified version which; *  carries forward this exception.; *; * $Id: yv12_to_yuyv_mmx.asm,v 1.3 2002/11/17 00:20:30 edgomez Exp $; *; ****************************************************************************/bits 32section .data%macro cglobal 1 	%ifdef PREFIX		global _%1 		%define %1 _%1	%else		global %1	%endif%endmacroalign 16section .text;===========================================================================;; void yv12_to_uyvy_mmx(;				uint8_t * dst,;				int dst_stride,;				uint8_t * y_src,;				uint8_t * u_src,;				uint8_t * v_src,;				int y_stride,;				int uv_stride,;				int width,;				int height);;;	width must be multiple of 8;	~10% faster than plain c;;===========================================================================align 16cglobal yv12_to_yuyv_mmxyv12_to_yuyv_mmx		push ebx		push ecx		push esi		push edi					push ebp		; STACK BASE = 20		; global constants		mov ebx, [esp + 20 + 32]	; width		mov eax, [esp + 20 + 8]		; dst_stride		sub eax, ebx				; 		add eax, eax				; eax = 2*(dst_stride - width)		push eax					; [esp + 4] = dst_dif						; STACK BASE = 24		shr ebx, 3					; ebx = width / 8		mov edi, [esp + 24 + 4]		; dst		; --------- flip -------------		mov	ebp, [esp + 24 + 36]		test ebp, ebp		jl .flip		mov esi, [esp + 24 + 12]	; y_src		mov ecx, [esp + 24 + 16]	; u_src		mov edx, [esp + 24 + 20]	; v_src		shr ebp, 1					; y = height / 2		jmp short .yloop.flip		neg ebp			; height = -height				mov	eax, [esp + 24 + 24]	; y_stride		lea	edx, [ebp - 1]			; edx = height - 1		mul	edx		mov esi, [esp + 24 + 12]	; y_src		add esi, eax				; y_src += (height - 1) * y_stride		shr ebp, 1					; y = height / 2		mov	eax, [esp + 24 + 28]	; uv_stride		lea	edx, [ebp - 1]			; edx = height/2 - 1		mul	edx		mov ecx, [esp + 24 + 16]	; u_src		mov edx, [esp + 24 + 20]	; v_src		add ecx, eax				; u_src += (height/2 - 1) * uv_stride		add edx, eax				; v_src += (height/2 - 1) * uv_stride		neg	dword [esp + 24 + 24]	; y_stride = -y_stride		neg dword [esp + 24 + 28]	; uv_stride = -uv_stride	.yloop		xor eax, eax			; x = 0;.xloop1				movd mm0, [ecx+4*eax]		; [    |uuuu]				movd mm1, [edx+4*eax]		; [    |vvvv]				movq mm2, [esi+8*eax]		; [yyyy|yyyy]				punpcklbw mm0, mm1			; [vuvu|vuvu]				movq mm3, mm2				punpcklbw mm2, mm0			; [vyuy|vyuy]				punpckhbw mm3, mm0			; [vyuy|vyuy]				movq [edi], mm2				movq [edi+8], mm3				inc eax				add edi, 16										cmp eax, ebx				jb	.xloop1		add edi, [esp + 0]		; dst += dst_dif		add esi, [esp + 24 + 24]	; y_src += y_stride				xor eax, eax.xloop2				movd mm0, [ecx+4*eax]		; [    |uuuu]				movd mm1, [edx+4*eax]		; [    |vvvv]				movq mm2, [esi+8*eax]		; [yyyy|yyyy]				punpcklbw mm0, mm1			; [vuvu|vuvu]				movq mm3, mm2				punpcklbw mm2, mm0			; [vyuy|vyuy]				punpckhbw mm3, mm0			; [vyuy|vyuy]				movq [edi], mm2				movq [edi+8], mm3				inc eax				add edi, 16														cmp eax, ebx				jb	.xloop2		add edi, [esp + 0]			; dst += dst_dif		add esi, [esp + 24 + 24]	; y_src += y_stride		add ecx, [esp + 24 + 28]	; u_src += uv_stride		add edx, [esp + 24 + 28]	; v_src += uv_stride		dec	ebp				; y--		jnz	near .yloop		emms		add esp, 4		pop ebp		pop edi		pop esi		pop ecx		pop ebx		ret;===========================================================================;; void yv12_to_uyvy_mmx(;				uint8_t * dst,;				int dst_stride,;				uint8_t * y_src,;				uint8_t * u_src,;				uint8_t * v_src,;				int y_stride,;				int uv_stride,;				int width,;				int height);;;	width must be multiple of 8;	~10% faster than plain c;;===========================================================================align 16cglobal yv12_to_uyvy_mmxyv12_to_uyvy_mmx		push ebx		push ecx		push esi		push edi					push ebp		; STACK BASE = 20		; global constants		mov ebx, [esp + 20 + 32]	; width		mov eax, [esp + 20 + 8]		; dst_stride		sub eax, ebx				; 		add eax, eax				; eax = 2*(dst_stride - width)		push eax					; [esp + 4] = dst_dif						; STACK BASE = 24		shr ebx, 3					; ebx = width / 8		mov edi, [esp + 24 + 4]		; dst		; --------- flip -------------		mov	ebp, [esp + 24 + 36]		test ebp, ebp		jl .flip		mov esi, [esp + 24 + 12]	; y_src		mov ecx, [esp + 24 + 16]	; u_src		mov edx, [esp + 24 + 20]	; v_src		shr ebp, 1					; y = height / 2		jmp short .yloop.flip		neg ebp			; height = -height				mov	eax, [esp + 24 + 24]	; y_stride		lea	edx, [ebp - 1]			; edx = height - 1		mul	edx		mov esi, [esp + 24 + 12]	; y_src		add esi, eax				; y_src += (height - 1) * y_stride		shr ebp, 1					; y = height / 2		mov	eax, [esp + 24 + 28]	; uv_stride		lea	edx, [ebp - 1]			; edx = height/2 - 1		mul	edx		mov ecx, [esp + 24 + 16]	; u_src		mov edx, [esp + 24 + 20]	; v_src		add ecx, eax				; u_src += (height/2 - 1) * uv_stride		add edx, eax				; v_src += (height/2 - 1) * uv_stride		neg	dword [esp + 24 + 24]	; y_stride = -y_stride		neg dword [esp + 24 + 28]	; uv_stride = -uv_stride	.yloop		xor eax, eax			; x = 0;.xloop1				movd mm0, [ecx+4*eax]		; [    |uuuu]				movd mm1, [edx+4*eax]		; [    |vvvv]				movq mm2, [esi+8*eax]		; [yyyy|yyyy]				punpcklbw mm0, mm1			; [vuvu|vuvu]				movq mm1, mm0				punpcklbw mm0, mm2			; [yvyu|yvyu]				punpckhbw mm1, mm2			; [yvyu|yvyu]				movq [edi], mm0				movq [edi+8], mm1				inc eax				add edi, 16										cmp eax, ebx				jb	.xloop1		add edi, [esp + 0]		; dst += dst_dif		add esi, [esp + 24 + 24]	; y_src += y_stride				xor eax, eax.xloop2				movd mm0, [ecx+4*eax]		; [    |uuuu]				movd mm1, [edx+4*eax]		; [    |vvvv]				movq mm2, [esi+8*eax]		; [yyyy|yyyy]				punpcklbw mm0, mm1			; [vuvu|vuvu]				movq mm1, mm0				punpcklbw mm0, mm2			; [yvyu|yvyu]				punpckhbw mm1, mm2			; [yvyu|yvyu]				movq [edi], mm0				movq [edi+8], mm1				inc eax				add edi, 16														cmp eax, ebx				jb	.xloop2		add edi, [esp + 0]			; dst += dst_dif		add esi, [esp + 24 + 24]	; y_src += y_stride		add ecx, [esp + 24 + 28]	; u_src += uv_stride		add edx, [esp + 24 + 28]	; v_src += uv_stride		dec	ebp				; y--		jnz	near .yloop		emms		add esp, 4		pop ebp		pop edi		pop esi		pop ecx		pop ebx		ret
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -