📄 rgb_to_yv12_mmx.asm

📁 MPEG4的VC代码
💻 ASM
字号:
;/*****************************************************************************; *; *  XVID MPEG-4 VIDEO CODEC; *	 mmx yuv planar to yuyv/uyvy conversion ; *; *  Copyright(C) 2002 Peter Ross <pross@xvid.org>; *; *  This file is part of XviD, a free MPEG-4 video encoder/decoder; *; *  XviD is free software; you can redistribute it and/or modify it; *  under the terms of the GNU General Public License as published by; *  the Free Software Foundation; either version 2 of the License, or; *  (at your option) any later version.; *; *  This program is distributed in the hope that it will be useful,; *  but WITHOUT ANY WARRANTY; without even the implied warranty of; *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the; *  GNU General Public License for more details.; *; *  You should have received a copy of the GNU General Public License; *  along with this program; if not, write to the Free Software; *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307 USA; *; *  Under section 8 of the GNU General Public License, the copyright; *  holders of XVID explicitly forbid distribution in the following; *  countries:; *; *    - Japan; *    - United States of America; *; *  Linking XviD statically or dynamically with other modules is making a; *  combined work based on XviD.  Thus, the terms and conditions of the; *  GNU General Public License cover the whole combination.; *; *  As a special exception, the copyright holders of XviD give you; *  permission to link XviD with independent modules that communicate with; *  XviD solely through the VFW1.1 and DShow interfaces, regardless of the; *  license terms of these independent modules, and to copy and distribute; *  the resulting combined work under terms of your choice, provided that; *  every copy of the combined work is accompanied by a complete copy of; *  the source code of XviD (the version of XviD used to produce the; *  combined work), being distributed under the terms of the GNU General; *  Public License plus this exception.  An independent module is a module; *  which is not derived from or based on XviD.; *; *  Note that people who make modified versions of XviD are not obligated; *  to grant this special exception for their modified versions; it is; *  their choice whether to do so.  The GNU General Public License gives; *  permission to release a modified version without this exception; this; *  exception also makes it possible to release a modified version which; *  carries forward this exception.; *; * $Id: rgb_to_yv12_mmx.asm,v 1.3 2002/11/17 00:20:30 edgomez Exp $; *; ****************************************************************************/bits 32section .data%macro cglobal 1 	%ifdef PREFIX		global _%1 		%define %1 _%1	%else		global %1	%endif%endmacroalign 16;===========================================================================; yuv constants;===========================================================================%define Y_R		0.257%define Y_G		0.504%define Y_B		0.098%define Y_ADD	16%define U_R		0.148%define U_G		0.291%define U_B		0.439%define U_ADD	128%define V_R		0.439%define V_G		0.368%define V_B		0.071%define V_ADD	128;===========================================================================; multiplication matrices;===========================================================================; %define SCALEBITS 8y_mul	dw		25		; FIX(Y_B)		dw		129		; FIX(Y_G)		dw		66		; FIX(Y_R)		dw		0u_mul	dw		112		; FIX(U_B)		dw		-74		; FIX(U_G)		dw		-38		; FIX(U_R)		dw		0v_mul	dw		-18		; FIX(V_B)		dw		-94		; FIX(V_G)		dw		112		; FIX(V_R)		dw		0section .text;===========================================================================;;	void rgb24_to_yv12_mmx(uint8_t * const y_out,;						uint8_t * const u_out,;						uint8_t * const v_out,;						const uint8_t * const src,;						const uint32_t width,;						const uint32_t height,;						const uint32_t stride);; always flips;;===========================================================================align 16cglobal rgb24_to_yv12_mmxrgb24_to_yv12_mmx		push ebx		push ecx		push esi		push edi					push ebp			; STACK BASE = 20		; global consants		mov eax, [esp + 20 + 28]	; stride		mov ecx, [esp + 20 + 20]	; width		mov ebx, eax		sub ebx, ecx						shr ebx, 1					; ebx = (stride-width) / 2;		push ebx					; [esp + 20] = uv_dif							; STACK BASE = 24		add eax, eax		sub eax, ecx				; eax = 2*stride - width		push eax					; [esp + 16] = y_dif							; STACK BASE = 28		mov ebx, ecx				; 		shr ebx, 1					;		push ebx					; [esp + 12] = width/2							; STACK BASE = 32		mov edx, ecx		add ecx, edx		add ecx, edx				; ecx = 3*width   (use 4 for rgb32)		push ecx					; [esp + 8] = width3							; STACK BASE = 36		mov edx, ecx		add edx, ecx		add edx, ecx				; edx = 3*width3		push edx					; [esp + 4] = src_dif							; STACK BASE = 40		mov esi, [esp + 40 + 16]	; src		mov ebp, [esp + 40 + 24]	; eax = height		mov eax, ebp		sub eax, 2		mul ecx		add esi, eax				; src += (height-2) * width3		mov edi, [esp + 40 + 4]		; y_out		mov ecx, [esp + 40 + 8]		; u_out		mov edx, [esp + 40 + 12]	; v_out		movq mm7, [y_mul]				shr ebp, 1				; ebp = height / 2		push ebp				; [esp+0] = tmp								; STACK BASE = 44.yloop		mov ebp, [esp + 12]		; ebp = width /2 .xloop			; y_out			mov ebx, [esp + 8]			; ebx = width3			pxor mm4, mm4			pxor mm5, mm5			movd mm0, [esi]			; src[0...]			movd mm2, [esi+ebx]		; src[width3...]			punpcklbw mm0, mm4		; [  |b |g |r ]			punpcklbw mm2, mm5		; [  |b |g |r ]			movq mm6, mm0			; = [  |b4|g4|r4]			paddw mm6, mm2			; +[  |b4|g4|r4]			pmaddwd mm0, mm7		; *= Y_MUL			pmaddwd mm2, mm7		; *= Y_MUL			movq mm4, mm0			; [r]			movq mm5, mm2			; [r]			psrlq mm4, 32			; +[g]			psrlq mm5, 32			; +[g]			paddd mm0, mm4			; +[b]			paddd mm2, mm5			; +[b]			pxor mm4, mm4			pxor mm5, mm5			movd mm1, [esi+3]		; src[4...]			movd mm3, [esi+ebx+3]	; src[width3+4...]			punpcklbw mm1, mm4		; [  |b |g |r ]			punpcklbw mm3, mm5		; [  |b |g |r ]			paddw mm6, mm1			; +[  |b4|g4|r4]			paddw mm6, mm3			; +[  |b4|g4|r4]			pmaddwd mm1, mm7		; *= Y_MUL			pmaddwd mm3, mm7		; *= Y_MUL			movq mm4, mm1			; [r]			movq mm5, mm3			; [r]			psrlq mm4, 32			; +[g]			psrlq mm5, 32			; +[g]			paddd mm1, mm4			; +[b]			paddd mm3, mm5			; +[b]			mov ebx, [esp + 44 + 28]	; stride			movd eax, mm0			shr eax, 8			add eax, Y_ADD			mov [edi + ebx], al			movd eax, mm1			shr eax, 8			add eax, Y_ADD			mov [edi + ebx + 1], al			movd eax, mm2			shr eax, 8			add eax, Y_ADD			mov [edi], al			movd eax, mm3			shr eax, 8			add eax, Y_ADD			mov [edi + 1], al			; u_out, v_out			movq mm0, mm6			; = [  |b4|g4|r4]			pmaddwd mm6, [v_mul]		; *= V_MUL			pmaddwd mm0, [u_mul]		; *= U_MUL			movq mm1, mm0			movq mm2, mm6			psrlq mm1, 32			psrlq mm2, 32			paddd mm0, mm1			paddd mm2, mm6			movd eax, mm0			shr eax, 10			add eax, U_ADD			mov [ecx], al			movd eax, mm2			shr eax, 10			add eax, V_ADD			mov [edx], al			add esi, 2 * 3			; (use 4 for rgb32)			add edi, 2			inc ecx			inc edx			dec ebp			jnz near .xloop		sub esi, [esp + 4]			; src  -= src_dif		add edi, [esp + 16]			; y_out += y_dif		add ecx, [esp + 20]			; u_out += uv_dif		add edx, [esp + 20]			; v_out += uv_dif		dec dword [esp+0]		jnz near .yloop		emms		add esp, 24		pop ebp		pop edi		pop esi		pop ecx		pop ebx		ret;===========================================================================;;	void rgb32_to_yv12mmx(uint8_t * const y_out,;						uint8_t * const u_out,;						uint8_t * const v_out,;						const uint8_t * const src,;						const uint32_t width,;						const uint32_t height,;						const uint32_t stride);; always flips;;===========================================================================align 16cglobal rgb32_to_yv12_mmxrgb32_to_yv12_mmx		push ebx		push ecx		push esi		push edi					push ebp			; STACK BASE = 20		; global consants		mov eax, [esp + 20 + 28]	; stride		mov ecx, [esp + 20 + 20]	; width		mov ebx, eax		sub ebx, ecx						shr ebx, 1					; ebx = (stride-width) / 2;		push ebx					; [esp + 20] = uv_dif							; STACK BASE = 24		add eax, eax		sub eax, ecx				; eax = 2*stride - width		push eax					; [esp + 16] = y_dif							; STACK BASE = 28		mov ebx, ecx				; 		shr ebx, 1					;		push ebx					; [esp + 12] = width/2							; STACK BASE = 32		mov edx, ecx		shl ecx, 2					; ecx = 4*width   (use 4 for rgb32)		push ecx					; [esp + 8] = width4							; STACK BASE = 36		mov edx, ecx		add edx, ecx		add edx, ecx				; edx = 3*width4		push edx					; [esp + 4] = src_dif							; STACK BASE = 40		mov esi, [esp + 40 + 16]	; src		mov ebp, [esp + 40 + 24]	; eax = height		mov eax, ebp		sub eax, 2		mul ecx		add esi, eax				; src += (height-2) * width4		mov edi, [esp + 40 + 4]		; y_out		mov ecx, [esp + 40 + 8]		; u_out		mov edx, [esp + 40 + 12]	; v_out		movq mm7, [y_mul]				shr ebp, 1				; ebp = height / 2		push ebp				; [esp+0] = tmp								; STACK BASE = 44.yloop		mov ebp, [esp + 12]		; ebp = width /2 .xloop			; y_out			mov ebx, [esp + 8]			; ebx = width4			pxor mm4, mm4			movq mm0, [esi]			; src[4...       |0...     ]			movq mm2, [esi+ebx]		; src[width4+4...|width4...]			movq mm1, mm0			movq mm3, mm2			punpcklbw mm0, mm4		; [  |b |g |r ]			punpcklbw mm2, mm4		; [  |b |g |r ]			punpckhbw mm1, mm4		; [  |b |g |r ]			punpckhbw mm3, mm4		; [  |b |g |r ]			movq mm6, mm0			; = [  |b4|g4|r4]			paddw mm6, mm2			; +[  |b4|g4|r4]			pmaddwd mm0, mm7		; *= Y_MUL			pmaddwd mm2, mm7		; *= Y_MUL			movq mm4, mm0			; [r]			movq mm5, mm2			; [r]			psrlq mm4, 32			; +[g]			psrlq mm5, 32			; +[g]			paddd mm0, mm4			; +[b]			paddd mm2, mm5			; +[b]			paddw mm6, mm1			; +[  |b4|g4|r4]			paddw mm6, mm3			; +[  |b4|g4|r4]			pmaddwd mm1, mm7		; *= Y_MUL			pmaddwd mm3, mm7		; *= Y_MUL			movq mm4, mm1			; [r]			movq mm5, mm3			; [r]			psrlq mm4, 32			; +[g]			psrlq mm5, 32			; +[g]			paddd mm1, mm4			; +[b]			paddd mm3, mm5			; +[b]			mov ebx, [esp + 44 + 28]	; stride			movd eax, mm0			shr eax, 8			add eax, Y_ADD			mov [edi + ebx], al			movd eax, mm1			shr eax, 8			add eax, Y_ADD			mov [edi + ebx + 1], al			movd eax, mm2			shr eax, 8			add eax, Y_ADD			mov [edi], al			movd eax, mm3			shr eax, 8			add eax, Y_ADD			mov [edi + 1], al			; u_out, v_out			movq mm0, mm6			; = [  |b4|g4|r4]			pmaddwd mm6, [v_mul]		; *= V_MUL			pmaddwd mm0, [u_mul]		; *= U_MUL			movq mm1, mm0			movq mm2, mm6			psrlq mm1, 32			psrlq mm2, 32			paddd mm0, mm1			paddd mm2, mm6			movd eax, mm0			shr eax, 10			add eax, U_ADD			mov [ecx], al			movd eax, mm2			shr eax, 10			add eax, V_ADD			mov [edx], al			add esi, 2 * 4			; (use 4 for rgb32)			add edi, 2			inc ecx			inc edx			dec ebp			jnz near .xloop		sub esi, [esp + 4]			; src  -= src_dif		add edi, [esp + 16]			; y_out += y_dif		add ecx, [esp + 20]			; u_out += uv_dif		add edx, [esp + 20]			; v_out += uv_dif		dec dword [esp+0]		jnz near .yloop		emms		add esp, 24		pop ebp		pop edi		pop esi		pop ecx		pop ebx		ret
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -