read_rgba_span_x86.s

来自「Mesa is an open-source implementation of」· S 代码 · 共 679 行
679 行
/* * (C) Copyright IBM Corporation 2004 * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * on the rights to use, copy, modify, merge, publish, distribute, sub * license, and/or sell copies of the Software, and to permit persons to whom * the Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE * USE OR OTHER DEALINGS IN THE SOFTWARE. */ /** * \file read_rgba_span_x86.S * Optimized routines to transfer pixel data from the framebuffer to a * buffer in main memory. * * \author Ian Romanick <idr@us.ibm.com> */	.file	"read_rgba_span_x86.S"#if !defined(__DJGPP__) && !defined(__MINGW32__) /* this one cries for assyntax.h *//* Kevin F. Quinn 2nd July 2006 * Replaced data segment constants with text-segment instructions. */#define	LOAD_MASK(mvins,m1,m2) \   	pushl	$0xff00ff00 ;\   	pushl	$0xff00ff00 ;\   	pushl	$0xff00ff00 ;\   	pushl	$0xff00ff00 ;\	mvins	(%esp), m1	;\   	pushl	$0x00ff0000 ;\   	pushl	$0x00ff0000 ;\   	pushl	$0x00ff0000 ;\   	pushl	$0x00ff0000 ;\	mvins	(%esp), m2	;\	addl	$32, %esp/* I implemented these as macros because they appear in several places, * and I've tweaked them a number of times.  I got tired of changing every * place they appear. :) */#define DO_ONE_PIXEL() \	movl	(%ebx), %eax ; \	addl	$4, %ebx ; \	bswap	%eax          /* ARGB -> BGRA */ ; \	rorl	$8, %eax      /* BGRA -> ABGR */ ; \	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \	addl	$4, %ecx#define DO_ONE_LAST_PIXEL() \	movl	(%ebx), %eax ; \	bswap	%eax          /* ARGB -> BGRA */ ; \	rorl	$8, %eax      /* BGRA -> ABGR */ ; \	movl	%eax, (%ecx)  /* ABGR -> R, G, B, A */ ; \/** * MMX optimized version of the BGRA8888_REV to RGBA copy routine. *  * \warning * This function assumes that the caller will issue the EMMS instruction * at the correct places. */.globl _generic_read_RGBA_span_BGRA8888_REV_MMX.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX	.type	_generic_read_RGBA_span_BGRA8888_REV_MMX, @function_generic_read_RGBA_span_BGRA8888_REV_MMX:	pushl	%ebx#ifdef USE_INNER_EMMS	emms#endif	LOAD_MASK(movq,%mm1,%mm2)	movl	8(%esp), %ebx	/* source pointer */	movl	16(%esp), %edx	/* number of pixels to copy */	movl	12(%esp), %ecx	/* destination pointer */	testl	%edx, %edx	jle	.L20		/* Bail if there's nothing to do. */	movl	%ebx, %eax	negl	%eax	sarl	$2, %eax	andl	$1, %eax	je	.L17	subl	%eax, %edx	DO_ONE_PIXEL().L17:	/* Would it be faster to unroll this loop once and process 4 pixels	 * per pass, instead of just two?	 */	movl	%edx, %eax	shrl	%eax	jmp	.L18.L19:	movq	(%ebx), %mm0	addl	$8, %ebx	/* These 9 instructions do what PSHUFB (if there were such an	 * instruction) could do in 1. :(	 */	movq	%mm0, %mm3	movq	%mm0, %mm4	pand	%mm2, %mm3	psllq	$16, %mm4	psrlq	$16, %mm3	pand	%mm2, %mm4	pand	%mm1, %mm0	por	%mm4, %mm3	por	%mm3, %mm0	movq	%mm0, (%ecx)	addl	$8, %ecx	subl	$1, %eax.L18:	jne	.L19#ifdef USE_INNER_EMMS	emms#endif	/* At this point there are either 1 or 0 pixels remaining to be	 * converted.  Convert the last pixel, if needed.	 */	testl	$1, %edx	je	.L20	DO_ONE_LAST_PIXEL().L20:	popl	%ebx	ret	.size	_generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX/** * SSE optimized version of the BGRA8888_REV to RGBA copy routine.  SSE * instructions are only actually used to read data from the framebuffer. * In practice, the speed-up is pretty small. * * \todo * Do some more testing and determine if there's any reason to have this * function in addition to the MMX version. * * \warning * This function assumes that the caller will issue the EMMS instruction * at the correct places. */.globl _generic_read_RGBA_span_BGRA8888_REV_SSE.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE, @function_generic_read_RGBA_span_BGRA8888_REV_SSE:	pushl	%esi	pushl	%ebx	pushl	%ebp#ifdef USE_INNER_EMMS	emms#endif	LOAD_MASK(movq,%mm1,%mm2)	movl	16(%esp), %ebx	/* source pointer */	movl	24(%esp), %edx	/* number of pixels to copy */	movl	20(%esp), %ecx	/* destination pointer */	testl	%edx, %edx	jle	.L35		/* Bail if there's nothing to do. */	movl	%esp, %ebp	subl	$16, %esp	andl	$0xfffffff0, %esp	movl	%ebx, %eax	movl	%edx, %esi	negl	%eax	andl	$15, %eax	sarl	$2, %eax	cmpl	%edx, %eax	cmovle	%eax, %esi	subl	%esi, %edx	testl	$1, %esi	je	.L32	DO_ONE_PIXEL().L32:	testl	$2, %esi	je	.L31	movq	(%ebx), %mm0	addl	$8, %ebx	movq	%mm0, %mm3	movq	%mm0, %mm4		pand	%mm2, %mm3	psllq	$16, %mm4	psrlq	$16, %mm3	pand	%mm2, %mm4	pand	%mm1, %mm0	por	%mm4, %mm3	por	%mm3, %mm0	movq	%mm0, (%ecx)	addl	$8, %ecx.L31:	movl	%edx, %eax	shrl	$2, %eax	jmp	.L33.L34:	movaps	(%ebx), %xmm0	addl	$16, %ebx	/* This would be so much better if we could just move directly from	 * an SSE register to an MMX register.  Unfortunately, that	 * functionality wasn't introduced until SSE2 with the MOVDQ2Q	 * instruction.	 */	movaps	%xmm0, (%esp)	movq	(%esp), %mm0	movq	8(%esp), %mm5	movq	%mm0, %mm3	movq	%mm0, %mm4	movq	%mm5, %mm6	movq	%mm5, %mm7	pand	%mm2, %mm3	pand	%mm2, %mm6	psllq	$16, %mm4	psllq	$16, %mm7	psrlq	$16, %mm3	psrlq	$16, %mm6	pand	%mm2, %mm4	pand	%mm2, %mm7	pand	%mm1, %mm0	pand	%mm1, %mm5	por	%mm4, %mm3	por	%mm7, %mm6	por	%mm3, %mm0	por	%mm6, %mm5	movq	%mm0, (%ecx)	movq	%mm5, 8(%ecx)	addl	$16, %ecx	subl	$1, %eax.L33:	jne	.L34#ifdef USE_INNER_EMMS	emms#endif	movl	%ebp, %esp	/* At this point there are either [0, 3] pixels remaining to be	 * converted.	 */	testl	$2, %edx	je	.L36	movq	(%ebx), %mm0	addl	$8, %ebx	movq	%mm0, %mm3	movq	%mm0, %mm4		pand	%mm2, %mm3	psllq	$16, %mm4	psrlq	$16, %mm3	pand	%mm2, %mm4	pand	%mm1, %mm0	por	%mm4, %mm3	por	%mm3, %mm0	movq	%mm0, (%ecx)	addl	$8, %ecx.L36:	testl	$1, %edx	je	.L35	DO_ONE_LAST_PIXEL().L35:	popl	%ebp	popl	%ebx	popl	%esi	ret	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE/** * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. */	.text.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2	.type	_generic_read_RGBA_span_BGRA8888_REV_SSE2, @function_generic_read_RGBA_span_BGRA8888_REV_SSE2:	pushl	%esi	pushl	%ebx	LOAD_MASK(movdqu,%xmm1,%xmm2)	movl	12(%esp), %ebx	/* source pointer */	movl	20(%esp), %edx	/* number of pixels to copy */	movl	16(%esp), %ecx	/* destination pointer */	movl	%ebx, %eax	movl	%edx, %esi	testl	%edx, %edx	jle	.L46		/* Bail if there's nothing to do. */	/* If the source pointer isn't a multiple of 16 we have to process	 * a few pixels the "slow" way to get the address aligned for	 * the SSE fetch intsructions.	 */	negl	%eax	andl	$15, %eax	sarl	$2, %eax	cmpl	%edx, %eax	cmovbe	%eax, %esi	subl	%esi, %edx	testl	$1, %esi	je	.L41	DO_ONE_PIXEL()  .L41:	testl	$2, %esi	je	.L40	movq	(%ebx), %xmm0	addl	$8, %ebx	movdqa	%xmm0, %xmm3	movdqa	%xmm0, %xmm4	andps	%xmm1, %xmm0	andps	%xmm2, %xmm3	pslldq	$2, %xmm4	psrldq	$2, %xmm3	andps	%xmm2, %xmm4	orps	%xmm4, %xmm3	orps	%xmm3, %xmm0	movq	%xmm0, (%ecx)	addl	$8, %ecx.L40:	/* Would it be worth having a specialized version of this loop for	 * the case where the destination is 16-byte aligned?  That version	 * would be identical except that it could use movedqa instead of	 * movdqu.	 */	movl	%edx, %eax	shrl	$2, %eax	jmp	.L42.L43:	movdqa	(%ebx), %xmm0	addl	$16, %ebx	movdqa	%xmm0, %xmm3	movdqa	%xmm0, %xmm4	andps	%xmm1, %xmm0	andps	%xmm2, %xmm3	pslldq	$2, %xmm4	psrldq	$2, %xmm3	andps	%xmm2, %xmm4	orps	%xmm4, %xmm3	orps	%xmm3, %xmm0	movdqu	%xmm0, (%ecx)	addl	$16, %ecx	subl	$1, %eax.L42:	jne	.L43	/* There may be upto 3 pixels remaining to be copied.  Take care	 * of them now.  We do the 2 pixel case first because the data	 * will be aligned.	 */	testl	$2, %edx	je	.L47	movq	(%ebx), %xmm0	addl	$8, %ebx        	movdqa	%xmm0, %xmm3	movdqa	%xmm0, %xmm4	andps	%xmm1, %xmm0	andps	%xmm2, %xmm3	pslldq	$2, %xmm4	psrldq	$2, %xmm3	andps	%xmm2, %xmm4	orps	%xmm4, %xmm3	orps	%xmm3, %xmm0	movq	%xmm0, (%ecx)	addl	$8, %ecx        .L47:	testl	$1, %edx	je	.L46	DO_ONE_LAST_PIXEL()  .L46:	popl	%ebx	popl	%esi	ret	.size	_generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2#define MASK_565_L	0x07e0f800#define MASK_565_H	0x0000001f/* Setting SCALE_ADJUST to 5 gives a perfect match with the * classic C implementation in Mesa.  Setting SCALE_ADJUST * to 0 is slightly faster but at a small cost to accuracy. */#define SCALE_ADJUST	5#if SCALE_ADJUST == 5#define PRESCALE_L 0x00100001#define PRESCALE_H 0x00000200#define SCALE_L 0x40C620E8#define SCALE_H 0x0000839d#elif SCALE_ADJUST == 0#define PRESCALE_L 0x00200001#define PRESCALE_H 0x00000800#define SCALE_L 0x01040108#define SCALE_H 0x00000108#else#error SCALE_ADJUST must either be 5 or 0.#endif#define ALPHA_L 0x00000000#define ALPHA_H 0x00ff0000/** * MMX optimized version of the RGB565 to RGBA copy routine. */	.text	.globl	_generic_read_RGBA_span_RGB565_MMX        .hidden _generic_read_RGBA_span_RGB565_MMX	.type	_generic_read_RGBA_span_RGB565_MMX, @function_generic_read_RGBA_span_RGB565_MMX:#ifdef USE_INNER_EMMS	emms#endif	movl	4(%esp), %eax	/* source pointer */	movl	8(%esp), %edx	/* destination pointer */	movl	12(%esp), %ecx	/* number of pixels to copy */	pushl	$MASK_565_H	pushl	$MASK_565_L	movq	(%esp), %mm5	pushl	$PRESCALE_H	pushl	$PRESCALE_L	movq	(%esp), %mm6	pushl	$SCALE_H	pushl	$SCALE_L	movq	(%esp), %mm7	pushl	$ALPHA_H	pushl	$ALPHA_L	movq	(%esp), %mm3	addl	$32,%esp	sarl	$2, %ecx	jle	.L01		/* Bail early if the count is negative. */	jmp	.L02.L03:	/* Fetch 4 RGB565 pixels into %mm4.  Distribute the first and	 * second pixels into the four words of %mm0 and %mm2.      	 */	movq	(%eax), %mm4	addl	$8, %eax	pshufw	$0x00, %mm4, %mm0	pshufw	$0x55, %mm4, %mm2	/* Mask the pixels so that each word of each register contains only	 * one color component.	 */	pand	%mm5, %mm0	pand	%mm5, %mm2	/* Adjust the component values so that they are as small as possible,	 * but large enough so that we can multiply them by an unsigned 16-bit	 * number and get a value as large as 0x00ff0000. 	 */	pmullw	%mm6, %mm0	pmullw	%mm6, %mm2#if SCALE_ADJUST > 0	psrlw	$SCALE_ADJUST, %mm0	psrlw	$SCALE_ADJUST, %mm2#endif	/* Scale the input component values to be on the range	 * [0, 0x00ff0000].  This it the real magic of the whole routine.	 */	pmulhuw	%mm7, %mm0	pmulhuw	%mm7, %mm2	/* Always set the alpha value to 0xff.	 */ 	por %mm3, %mm0 	por %mm3, %mm2	/* Pack the 16-bit values to 8-bit values and store the converted	 * pixel data.	 */	packuswb	%mm2, %mm0	movq	%mm0, (%edx)	addl	$8, %edx	pshufw	$0xaa, %mm4, %mm0	pshufw	$0xff, %mm4, %mm2	pand	%mm5, %mm0	pand	%mm5, %mm2	pmullw	%mm6, %mm0	pmullw	%mm6, %mm2#if SCALE_ADJUST > 0	psrlw	$SCALE_ADJUST, %mm0	psrlw	$SCALE_ADJUST, %mm2#endif	pmulhuw	%mm7, %mm0	pmulhuw	%mm7, %mm2 	por %mm3, %mm0 	por %mm3, %mm2	packuswb	%mm2, %mm0	movq	%mm0, (%edx)	addl	$8, %edx	subl	$1, %ecx.L02:	jne	.L03	/* At this point there can be at most 3 pixels left to process.  If	 * there is either 2 or 3 left, process 2.         */	movl	12(%esp), %ecx	testl	$0x02, %ecx	je	.L04	movd	(%eax), %mm4	addl	$4, %eax	pshufw	$0x00, %mm4, %mm0	pshufw	$0x55, %mm4, %mm2	pand	%mm5, %mm0	pand	%mm5, %mm2	pmullw	%mm6, %mm0	pmullw	%mm6, %mm2#if SCALE_ADJUST > 0	psrlw	$SCALE_ADJUST, %mm0	psrlw	$SCALE_ADJUST, %mm2#endif	pmulhuw	%mm7, %mm0	pmulhuw	%mm7, %mm2 	por %mm3, %mm0 	por %mm3, %mm2	packuswb	%mm2, %mm0	movq	%mm0, (%edx)	addl	$8, %edx.L04:	/* At this point there can be at most 1 pixel left to process.	 * Process it if needed.         */	testl	$0x01, %ecx	je	.L01	movzxw	(%eax), %ecx	movd	%ecx, %mm4	pshufw	$0x00, %mm4, %mm0	pand	%mm5, %mm0	pmullw	%mm6, %mm0#if SCALE_ADJUST > 0	psrlw	$SCALE_ADJUST, %mm0#endif	pmulhuw	%mm7, %mm0 	por %mm3, %mm0	packuswb	%mm0, %mm0	movd	%mm0, (%edx).L01:#ifdef USE_INNER_EMMS	emms#endif	ret#endif /* !defined(__DJGPP__) && !defined(__MINGW32__) */	#if defined (__ELF__) && defined (__linux__)	.section .note.GNU-stack,"",%progbits#endif
read_rgba_span_x86.s - 源码说明

本页面展示了「Mesa is an open-source implementation of the OpenGL specification - a system for rendering interacti」中的 read_rgba_span_x86.s 源码文件，采用 S 编程语言编写，共 679 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与implementation相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?