bcopy.s

来自「一个嵌入式操作系统(microwindows)的源代码」· S 代码 · 共 1,579 行 · 第 1/3 页
1,579 行
Hi,The following code is the file support.s from the FreeBSD 2.6distribution for i386.  I included the entire file so you canpick and choose as you like and you can pick up the license.There's a generic bcopy that does overlapping, uses rep movsin the largest chunk possible, etc.  That might do the trick.There's a few macros around but hopefully you can decipherthem.Later,FM--Frank W. MillerCornfed Systems Incwww.cornfed.com--/*- * Copyright (c) 1993 The Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in the *    documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software *    must display the following acknowledgement: *	This product includes software developed by the University of *	California, Berkeley and its contributors. * 4. Neither the name of the University nor the names of its contributors *    may be used to endorse or promote products derived from this software *    without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */#include "npx.h"#include "opt_cpu.h"#include <machine/asmacros.h>#include <machine/cputypes.h>#include <machine/pmap.h>#include <machine/specialreg.h>#include "assym.s"#define KDSEL		0x10			/* kernel data selector */#define IDXSHIFT	10	.data	.globl	_bcopy_vector_bcopy_vector:	.long	_generic_bcopy	.globl	_bzero_bzero:	.long	_generic_bzero	.globl	_copyin_vector_copyin_vector:	.long	_generic_copyin	.globl	_copyout_vector_copyout_vector:	.long	_generic_copyout	.globl	_ovbcopy_vector_ovbcopy_vector:	.long	_generic_bcopy#if defined(I586_CPU) && NNPX > 0kernel_fpu_lock:	.byte	0xfe	.space	3#endif	.text/* * bcopy family * void bzero(void *buf, u_int len) */ENTRY(generic_bzero)	pushl	%edi	movl	8(%esp),%edi	movl	12(%esp),%ecx	xorl	%eax,%eax	shrl	$2,%ecx	cld	rep	stosl	movl	12(%esp),%ecx	andl	$3,%ecx	rep	stosb	popl	%edi	ret#if defined(I486_CPU)ENTRY(i486_bzero)	movl	4(%esp),%edx	movl	8(%esp),%ecx	xorl	%eax,%eax/* * do 64 byte chunks first * * XXX this is probably over-unrolled at least for DX2's */2:	cmpl	$64,%ecx	jb	3f	movl	%eax,(%edx)	movl	%eax,4(%edx)	movl	%eax,8(%edx)	movl	%eax,12(%edx)	movl	%eax,16(%edx)	movl	%eax,20(%edx)	movl	%eax,24(%edx)	movl	%eax,28(%edx)	movl	%eax,32(%edx)	movl	%eax,36(%edx)	movl	%eax,40(%edx)	movl	%eax,44(%edx)	movl	%eax,48(%edx)	movl	%eax,52(%edx)	movl	%eax,56(%edx)	movl	%eax,60(%edx)	addl	$64,%edx	subl	$64,%ecx	jnz	2b	ret/* * do 16 byte chunks */	SUPERALIGN_TEXT3:	cmpl	$16,%ecx	jb	4f	movl	%eax,(%edx)	movl	%eax,4(%edx)	movl	%eax,8(%edx)	movl	%eax,12(%edx)	addl	$16,%edx	subl	$16,%ecx	jnz	3b	ret/* * do 4 byte chunks */	SUPERALIGN_TEXT4:	cmpl	$4,%ecx	jb	5f	movl	%eax,(%edx)	addl	$4,%edx	subl	$4,%ecx	jnz	4b	ret/* * do 1 byte chunks * a jump table seems to be faster than a loop or more range reductions * * XXX need a const section for non-text */	.datajtab:	.long	do0	.long	do1	.long	do2	.long	do3	.text	SUPERALIGN_TEXT5:	jmp	jtab(,%ecx,4)	SUPERALIGN_TEXTdo3:	movw	%ax,(%edx)	movb	%al,2(%edx)	ret	SUPERALIGN_TEXTdo2:	movw	%ax,(%edx)	ret	SUPERALIGN_TEXTdo1:	movb	%al,(%edx)	ret	SUPERALIGN_TEXTdo0:	ret#endif#if defined(I586_CPU) && NNPX > 0ENTRY(i586_bzero)	movl	4(%esp),%edx	movl	8(%esp),%ecx	/*	 * The FPU register method is twice as fast as the integer register	 * method unless the target is in the L1 cache and we pre-allocate a	 * cache line for it (then the integer register method is 4-5 times	 * faster).  However, we never pre-allocate cache lines, since that	 * would make the integer method 25% or more slower for the common	 * case when the target isn't in either the L1 cache or the L2 cache.	 * Thus we normally use the FPU register method unless the overhead	 * would be too large.	 */	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */	jb	intreg_i586_bzero	/*	 * The FPU registers may belong to an application or to fastmove()	 * or to another invocation of bcopy() or ourself in a higher level	 * interrupt or trap handler.  Preserving the registers is	 * complicated since we avoid it if possible at all levels.  We	 * want to localize the complications even when that increases them.	 * Here the extra work involves preserving CR0_TS in TS.	 * `npxproc != NULL' is supposed to be the condition that all the	 * FPU resources belong to an application, but npxproc and CR0_TS	 * aren't set atomically enough for this condition to work in	 * interrupt handlers.	 *	 * Case 1: FPU registers belong to the application: we must preserve	 * the registers if we use them, so we only use the FPU register	 * method if the target size is large enough to amortize the extra	 * overhead for preserving them.  CR0_TS must be preserved although	 * it is very likely to end up as set.	 *	 * Case 2: FPU registers belong to fastmove(): fastmove() currently	 * makes the registers look like they belong to an application so	 * that cpu_switch() and savectx() don't have to know about it, so	 * this case reduces to case 1.	 *	 * Case 3: FPU registers belong to the kernel: don't use the FPU	 * register method.  This case is unlikely, and supporting it would	 * be more complicated and might take too much stack.	 *	 * Case 4: FPU registers don't belong to anyone: the FPU registers	 * don't need to be preserved, so we always use the FPU register	 * method.  CR0_TS must be preserved although it is very likely to	 * always end up as clear.	 */	cmpl	$0,_npxproc	je	i586_bz1	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */	jb	intreg_i586_bzero	sarb	$1,kernel_fpu_lock	jc	intreg_i586_bzero	smsw	%ax	clts	subl	$108,%esp	fnsave	0(%esp)	jmp	i586_bz2i586_bz1:	sarb	$1,kernel_fpu_lock	jc	intreg_i586_bzero	smsw	%ax	clts	fninit				/* XXX should avoid needing this */i586_bz2:	fldz	/*	 * Align to an 8 byte boundary (misalignment in the main loop would	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is	 * already aligned) by always zeroing 8 bytes and using the part up	 * to the _next_ alignment position.	 */	fstl	0(%edx)	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */	addl	$8,%edx	andl	$~7,%edx	subl	%edx,%ecx	/*	 * Similarly align `len' to a multiple of 8.	 */	fstl	-8(%edx,%ecx)	decl	%ecx	andl	$~7,%ecx	/*	 * This wouldn't be any faster if it were unrolled, since the loop	 * control instructions are much faster than the fstl and/or done	 * in parallel with it so their overhead is insignificant.	 */fpureg_i586_bzero_loop:	fstl	0(%edx)	addl	$8,%edx	subl	$8,%ecx	cmpl	$8,%ecx	jae	fpureg_i586_bzero_loop	cmpl	$0,_npxproc	je	i586_bz3	frstor	0(%esp)	addl	$108,%esp	lmsw	%ax	movb	$0xfe,kernel_fpu_lock	reti586_bz3:	fstpl	%st(0)	lmsw	%ax	movb	$0xfe,kernel_fpu_lock	retintreg_i586_bzero:	/*	 * `rep stos' seems to be the best method in practice for small	 * counts.  Fancy methods usually take too long to start up due	 * to cache and BTB misses.	 */	pushl	%edi	movl	%edx,%edi	xorl	%eax,%eax	shrl	$2,%ecx	cld	rep	stosl	movl	12(%esp),%ecx	andl	$3,%ecx	jne	1f	popl	%edi	ret1:	rep	stosb	popl	%edi	ret#endif /* I586_CPU && NNPX > 0 *//* fillw(pat, base, cnt) */ENTRY(fillw)	pushl	%edi	movl	8(%esp),%eax	movl	12(%esp),%edi	movl	16(%esp),%ecx	cld	rep	stosw	popl	%edi	retENTRY(bcopyb)bcopyb:	pushl	%esi	pushl	%edi	movl	12(%esp),%esi	movl	16(%esp),%edi	movl	20(%esp),%ecx	movl	%edi,%eax	subl	%esi,%eax	cmpl	%ecx,%eax			/* overlapping && src < dst? */	jb	1f	cld					/* nope, copy forwards */	rep	movsb	popl	%edi	popl	%esi	ret	ALIGN_TEXT1:	addl	%ecx,%edi			/* copy backwards. */	addl	%ecx,%esi	decl	%edi	decl	%esi	std	rep	movsb	popl	%edi	popl	%esi	cld	retENTRY(bcopy)	MEXITCOUNT	jmp	*_bcopy_vectorENTRY(ovbcopy)	MEXITCOUNT	jmp	*_ovbcopy_vector/* * generic_bcopy(src, dst, cnt) *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800 */ENTRY(generic_bcopy)	pushl	%esi	pushl	%edi	movl	12(%esp),%esi	movl	16(%esp),%edi	movl	20(%esp),%ecx	movl	%edi,%eax	subl	%esi,%eax	cmpl	%ecx,%eax			/* overlapping && src < dst? */	jb	1f	shrl	$2,%ecx				/* copy by 32-bit words */	cld					/* nope, copy forwards */	rep	movsl	movl	20(%esp),%ecx	andl	$3,%ecx				/* any bytes left? */	rep	movsb	popl	%edi	popl	%esi	ret	ALIGN_TEXT1:	addl	%ecx,%edi			/* copy backwards */	addl	%ecx,%esi	decl	%edi	decl	%esi	andl	$3,%ecx				/* any fractional bytes? */	std	rep	movsb	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */	shrl	$2,%ecx	subl	$3,%esi	subl	$3,%edi	rep	movsl	popl	%edi	popl	%esi	cld	ret#if defined(I586_CPU) && NNPX > 0ENTRY(i586_bcopy)	pushl	%esi	pushl	%edi	movl	12(%esp),%esi	movl	16(%esp),%edi	movl	20(%esp),%ecx	movl	%edi,%eax	subl	%esi,%eax	cmpl	%ecx,%eax			/* overlapping && src < dst? */	jb	1f	cmpl	$1024,%ecx	jb	small_i586_bcopy	sarb	$1,kernel_fpu_lock	jc	small_i586_bcopy	cmpl	$0,_npxproc	je	i586_bc1	smsw	%dx	clts	subl	$108,%esp	fnsave	0(%esp)	jmp	4fi586_bc1:	smsw	%dx	clts	fninit				/* XXX should avoid needing this */	ALIGN_TEXT4:	pushl	%ecx#define	DCACHE_SIZE	8192	cmpl	$(DCACHE_SIZE-512)/2,%ecx	jbe	2f	movl	$(DCACHE_SIZE-512)/2,%ecx2:	subl	%ecx,0(%esp)	cmpl	$256,%ecx	jb	5f			/* XXX should prefetch if %ecx >= 32 */	pushl	%esi	pushl	%ecx	ALIGN_TEXT3:	movl	0(%esi),%eax	movl	32(%esi),%eax	movl	64(%esi),%eax	movl	96(%esi),%eax	movl	128(%esi),%eax	movl	160(%esi),%eax	movl	192(%esi),%eax	movl	224(%esi),%eax	addl	$256,%esi	subl	$256,%ecx	cmpl	$256,%ecx	jae	3b	popl	%ecx	popl	%esi5:	ALIGN_TEXTlarge_i586_bcopy_loop:	fildq	0(%esi)	fildq	8(%esi)	fildq	16(%esi)	fildq	24(%esi)	fildq	32(%esi)	fildq	40(%esi)	fildq	48(%esi)	fildq	56(%esi)
bcopy.s - 源码说明

本页面展示了「一个嵌入式操作系统(microwindows)的源代码」中的 bcopy.s 源码文件，采用 S 编程语言编写，共 1,579 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与microwindows相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?