⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 bcopy.s

📁 eCos操作系统源码
💻 S
📖 第 1 页 / 共 3 页
字号:
Hi,

The following code is the file support.s from the FreeBSD 2.6
distribution for i386.  I included the entire file so you can
pick and choose as you like and you can pick up the license.
There's a generic bcopy that does overlapping, uses rep movs
in the largest chunk possible, etc.  That might do the trick.
There's a few macros around but hopefully you can decipher
them.

Later,
FM

--
Frank W. Miller
Cornfed Systems Inc
www.cornfed.com


--
/*-
 * Copyright (c) 1993 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	$Id: bcopy.s,v 1.1.1.1 2001/06/21 06:32:39 greg Exp $
 */

#include "npx.h"
#include "opt_cpu.h"

#include <machine/asmacros.h>
#include <machine/cputypes.h>
#include <machine/pmap.h>
#include <machine/specialreg.h>

#include "assym.s"

#define KDSEL		0x10			/* kernel data selector */
#define IDXSHIFT	10

	.data
	.globl	_bcopy_vector
_bcopy_vector:
	.long	_generic_bcopy
	.globl	_bzero
_bzero:
	.long	_generic_bzero
	.globl	_copyin_vector
_copyin_vector:
	.long	_generic_copyin
	.globl	_copyout_vector
_copyout_vector:
	.long	_generic_copyout
	.globl	_ovbcopy_vector
_ovbcopy_vector:
	.long	_generic_bcopy
#if defined(I586_CPU) && NNPX > 0
kernel_fpu_lock:
	.byte	0xfe
	.space	3
#endif

	.text

/*
 * bcopy family
 * void bzero(void *buf, u_int len)
 */

ENTRY(generic_bzero)
	pushl	%edi
	movl	8(%esp),%edi
	movl	12(%esp),%ecx
	xorl	%eax,%eax
	shrl	$2,%ecx
	cld
	rep
	stosl
	movl	12(%esp),%ecx
	andl	$3,%ecx
	rep
	stosb
	popl	%edi
	ret

#if defined(I486_CPU)
ENTRY(i486_bzero)
	movl	4(%esp),%edx
	movl	8(%esp),%ecx
	xorl	%eax,%eax
/*
 * do 64 byte chunks first
 *
 * XXX this is probably over-unrolled at least for DX2's
 */
2:
	cmpl	$64,%ecx
	jb	3f
	movl	%eax,(%edx)
	movl	%eax,4(%edx)
	movl	%eax,8(%edx)
	movl	%eax,12(%edx)
	movl	%eax,16(%edx)
	movl	%eax,20(%edx)
	movl	%eax,24(%edx)
	movl	%eax,28(%edx)
	movl	%eax,32(%edx)
	movl	%eax,36(%edx)
	movl	%eax,40(%edx)
	movl	%eax,44(%edx)
	movl	%eax,48(%edx)
	movl	%eax,52(%edx)
	movl	%eax,56(%edx)
	movl	%eax,60(%edx)
	addl	$64,%edx
	subl	$64,%ecx
	jnz	2b
	ret

/*
 * do 16 byte chunks
 */
	SUPERALIGN_TEXT
3:
	cmpl	$16,%ecx
	jb	4f
	movl	%eax,(%edx)
	movl	%eax,4(%edx)
	movl	%eax,8(%edx)
	movl	%eax,12(%edx)
	addl	$16,%edx
	subl	$16,%ecx
	jnz	3b
	ret

/*
 * do 4 byte chunks
 */
	SUPERALIGN_TEXT
4:
	cmpl	$4,%ecx
	jb	5f
	movl	%eax,(%edx)
	addl	$4,%edx
	subl	$4,%ecx
	jnz	4b
	ret

/*
 * do 1 byte chunks
 * a jump table seems to be faster than a loop or more range reductions
 *
 * XXX need a const section for non-text
 */
	.data
jtab:
	.long	do0
	.long	do1
	.long	do2
	.long	do3

	.text
	SUPERALIGN_TEXT
5:
	jmp	jtab(,%ecx,4)

	SUPERALIGN_TEXT
do3:
	movw	%ax,(%edx)
	movb	%al,2(%edx)
	ret

	SUPERALIGN_TEXT
do2:
	movw	%ax,(%edx)
	ret

	SUPERALIGN_TEXT
do1:
	movb	%al,(%edx)
	ret

	SUPERALIGN_TEXT
do0:
	ret
#endif

#if defined(I586_CPU) && NNPX > 0
ENTRY(i586_bzero)
	movl	4(%esp),%edx
	movl	8(%esp),%ecx

	/*
	 * The FPU register method is twice as fast as the integer register
	 * method unless the target is in the L1 cache and we pre-allocate a
	 * cache line for it (then the integer register method is 4-5 times
	 * faster).  However, we never pre-allocate cache lines, since that
	 * would make the integer method 25% or more slower for the common
	 * case when the target isn't in either the L1 cache or the L2 cache.
	 * Thus we normally use the FPU register method unless the overhead
	 * would be too large.
	 */
	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */
	jb	intreg_i586_bzero

	/*
	 * The FPU registers may belong to an application or to fastmove()
	 * or to another invocation of bcopy() or ourself in a higher level
	 * interrupt or trap handler.  Preserving the registers is
	 * complicated since we avoid it if possible at all levels.  We
	 * want to localize the complications even when that increases them.
	 * Here the extra work involves preserving CR0_TS in TS.
	 * `npxproc != NULL' is supposed to be the condition that all the
	 * FPU resources belong to an application, but npxproc and CR0_TS
	 * aren't set atomically enough for this condition to work in
	 * interrupt handlers.
	 *
	 * Case 1: FPU registers belong to the application: we must preserve
	 * the registers if we use them, so we only use the FPU register
	 * method if the target size is large enough to amortize the extra
	 * overhead for preserving them.  CR0_TS must be preserved although
	 * it is very likely to end up as set.
	 *
	 * Case 2: FPU registers belong to fastmove(): fastmove() currently
	 * makes the registers look like they belong to an application so
	 * that cpu_switch() and savectx() don't have to know about it, so
	 * this case reduces to case 1.
	 *
	 * Case 3: FPU registers belong to the kernel: don't use the FPU
	 * register method.  This case is unlikely, and supporting it would
	 * be more complicated and might take too much stack.
	 *
	 * Case 4: FPU registers don't belong to anyone: the FPU registers
	 * don't need to be preserved, so we always use the FPU register
	 * method.  CR0_TS must be preserved although it is very likely to
	 * always end up as clear.
	 */
	cmpl	$0,_npxproc
	je	i586_bz1
	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */
	jb	intreg_i586_bzero
	sarb	$1,kernel_fpu_lock
	jc	intreg_i586_bzero
	smsw	%ax
	clts
	subl	$108,%esp
	fnsave	0(%esp)
	jmp	i586_bz2

i586_bz1:
	sarb	$1,kernel_fpu_lock
	jc	intreg_i586_bzero
	smsw	%ax
	clts
	fninit				/* XXX should avoid needing this */
i586_bz2:
	fldz

	/*
	 * Align to an 8 byte boundary (misalignment in the main loop would
	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is
	 * already aligned) by always zeroing 8 bytes and using the part up
	 * to the _next_ alignment position.
	 */
	fstl	0(%edx)
	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */
	addl	$8,%edx
	andl	$~7,%edx
	subl	%edx,%ecx

	/*
	 * Similarly align `len' to a multiple of 8.
	 */
	fstl	-8(%edx,%ecx)
	decl	%ecx
	andl	$~7,%ecx

	/*
	 * This wouldn't be any faster if it were unrolled, since the loop
	 * control instructions are much faster than the fstl and/or done
	 * in parallel with it so their overhead is insignificant.
	 */
fpureg_i586_bzero_loop:
	fstl	0(%edx)
	addl	$8,%edx
	subl	$8,%ecx
	cmpl	$8,%ecx
	jae	fpureg_i586_bzero_loop

	cmpl	$0,_npxproc
	je	i586_bz3
	frstor	0(%esp)
	addl	$108,%esp
	lmsw	%ax
	movb	$0xfe,kernel_fpu_lock
	ret

i586_bz3:
	fstpl	%st(0)
	lmsw	%ax
	movb	$0xfe,kernel_fpu_lock
	ret

intreg_i586_bzero:
	/*
	 * `rep stos' seems to be the best method in practice for small
	 * counts.  Fancy methods usually take too long to start up due
	 * to cache and BTB misses.
	 */
	pushl	%edi
	movl	%edx,%edi
	xorl	%eax,%eax
	shrl	$2,%ecx
	cld
	rep
	stosl
	movl	12(%esp),%ecx
	andl	$3,%ecx
	jne	1f
	popl	%edi
	ret

1:
	rep
	stosb
	popl	%edi
	ret
#endif /* I586_CPU && NNPX > 0 */

/* fillw(pat, base, cnt) */
ENTRY(fillw)
	pushl	%edi
	movl	8(%esp),%eax
	movl	12(%esp),%edi
	movl	16(%esp),%ecx
	cld
	rep
	stosw
	popl	%edi
	ret

ENTRY(bcopyb)
bcopyb:
	pushl	%esi
	pushl	%edi
	movl	12(%esp),%esi
	movl	16(%esp),%edi
	movl	20(%esp),%ecx
	movl	%edi,%eax
	subl	%esi,%eax
	cmpl	%ecx,%eax			/* overlapping && src < dst? */
	jb	1f
	cld					/* nope, copy forwards */
	rep
	movsb
	popl	%edi
	popl	%esi
	ret

	ALIGN_TEXT
1:
	addl	%ecx,%edi			/* copy backwards. */
	addl	%ecx,%esi
	decl	%edi
	decl	%esi
	std
	rep
	movsb
	popl	%edi
	popl	%esi
	cld
	ret

ENTRY(bcopy)
	MEXITCOUNT
	jmp	*_bcopy_vector

ENTRY(ovbcopy)
	MEXITCOUNT
	jmp	*_ovbcopy_vector

/*
 * generic_bcopy(src, dst, cnt)
 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
 */
ENTRY(generic_bcopy)
	pushl	%esi
	pushl	%edi
	movl	12(%esp),%esi
	movl	16(%esp),%edi
	movl	20(%esp),%ecx

	movl	%edi,%eax
	subl	%esi,%eax
	cmpl	%ecx,%eax			/* overlapping && src < dst? */
	jb	1f

	shrl	$2,%ecx				/* copy by 32-bit words */
	cld					/* nope, copy forwards */
	rep
	movsl
	movl	20(%esp),%ecx
	andl	$3,%ecx				/* any bytes left? */
	rep
	movsb
	popl	%edi
	popl	%esi
	ret

	ALIGN_TEXT
1:
	addl	%ecx,%edi			/* copy backwards */
	addl	%ecx,%esi
	decl	%edi
	decl	%esi
	andl	$3,%ecx				/* any fractional bytes? */
	std
	rep
	movsb
	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */
	shrl	$2,%ecx
	subl	$3,%esi
	subl	$3,%edi
	rep
	movsl
	popl	%edi
	popl	%esi
	cld
	ret

#if defined(I586_CPU) && NNPX > 0
ENTRY(i586_bcopy)
	pushl	%esi
	pushl	%edi
	movl	12(%esp),%esi
	movl	16(%esp),%edi
	movl	20(%esp),%ecx

	movl	%edi,%eax
	subl	%esi,%eax
	cmpl	%ecx,%eax			/* overlapping && src < dst? */
	jb	1f

	cmpl	$1024,%ecx
	jb	small_i586_bcopy

	sarb	$1,kernel_fpu_lock
	jc	small_i586_bcopy
	cmpl	$0,_npxproc
	je	i586_bc1
	smsw	%dx
	clts
	subl	$108,%esp
	fnsave	0(%esp)
	jmp	4f

i586_bc1:
	smsw	%dx
	clts
	fninit				/* XXX should avoid needing this */

	ALIGN_TEXT
4:
	pushl	%ecx
#define	DCACHE_SIZE	8192
	cmpl	$(DCACHE_SIZE-512)/2,%ecx
	jbe	2f
	movl	$(DCACHE_SIZE-512)/2,%ecx
2:
	subl	%ecx,0(%esp)
	cmpl	$256,%ecx
	jb	5f			/* XXX should prefetch if %ecx >= 32 */
	pushl	%esi
	pushl	%ecx
	ALIGN_TEXT
3:
	movl	0(%esi),%eax
	movl	32(%esi),%eax
	movl	64(%esi),%eax
	movl	96(%esi),%eax
	movl	128(%esi),%eax
	movl	160(%esi),%eax
	movl	192(%esi),%eax
	movl	224(%esi),%eax
	addl	$256,%esi
	subl	$256,%ecx
	cmpl	$256,%ecx
	jae	3b
	popl	%ecx
	popl	%esi
5:
	ALIGN_TEXT
large_i586_bcopy_loop:
	fildq	0(%esi)
	fildq	8(%esi)
	fildq	16(%esi)
	fildq	24(%esi)
	fildq	32(%esi)
	fildq	40(%esi)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -