📄 bcopy.s

📁 开放源码实时操作系统源码.
💻 S
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
Hi,



The following code is the file support.s from the FreeBSD 2.6

distribution for i386.  I included the entire file so you can

pick and choose as you like and you can pick up the license.

There's a generic bcopy that does overlapping, uses rep movs

in the largest chunk possible, etc.  That might do the trick.

There's a few macros around but hopefully you can decipher

them.



Later,

FM



--

Frank W. Miller

Cornfed Systems Inc

www.cornfed.com





--

/*-

 * Copyright (c) 1993 The Regents of the University of California.

 * All rights reserved.

 *

 * Redistribution and use in source and binary forms, with or without

 * modification, are permitted provided that the following conditions

 * are met:

 * 1. Redistributions of source code must retain the above copyright

 *    notice, this list of conditions and the following disclaimer.

 * 2. Redistributions in binary form must reproduce the above copyright

 *    notice, this list of conditions and the following disclaimer in the

 *    documentation and/or other materials provided with the distribution.

 * 3. All advertising materials mentioning features or use of this software

 *    must display the following acknowledgement:

 *	This product includes software developed by the University of

 *	California, Berkeley and its contributors.

 * 4. Neither the name of the University nor the names of its contributors

 *    may be used to endorse or promote products derived from this software

 *    without specific prior written permission.

 *

 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND

 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE

 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE

 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL

 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS

 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)

 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY

 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF

 * SUCH DAMAGE.

 *

 *	$Id: bcopy.s,v 1.1.1.1 2001/06/21 06:32:39 greg Exp $

 */



#include "npx.h"

#include "opt_cpu.h"



#include <machine/asmacros.h>

#include <machine/cputypes.h>

#include <machine/pmap.h>

#include <machine/specialreg.h>



#include "assym.s"



#define KDSEL		0x10			/* kernel data selector */

#define IDXSHIFT	10



	.data

	.globl	_bcopy_vector

_bcopy_vector:

	.long	_generic_bcopy

	.globl	_bzero

_bzero:

	.long	_generic_bzero

	.globl	_copyin_vector

_copyin_vector:

	.long	_generic_copyin

	.globl	_copyout_vector

_copyout_vector:

	.long	_generic_copyout

	.globl	_ovbcopy_vector

_ovbcopy_vector:

	.long	_generic_bcopy

#if defined(I586_CPU) && NNPX > 0

kernel_fpu_lock:

	.byte	0xfe

	.space	3

#endif



	.text



/*

 * bcopy family

 * void bzero(void *buf, u_int len)

 */



ENTRY(generic_bzero)

	pushl	%edi

	movl	8(%esp),%edi

	movl	12(%esp),%ecx

	xorl	%eax,%eax

	shrl	$2,%ecx

	cld

	rep

	stosl

	movl	12(%esp),%ecx

	andl	$3,%ecx

	rep

	stosb

	popl	%edi

	ret



#if defined(I486_CPU)

ENTRY(i486_bzero)

	movl	4(%esp),%edx

	movl	8(%esp),%ecx

	xorl	%eax,%eax

/*

 * do 64 byte chunks first

 *

 * XXX this is probably over-unrolled at least for DX2's

 */

2:

	cmpl	$64,%ecx

	jb	3f

	movl	%eax,(%edx)

	movl	%eax,4(%edx)

	movl	%eax,8(%edx)

	movl	%eax,12(%edx)

	movl	%eax,16(%edx)

	movl	%eax,20(%edx)

	movl	%eax,24(%edx)

	movl	%eax,28(%edx)

	movl	%eax,32(%edx)

	movl	%eax,36(%edx)

	movl	%eax,40(%edx)

	movl	%eax,44(%edx)

	movl	%eax,48(%edx)

	movl	%eax,52(%edx)

	movl	%eax,56(%edx)

	movl	%eax,60(%edx)

	addl	$64,%edx

	subl	$64,%ecx

	jnz	2b

	ret



/*

 * do 16 byte chunks

 */

	SUPERALIGN_TEXT

3:

	cmpl	$16,%ecx

	jb	4f

	movl	%eax,(%edx)

	movl	%eax,4(%edx)

	movl	%eax,8(%edx)

	movl	%eax,12(%edx)

	addl	$16,%edx

	subl	$16,%ecx

	jnz	3b

	ret



/*

 * do 4 byte chunks

 */

	SUPERALIGN_TEXT

4:

	cmpl	$4,%ecx

	jb	5f

	movl	%eax,(%edx)

	addl	$4,%edx

	subl	$4,%ecx

	jnz	4b

	ret



/*

 * do 1 byte chunks

 * a jump table seems to be faster than a loop or more range reductions

 *

 * XXX need a const section for non-text

 */

	.data

jtab:

	.long	do0

	.long	do1

	.long	do2

	.long	do3



	.text

	SUPERALIGN_TEXT

5:

	jmp	jtab(,%ecx,4)



	SUPERALIGN_TEXT

do3:

	movw	%ax,(%edx)

	movb	%al,2(%edx)

	ret



	SUPERALIGN_TEXT

do2:

	movw	%ax,(%edx)

	ret



	SUPERALIGN_TEXT

do1:

	movb	%al,(%edx)

	ret



	SUPERALIGN_TEXT

do0:

	ret

#endif



#if defined(I586_CPU) && NNPX > 0

ENTRY(i586_bzero)

	movl	4(%esp),%edx

	movl	8(%esp),%ecx



	/*

	 * The FPU register method is twice as fast as the integer register

	 * method unless the target is in the L1 cache and we pre-allocate a

	 * cache line for it (then the integer register method is 4-5 times

	 * faster).  However, we never pre-allocate cache lines, since that

	 * would make the integer method 25% or more slower for the common

	 * case when the target isn't in either the L1 cache or the L2 cache.

	 * Thus we normally use the FPU register method unless the overhead

	 * would be too large.

	 */

	cmpl	$256,%ecx	/* empirical; clts, fninit, smsw cost a lot */

	jb	intreg_i586_bzero



	/*

	 * The FPU registers may belong to an application or to fastmove()

	 * or to another invocation of bcopy() or ourself in a higher level

	 * interrupt or trap handler.  Preserving the registers is

	 * complicated since we avoid it if possible at all levels.  We

	 * want to localize the complications even when that increases them.

	 * Here the extra work involves preserving CR0_TS in TS.

	 * `npxproc != NULL' is supposed to be the condition that all the

	 * FPU resources belong to an application, but npxproc and CR0_TS

	 * aren't set atomically enough for this condition to work in

	 * interrupt handlers.

	 *

	 * Case 1: FPU registers belong to the application: we must preserve

	 * the registers if we use them, so we only use the FPU register

	 * method if the target size is large enough to amortize the extra

	 * overhead for preserving them.  CR0_TS must be preserved although

	 * it is very likely to end up as set.

	 *

	 * Case 2: FPU registers belong to fastmove(): fastmove() currently

	 * makes the registers look like they belong to an application so

	 * that cpu_switch() and savectx() don't have to know about it, so

	 * this case reduces to case 1.

	 *

	 * Case 3: FPU registers belong to the kernel: don't use the FPU

	 * register method.  This case is unlikely, and supporting it would

	 * be more complicated and might take too much stack.

	 *

	 * Case 4: FPU registers don't belong to anyone: the FPU registers

	 * don't need to be preserved, so we always use the FPU register

	 * method.  CR0_TS must be preserved although it is very likely to

	 * always end up as clear.

	 */

	cmpl	$0,_npxproc

	je	i586_bz1

	cmpl	$256+184,%ecx		/* empirical; not quite 2*108 more */

	jb	intreg_i586_bzero

	sarb	$1,kernel_fpu_lock

	jc	intreg_i586_bzero

	smsw	%ax

	clts

	subl	$108,%esp

	fnsave	0(%esp)

	jmp	i586_bz2



i586_bz1:

	sarb	$1,kernel_fpu_lock

	jc	intreg_i586_bzero

	smsw	%ax

	clts

	fninit				/* XXX should avoid needing this */

i586_bz2:

	fldz



	/*

	 * Align to an 8 byte boundary (misalignment in the main loop would

	 * cost a factor of >= 2).  Avoid jumps (at little cost if it is

	 * already aligned) by always zeroing 8 bytes and using the part up

	 * to the _next_ alignment position.

	 */

	fstl	0(%edx)

	addl	%edx,%ecx		/* part of %ecx -= new_%edx - %edx */

	addl	$8,%edx

	andl	$~7,%edx

	subl	%edx,%ecx



	/*

	 * Similarly align `len' to a multiple of 8.

	 */

	fstl	-8(%edx,%ecx)

	decl	%ecx

	andl	$~7,%ecx



	/*

	 * This wouldn't be any faster if it were unrolled, since the loop

	 * control instructions are much faster than the fstl and/or done

	 * in parallel with it so their overhead is insignificant.

	 */

fpureg_i586_bzero_loop:

	fstl	0(%edx)

	addl	$8,%edx

	subl	$8,%ecx

	cmpl	$8,%ecx

	jae	fpureg_i586_bzero_loop



	cmpl	$0,_npxproc

	je	i586_bz3

	frstor	0(%esp)

	addl	$108,%esp

	lmsw	%ax

	movb	$0xfe,kernel_fpu_lock

	ret



i586_bz3:

	fstpl	%st(0)

	lmsw	%ax

	movb	$0xfe,kernel_fpu_lock

	ret



intreg_i586_bzero:

	/*

	 * `rep stos' seems to be the best method in practice for small

	 * counts.  Fancy methods usually take too long to start up due

	 * to cache and BTB misses.

	 */

	pushl	%edi

	movl	%edx,%edi

	xorl	%eax,%eax

	shrl	$2,%ecx

	cld

	rep

	stosl

	movl	12(%esp),%ecx

	andl	$3,%ecx

	jne	1f

	popl	%edi

	ret



1:

	rep

	stosb

	popl	%edi

	ret

#endif /* I586_CPU && NNPX > 0 */



/* fillw(pat, base, cnt) */

ENTRY(fillw)

	pushl	%edi

	movl	8(%esp),%eax

	movl	12(%esp),%edi

	movl	16(%esp),%ecx

	cld

	rep

	stosw

	popl	%edi

	ret



ENTRY(bcopyb)

bcopyb:

	pushl	%esi

	pushl	%edi

	movl	12(%esp),%esi

	movl	16(%esp),%edi

	movl	20(%esp),%ecx

	movl	%edi,%eax

	subl	%esi,%eax

	cmpl	%ecx,%eax			/* overlapping && src < dst? */

	jb	1f

	cld					/* nope, copy forwards */

	rep

	movsb

	popl	%edi

	popl	%esi

	ret



	ALIGN_TEXT

1:

	addl	%ecx,%edi			/* copy backwards. */

	addl	%ecx,%esi

	decl	%edi

	decl	%esi

	std

	rep

	movsb

	popl	%edi

	popl	%esi

	cld

	ret



ENTRY(bcopy)

	MEXITCOUNT

	jmp	*_bcopy_vector



ENTRY(ovbcopy)

	MEXITCOUNT

	jmp	*_ovbcopy_vector



/*

 * generic_bcopy(src, dst, cnt)

 *  ws@tools.de     (Wolfgang Solfrank, TooLs GmbH) +49-228-985800

 */

ENTRY(generic_bcopy)

	pushl	%esi

	pushl	%edi

	movl	12(%esp),%esi

	movl	16(%esp),%edi

	movl	20(%esp),%ecx



	movl	%edi,%eax

	subl	%esi,%eax

	cmpl	%ecx,%eax			/* overlapping && src < dst? */

	jb	1f



	shrl	$2,%ecx				/* copy by 32-bit words */

	cld					/* nope, copy forwards */

	rep

	movsl

	movl	20(%esp),%ecx

	andl	$3,%ecx				/* any bytes left? */

	rep

	movsb

	popl	%edi

	popl	%esi

	ret



	ALIGN_TEXT

1:

	addl	%ecx,%edi			/* copy backwards */

	addl	%ecx,%esi

	decl	%edi

	decl	%esi

	andl	$3,%ecx				/* any fractional bytes? */

	std

	rep

	movsb

	movl	20(%esp),%ecx			/* copy remainder by 32-bit words */

	shrl	$2,%ecx

	subl	$3,%esi

	subl	$3,%edi

	rep

	movsl

	popl	%edi

	popl	%esi

	cld

	ret



#if defined(I586_CPU) && NNPX > 0

ENTRY(i586_bcopy)

	pushl	%esi

	pushl	%edi

	movl	12(%esp),%esi

	movl	16(%esp),%edi

	movl	20(%esp),%ecx



	movl	%edi,%eax

	subl	%esi,%eax

	cmpl	%ecx,%eax			/* overlapping && src < dst? */

	jb	1f



	cmpl	$1024,%ecx

	jb	small_i586_bcopy



	sarb	$1,kernel_fpu_lock

	jc	small_i586_bcopy

	cmpl	$0,_npxproc

	je	i586_bc1

	smsw	%dx

	clts

	subl	$108,%esp

	fnsave	0(%esp)

	jmp	4f



i586_bc1:

	smsw	%dx

	clts

	fninit				/* XXX should avoid needing this */



	ALIGN_TEXT

4:

	pushl	%ecx

#define	DCACHE_SIZE	8192

	cmpl	$(DCACHE_SIZE-512)/2,%ecx

	jbe	2f

	movl	$(DCACHE_SIZE-512)/2,%ecx

2:

	subl	%ecx,0(%esp)

	cmpl	$256,%ecx

	jb	5f			/* XXX should prefetch if %ecx >= 32 */

	pushl	%esi

	pushl	%ecx

	ALIGN_TEXT

3:

	movl	0(%esi),%eax

	movl	32(%esi),%eax

	movl	64(%esi),%eax

	movl	96(%esi),%eax

	movl	128(%esi),%eax

	movl	160(%esi),%eax

	movl	192(%esi),%eax

	movl	224(%esi),%eax

	addl	$256,%esi

	subl	$256,%ecx

	cmpl	$256,%ecx

	jae	3b

	popl	%ecx

	popl	%esi

5:

	ALIGN_TEXT

large_i586_bcopy_loop:

	fildq	0(%esi)

	fildq	8(%esi)

	fildq	16(%esi)

	fildq	24(%esi)

	fildq	32(%esi)

	fildq	40(%esi)
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -