📄 bcopy.s
字号:
Hi,
The following code is the file support.s from the FreeBSD 2.6
distribution for i386. I included the entire file so you can
pick and choose as you like and you can pick up the license.
There's a generic bcopy that does overlapping, uses rep movs
in the largest chunk possible, etc. That might do the trick.
There's a few macros around but hopefully you can decipher
them.
Later,
FM
--
Frank W. Miller
Cornfed Systems Inc
www.cornfed.com
--
/*-
* Copyright (c) 1993 The Regents of the University of California.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the University of
* California, Berkeley and its contributors.
* 4. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: bcopy.s,v 1.1.1.1 2001/06/21 06:32:39 greg Exp $
*/
#include "npx.h"
#include "opt_cpu.h"
#include <machine/asmacros.h>
#include <machine/cputypes.h>
#include <machine/pmap.h>
#include <machine/specialreg.h>
#include "assym.s"
#define KDSEL 0x10 /* kernel data selector */
#define IDXSHIFT 10
.data
.globl _bcopy_vector
_bcopy_vector:
.long _generic_bcopy
.globl _bzero
_bzero:
.long _generic_bzero
.globl _copyin_vector
_copyin_vector:
.long _generic_copyin
.globl _copyout_vector
_copyout_vector:
.long _generic_copyout
.globl _ovbcopy_vector
_ovbcopy_vector:
.long _generic_bcopy
#if defined(I586_CPU) && NNPX > 0
kernel_fpu_lock:
.byte 0xfe
.space 3
#endif
.text
/*
* bcopy family
* void bzero(void *buf, u_int len)
*/
ENTRY(generic_bzero)
pushl %edi
movl 8(%esp),%edi
movl 12(%esp),%ecx
xorl %eax,%eax
shrl $2,%ecx
cld
rep
stosl
movl 12(%esp),%ecx
andl $3,%ecx
rep
stosb
popl %edi
ret
#if defined(I486_CPU)
ENTRY(i486_bzero)
movl 4(%esp),%edx
movl 8(%esp),%ecx
xorl %eax,%eax
/*
* do 64 byte chunks first
*
* XXX this is probably over-unrolled at least for DX2's
*/
2:
cmpl $64,%ecx
jb 3f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
movl %eax,16(%edx)
movl %eax,20(%edx)
movl %eax,24(%edx)
movl %eax,28(%edx)
movl %eax,32(%edx)
movl %eax,36(%edx)
movl %eax,40(%edx)
movl %eax,44(%edx)
movl %eax,48(%edx)
movl %eax,52(%edx)
movl %eax,56(%edx)
movl %eax,60(%edx)
addl $64,%edx
subl $64,%ecx
jnz 2b
ret
/*
* do 16 byte chunks
*/
SUPERALIGN_TEXT
3:
cmpl $16,%ecx
jb 4f
movl %eax,(%edx)
movl %eax,4(%edx)
movl %eax,8(%edx)
movl %eax,12(%edx)
addl $16,%edx
subl $16,%ecx
jnz 3b
ret
/*
* do 4 byte chunks
*/
SUPERALIGN_TEXT
4:
cmpl $4,%ecx
jb 5f
movl %eax,(%edx)
addl $4,%edx
subl $4,%ecx
jnz 4b
ret
/*
* do 1 byte chunks
* a jump table seems to be faster than a loop or more range reductions
*
* XXX need a const section for non-text
*/
.data
jtab:
.long do0
.long do1
.long do2
.long do3
.text
SUPERALIGN_TEXT
5:
jmp jtab(,%ecx,4)
SUPERALIGN_TEXT
do3:
movw %ax,(%edx)
movb %al,2(%edx)
ret
SUPERALIGN_TEXT
do2:
movw %ax,(%edx)
ret
SUPERALIGN_TEXT
do1:
movb %al,(%edx)
ret
SUPERALIGN_TEXT
do0:
ret
#endif
#if defined(I586_CPU) && NNPX > 0
ENTRY(i586_bzero)
movl 4(%esp),%edx
movl 8(%esp),%ecx
/*
* The FPU register method is twice as fast as the integer register
* method unless the target is in the L1 cache and we pre-allocate a
* cache line for it (then the integer register method is 4-5 times
* faster). However, we never pre-allocate cache lines, since that
* would make the integer method 25% or more slower for the common
* case when the target isn't in either the L1 cache or the L2 cache.
* Thus we normally use the FPU register method unless the overhead
* would be too large.
*/
cmpl $256,%ecx /* empirical; clts, fninit, smsw cost a lot */
jb intreg_i586_bzero
/*
* The FPU registers may belong to an application or to fastmove()
* or to another invocation of bcopy() or ourself in a higher level
* interrupt or trap handler. Preserving the registers is
* complicated since we avoid it if possible at all levels. We
* want to localize the complications even when that increases them.
* Here the extra work involves preserving CR0_TS in TS.
* `npxproc != NULL' is supposed to be the condition that all the
* FPU resources belong to an application, but npxproc and CR0_TS
* aren't set atomically enough for this condition to work in
* interrupt handlers.
*
* Case 1: FPU registers belong to the application: we must preserve
* the registers if we use them, so we only use the FPU register
* method if the target size is large enough to amortize the extra
* overhead for preserving them. CR0_TS must be preserved although
* it is very likely to end up as set.
*
* Case 2: FPU registers belong to fastmove(): fastmove() currently
* makes the registers look like they belong to an application so
* that cpu_switch() and savectx() don't have to know about it, so
* this case reduces to case 1.
*
* Case 3: FPU registers belong to the kernel: don't use the FPU
* register method. This case is unlikely, and supporting it would
* be more complicated and might take too much stack.
*
* Case 4: FPU registers don't belong to anyone: the FPU registers
* don't need to be preserved, so we always use the FPU register
* method. CR0_TS must be preserved although it is very likely to
* always end up as clear.
*/
cmpl $0,_npxproc
je i586_bz1
cmpl $256+184,%ecx /* empirical; not quite 2*108 more */
jb intreg_i586_bzero
sarb $1,kernel_fpu_lock
jc intreg_i586_bzero
smsw %ax
clts
subl $108,%esp
fnsave 0(%esp)
jmp i586_bz2
i586_bz1:
sarb $1,kernel_fpu_lock
jc intreg_i586_bzero
smsw %ax
clts
fninit /* XXX should avoid needing this */
i586_bz2:
fldz
/*
* Align to an 8 byte boundary (misalignment in the main loop would
* cost a factor of >= 2). Avoid jumps (at little cost if it is
* already aligned) by always zeroing 8 bytes and using the part up
* to the _next_ alignment position.
*/
fstl 0(%edx)
addl %edx,%ecx /* part of %ecx -= new_%edx - %edx */
addl $8,%edx
andl $~7,%edx
subl %edx,%ecx
/*
* Similarly align `len' to a multiple of 8.
*/
fstl -8(%edx,%ecx)
decl %ecx
andl $~7,%ecx
/*
* This wouldn't be any faster if it were unrolled, since the loop
* control instructions are much faster than the fstl and/or done
* in parallel with it so their overhead is insignificant.
*/
fpureg_i586_bzero_loop:
fstl 0(%edx)
addl $8,%edx
subl $8,%ecx
cmpl $8,%ecx
jae fpureg_i586_bzero_loop
cmpl $0,_npxproc
je i586_bz3
frstor 0(%esp)
addl $108,%esp
lmsw %ax
movb $0xfe,kernel_fpu_lock
ret
i586_bz3:
fstpl %st(0)
lmsw %ax
movb $0xfe,kernel_fpu_lock
ret
intreg_i586_bzero:
/*
* `rep stos' seems to be the best method in practice for small
* counts. Fancy methods usually take too long to start up due
* to cache and BTB misses.
*/
pushl %edi
movl %edx,%edi
xorl %eax,%eax
shrl $2,%ecx
cld
rep
stosl
movl 12(%esp),%ecx
andl $3,%ecx
jne 1f
popl %edi
ret
1:
rep
stosb
popl %edi
ret
#endif /* I586_CPU && NNPX > 0 */
/* fillw(pat, base, cnt) */
ENTRY(fillw)
pushl %edi
movl 8(%esp),%eax
movl 12(%esp),%edi
movl 16(%esp),%ecx
cld
rep
stosw
popl %edi
ret
ENTRY(bcopyb)
bcopyb:
pushl %esi
pushl %edi
movl 12(%esp),%esi
movl 16(%esp),%edi
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cld /* nope, copy forwards */
rep
movsb
popl %edi
popl %esi
ret
ALIGN_TEXT
1:
addl %ecx,%edi /* copy backwards. */
addl %ecx,%esi
decl %edi
decl %esi
std
rep
movsb
popl %edi
popl %esi
cld
ret
ENTRY(bcopy)
MEXITCOUNT
jmp *_bcopy_vector
ENTRY(ovbcopy)
MEXITCOUNT
jmp *_ovbcopy_vector
/*
* generic_bcopy(src, dst, cnt)
* ws@tools.de (Wolfgang Solfrank, TooLs GmbH) +49-228-985800
*/
ENTRY(generic_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
movl 16(%esp),%edi
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
shrl $2,%ecx /* copy by 32-bit words */
cld /* nope, copy forwards */
rep
movsl
movl 20(%esp),%ecx
andl $3,%ecx /* any bytes left? */
rep
movsb
popl %edi
popl %esi
ret
ALIGN_TEXT
1:
addl %ecx,%edi /* copy backwards */
addl %ecx,%esi
decl %edi
decl %esi
andl $3,%ecx /* any fractional bytes? */
std
rep
movsb
movl 20(%esp),%ecx /* copy remainder by 32-bit words */
shrl $2,%ecx
subl $3,%esi
subl $3,%edi
rep
movsl
popl %edi
popl %esi
cld
ret
#if defined(I586_CPU) && NNPX > 0
ENTRY(i586_bcopy)
pushl %esi
pushl %edi
movl 12(%esp),%esi
movl 16(%esp),%edi
movl 20(%esp),%ecx
movl %edi,%eax
subl %esi,%eax
cmpl %ecx,%eax /* overlapping && src < dst? */
jb 1f
cmpl $1024,%ecx
jb small_i586_bcopy
sarb $1,kernel_fpu_lock
jc small_i586_bcopy
cmpl $0,_npxproc
je i586_bc1
smsw %dx
clts
subl $108,%esp
fnsave 0(%esp)
jmp 4f
i586_bc1:
smsw %dx
clts
fninit /* XXX should avoid needing this */
ALIGN_TEXT
4:
pushl %ecx
#define DCACHE_SIZE 8192
cmpl $(DCACHE_SIZE-512)/2,%ecx
jbe 2f
movl $(DCACHE_SIZE-512)/2,%ecx
2:
subl %ecx,0(%esp)
cmpl $256,%ecx
jb 5f /* XXX should prefetch if %ecx >= 32 */
pushl %esi
pushl %ecx
ALIGN_TEXT
3:
movl 0(%esi),%eax
movl 32(%esi),%eax
movl 64(%esi),%eax
movl 96(%esi),%eax
movl 128(%esi),%eax
movl 160(%esi),%eax
movl 192(%esi),%eax
movl 224(%esi),%eax
addl $256,%esi
subl $256,%ecx
cmpl $256,%ecx
jae 3b
popl %ecx
popl %esi
5:
ALIGN_TEXT
large_i586_bcopy_loop:
fildq 0(%esi)
fildq 8(%esi)
fildq 16(%esi)
fildq 24(%esi)
fildq 32(%esi)
fildq 40(%esi)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -