📄 usercopy.s
字号:
/* @(#)usercopy.s 4.1 (ULTRIX) 7/2/90 *//************************************************************************ * * * Copyright (c) 1988 by * * Digital Equipment Corporation, Maynard, MA * * All rights reserved. * * * * This software is furnished under a license and may be used and * * copied only in accordance with the terms of such license and * * with the inclusion of the above copyright notice. This * * software or any other copies thereof may not be provided or * * otherwise made available to any other person. No title to and * * ownership of the software is hereby transferred. * * * * The information in this software is subject to change without * * notice and should not be construed as a commitment by Digital * * Equipment Corporation. * * * * Digital assumes no responsibility for the use or reliability * * of its software on equipment which is not supplied by Digital. * * * ************************************************************************//* ------------------------------------------------------------------ *//* | Copyright Unpublished, MIPS Computer Systems, Inc. All Rights | *//* | Reserved. This software contains proprietary and confidential | *//* | information of MIPS and its suppliers. Use, disclosure or | *//* | reproduction is prohibited without the prior express written | *//* | consent of MIPS. | *//* ------------------------------------------------------------------ *//* * Modification History: * * 13-Oct-89 gmm * smp changes. Access nofault etc through cpudata * * 18-July-89 kong * Rewrote the routine useracc. It now lives in machdep.c. * Refer to Modification History in machdep.c for details * of the changes. * * 10-July-89 burns * Made the follwing cache routines cpu cpecific since DS5800's have * additional requirements: clean_icache, clean_dcache, page_iflush and * page_dflush. * * 16-Jan-1989 Kong * Renamed flush_cache to kn01flush_cache. This routine * will probably be moved to a file specific to kn01 (pmax) * if the flush_cache routines can be tuned for performance. */#include "../machine/param.h"#include "../machine/cpu.h"#include "../machine/asm.h"#include "../machine/reg.h"#include "../machine/regdef.h"#include "../h/errno.h"#include "assym.h"/* * copypage(src_ppn, dst_ppn) * * Performance: * Config C/NC Cycles/ Speed vs VAX * Reads 4K Page * 08V11 NC 13,568 1.89X * C 6,272 4.08X * 08M44 NC 6,528 3.92X * C 2,432 10.53X * * */LEAF(copypage)XLEAF(copyseg) sll a1,PGSHIFT # page number to phys addr or a1,K0BASE addu a3,a0,NBPG # source endpoint1: lw v0,0(a0) lw v1,4(a0) lw t0,8(a0) lw t1,12(a0) sw v0,0(a1) sw v1,4(a1) sw t0,8(a1) sw t1,12(a1) addu a0,32 lw v0,-16(a0) lw v1,-12(a0) lw t0,-8(a0) lw t1,-4(a0) sw v0,16(a1) sw v1,20(a1) sw t0,24(a1) sw t1,28(a1) addu a1,32 # BDSLOT: incr dst address bne a0,a3,1b#ifdef EXTRA_CACHETRICKS /* * The dcachecnt for the source page must be handled by the * caller, since it's too much of a pain to do the vtop and * pte issues here. */ subu a1,32 # back to copied page srl a1,PGSHIFT lw v0,dcachemask and a1,v0 # figure appropriate cache alias sll a1,1 lhu v0,dcachecnt(a1) addu v0,1 sh v0,dcachecnt(a1)#endif EXTRA_CACHETRICKS j ra END(copypage)/* * clearseg(dst_ppn) * * Performance * Config Cycles/ Speed vs VAX * 4K Page * 08V11 6,144 1.09X * 08M44 1,229 5.46X (could be made faster by unroll to 64) * (done April '87 per djl) * since writes only occur at best 1 per two cycles(m500) and unroll * shouldn't help, in fact we probably don't want many instructions * so that it is easy to get into icache-- so changing back to two * sw's per loop (two cycles + two cycles for loop overhead) which * will keep the write buffers busy and not stall the cpu. */LEAF(clearseg) sll a0,PGSHIFT addu a0,K0BASE # reference via k0seg addu t0,a0,NBPG-8 # dst on last pass of loop1: sw zero,0(a0) sw zero,4(a0) .set noreorder bne a0,t0,1b addu a0,8 # BDSLOT: inc dst, NOTE after test .set reorder#ifdef EXTRA_CACHETRICKS subu a0,8 # back to copied page srl a0,PGSHIFT lw v0,dcachemask and a0,v0 # figure appropriate cache alias sll a0,1 lhu v0,dcachecnt(a0) addu v0,1 sh v0,dcachecnt(a0)#endif EXTRA_CACHETRICKS j ra END(clearseg)#ifdef USE_IDLE/* * clearseg1(dst_ppn, index) * do a clear of one 128 byte chunk. called from idle. */LEAF(clearseg1) sll a0,PGSHIFT addu a0,K0BASE # reference via k0seg mul a1,a1,128 # 128 * index = offset into page addu a0,a0,a1 # start at page + offset addu t0,a0,120 # dst is start + (128 - 8)1: sw zero,0(a0) sw zero,4(a0) .set noreorder bne a0,t0,1b addu a0,8 # BDSLOT: inc dst, NOTE after test .set reorder#ifdef EXTRA_CACHETRICKS subu a0,8 # sub 8 assures correct page srl a0,PGSHIFT lw v0,dcachemask and a0,v0 # figure appropriate cache alias sll a0,1 lhu v0,dcachecnt(a0) addu v0,1 sh v0,dcachecnt(a0)#endif EXTRA_CACHETRICKS j ra END(clearseg1)#endif USE_IDLE/* * copyin(user_src, kernel_dst, bcount)/* * copyin(user_src, kernel_dst, bcount) */COPYIOFRM= (4*4)+4 # 4 arg saves plus raNESTED(copyin, COPYIOFRM, zero) subu sp,COPYIOFRM sw ra,COPYIOFRM-4(sp) bltz a0,cerror#ifdef ASSERTIONS lw v0,u+PCB_CPUPTR lw v0,CPU_NOFAULT(v0) beq v0,zero,8f PANIC("recursive nofault")8:#endif ASSERTIONS .set noreorder lw ra,u+PCB_CPUPTR li v0,NF_COPYIO # LDSLOT sw v0,CPU_NOFAULT(ra) jal bcopy nop lw ra,u+PCB_CPUPTR nop sw zero,CPU_NOFAULT(ra) .set reorder move v0,zero lw ra,COPYIOFRM-4(sp) addu sp,COPYIOFRM j ra END(copyin)/* * copyout(kernel_src, user_dst, bcount) */NESTED(copyout, COPYIOFRM, zero) subu sp,COPYIOFRM sw ra,COPYIOFRM-4(sp) bltz a1,cerror#ifdef ASSERTIONS lw v0,u+PCB_CPUPTR lw v0,CPU_NOFAULT(v0) beq v0,zero,8f PANIC("recursive nofault")8:#endif ASSERTIONS .set noreorder lw ra,u+PCB_CPUPTR li v0,NF_COPYIO # LDSLOT sw v0,CPU_NOFAULT(ra) jal bcopy nop lw ra,u+PCB_CPUPTR nop sw zero,CPU_NOFAULT(ra) .set reorder move v0,zero lw ra,COPYIOFRM-4(sp) addu sp,COPYIOFRM j ra END(copyout)NESTED(cerror, COPYIOFRM, zero) li v0,EFAULT lw ra,COPYIOFRM-4(sp) addu sp,COPYIOFRM j ra END(cerror)/* * bcopy(src, dst, bcount) * * NOTE: the optimal copy here is somewhat different than for the user-level * equivalents (bcopy in 4.2, memcpy in V), because: * 1) it frequently acts on uncached data, especially since copying from * (uncached) disk buffers into user pgms is high runner. * This means one must be careful with lwl/lwr/lb - don't expect cache help. * 2) the distribution of usage is very different: there are a large number * of bcopies for small, aligned structures (like for ioctl, for example), * a reasonable number of randomly-sized copies for user I/O, and many * bcopies of large (page-size) blocks for stdio; the latter must be * well-tuned, hence the use of 32-byte loops. * 3) this is much more frequently-used code inside the kernel than outside * * Overall copy-loop speeds, by amount of loop-unrolling: assumptions: * a) low icache miss rate (this code gets used a bunch) * b) large transfers, especially, will be word-alignable. * c) Copying speeds (steady state, 0% I-cache-miss, 100% D-cache Miss): * d) 100% D-Cache Miss (but cacheable, so that lwl/lwr/lb work well) * Config Bytes/ Cycles/ Speed (VAX/780 = 1) * Loop Word * 08V11 1 35 0.71X (8MHz, BUS, 1-Deep WB, 1-way ILV) * 4 15 1.67X * 8/16 13.5 1.85X * 32/up 13.25 1.89X * 08MM44 1 26 0.96X (8MHz, MEM, 4-Deep WB, 4-way ILV) * 4 9 2.78X * 8 7.5 3.33X * 16 6.75 3.70X * 32 6.375 3.92X (diminishing returns thereafter) * * MINCOPY is minimum number of byte that its worthwhile to try and * align copy into word transactions. Calculations below are for 8 bytes: * Estimating MINCOPY (C = Cacheable, NC = Noncacheable): * Assumes 100% D-cache miss on first reference, then 0% (100%) for C (NC): * (Warning: these are gross numbers, and the code has changed slightly): * Case 08V11 08M44 * MINCOPY C NC C NC * 9 (1 byte loop) 75 133 57 93 * 8 (complex logic) * Aligned 51 51 40 40 * Alignable, * worst (1+4+3) 69 96 53 80 * Unalignable 66 93 60 72 * MINCOPY should be lower for lower cache miss rates, lower cache miss * penalties, better alignment properties, or if src and dst alias in * cache. For this particular case, it seems very important to minimize the * number of lb/sb pairs: a) frequent non-cacheable references are used, * b) when i-cache miss rate approaches zero, even the 4-deep WB can't * put successive sb's together in any useful way, so few references are saved. * To summarize, even as low as 8 bytes, avoiding the single-byte loop seems * worthwhile; some assumptions are probably optimistic, so there is not quite * as much disadvantage. However, the optimal number is almost certainly in * the range 7-12. * * a0 src addr * a1 dst addr * a2 length remaining */#define MINCOPY 8LEAF(bcopy)#ifdef ASSERTIONS bgeu a0,a1,1f # src >= dst, no overlap error addu v0,a0,a2 # src endpoint + 1 bgeu a1,v0,1f # dst >= src endpoint+1, no overlap err PANIC("bcopy overlap")1:#endif ASSERTIONS xor v0,a0,a1 # bash src & dst for align chk; BDSLOT blt a2,MINCOPY,bytecopy # too short, just byte copy and v0,NBPW-1 # low-order bits for align chk subu v1,zero,a0 # -src; BDSLOT bne v0,zero,unaligncopy # src and dst not alignable/* * src and dst can be simultaneously word aligned */ and v1,NBPW-1 # number of bytes til aligned subu a2,v1 # bcount -= alignment beq v1,zero,blkcopy # already aligned#ifdef MIPSEB lwl v0,0(a0) # copy unaligned portion swl v0,0(a1)#endif#ifdef MIPSEL lwr v0,0(a0) swr v0,0(a1)#endif addu a0,v1 # src += alignment addu a1,v1 # dst += alignment/* * 32 byte block, aligned copy loop (for big reads/writes) */blkcopy: and a3,a2,~31 # total space in 32 byte chunks subu a2,a3 # count after by-32 byte loop done beq a3,zero,wordcopy # less than 32 bytes to copy addu a3,a0 # source endpoint1: lw v0,0(a0) lw v1,4(a0) lw t0,8(a0) lw t1,12(a0) sw v0,0(a1) sw v1,4(a1) sw t0,8(a1) sw t1,12(a1) addu a0,32 # src+= 32; here to ease loop end lw v0,-16(a0) lw v1,-12(a0) lw t0,-8(a0) lw t1,-4(a0) sw v0,16(a1) sw v1,20(a1) sw t0,24(a1) sw t1,28(a1) addu a1,32 # dst+= 32; fills BD slot bne a0,a3,1b/* * word copy loop */wordcopy: and a3,a2,~(NBPW-1) # word chunks subu a2,a3 # count after by word loop beq a3,zero,bytecopy # less than a word to copy addu a3,a0 # source endpoint1: lw v0,0(a0) addu a0,NBPW sw v0,0(a1) addu a1,NBPW # dst += 4; BD slot bne a0,a3,1b b bytecopy/* * deal with simultaneously unalignable copy by aligning dst */unaligncopy: subu a3,zero,a1 # calc byte cnt to get dst aligned and a3,NBPW-1 # alignment = 0..3 subu a2,a3 # bcount -= alignment beq a3,zero,partaligncopy # already aligned#ifdef MIPSEB lwl v0,0(a0) # get whole word lwr v0,3(a0) # for sure swl v0,0(a1) # store left piece (1-3 bytes)#endif#ifdef MIPSEL lwr v0,0(a0) # get whole word lwl v0,3(a0) # for sure swr v0,0(a1) # store right piece (1-3 bytes)#endif addu a0,a3 # src += alignment (will fill LD slot) addu a1,a3 # dst += alignment/* * src unaligned, dst aligned loop * NOTE: if MINCOPY >= 7, will always do 1 loop iteration or more * if we get here at all */partaligncopy: and a3,a2,~(NBPW-1) # space in word chunks subu a2,a3 # count after by word loop#if MINCOPY < 7 beq a3,zero,bytecopy # less than a word to copy#endif addu a3,a0 # source endpoint1:#ifdef MIPSEB lwl v0,0(a0) lwr v0,3(a0)#endif#ifdef MIPSEL lwr v0,0(a0) lwl v0,3(a0)#endif addu a0,NBPW sw v0,0(a1) addu a1,NBPW bne a0,a3,1b/* * brute force byte copy loop, for bcount < MINCOPY + tail of unaligned dst * note that lwl, lwr, swr CANNOT be used for tail, since the lwr might * cross page boundary and give spurious address exception */bytecopy: addu a3,a2,a0 # source endpoint; BDSLOT ble a2,zero,copydone # nothing left to copy, or bad length1: lb v0,0(a0) addu a0,1 sb v0,0(a1) addu a1,1 # BDSLOT: incr dst address bne a0,a3,1bcopydone: j ra END(bcopy)/* * bzero(dst, bcount) * Zero block of memory * * Calculating MINZERO, assuming 50% cache-miss on non-loop code: * Overhead =~ 18 instructions => 63 (81) cycles * Byte zero =~ 16 (24) cycles/word for 08M44 (08V11) * Word zero =~ 3 (6) cycles/word for 08M44 (08V11) * If I-cache-miss nears 0, MINZERO ==> 4 bytes; otherwise, times are: * breakeven (MEM) = 63 / (16 - 3) =~ 5 words * breakeven (BUS) = 81 / (24 - 6) =~ 4.5 words * Since the overhead is pessimistic (worst-case alignment), and many calls * will be for well-aligned data, and since Word-zeroing at least leaves * the zero in the cache, we shade these values (18-20) down to 12 */#define MINZERO 12LEAF(bzero)XLEAF(blkclr) subu v1,zero,a0 # number of bytes til aligned blt a1,MINZERO,bytezero and v1,NBPW-1 subu a1,v1 beq v1,zero,blkzero # already aligned#ifdef MIPSEB swl zero,0(a0)#endif#ifdef MIPSEL swr zero,0(a0)#endif addu a0,v1/* * zero 32 byte, aligned block */blkzero: and a3,a1,~31 # 32 byte chunks subu a1,a3 beq a3,zero,wordzero addu a3,a0 # dst endpoint1: sw zero,0(a0) sw zero,4(a0) sw zero,8(a0) sw zero,12(a0) addu a0,32 sw zero,-16(a0) sw zero,-12(a0) sw zero,-8(a0) sw zero,-4(a0) bne a0,a3,1bwordzero: and a3,a1,~(NBPW-1) # word chunks subu a1,a3 beq a3,zero,bytezero addu a3,a0 # dst endpoint1: addu a0,NBPW sw zero,-NBPW(a0) bne a0,a3,1bbytezero: ble a1,zero,zerodone addu a1,a0 # dst endpoint1: addu a0,1 sb zero,-1(a0) bne a0,a1,1bzerodone: j ra END(bzero)/* * bcmp(src, dst, bcount) * * MINCMP is minimum number of byte that its worthwhile to try and * align cmp into word transactions * * Calculating MINCMP * Overhead =~ 15 instructions => 90 cycles * Byte cmp =~ 38 cycles/word * Word cmp =~ 17 cycles/word * Breakeven =~ 16 bytes */#define MINCMP 16LEAF(bcmp) xor v0,a0,a1 blt a2,MINCMP,bytecmp # too short, just byte cmp and v0,NBPW-1 subu t8,zero,a0 # number of bytes til aligned bne v0,zero,unalgncmp # src and dst not alignable/* * src and dst can be simultaneously word aligned */ and t8,NBPW-1 subu a2,t8 beq t8,zero,wordcmp # already aligned move a1,a0 # The FIX#ifdef MIPSEB lwl v0,0(a0) # cmp unaligned portion lwl v1,0(a1)#endif#ifdef MIPSEL lwr v0,0(a0) lwr v1,0(a1)#endif addu a0,t8 addu a1,t8 bne v0,v1,cmpne/* * word cmp loop */wordcmp: and a3,a2,~(NBPW-1) subu a2,a3 beq a3,zero,bytecmp addu a3,a0 # src1 endpoint1: lw v0,0(a0) lw v1,0(a1) addu a0,NBPW # 1st BDSLOT addu a1,NBPW # 2nd BDSLOT (asm doesn't move) bne v0,v1,cmpne
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -