📄 usercopy.s

📁 <B>Digital的Unix操作系统VAX 4.2源码</B>
💻 S
📖 第 1 页 / 共 4 页
字号:
12 3 4 下一页
/*	@(#)usercopy.s	4.1	(ULTRIX)	7/2/90	*//************************************************************************ *									* *			Copyright (c) 1988 by				* *		Digital Equipment Corporation, Maynard, MA		* *			All rights reserved.				* *									* *   This software is furnished under a license and may be used and	* *   copied  only  in accordance with the terms of such license and	* *   with the  inclusion  of  the  above  copyright  notice.   This	* *   software  or  any  other copies thereof may not be provided or	* *   otherwise made available to any other person.  No title to and	* *   ownership of the software is hereby transferred.			* *									* *   The information in this software is subject to change  without	* *   notice  and should not be construed as a commitment by Digital	* *   Equipment Corporation.						* *									* *   Digital assumes no responsibility for the use  or  reliability	* *   of its software on equipment which is not supplied by Digital.	* *									* ************************************************************************//* ------------------------------------------------------------------ *//* | Copyright Unpublished, MIPS Computer Systems, Inc.  All Rights | *//* | Reserved.  This software contains proprietary and confidential | *//* | information of MIPS and its suppliers.  Use, disclosure or     | *//* | reproduction is prohibited without the prior express written   | *//* | consent of MIPS.                                               | *//* ------------------------------------------------------------------ *//* * Modification History: * * 13-Oct-89    gmm *	smp changes. Access nofault etc through cpudata * * 18-July-89	kong *	Rewrote the routine useracc.  It now lives in machdep.c. *	Refer to Modification History in machdep.c for details *	of the changes. * * 10-July-89	burns *	Made the follwing cache routines cpu cpecific since DS5800's have *	additional requirements: clean_icache, clean_dcache, page_iflush and *	page_dflush. * * 16-Jan-1989	Kong *	Renamed flush_cache to kn01flush_cache.  This routine *	will probably be moved to a file specific to kn01 (pmax) *	if the flush_cache routines can be tuned for performance. */#include "../machine/param.h"#include "../machine/cpu.h"#include "../machine/asm.h"#include "../machine/reg.h"#include "../machine/regdef.h"#include "../h/errno.h"#include "assym.h"/* * copypage(src_ppn, dst_ppn) * * Performance: *	Config	C/NC	Cycles/	Speed vs VAX *		Reads	4K Page	 *	08V11	NC	13,568	 1.89X *		C	 6,272	 4.08X *	08M44	NC	 6,528	 3.92X *		C	 2,432	10.53X *			 *	 */LEAF(copypage)XLEAF(copyseg)	sll	a1,PGSHIFT		# page number to phys addr	or	a1,K0BASE	addu	a3,a0,NBPG		# source endpoint1:	lw	v0,0(a0)	lw	v1,4(a0)	lw	t0,8(a0)	lw	t1,12(a0)	sw	v0,0(a1)	sw	v1,4(a1)	sw	t0,8(a1)	sw	t1,12(a1)	addu	a0,32	lw	v0,-16(a0)	lw	v1,-12(a0)	lw	t0,-8(a0)	lw	t1,-4(a0)	sw	v0,16(a1)	sw	v1,20(a1)	sw	t0,24(a1)	sw	t1,28(a1)	addu	a1,32			# BDSLOT: incr dst address	bne	a0,a3,1b#ifdef EXTRA_CACHETRICKS	/*	 * The dcachecnt for the source page must be handled by the	 * caller, since it's too much of a pain to do the vtop and	 * pte issues here.	 */	subu	a1,32			# back to copied page	srl	a1,PGSHIFT	lw	v0,dcachemask	and	a1,v0			# figure appropriate cache alias	sll	a1,1	lhu	v0,dcachecnt(a1)	addu	v0,1	sh	v0,dcachecnt(a1)#endif EXTRA_CACHETRICKS	j	ra	END(copypage)/* * clearseg(dst_ppn) * *	Performance *	Config	Cycles/	Speed vs VAX *		4K Page	 *	08V11	6,144	1.09X *	08M44	1,229	5.46X	(could be made faster by unroll to 64) *                              (done April '87 per djl) *	since writes only occur at best 1 per two cycles(m500) and unroll *	shouldn't help, in fact we probably don't want many instructions *	so that it is easy to get into icache-- so changing back to two *	sw's per loop (two cycles + two cycles for loop overhead) which *	will keep the write buffers busy and not stall the cpu. */LEAF(clearseg)	sll	a0,PGSHIFT	addu	a0,K0BASE		# reference via k0seg	addu	t0,a0,NBPG-8		# dst on last pass of loop1:	sw	zero,0(a0)	sw	zero,4(a0)	.set	noreorder	bne	a0,t0,1b	addu	a0,8			# BDSLOT: inc dst, NOTE after test	.set	reorder#ifdef EXTRA_CACHETRICKS	subu	a0,8			# back to copied page	srl	a0,PGSHIFT	lw	v0,dcachemask	and	a0,v0			#  figure appropriate cache alias	sll	a0,1	lhu	v0,dcachecnt(a0)	addu	v0,1	sh	v0,dcachecnt(a0)#endif EXTRA_CACHETRICKS	j	ra	END(clearseg)#ifdef USE_IDLE/* * clearseg1(dst_ppn, index) * do a clear of one 128 byte chunk.  called from idle. */LEAF(clearseg1)	sll	a0,PGSHIFT	addu	a0,K0BASE		# reference via k0seg	mul	a1,a1,128		# 128 * index = offset into page	addu	a0,a0,a1		# start at page + offset	addu	t0,a0,120		# dst is start + (128 - 8)1:	sw	zero,0(a0)	sw	zero,4(a0)	.set	noreorder	bne	a0,t0,1b	addu	a0,8			# BDSLOT: inc dst, NOTE after test	.set	reorder#ifdef EXTRA_CACHETRICKS	subu	a0,8			# sub 8 assures correct page	srl	a0,PGSHIFT	lw	v0,dcachemask	and	a0,v0			#  figure appropriate cache alias	sll	a0,1	lhu	v0,dcachecnt(a0)	addu	v0,1	sh	v0,dcachecnt(a0)#endif EXTRA_CACHETRICKS	j	ra	END(clearseg1)#endif USE_IDLE/* * copyin(user_src, kernel_dst, bcount)/* * copyin(user_src, kernel_dst, bcount) */COPYIOFRM=	(4*4)+4			# 4 arg saves plus raNESTED(copyin, COPYIOFRM, zero)	subu	sp,COPYIOFRM	sw	ra,COPYIOFRM-4(sp)	bltz	a0,cerror#ifdef ASSERTIONS	lw	v0,u+PCB_CPUPTR	lw	v0,CPU_NOFAULT(v0)	beq	v0,zero,8f	PANIC("recursive nofault")8:#endif ASSERTIONS	.set	noreorder	lw	ra,u+PCB_CPUPTR	li	v0,NF_COPYIO		# LDSLOT	sw	v0,CPU_NOFAULT(ra)	jal	bcopy	nop	lw	ra,u+PCB_CPUPTR	nop	sw	zero,CPU_NOFAULT(ra)	.set	reorder	move	v0,zero	lw	ra,COPYIOFRM-4(sp)	addu	sp,COPYIOFRM	j	ra	END(copyin)/* * copyout(kernel_src, user_dst, bcount) */NESTED(copyout, COPYIOFRM, zero)	subu	sp,COPYIOFRM	sw	ra,COPYIOFRM-4(sp)	bltz	a1,cerror#ifdef ASSERTIONS	lw	v0,u+PCB_CPUPTR	lw	v0,CPU_NOFAULT(v0)	beq	v0,zero,8f	PANIC("recursive nofault")8:#endif ASSERTIONS	.set	noreorder	lw	ra,u+PCB_CPUPTR	li	v0,NF_COPYIO		# LDSLOT	sw	v0,CPU_NOFAULT(ra)	jal	bcopy	nop	lw	ra,u+PCB_CPUPTR	nop	sw	zero,CPU_NOFAULT(ra)	.set	reorder	move	v0,zero	lw	ra,COPYIOFRM-4(sp)	addu	sp,COPYIOFRM	j	ra	END(copyout)NESTED(cerror, COPYIOFRM, zero)	li	v0,EFAULT	lw	ra,COPYIOFRM-4(sp)	addu	sp,COPYIOFRM	j	ra	END(cerror)/* * bcopy(src, dst, bcount) * * NOTE: the optimal copy here is somewhat different than for the user-level * equivalents (bcopy in 4.2, memcpy in V), because: * 1) it frequently acts on uncached data, especially since copying from * (uncached) disk buffers into user pgms is high runner. * This means one must be careful with lwl/lwr/lb - don't expect cache help. * 2) the distribution of usage is very different: there are a large number * of bcopies for small, aligned structures (like for ioctl, for example), * a reasonable number of randomly-sized copies for user I/O, and many * bcopies of large (page-size) blocks for stdio; the latter must be * well-tuned, hence the use of 32-byte loops. * 3) this is much more frequently-used code inside the kernel than outside * * Overall copy-loop speeds, by amount of loop-unrolling: assumptions: * a) low icache miss rate (this code gets used a bunch) * b) large transfers, especially, will be word-alignable. * c) Copying speeds (steady state, 0% I-cache-miss, 100% D-cache Miss): * d) 100% D-Cache Miss (but cacheable, so that lwl/lwr/lb work well) *	Config	Bytes/	Cycles/	Speed (VAX/780 = 1) *		Loop	Word *	08V11	1	35	0.71X	(8MHz, BUS, 1-Deep WB, 1-way ILV) *		4	15	1.67X *		8/16	13.5	1.85X *		32/up	13.25	1.89X *	08MM44	1	26	0.96X	(8MHz, MEM, 4-Deep WB, 4-way ILV) *		4	9	2.78X *		8	7.5	3.33X *		16	6.75	3.70X *		32	6.375	3.92X	(diminishing returns thereafter) * * MINCOPY is minimum number of byte that its worthwhile to try and * align copy into word transactions.  Calculations below are for 8 bytes: * Estimating MINCOPY (C = Cacheable, NC = Noncacheable): * Assumes 100% D-cache miss on first reference, then 0% (100%) for C (NC): * (Warning: these are gross numbers, and the code has changed slightly): *	Case		08V11			08M44 *	MINCOPY		C	NC		C	NC *	9 (1 byte loop)	75	133		57	93 *	8 (complex logic) *	Aligned		51	51		40	40 *	Alignable, *	worst (1+4+3)	69	96		53	80 *	Unalignable	66	93		60	72 * MINCOPY should be lower for lower cache miss rates, lower cache miss * penalties, better alignment properties, or if src and dst alias in * cache. For this particular case, it seems very important to minimize the * number of lb/sb pairs: a) frequent non-cacheable references are used, * b) when i-cache miss rate approaches zero, even the 4-deep WB can't * put successive sb's together in any useful way, so few references are saved. * To summarize, even as low as 8 bytes, avoiding the single-byte loop seems * worthwhile; some assumptions are probably optimistic, so there is not quite * as much disadvantage.  However, the optimal number is almost certainly in * the range 7-12. * *	a0	src addr *	a1	dst addr *	a2	length remaining */#define	MINCOPY	8LEAF(bcopy)#ifdef ASSERTIONS	bgeu	a0,a1,1f		# src >= dst, no overlap error	addu	v0,a0,a2		# src endpoint + 1	bgeu	a1,v0,1f		# dst >= src endpoint+1, no overlap err	PANIC("bcopy overlap")1:#endif ASSERTIONS	xor	v0,a0,a1		# bash src & dst for align chk; BDSLOT	blt	a2,MINCOPY,bytecopy	# too short, just byte copy	and	v0,NBPW-1		# low-order bits for align chk	subu	v1,zero,a0		# -src; BDSLOT	bne	v0,zero,unaligncopy	# src and dst not alignable/* * src and dst can be simultaneously word aligned */	and	v1,NBPW-1		# number of bytes til aligned	subu	a2,v1			# bcount -= alignment	beq	v1,zero,blkcopy		# already aligned#ifdef MIPSEB	lwl	v0,0(a0)		# copy unaligned portion	swl	v0,0(a1)#endif#ifdef MIPSEL	lwr	v0,0(a0)	swr	v0,0(a1)#endif	addu	a0,v1			# src += alignment	addu	a1,v1			# dst += alignment/* * 32 byte block, aligned copy loop (for big reads/writes) */blkcopy:	and	a3,a2,~31		# total space in 32 byte chunks	subu	a2,a3			# count after by-32 byte loop done	beq	a3,zero,wordcopy	# less than 32 bytes to copy	addu	a3,a0			# source endpoint1:	lw	v0,0(a0)	lw	v1,4(a0)	lw	t0,8(a0)	lw	t1,12(a0)	sw	v0,0(a1)	sw	v1,4(a1)	sw	t0,8(a1)	sw	t1,12(a1)	addu	a0,32			# src+= 32; here to ease loop end	lw	v0,-16(a0)	lw	v1,-12(a0)	lw	t0,-8(a0)	lw	t1,-4(a0)	sw	v0,16(a1)	sw	v1,20(a1)	sw	t0,24(a1)	sw	t1,28(a1)	addu	a1,32			# dst+= 32; fills BD slot	bne	a0,a3,1b/* * word copy loop */wordcopy:	and	a3,a2,~(NBPW-1)		# word chunks	subu	a2,a3			# count after by word loop	beq	a3,zero,bytecopy	# less than a word to copy	addu	a3,a0			# source endpoint1:	lw	v0,0(a0)	addu	a0,NBPW	sw	v0,0(a1)	addu	a1,NBPW			# dst += 4; BD slot	bne	a0,a3,1b	b	bytecopy/* * deal with simultaneously unalignable copy by aligning dst */unaligncopy:	subu	a3,zero,a1		# calc byte cnt to get dst aligned	and	a3,NBPW-1		# alignment = 0..3	subu	a2,a3			# bcount -= alignment	beq	a3,zero,partaligncopy	# already aligned#ifdef MIPSEB	lwl	v0,0(a0)		# get whole word	lwr	v0,3(a0)		# for sure	swl	v0,0(a1)		# store left piece (1-3 bytes)#endif#ifdef MIPSEL	lwr	v0,0(a0)		# get whole word	lwl	v0,3(a0)		# for sure	swr	v0,0(a1)		# store right piece (1-3 bytes)#endif	addu	a0,a3			# src += alignment (will fill LD slot)	addu	a1,a3			# dst += alignment/* * src unaligned, dst aligned loop * NOTE: if MINCOPY >= 7, will always do 1 loop iteration or more * if we get here at all */partaligncopy:	and	a3,a2,~(NBPW-1)		# space in word chunks	subu	a2,a3			# count after by word loop#if MINCOPY < 7	beq	a3,zero,bytecopy	# less than a word to copy#endif	addu	a3,a0			# source endpoint1:#ifdef MIPSEB	lwl	v0,0(a0)	lwr	v0,3(a0)#endif#ifdef MIPSEL	lwr	v0,0(a0)	lwl	v0,3(a0)#endif	addu	a0,NBPW	sw	v0,0(a1)	addu	a1,NBPW	bne	a0,a3,1b/* * brute force byte copy loop, for bcount < MINCOPY + tail of unaligned dst * note that lwl, lwr, swr CANNOT be used for tail, since the lwr might * cross page boundary and give spurious address exception */bytecopy:	addu	a3,a2,a0		# source endpoint; BDSLOT	ble	a2,zero,copydone	# nothing left to copy, or bad length1:	lb	v0,0(a0)	addu	a0,1	sb	v0,0(a1)	addu	a1,1			# BDSLOT: incr dst address	bne	a0,a3,1bcopydone:	j	ra	END(bcopy)/* * bzero(dst, bcount) * Zero block of memory * * Calculating MINZERO, assuming 50% cache-miss on non-loop code: * Overhead =~ 18 instructions => 63 (81) cycles * Byte zero =~ 16 (24) cycles/word for 08M44 (08V11) * Word zero =~ 3 (6) cycles/word for 08M44 (08V11) * If I-cache-miss nears 0, MINZERO ==> 4 bytes; otherwise, times are: * breakeven (MEM) = 63 / (16 - 3) =~ 5 words * breakeven (BUS) = 81 / (24 - 6)  =~ 4.5 words * Since the overhead is pessimistic (worst-case alignment), and many calls * will be for well-aligned data, and since Word-zeroing at least leaves * the zero in the cache, we shade these values (18-20) down to 12 */#define	MINZERO	12LEAF(bzero)XLEAF(blkclr)	subu	v1,zero,a0		# number of bytes til aligned	blt	a1,MINZERO,bytezero	and	v1,NBPW-1	subu	a1,v1	beq	v1,zero,blkzero		# already aligned#ifdef MIPSEB	swl	zero,0(a0)#endif#ifdef	MIPSEL	swr	zero,0(a0)#endif	addu	a0,v1/* * zero 32 byte, aligned block */blkzero:	and	a3,a1,~31		# 32 byte chunks	subu	a1,a3	beq	a3,zero,wordzero	addu	a3,a0			# dst endpoint1:	sw	zero,0(a0)	sw	zero,4(a0)	sw	zero,8(a0)	sw	zero,12(a0)	addu	a0,32	sw	zero,-16(a0)	sw	zero,-12(a0)	sw	zero,-8(a0)	sw	zero,-4(a0)	bne	a0,a3,1bwordzero:	and	a3,a1,~(NBPW-1)		# word chunks	subu	a1,a3	beq	a3,zero,bytezero	addu	a3,a0			# dst endpoint1:	addu	a0,NBPW	sw	zero,-NBPW(a0)	bne	a0,a3,1bbytezero:	ble	a1,zero,zerodone	addu	a1,a0			# dst endpoint1:	addu	a0,1	sb	zero,-1(a0)	bne	a0,a1,1bzerodone:	j	ra	END(bzero)/* * bcmp(src, dst, bcount) * * MINCMP is minimum number of byte that its worthwhile to try and * align cmp into word transactions * * Calculating MINCMP * Overhead =~ 15 instructions => 90 cycles * Byte cmp =~ 38 cycles/word * Word cmp =~ 17 cycles/word * Breakeven =~ 16 bytes */#define	MINCMP	16LEAF(bcmp)	xor	v0,a0,a1	blt	a2,MINCMP,bytecmp	# too short, just byte cmp	and	v0,NBPW-1	subu	t8,zero,a0		# number of bytes til aligned	bne	v0,zero,unalgncmp	# src and dst not alignable/* * src and dst can be simultaneously word aligned */	and	t8,NBPW-1	subu	a2,t8	beq	t8,zero,wordcmp		# already aligned 	move	a1,a0			# The FIX#ifdef MIPSEB	lwl	v0,0(a0)		# cmp unaligned portion	lwl	v1,0(a1)#endif#ifdef MIPSEL	lwr	v0,0(a0)	lwr	v1,0(a1)#endif	addu	a0,t8	addu	a1,t8	bne	v0,v1,cmpne/* * word cmp loop */wordcmp:	and	a3,a2,~(NBPW-1)	subu	a2,a3	beq	a3,zero,bytecmp	addu	a3,a0				# src1 endpoint1:	lw	v0,0(a0)	lw	v1,0(a1)	addu	a0,NBPW				# 1st BDSLOT	addu	a1,NBPW				# 2nd BDSLOT (asm doesn't move)	bne	v0,v1,cmpne
12 3 4 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -