⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 memcpy_mck.s

📁 linux-2.6.15.6
💻 S
📖 第 1 页 / 共 2 页
字号:
	;;	add	r29=.jump_table - 1b - (.jmp1-.jump_table), r29	shl	r28=r30, LOOP_SIZE	// jmp_table thread	mov	ar.ec=2		// loop setup	;;	add	r29=r29,r28		// jmp_table thread	cmp.eq	p16,p17=r0,r0	;;	mov	b6=r29			// jmp_table thread	;;	br.cond.sptk.few b6// for 8-15 byte case// We will skip the loop, but need to replicate the side effect// that the loop produces..noloop:EX(.ex_handler, (p6)	ld8	r37=[src1],8)	add	src0=8,src0(p6)	shl	r25=r30,3	;;EX(.ex_handler, (p6)	ld8	r27=[src1])(p6)	shr.u	r28=r37,r25(p6)	sub	r26=64,r25	;;(p6)	shl	r27=r27,r26	;;(p6)	or	r21=r28,r27.unaligned_src_tail:/* check if we have more than blocksize to copy, if so go back */	cmp.gt	p8,p0=saved_in2,blocksize	;;(p8)	add	dst0=saved_in0,blocksize(p8)	add	src0=saved_in1,blocksize(p8)	sub	in2=saved_in2,blocksize(p8)	br.dpnt	.4k_block	;;/* we have up to 15 byte to copy in the tail. * part of work is already done in the jump table code * we are at the following state. * src side: *  *   xxxxxx xx                   <----- r21 has xxxxxxxx already * -------- -------- -------- * 0        8        16 *          ^ *          | *          src1 *  * dst * -------- -------- -------- * ^ * | * dst1 */EX(.ex_handler, (p6)	st8	[dst1]=r21,8)	// more than 8 byte to copy(p6)	add	curlen=-8,curlen	// update length	mov	ar.pfs=saved_pfs	;;	mov	ar.lc=saved_lc	mov	pr=saved_pr,-1	mov	in2=curlen	// remaining length	mov	dst0=dst1	// dest pointer	add	src0=src1,r30	// forward by src alignment	;;// 7 byte or smaller..memcpy_short:	cmp.le	p8,p9   = 1,in2	cmp.le	p10,p11 = 2,in2	cmp.le	p12,p13 = 3,in2	cmp.le	p14,p15 = 4,in2	add	src1=1,src0	// second src pointer	add	dst1=1,dst0	// second dest pointer	;;EX(.ex_handler_short, (p8)	ld1	t1=[src0],2)EK(.ex_handler_short, (p10)	ld1	t2=[src1],2)(p9)	br.ret.dpnt rp		// 0 byte copy	;;EX(.ex_handler_short, (p8)	st1	[dst0]=t1,2)EK(.ex_handler_short, (p10)	st1	[dst1]=t2,2)(p11)	br.ret.dpnt rp		// 1 byte copyEX(.ex_handler_short, (p12)	ld1	t3=[src0],2)EK(.ex_handler_short, (p14)	ld1	t4=[src1],2)(p13)	br.ret.dpnt rp		// 2 byte copy	;;	cmp.le	p6,p7   = 5,in2	cmp.le	p8,p9   = 6,in2	cmp.le	p10,p11 = 7,in2EX(.ex_handler_short, (p12)	st1	[dst0]=t3,2)EK(.ex_handler_short, (p14)	st1	[dst1]=t4,2)(p15)	br.ret.dpnt rp		// 3 byte copy	;;EX(.ex_handler_short, (p6)	ld1	t5=[src0],2)EK(.ex_handler_short, (p8)	ld1	t6=[src1],2)(p7)	br.ret.dpnt rp		// 4 byte copy	;;EX(.ex_handler_short, (p6)	st1	[dst0]=t5,2)EK(.ex_handler_short, (p8)	st1	[dst1]=t6,2)(p9)	br.ret.dptk rp		// 5 byte copyEX(.ex_handler_short, (p10)	ld1	t7=[src0],2)(p11)	br.ret.dptk rp		// 6 byte copy	;;EX(.ex_handler_short, (p10)	st1	[dst0]=t7,2)	br.ret.dptk rp		// done all cases/* Align dest to nearest 8-byte boundary. We know we have at * least 7 bytes to copy, enough to crawl to 8-byte boundary. * Actual number of byte to crawl depend on the dest alignment. * 7 byte or less is taken care at .memcpy_short * src0 - source even index * src1 - source  odd index * dst0 - dest even index * dst1 - dest  odd index * r30  - distance to 8-byte boundary */.align_dest:	add	src1=1,in1	// source odd index	cmp.le	p7,p0 = 2,r30	// for .align_dest	cmp.le	p8,p0 = 3,r30	// for .align_destEX(.ex_handler_short, (p6)	ld1	t1=[src0],2)	cmp.le	p9,p0 = 4,r30	// for .align_dest	cmp.le	p10,p0 = 5,r30	;;EX(.ex_handler_short, (p7)	ld1	t2=[src1],2)EK(.ex_handler_short, (p8)	ld1	t3=[src0],2)	cmp.le	p11,p0 = 6,r30EX(.ex_handler_short, (p6)	st1	[dst0] = t1,2)	cmp.le	p12,p0 = 7,r30	;;EX(.ex_handler_short, (p9)	ld1	t4=[src1],2)EK(.ex_handler_short, (p10)	ld1	t5=[src0],2)EX(.ex_handler_short, (p7)	st1	[dst1] = t2,2)EK(.ex_handler_short, (p8)	st1	[dst0] = t3,2)	;;EX(.ex_handler_short, (p11)	ld1	t6=[src1],2)EK(.ex_handler_short, (p12)	ld1	t7=[src0],2)	cmp.eq	p6,p7=r28,r29EX(.ex_handler_short, (p9)	st1	[dst1] = t4,2)EK(.ex_handler_short, (p10)	st1	[dst0] = t5,2)	sub	in2=in2,r30	;;EX(.ex_handler_short, (p11)	st1	[dst1] = t6,2)EK(.ex_handler_short, (p12)	st1	[dst0] = t7)	add	dst0=in0,r30	// setup arguments	add	src0=in1,r30(p6)	br.cond.dptk .aligned_src(p7)	br.cond.dpnt .unaligned_src	;;/* main loop body in jump table format */#define COPYU(shift)									\1:											\EX(.ex_handler,  (p16)	ld8	r32=[src0],8);		/* 1 */				\EK(.ex_handler,  (p16)	ld8	r36=[src1],8);						\		 (p17)	shrp	r35=r33,r34,shift;;	/* 1 */				\EX(.ex_handler,  (p6)	ld8	r22=[src1]);	/* common, prime for tail section */	\		 nop.m	0;								\		 (p16)	shrp	r38=r36,r37,shift;					\EX(.ex_handler,  (p17)	st8	[dst0]=r35,8);		/* 1 */				\EK(.ex_handler,  (p17)	st8	[dst1]=r39,8);						\		 br.ctop.dptk.few 1b;;							\		 (p7)	add	src1=-8,src1;	/* back out for <8 byte case */		\		 shrp	r21=r22,r38,shift;	/* speculative work */			\		 br.sptk.few .unaligned_src_tail /* branch out of jump table */		\		 ;;	TEXT_ALIGN(32).jump_table:	COPYU(8)	// unaligned cases.jmp1:	COPYU(16)	COPYU(24)	COPYU(32)	COPYU(40)	COPYU(48)	COPYU(56)#undef A#undef B#undef C#undef D/* * Due to lack of local tag support in gcc 2.x assembler, it is not clear which * instruction failed in the bundle.  The exception algorithm is that we * first figure out the faulting address, then detect if there is any * progress made on the copy, if so, redo the copy from last known copied * location up to the faulting address (exclusive). In the copy_from_user * case, remaining byte in kernel buffer will be zeroed. * * Take copy_from_user as an example, in the code there are multiple loads * in a bundle and those multiple loads could span over two pages, the * faulting address is calculated as page_round_down(max(src0, src1)). * This is based on knowledge that if we can access one byte in a page, we * can access any byte in that page. * * predicate used in the exception handler: * p6-p7: direction * p10-p11: src faulting addr calculation * p12-p13: dst faulting addr calculation */#define A	r19#define B	r20#define C	r21#define D	r22#define F	r28#define memset_arg0	r32#define memset_arg2	r33#define saved_retval	loc0#define saved_rtlink	loc1#define saved_pfs_stack	loc2.ex_hndlr_s:	add	src0=8,src0	br.sptk .ex_handler	;;.ex_hndlr_d:	add	dst0=8,dst0	br.sptk .ex_handler	;;.ex_hndlr_lcpy_1:	mov	src1=src_pre_mem	mov	dst1=dst_pre_mem	cmp.gtu	p10,p11=src_pre_mem,saved_in1	cmp.gtu	p12,p13=dst_pre_mem,saved_in0	;;(p10)	add	src0=8,saved_in1(p11)	mov	src0=saved_in1(p12)	add	dst0=8,saved_in0(p13)	mov	dst0=saved_in0	br.sptk	.ex_handler.ex_handler_lcpy:	// in line_copy block, the preload addresses should always ahead	// of the other two src/dst pointers.  Furthermore, src1/dst1 should	// always ahead of src0/dst0.	mov	src1=src_pre_mem	mov	dst1=dst_pre_mem.ex_handler:	mov	pr=saved_pr,-1		// first restore pr, lc, and pfs	mov	ar.lc=saved_lc	mov	ar.pfs=saved_pfs	;;.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs	cmp.ltu	p6,p7=saved_in0, saved_in1	// get the copy direction	cmp.ltu	p10,p11=src0,src1	cmp.ltu	p12,p13=dst0,dst1	fcmp.eq	p8,p0=f6,f0		// is it memcpy?	mov	tmp = dst0	;;(p11)	mov	src1 = src0		// pick the larger of the two(p13)	mov	dst0 = dst1		// make dst0 the smaller one(p13)	mov	dst1 = tmp		// and dst1 the larger one	;;(p6)	dep	F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary(p7)	dep	F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary	;;(p6)	cmp.le	p14,p0=dst0,saved_in0	// no progress has been made on store(p7)	cmp.le	p14,p0=src0,saved_in1	// no progress has been made on load	mov	retval=saved_in2(p8)	ld1	tmp=[src1]		// force an oops for memcpy call(p8)	st1	[dst1]=r0		// force an oops for memcpy call(p14)	br.ret.sptk.many rp/* * The remaining byte to copy is calculated as: * * A =	(faulting_addr - orig_src)	-> len to faulting ld address *	or  * 	(faulting_addr - orig_dst)	-> len to faulting st address * B =	(cur_dst - orig_dst)		-> len copied so far * C =	A - B				-> len need to be copied * D =	orig_len - A			-> len need to be zeroed */(p6)	sub	A = F, saved_in0(p7)	sub	A = F, saved_in1	clrrrb	;;	alloc	saved_pfs_stack=ar.pfs,3,3,3,0	cmp.lt	p8,p0=A,r0	sub	B = dst0, saved_in0	// how many byte copied so far	;;(p8)	mov	A = 0;			// A shouldn't be negative, cap it	;;	sub	C = A, B	sub	D = saved_in2, A	;;	cmp.gt	p8,p0=C,r0		// more than 1 byte?	add	memset_arg0=saved_in0, A(p6)	mov	memset_arg2=0		// copy_to_user should not call memset(p7)	mov	memset_arg2=D		// copy_from_user need to have kbuf zeroed	mov	r8=0	mov	saved_retval = D	mov	saved_rtlink = b0	add	out0=saved_in0, B	add	out1=saved_in1, B	mov	out2=C(p8)	br.call.sptk.few b0=__copy_user	// recursive call	;;	add	saved_retval=saved_retval,r8	// above might return non-zero value	cmp.gt	p8,p0=memset_arg2,r0	// more than 1 byte?	mov	out0=memset_arg0	// *s	mov	out1=r0			// c	mov	out2=memset_arg2	// n(p8)	br.call.sptk.few b0=memset	;;	mov	retval=saved_retval	mov	ar.pfs=saved_pfs_stack	mov	b0=saved_rtlink	br.ret.sptk.many rp/* end of McKinley specific optimization */END(__copy_user)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -