ng2memcpy.s

来自「linux 内核源代码」· S 代码 · 共 521 行

S
521
字号
/* NG2memcpy.S: Niagara-2 optimized memcpy. * * Copyright (C) 2007 David S. Miller (davem@davemloft.net) */#ifdef __KERNEL__#include <asm/visasm.h>#include <asm/asi.h>#define GLOBAL_SPARE	%g7#else#define ASI_PNF 0x82#define ASI_BLK_P 0xf0#define ASI_BLK_INIT_QUAD_LDD_P 0xe2#define FPRS_FEF  0x04#ifdef MEMCPY_DEBUG#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs; \		     clr %g1; clr %g2; clr %g3; subcc %g0, %g0, %g0;#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs#else#define VISEntryHalf rd %fprs, %o5; wr %g0, FPRS_FEF, %fprs#define VISExitHalf and %o5, FPRS_FEF, %o5; wr %o5, 0x0, %fprs#endif#define GLOBAL_SPARE	%g5#endif#ifndef STORE_ASI#ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA#define STORE_ASI	ASI_BLK_INIT_QUAD_LDD_P#else#define STORE_ASI	0x80		/* ASI_P */#endif#endif#ifndef EX_LD#define EX_LD(x)	x#endif#ifndef EX_ST#define EX_ST(x)	x#endif#ifndef EX_RETVAL#define EX_RETVAL(x)	x#endif#ifndef LOAD#define LOAD(type,addr,dest)	type [addr], dest#endif#ifndef LOAD_BLK#define LOAD_BLK(addr,dest)	ldda [addr] ASI_BLK_P, dest#endif#ifndef STORE#ifndef MEMCPY_DEBUG#define STORE(type,src,addr)	type src, [addr]#else#define STORE(type,src,addr)	type##a src, [addr] 0x80#endif#endif#ifndef STORE_BLK#define STORE_BLK(src,addr)	stda src, [addr] ASI_BLK_P#endif#ifndef STORE_INIT#define STORE_INIT(src,addr)	stxa src, [addr] STORE_ASI#endif#ifndef FUNC_NAME#define FUNC_NAME	NG2memcpy#endif#ifndef PREAMBLE#define PREAMBLE#endif#ifndef XCC#define XCC xcc#endif#define FREG_FROB(x0, x1, x2, x3, x4, x5, x6, x7, x8) \	faligndata	%x0, %x1, %f0; \	faligndata	%x1, %x2, %f2; \	faligndata	%x2, %x3, %f4; \	faligndata	%x3, %x4, %f6; \	faligndata	%x4, %x5, %f8; \	faligndata	%x5, %x6, %f10; \	faligndata	%x6, %x7, %f12; \	faligndata	%x7, %x8, %f14;#define FREG_MOVE_1(x0) \	fmovd		%x0, %f0;#define FREG_MOVE_2(x0, x1) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2;#define FREG_MOVE_3(x0, x1, x2) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2; \	fmovd		%x2, %f4;#define FREG_MOVE_4(x0, x1, x2, x3) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2; \	fmovd		%x2, %f4; \	fmovd		%x3, %f6;#define FREG_MOVE_5(x0, x1, x2, x3, x4) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2; \	fmovd		%x2, %f4; \	fmovd		%x3, %f6; \	fmovd		%x4, %f8;#define FREG_MOVE_6(x0, x1, x2, x3, x4, x5) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2; \	fmovd		%x2, %f4; \	fmovd		%x3, %f6; \	fmovd		%x4, %f8; \	fmovd		%x5, %f10;#define FREG_MOVE_7(x0, x1, x2, x3, x4, x5, x6) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2; \	fmovd		%x2, %f4; \	fmovd		%x3, %f6; \	fmovd		%x4, %f8; \	fmovd		%x5, %f10; \	fmovd		%x6, %f12;#define FREG_MOVE_8(x0, x1, x2, x3, x4, x5, x6, x7) \	fmovd		%x0, %f0; \	fmovd		%x1, %f2; \	fmovd		%x2, %f4; \	fmovd		%x3, %f6; \	fmovd		%x4, %f8; \	fmovd		%x5, %f10; \	fmovd		%x6, %f12; \	fmovd		%x7, %f14;#define FREG_LOAD_1(base, x0) \	EX_LD(LOAD(ldd, base + 0x00, %x0))#define FREG_LOAD_2(base, x0, x1) \	EX_LD(LOAD(ldd, base + 0x00, %x0)); \	EX_LD(LOAD(ldd, base + 0x08, %x1));#define FREG_LOAD_3(base, x0, x1, x2) \	EX_LD(LOAD(ldd, base + 0x00, %x0)); \	EX_LD(LOAD(ldd, base + 0x08, %x1)); \	EX_LD(LOAD(ldd, base + 0x10, %x2));#define FREG_LOAD_4(base, x0, x1, x2, x3) \	EX_LD(LOAD(ldd, base + 0x00, %x0)); \	EX_LD(LOAD(ldd, base + 0x08, %x1)); \	EX_LD(LOAD(ldd, base + 0x10, %x2)); \	EX_LD(LOAD(ldd, base + 0x18, %x3));#define FREG_LOAD_5(base, x0, x1, x2, x3, x4) \	EX_LD(LOAD(ldd, base + 0x00, %x0)); \	EX_LD(LOAD(ldd, base + 0x08, %x1)); \	EX_LD(LOAD(ldd, base + 0x10, %x2)); \	EX_LD(LOAD(ldd, base + 0x18, %x3)); \	EX_LD(LOAD(ldd, base + 0x20, %x4));#define FREG_LOAD_6(base, x0, x1, x2, x3, x4, x5) \	EX_LD(LOAD(ldd, base + 0x00, %x0)); \	EX_LD(LOAD(ldd, base + 0x08, %x1)); \	EX_LD(LOAD(ldd, base + 0x10, %x2)); \	EX_LD(LOAD(ldd, base + 0x18, %x3)); \	EX_LD(LOAD(ldd, base + 0x20, %x4)); \	EX_LD(LOAD(ldd, base + 0x28, %x5));#define FREG_LOAD_7(base, x0, x1, x2, x3, x4, x5, x6) \	EX_LD(LOAD(ldd, base + 0x00, %x0)); \	EX_LD(LOAD(ldd, base + 0x08, %x1)); \	EX_LD(LOAD(ldd, base + 0x10, %x2)); \	EX_LD(LOAD(ldd, base + 0x18, %x3)); \	EX_LD(LOAD(ldd, base + 0x20, %x4)); \	EX_LD(LOAD(ldd, base + 0x28, %x5)); \	EX_LD(LOAD(ldd, base + 0x30, %x6));	.register	%g2,#scratch	.register	%g3,#scratch	.text	.align		64	.globl	FUNC_NAME	.type	FUNC_NAME,#functionFUNC_NAME:	/* %o0=dst, %o1=src, %o2=len */	srlx		%o2, 31, %g2	cmp		%g2, 0	tne		%xcc, 5	PREAMBLE	mov		%o0, GLOBAL_SPARE	cmp		%o2, 0	be,pn		%XCC, 85f	 or		%o0, %o1, %o3	cmp		%o2, 16	blu,a,pn	%XCC, 80f	 or		%o3, %o2, %o3	/* 2 blocks (128 bytes) is the minimum we can do the block	 * copy with.  We need to ensure that we'll iterate at least	 * once in the block copy loop.  At worst we'll need to align	 * the destination to a 64-byte boundary which can chew up	 * to (64 - 1) bytes from the length before we perform the	 * block copy loop.	 *	 * However, the cut-off point, performance wise, is around	 * 4 64-byte blocks.	 */	cmp		%o2, (4 * 64)	blu,pt		%XCC, 75f	 andcc		%o3, 0x7, %g0	/* %o0:	dst	 * %o1:	src	 * %o2:	len  (known to be >= 128)	 *	 * The block copy loops can use %o4, %g2, %g3 as	 * temporaries while copying the data.  %o5 must	 * be preserved between VISEntryHalf and VISExitHalf	 */	LOAD(prefetch, %o1 + 0x000, #one_read)	LOAD(prefetch, %o1 + 0x040, #one_read)	LOAD(prefetch, %o1 + 0x080, #one_read)	/* Align destination on 64-byte boundary.  */	andcc		%o0, (64 - 1), %o4	be,pt		%XCC, 2f	 sub		%o4, 64, %o4	sub		%g0, %o4, %o4	! bytes to align dst	sub		%o2, %o4, %o21:	subcc		%o4, 1, %o4	EX_LD(LOAD(ldub, %o1, %g1))	EX_ST(STORE(stb, %g1, %o0))	add		%o1, 1, %o1	bne,pt		%XCC, 1b	add		%o0, 1, %o02:	/* Clobbers o5/g1/g2/g3/g7/icc/xcc.  We must preserve	 * o5 from here until we hit VISExitHalf.	 */	VISEntryHalf	alignaddr	%o1, %g0, %g0	add		%o1, (64 - 1), %o4	andn		%o4, (64 - 1), %o4	andn		%o2, (64 - 1), %g1	sub		%o2, %g1, %o2	and		%o1, (64 - 1), %g2	add		%o1, %g1, %o1	sub		%o0, %o4, %g3	brz,pt		%g2, 190f	 cmp		%g2, 32	blu,a		5f	 cmp		%g2, 16	cmp		%g2, 48	blu,a		4f	 cmp		%g2, 40	cmp		%g2, 56	blu		170f	 nop	ba,a,pt		%xcc, 180f4:	/* 32 <= low bits < 48 */	blu		150f	 nop	ba,a,pt		%xcc, 160f5:	/* 0 < low bits < 32 */	blu,a		6f	 cmp		%g2, 8	cmp		%g2, 24	blu		130f	 nop	ba,a,pt		%xcc, 140f6:	/* 0 < low bits < 16 */	bgeu		120f	 nop	/* fall through for 0 < low bits < 8 */110:	sub		%o4, 64, %g2	EX_LD(LOAD_BLK(%g2, %f0))1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f14, f16)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_8(f16, f18, f20, f22, f24, f26, f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop120:	sub		%o4, 56, %g2	FREG_LOAD_7(%g2, f0, f2, f4, f6, f8, f10, f12)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f4, f6, f8, f10, f12, f16, f18)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_7(f18, f20, f22, f24, f26, f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop130:	sub		%o4, 48, %g2	FREG_LOAD_6(%g2, f0, f2, f4, f6, f8, f10)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f4, f6, f8, f10, f16, f18, f20)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_6(f20, f22, f24, f26, f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop140:	sub		%o4, 40, %g2	FREG_LOAD_5(%g2, f0, f2, f4, f6, f8)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f4, f6, f8, f16, f18, f20, f22)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_5(f22, f24, f26, f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop150:	sub		%o4, 32, %g2	FREG_LOAD_4(%g2, f0, f2, f4, f6)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f4, f6, f16, f18, f20, f22, f24)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_4(f24, f26, f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop160:	sub		%o4, 24, %g2	FREG_LOAD_3(%g2, f0, f2, f4)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f4, f16, f18, f20, f22, f24, f26)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_3(f26, f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop170:	sub		%o4, 16, %g2	FREG_LOAD_2(%g2, f0, f2)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f2, f16, f18, f20, f22, f24, f26, f28)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_2(f28, f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop180:	sub		%o4, 8, %g2	FREG_LOAD_1(%g2, f0)1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	EX_LD(LOAD_BLK(%o4, %f16))	FREG_FROB(f0, f16, f18, f20, f22, f24, f26, f28, f30)	EX_ST(STORE_BLK(%f0, %o4 + %g3))	FREG_MOVE_1(f30)	subcc		%g1, 64, %g1	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)	ba,pt		%xcc, 195f	 nop190:1:	EX_ST(STORE_INIT(%g0, %o4 + %g3))	subcc		%g1, 64, %g1	EX_LD(LOAD_BLK(%o4, %f0))	EX_ST(STORE_BLK(%f0, %o4 + %g3))	add		%o4, 64, %o4	bne,pt		%xcc, 1b	 LOAD(prefetch, %o4 + 64, #one_read)195:	add		%o4, %g3, %o0	membar		#Sync	VISExitHalf	/* %o2 contains any final bytes still needed to be copied	 * over. If anything is left, we copy it one byte at a time.	 */	brz,pt		%o2, 85f	 sub		%o0, %o1, %o3	ba,a,pt		%XCC, 90f	.align		6475: /* 16 < len <= 64 */	bne,pn		%XCC, 75f	 sub		%o0, %o1, %o372:	andn		%o2, 0xf, %o4	and		%o2, 0xf, %o21:	subcc		%o4, 0x10, %o4	EX_LD(LOAD(ldx, %o1, %o5))	add		%o1, 0x08, %o1	EX_LD(LOAD(ldx, %o1, %g1))	sub		%o1, 0x08, %o1	EX_ST(STORE(stx, %o5, %o1 + %o3))	add		%o1, 0x8, %o1	EX_ST(STORE(stx, %g1, %o1 + %o3))	bgu,pt		%XCC, 1b	 add		%o1, 0x8, %o173:	andcc		%o2, 0x8, %g0	be,pt		%XCC, 1f	 nop	sub		%o2, 0x8, %o2	EX_LD(LOAD(ldx, %o1, %o5))	EX_ST(STORE(stx, %o5, %o1 + %o3))	add		%o1, 0x8, %o11:	andcc		%o2, 0x4, %g0	be,pt		%XCC, 1f	 nop	sub		%o2, 0x4, %o2	EX_LD(LOAD(lduw, %o1, %o5))	EX_ST(STORE(stw, %o5, %o1 + %o3))	add		%o1, 0x4, %o11:	cmp		%o2, 0	be,pt		%XCC, 85f	 nop	ba,pt		%xcc, 90f	 nop75:	andcc		%o0, 0x7, %g1	sub		%g1, 0x8, %g1	be,pn		%icc, 2f	 sub		%g0, %g1, %g1	sub		%o2, %g1, %o21:	subcc		%g1, 1, %g1	EX_LD(LOAD(ldub, %o1, %o5))	EX_ST(STORE(stb, %o5, %o1 + %o3))	bgu,pt		%icc, 1b	 add		%o1, 1, %o12:	add		%o1, %o3, %o0	andcc		%o1, 0x7, %g1	bne,pt		%icc, 8f	 sll		%g1, 3, %g1	cmp		%o2, 16	bgeu,pt		%icc, 72b	 nop	ba,a,pt		%xcc, 73b8:	mov		64, %o3	andn		%o1, 0x7, %o1	EX_LD(LOAD(ldx, %o1, %g2))	sub		%o3, %g1, %o3	andn		%o2, 0x7, %o4	sllx		%g2, %g1, %g21:	add		%o1, 0x8, %o1	EX_LD(LOAD(ldx, %o1, %g3))	subcc		%o4, 0x8, %o4	srlx		%g3, %o3, %o5	or		%o5, %g2, %o5	EX_ST(STORE(stx, %o5, %o0))	add		%o0, 0x8, %o0	bgu,pt		%icc, 1b	 sllx		%g3, %g1, %g2	srl		%g1, 3, %g1	andcc		%o2, 0x7, %o2	be,pn		%icc, 85f	 add		%o1, %g1, %o1	ba,pt		%xcc, 90f	 sub		%o0, %o1, %o3	.align		6480: /* 0 < len <= 16 */	andcc		%o3, 0x3, %g0	bne,pn		%XCC, 90f	 sub		%o0, %o1, %o31:	subcc		%o2, 4, %o2	EX_LD(LOAD(lduw, %o1, %g1))	EX_ST(STORE(stw, %g1, %o1 + %o3))	bgu,pt		%XCC, 1b	 add		%o1, 4, %o185:	retl	 mov		EX_RETVAL(GLOBAL_SPARE), %o0	.align		3290:	subcc		%o2, 1, %o2	EX_LD(LOAD(ldub, %o1, %g1))	EX_ST(STORE(stb, %g1, %o1 + %o3))	bgu,pt		%XCC, 90b	 add		%o1, 1, %o1	retl	 mov		EX_RETVAL(GLOBAL_SPARE), %o0	.size		FUNC_NAME, .-FUNC_NAME

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?