📄 memcpy.c

📁 linux-2.6.15.6
💻 C
📖 第 1 页 / 共 2 页
字号:
12 下一页
/* *    Optimized memory copy routines. * *    Copyright (C) 2004 Randolph Chung <tausq@debian.org> * *    This program is free software; you can redistribute it and/or modify *    it under the terms of the GNU General Public License as published by *    the Free Software Foundation; either version 2, or (at your option) *    any later version. * *    This program is distributed in the hope that it will be useful, *    but WITHOUT ANY WARRANTY; without even the implied warranty of *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the *    GNU General Public License for more details. * *    You should have received a copy of the GNU General Public License *    along with this program; if not, write to the Free Software *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *    Portions derived from the GNU C Library *    Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. * * Several strategies are tried to try to get the best performance for various * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using  * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using * general registers.  Unaligned copies are handled either by aligning the  * destination and then using shift-and-write method, or in a few cases by  * falling back to a byte-at-a-time copy. * * I chose to implement this in C because it is easier to maintain and debug, * and in my experiments it appears that the C code generated by gcc (3.3/3.4 * at the time of writing) is fairly optimal. Unfortunately some of the  * semantics of the copy routine (exception handling) is difficult to express * in C, so we have to play some tricks to get it to work. * * All the loads and stores are done via explicit asm() code in order to use * the right space registers.  *  * Testing with various alignments and buffer sizes shows that this code is  * often >10x faster than a simple byte-at-a-time copy, even for strangely * aligned operands. It is interesting to note that the glibc version * of memcpy (written in C) is actually quite fast already. This routine is  * able to beat it by 30-40% for aligned copies because of the loop unrolling,  * but in some cases the glibc version is still slightly faster. This lends  * more credibility that gcc can generate very good code as long as we are  * careful. * * TODO: * - cache prefetching needs more experimentation to get optimal settings * - try not to use the post-increment address modifiers; they create additional *   interlocks * - replace byte-copy loops with stybs sequences */#ifdef __KERNEL__#include <linux/config.h>#include <linux/module.h>#include <linux/compiler.h>#include <asm/uaccess.h>#define s_space "%%sr1"#define d_space "%%sr2"#else#include "memcpy.h"#define s_space "%%sr0"#define d_space "%%sr0"#define pa_memcpy new2_copy#endifDECLARE_PER_CPU(struct exception_data, exception_data);#define preserve_branch(label)	do {					\	volatile int dummy;						\	/* The following branch is never taken, it's just here to  */	\	/* prevent gcc from optimizing away our exception code. */ 	\	if (unlikely(dummy != dummy))					\		goto label;						\} while (0)#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))#define get_kernel_space() (0)#define MERGE(w0, sh_1, w1, sh_2)  ({					\	unsigned int _r;						\	asm volatile (							\	"mtsar %3\n"							\	"shrpw %1, %2, %%sar, %0\n"					\	: "=r"(_r)							\	: "r"(w0), "r"(w1), "r"(sh_2)					\	);								\	_r;								\})#define THRESHOLD	16#ifdef DEBUG_MEMCPY#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)#else#define DPRINTF(fmt, args...)#endif#ifndef __LP64__#define EXC_WORD ".word"#else#define EXC_WORD ".dword"#endif#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e)	\	__asm__ __volatile__ (				\	"1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" 	\	"\t.section __ex_table,\"aw\"\n"		\	"\t" EXC_WORD "\t1b\n"				\	"\t" EXC_WORD "\t" #_e "\n"			\	"\t.previous\n"					\	: _tt(_t), "+r"(_a)				\	: 						\	: "r8")#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) 	\	__asm__ __volatile__ (				\	"1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" 	\	"\t.section __ex_table,\"aw\"\n"		\	"\t" EXC_WORD "\t1b\n"				\	"\t" EXC_WORD "\t" #_e "\n"			\	"\t.previous\n"					\	: "+r"(_a) 					\	: _tt(_t)					\	: "r8")#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) 	\	__asm__ __volatile__ (				\	"1:\t" #_insn " " #_o "(" _s ",%1), %0\n"	\	"\t.section __ex_table,\"aw\"\n"		\	"\t" EXC_WORD "\t1b\n"				\	"\t" EXC_WORD "\t" #_e "\n"			\	"\t.previous\n"					\	: _tt(_t) 					\	: "r"(_a)					\	: "r8")#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) 	\	__asm__ __volatile__ (				\	"1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" 	\	"\t.section __ex_table,\"aw\"\n"		\	"\t" EXC_WORD "\t1b\n"				\	"\t" EXC_WORD "\t" #_e "\n"			\	"\t.previous\n"					\	: 						\	: _tt(_t), "r"(_a)				\	: "r8")#define ldw(_s,_o,_a,_t,_e)	def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)#define stw(_s,_t,_o,_a,_e) 	def_store_insn(stw,"r",_s,_t,_o,_a,_e)#ifdef  CONFIG_PREFETCHextern inline void prefetch_src(const void *addr){	__asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));}extern inline void prefetch_dst(const void *addr){	__asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));}#else#define prefetch_src(addr)#define prefetch_dst(addr)#endif/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words * per loop.  This code is derived from glibc.  */static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len){	/* gcc complains that a2 and a3 may be uninitialized, but actually	 * they cannot be.  Initialize a2/a3 to shut gcc up.	 */	register unsigned int a0, a1, a2 = 0, a3 = 0;	int sh_1, sh_2;	struct exception_data *d;	/* prefetch_src((const void *)src); */	/* Calculate how to shift a word read at the memory operation	   aligned srcp to make it aligned for copy.  */	sh_1 = 8 * (src % sizeof(unsigned int));	sh_2 = 8 * sizeof(unsigned int) - sh_1;	/* Make src aligned by rounding it down.  */	src &= -sizeof(unsigned int);	switch (len % 4)	{		case 2:			/* a1 = ((unsigned int *) src)[0];			   a2 = ((unsigned int *) src)[1]; */			ldw(s_space, 0, src, a1, cda_ldw_exc);			ldw(s_space, 4, src, a2, cda_ldw_exc);			src -= 1 * sizeof(unsigned int);			dst -= 3 * sizeof(unsigned int);			len += 2;			goto do1;		case 3:			/* a0 = ((unsigned int *) src)[0];			   a1 = ((unsigned int *) src)[1]; */			ldw(s_space, 0, src, a0, cda_ldw_exc);			ldw(s_space, 4, src, a1, cda_ldw_exc);			src -= 0 * sizeof(unsigned int);			dst -= 2 * sizeof(unsigned int);			len += 1;			goto do2;		case 0:			if (len == 0)				return 0;			/* a3 = ((unsigned int *) src)[0];			   a0 = ((unsigned int *) src)[1]; */			ldw(s_space, 0, src, a3, cda_ldw_exc);			ldw(s_space, 4, src, a0, cda_ldw_exc);			src -=-1 * sizeof(unsigned int);			dst -= 1 * sizeof(unsigned int);			len += 0;			goto do3;		case 1:			/* a2 = ((unsigned int *) src)[0];			   a3 = ((unsigned int *) src)[1]; */			ldw(s_space, 0, src, a2, cda_ldw_exc);			ldw(s_space, 4, src, a3, cda_ldw_exc);			src -=-2 * sizeof(unsigned int);			dst -= 0 * sizeof(unsigned int);			len -= 1;			if (len == 0)				goto do0;			goto do4;			/* No-op.  */	}	do	{		/* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */do4:		/* a0 = ((unsigned int *) src)[0]; */		ldw(s_space, 0, src, a0, cda_ldw_exc);		/* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */		stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);do3:		/* a1 = ((unsigned int *) src)[1]; */		ldw(s_space, 4, src, a1, cda_ldw_exc);		/* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */		stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);do2:		/* a2 = ((unsigned int *) src)[2]; */		ldw(s_space, 8, src, a2, cda_ldw_exc);		/* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */		stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);do1:		/* a3 = ((unsigned int *) src)[3]; */		ldw(s_space, 12, src, a3, cda_ldw_exc);		/* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */		stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -