📄 memcpy.c
字号:
/* * Optimized memory copy routines. * * Copyright (C) 2004 Randolph Chung <tausq@debian.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * * Portions derived from the GNU C Library * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. * * Several strategies are tried to try to get the best performance for various * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using * general registers. Unaligned copies are handled either by aligning the * destination and then using shift-and-write method, or in a few cases by * falling back to a byte-at-a-time copy. * * I chose to implement this in C because it is easier to maintain and debug, * and in my experiments it appears that the C code generated by gcc (3.3/3.4 * at the time of writing) is fairly optimal. Unfortunately some of the * semantics of the copy routine (exception handling) is difficult to express * in C, so we have to play some tricks to get it to work. * * All the loads and stores are done via explicit asm() code in order to use * the right space registers. * * Testing with various alignments and buffer sizes shows that this code is * often >10x faster than a simple byte-at-a-time copy, even for strangely * aligned operands. It is interesting to note that the glibc version * of memcpy (written in C) is actually quite fast already. This routine is * able to beat it by 30-40% for aligned copies because of the loop unrolling, * but in some cases the glibc version is still slightly faster. This lends * more credibility that gcc can generate very good code as long as we are * careful. * * TODO: * - cache prefetching needs more experimentation to get optimal settings * - try not to use the post-increment address modifiers; they create additional * interlocks * - replace byte-copy loops with stybs sequences */#ifdef __KERNEL__#include <linux/config.h>#include <linux/module.h>#include <linux/compiler.h>#include <asm/uaccess.h>#define s_space "%%sr1"#define d_space "%%sr2"#else#include "memcpy.h"#define s_space "%%sr0"#define d_space "%%sr0"#define pa_memcpy new2_copy#endifDECLARE_PER_CPU(struct exception_data, exception_data);#define preserve_branch(label) do { \ volatile int dummy; \ /* The following branch is never taken, it's just here to */ \ /* prevent gcc from optimizing away our exception code. */ \ if (unlikely(dummy != dummy)) \ goto label; \} while (0)#define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3))#define get_kernel_space() (0)#define MERGE(w0, sh_1, w1, sh_2) ({ \ unsigned int _r; \ asm volatile ( \ "mtsar %3\n" \ "shrpw %1, %2, %%sar, %0\n" \ : "=r"(_r) \ : "r"(w0), "r"(w1), "r"(sh_2) \ ); \ _r; \})#define THRESHOLD 16#ifdef DEBUG_MEMCPY#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __FUNCTION__ ); printk(KERN_DEBUG fmt, ##args ); } while (0)#else#define DPRINTF(fmt, args...)#endif#ifndef __LP64__#define EXC_WORD ".word"#else#define EXC_WORD ".dword"#endif#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ __asm__ __volatile__ ( \ "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n" \ "\t.section __ex_table,\"aw\"\n" \ "\t" EXC_WORD "\t1b\n" \ "\t" EXC_WORD "\t" #_e "\n" \ "\t.previous\n" \ : _tt(_t), "+r"(_a) \ : \ : "r8")#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ __asm__ __volatile__ ( \ "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n" \ "\t.section __ex_table,\"aw\"\n" \ "\t" EXC_WORD "\t1b\n" \ "\t" EXC_WORD "\t" #_e "\n" \ "\t.previous\n" \ : "+r"(_a) \ : _tt(_t) \ : "r8")#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e)#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e)#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e)#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e)#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e)#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e)#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ __asm__ __volatile__ ( \ "1:\t" #_insn " " #_o "(" _s ",%1), %0\n" \ "\t.section __ex_table,\"aw\"\n" \ "\t" EXC_WORD "\t1b\n" \ "\t" EXC_WORD "\t" #_e "\n" \ "\t.previous\n" \ : _tt(_t) \ : "r"(_a) \ : "r8")#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ __asm__ __volatile__ ( \ "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n" \ "\t.section __ex_table,\"aw\"\n" \ "\t" EXC_WORD "\t1b\n" \ "\t" EXC_WORD "\t" #_e "\n" \ "\t.previous\n" \ : \ : _tt(_t), "r"(_a) \ : "r8")#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e)#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e)#ifdef CONFIG_PREFETCHextern inline void prefetch_src(const void *addr){ __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr));}extern inline void prefetch_dst(const void *addr){ __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr));}#else#define prefetch_src(addr)#define prefetch_dst(addr)#endif/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words * per loop. This code is derived from glibc. */static inline unsigned long copy_dstaligned(unsigned long dst, unsigned long src, unsigned long len, unsigned long o_dst, unsigned long o_src, unsigned long o_len){ /* gcc complains that a2 and a3 may be uninitialized, but actually * they cannot be. Initialize a2/a3 to shut gcc up. */ register unsigned int a0, a1, a2 = 0, a3 = 0; int sh_1, sh_2; struct exception_data *d; /* prefetch_src((const void *)src); */ /* Calculate how to shift a word read at the memory operation aligned srcp to make it aligned for copy. */ sh_1 = 8 * (src % sizeof(unsigned int)); sh_2 = 8 * sizeof(unsigned int) - sh_1; /* Make src aligned by rounding it down. */ src &= -sizeof(unsigned int); switch (len % 4) { case 2: /* a1 = ((unsigned int *) src)[0]; a2 = ((unsigned int *) src)[1]; */ ldw(s_space, 0, src, a1, cda_ldw_exc); ldw(s_space, 4, src, a2, cda_ldw_exc); src -= 1 * sizeof(unsigned int); dst -= 3 * sizeof(unsigned int); len += 2; goto do1; case 3: /* a0 = ((unsigned int *) src)[0]; a1 = ((unsigned int *) src)[1]; */ ldw(s_space, 0, src, a0, cda_ldw_exc); ldw(s_space, 4, src, a1, cda_ldw_exc); src -= 0 * sizeof(unsigned int); dst -= 2 * sizeof(unsigned int); len += 1; goto do2; case 0: if (len == 0) return 0; /* a3 = ((unsigned int *) src)[0]; a0 = ((unsigned int *) src)[1]; */ ldw(s_space, 0, src, a3, cda_ldw_exc); ldw(s_space, 4, src, a0, cda_ldw_exc); src -=-1 * sizeof(unsigned int); dst -= 1 * sizeof(unsigned int); len += 0; goto do3; case 1: /* a2 = ((unsigned int *) src)[0]; a3 = ((unsigned int *) src)[1]; */ ldw(s_space, 0, src, a2, cda_ldw_exc); ldw(s_space, 4, src, a3, cda_ldw_exc); src -=-2 * sizeof(unsigned int); dst -= 0 * sizeof(unsigned int); len -= 1; if (len == 0) goto do0; goto do4; /* No-op. */ } do { /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */do4: /* a0 = ((unsigned int *) src)[0]; */ ldw(s_space, 0, src, a0, cda_ldw_exc); /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc);do3: /* a1 = ((unsigned int *) src)[1]; */ ldw(s_space, 4, src, a1, cda_ldw_exc); /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc);do2: /* a2 = ((unsigned int *) src)[2]; */ ldw(s_space, 8, src, a2, cda_ldw_exc); /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc);do1: /* a3 = ((unsigned int *) src)[3]; */ ldw(s_space, 12, src, a3, cda_ldw_exc); /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -