📄 memcpy.s
字号:
/* Copy SIZE bytes from SRC to DEST. For UltraSPARC. Copyright (C) 1996, 97, 98, 99, 2003 Free Software Foundation, Inc. This file is part of the GNU C Library. Contributed by David S. Miller (davem@caip.rutgers.edu) and Jakub Jelinek (jakub@redhat.com). The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#include <sysdep.h>#include <asm/asi.h>#ifndef XCC#define USE_BPR .register %g2, #scratch .register %g3, #scratch .register %g6, #scratch#define XCC xcc#endif#define FPRS_FEF 4#define FREG_FROB(f1, f2, f3, f4, f5, f6, f7, f8, f9) \ faligndata %f1, %f2, %f48; \ faligndata %f2, %f3, %f50; \ faligndata %f3, %f4, %f52; \ faligndata %f4, %f5, %f54; \ faligndata %f5, %f6, %f56; \ faligndata %f6, %f7, %f58; \ faligndata %f7, %f8, %f60; \ faligndata %f8, %f9, %f62;#define MAIN_LOOP_CHUNK(src, dest, fdest, fsrc, len, jmptgt) \ ldda [%src] %asi, %fdest; \ add %src, 0x40, %src; \ add %dest, 0x40, %dest; \ subcc %len, 0x40, %len; \ be,pn %xcc, jmptgt; \ stda %fsrc, [%dest - 0x40] %asi;#define LOOP_CHUNK1(src, dest, len, branch_dest) \ MAIN_LOOP_CHUNK(src, dest, f0, f48, len, branch_dest)#define LOOP_CHUNK2(src, dest, len, branch_dest) \ MAIN_LOOP_CHUNK(src, dest, f16, f48, len, branch_dest)#define LOOP_CHUNK3(src, dest, len, branch_dest) \ MAIN_LOOP_CHUNK(src, dest, f32, f48, len, branch_dest)#define STORE_SYNC(dest, fsrc) \ stda %fsrc, [%dest] %asi; \ add %dest, 0x40, %dest;#define STORE_JUMP(dest, fsrc, target) \ stda %fsrc, [%dest] %asi; \ add %dest, 0x40, %dest; \ ba,pt %xcc, target;#define VISLOOP_PAD nop; nop; nop; nop; \ nop; nop; nop; nop; \ nop; nop; nop; nop; \ nop; nop; nop;#define FINISH_VISCHUNK(dest, f0, f1, left) \ subcc %left, 8, %left; \ bl,pn %xcc, 205f; \ faligndata %f0, %f1, %f48; \ std %f48, [%dest]; \ add %dest, 8, %dest;#define UNEVEN_VISCHUNK(dest, f0, f1, left) \ subcc %left, 8, %left; \ bl,pn %xcc, 205f; \ fsrc1 %f0, %f1; \ ba,a,pt %xcc, 204f; /* Macros for non-VIS memcpy code. */#define MOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3) \ ldx [%src + offset + 0x00], %t0; \ ldx [%src + offset + 0x08], %t1; \ ldx [%src + offset + 0x10], %t2; \ ldx [%src + offset + 0x18], %t3; \ stw %t0, [%dst + offset + 0x04]; \ srlx %t0, 32, %t0; \ stw %t0, [%dst + offset + 0x00]; \ stw %t1, [%dst + offset + 0x0c]; \ srlx %t1, 32, %t1; \ stw %t1, [%dst + offset + 0x08]; \ stw %t2, [%dst + offset + 0x14]; \ srlx %t2, 32, %t2; \ stw %t2, [%dst + offset + 0x10]; \ stw %t3, [%dst + offset + 0x1c]; \ srlx %t3, 32, %t3; \ stw %t3, [%dst + offset + 0x18];#define MOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ ldx [%src + offset + 0x00], %t0; \ ldx [%src + offset + 0x08], %t1; \ ldx [%src + offset + 0x10], %t2; \ ldx [%src + offset + 0x18], %t3; \ stx %t0, [%dst + offset + 0x00]; \ stx %t1, [%dst + offset + 0x08]; \ stx %t2, [%dst + offset + 0x10]; \ stx %t3, [%dst + offset + 0x18]; \ ldx [%src + offset + 0x20], %t0; \ ldx [%src + offset + 0x28], %t1; \ ldx [%src + offset + 0x30], %t2; \ ldx [%src + offset + 0x38], %t3; \ stx %t0, [%dst + offset + 0x20]; \ stx %t1, [%dst + offset + 0x28]; \ stx %t2, [%dst + offset + 0x30]; \ stx %t3, [%dst + offset + 0x38];#define MOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ ldx [%src - offset - 0x10], %t0; \ ldx [%src - offset - 0x08], %t1; \ stw %t0, [%dst - offset - 0x0c]; \ srlx %t0, 32, %t2; \ stw %t2, [%dst - offset - 0x10]; \ stw %t1, [%dst - offset - 0x04]; \ srlx %t1, 32, %t3; \ stw %t3, [%dst - offset - 0x08];#define MOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \ ldx [%src - offset - 0x10], %t0; \ ldx [%src - offset - 0x08], %t1; \ stx %t0, [%dst - offset - 0x10]; \ stx %t1, [%dst - offset - 0x08]; /* Macros for non-VIS memmove code. */#define RMOVE_BIGCHUNK(src, dst, offset, t0, t1, t2, t3) \ ldx [%src - offset - 0x20], %t0; \ ldx [%src - offset - 0x18], %t1; \ ldx [%src - offset - 0x10], %t2; \ ldx [%src - offset - 0x08], %t3; \ stw %t0, [%dst - offset - 0x1c]; \ srlx %t0, 32, %t0; \ stw %t0, [%dst - offset - 0x20]; \ stw %t1, [%dst - offset - 0x14]; \ srlx %t1, 32, %t1; \ stw %t1, [%dst - offset - 0x18]; \ stw %t2, [%dst - offset - 0x0c]; \ srlx %t2, 32, %t2; \ stw %t2, [%dst - offset - 0x10]; \ stw %t3, [%dst - offset - 0x04]; \ srlx %t3, 32, %t3; \ stw %t3, [%dst - offset - 0x08];#define RMOVE_BIGALIGNCHUNK(src, dst, offset, t0, t1, t2, t3) \ ldx [%src - offset - 0x20], %t0; \ ldx [%src - offset - 0x18], %t1; \ ldx [%src - offset - 0x10], %t2; \ ldx [%src - offset - 0x08], %t3; \ stx %t0, [%dst - offset - 0x20]; \ stx %t1, [%dst - offset - 0x18]; \ stx %t2, [%dst - offset - 0x10]; \ stx %t3, [%dst - offset - 0x08]; \ ldx [%src - offset - 0x40], %t0; \ ldx [%src - offset - 0x38], %t1; \ ldx [%src - offset - 0x30], %t2; \ ldx [%src - offset - 0x28], %t3; \ stx %t0, [%dst - offset - 0x40]; \ stx %t1, [%dst - offset - 0x38]; \ stx %t2, [%dst - offset - 0x30]; \ stx %t3, [%dst - offset - 0x28];#define RMOVE_LASTCHUNK(src, dst, offset, t0, t1, t2, t3) \ ldx [%src + offset + 0x00], %t0; \ ldx [%src + offset + 0x08], %t1; \ stw %t0, [%dst + offset + 0x04]; \ srlx %t0, 32, %t2; \ stw %t2, [%dst + offset + 0x00]; \ stw %t1, [%dst + offset + 0x0c]; \ srlx %t1, 32, %t3; \ stw %t3, [%dst + offset + 0x08];#define RMOVE_LASTALIGNCHUNK(src, dst, offset, t0, t1) \ ldx [%src + offset + 0x00], %t0; \ ldx [%src + offset + 0x08], %t1; \ stx %t0, [%dst + offset + 0x00]; \ stx %t1, [%dst + offset + 0x08]; .text .align 32ENTRY(bcopy) sub %o1, %o0, %o4 /* IEU0 Group */ mov %o0, %g3 /* IEU1 */ cmp %o4, %o2 /* IEU1 Group */ mov %o1, %o0 /* IEU0 */ bgeu,pt %XCC, 210f /* CTI */ mov %g3, %o1 /* IEU0 Group */#ifndef USE_BPR srl %o2, 0, %o2 /* IEU1 */#endif brnz,pn %o2, 220f /* CTI Group */ add %o0, %o2, %o0 /* IEU0 */ retl nopEND(bcopy) .align 32200: be,pt %xcc, 201f /* CTI */ andcc %o0, 0x38, %g5 /* IEU1 Group */ mov 8, %g1 /* IEU0 */ sub %g1, %g2, %g2 /* IEU0 Group */ andcc %o0, 1, %g0 /* IEU1 */ be,pt %icc, 2f /* CTI */ sub %o2, %g2, %o2 /* IEU0 Group */1: ldub [%o1], %o5 /* Load Group */ add %o1, 1, %o1 /* IEU0 */ add %o0, 1, %o0 /* IEU1 */ subcc %g2, 1, %g2 /* IEU1 Group */ be,pn %xcc, 3f /* CTI */ stb %o5, [%o0 - 1] /* Store */2: ldub [%o1], %o5 /* Load Group */ add %o0, 2, %o0 /* IEU0 */ ldub [%o1 + 1], %g3 /* Load Group */ subcc %g2, 2, %g2 /* IEU1 Group */ stb %o5, [%o0 - 2] /* Store */ add %o1, 2, %o1 /* IEU0 */ bne,pt %xcc, 2b /* CTI Group */ stb %g3, [%o0 - 1] /* Store */3: andcc %o0, 0x38, %g5 /* IEU1 Group */201: be,pt %icc, 202f /* CTI */ mov 64, %g1 /* IEU0 */ fmovd %f0, %f2 /* FPU */ sub %g1, %g5, %g5 /* IEU0 Group */ alignaddr %o1, %g0, %g1 /* GRU Group */ ldd [%g1], %f4 /* Load Group */ sub %o2, %g5, %o2 /* IEU0 */1: ldd [%g1 + 0x8], %f6 /* Load Group */ add %g1, 0x8, %g1 /* IEU0 Group */ subcc %g5, 8, %g5 /* IEU1 */ faligndata %f4, %f6, %f0 /* GRU Group */ std %f0, [%o0] /* Store */ add %o1, 8, %o1 /* IEU0 Group */ be,pn %xcc, 202f /* CTI */ add %o0, 8, %o0 /* IEU1 */ ldd [%g1 + 0x8], %f4 /* Load Group */ add %g1, 8, %g1 /* IEU0 */ subcc %g5, 8, %g5 /* IEU1 */ faligndata %f6, %f4, %f0 /* GRU Group */ std %f0, [%o0] /* Store */ add %o1, 8, %o1 /* IEU0 */ bne,pt %xcc, 1b /* CTI Group */ add %o0, 8, %o0 /* IEU0 */202: membar #LoadStore | #StoreStore | #StoreLoad /* LSU Group */ wr %g0, ASI_BLK_P, %asi /* LSU Group */ subcc %o2, 0x40, %g6 /* IEU1 Group */ mov %o1, %g1 /* IEU0 */ andncc %g6, (0x40 - 1), %g6 /* IEU1 Group */ srl %g1, 3, %g2 /* IEU0 */ sub %o2, %g6, %g3 /* IEU0 Group */ andn %o1, (0x40 - 1), %o1 /* IEU1 */ and %g2, 7, %g2 /* IEU0 Group */ andncc %g3, 0x7, %g3 /* IEU1 */ fmovd %f0, %f2 /* FPU */ sub %g3, 0x10, %g3 /* IEU0 Group */ sub %o2, %g6, %o2 /* IEU1 */ alignaddr %g1, %g0, %g0 /* GRU Group */ add %g1, %g6, %g1 /* IEU0 Group */ subcc %o2, %g3, %o2 /* IEU1 */ ldda [%o1 + 0x00] %asi, %f0 /* LSU Group */ add %g1, %g3, %g1 /* IEU0 */ ldda [%o1 + 0x40] %asi, %f16 /* LSU Group */ sub %g6, 0x80, %g6 /* IEU0 */ ldda [%o1 + 0x80] %asi, %f32 /* LSU Group */ /* Clk1 Group 8-( */ /* Clk2 Group 8-( */ /* Clk3 Group 8-( */ /* Clk4 Group 8-( */203: rd %pc, %g5 /* PDU Group 8-( */ addcc %g5, %lo(300f - 203b), %g5 /* IEU1 Group */ sll %g2, 9, %g2 /* IEU0 */ jmpl %g5 + %g2, %g0 /* CTI Group brk forced*/ addcc %o1, 0xc0, %o1 /* IEU1 Group */ .align 512 /* OK, here comes the fun part... */300: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) LOOP_CHUNK1(o1, o0, g6, 301f) FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) LOOP_CHUNK2(o1, o0, g6, 302f) FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) LOOP_CHUNK3(o1, o0, g6, 303f) b,pt %xcc, 300b+4; faligndata %f0, %f2, %f48301: FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) STORE_SYNC(o0, f48) membar #Sync FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) STORE_JUMP(o0, f48, 400f) membar #Sync302: FREG_FROB(f32,f34,f36,f38,f40,f42,f44,f46,f0) STORE_SYNC(o0, f48) membar #Sync FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) STORE_JUMP(o0, f48, 416f) membar #Sync303: FREG_FROB(f0, f2, f4, f6, f8, f10,f12,f14,f16) STORE_SYNC(o0, f48) membar #Sync FREG_FROB(f16,f18,f20,f22,f24,f26,f28,f30,f32) STORE_JUMP(o0, f48, 432f) membar #Sync VISLOOP_PAD310: FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) LOOP_CHUNK1(o1, o0, g6, 311f) FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) LOOP_CHUNK2(o1, o0, g6, 312f) FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) LOOP_CHUNK3(o1, o0, g6, 313f) b,pt %xcc, 310b+4; faligndata %f2, %f4, %f48311: FREG_FROB(f18,f20,f22,f24,f26,f28,f30,f32,f34) STORE_SYNC(o0, f48) membar #Sync FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) STORE_JUMP(o0, f48, 402f) membar #Sync312: FREG_FROB(f34,f36,f38,f40,f42,f44,f46,f0, f2) STORE_SYNC(o0, f48) membar #Sync FREG_FROB(f2, f4, f6, f8, f10,f12,f14,f16,f18) STORE_JUMP(o0, f48, 418f) membar #Sync
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -