📄 memcpy_mck.s
字号:
/* * Itanium 2-optimized version of memcpy and copy_user function * * Inputs: * in0: destination address * in1: source address * in2: number of bytes to copy * Output: * for memcpy: return dest * for copy_user: return 0 if success, * or number of byte NOT copied if error occurred. * * Copyright (C) 2002 Intel Corp. * Copyright (C) 2002 Ken Chen <kenneth.w.chen@intel.com> */#include <asm/asmmacro.h>#include <asm/page.h>#define EK(y...) EX(y)/* McKinley specific optimization */#define retval r8#define saved_pfs r31#define saved_lc r10#define saved_pr r11#define saved_in0 r14#define saved_in1 r15#define saved_in2 r16#define src0 r2#define src1 r3#define dst0 r17#define dst1 r18#define cnt r9/* r19-r30 are temp for each code section */#define PREFETCH_DIST 8#define src_pre_mem r19#define dst_pre_mem r20#define src_pre_l2 r21#define dst_pre_l2 r22#define t1 r23#define t2 r24#define t3 r25#define t4 r26#define t5 t1 // alias!#define t6 t2 // alias!#define t7 t3 // alias!#define n8 r27#define t9 t5 // alias!#define t10 t4 // alias!#define t11 t7 // alias!#define t12 t6 // alias!#define t14 t10 // alias!#define t13 r28#define t15 r29#define tmp r30/* defines for long_copy block */#define A 0#define B (PREFETCH_DIST)#define C (B + PREFETCH_DIST)#define D (C + 1)#define N (D + 1)#define Nrot ((N + 7) & ~7)/* alias */#define in0 r32#define in1 r33#define in2 r34GLOBAL_ENTRY(memcpy) and r28=0x7,in0 and r29=0x7,in1 mov f6=f0 mov retval=in0 br.cond.sptk .common_code ;;END(memcpy)GLOBAL_ENTRY(__copy_user) .prologue// check dest alignment and r28=0x7,in0 and r29=0x7,in1 mov f6=f1 mov saved_in0=in0 // save dest pointer mov saved_in1=in1 // save src pointer mov retval=r0 // initialize return value ;;.common_code: cmp.gt p15,p0=8,in2 // check for small size cmp.ne p13,p0=0,r28 // check dest alignment cmp.ne p14,p0=0,r29 // check src alignment add src0=0,in1 sub r30=8,r28 // for .align_dest mov saved_in2=in2 // save len ;; add dst0=0,in0 add dst1=1,in0 // dest odd index cmp.le p6,p0 = 1,r30 // for .align_dest(p15) br.cond.dpnt .memcpy_short(p13) br.cond.dpnt .align_dest(p14) br.cond.dpnt .unaligned_src ;;// both dest and src are aligned on 8-byte boundary.aligned_src: .save ar.pfs, saved_pfs alloc saved_pfs=ar.pfs,3,Nrot-3,0,Nrot .save pr, saved_pr mov saved_pr=pr shr.u cnt=in2,7 // this much cache line ;; cmp.lt p6,p0=2*PREFETCH_DIST,cnt cmp.lt p7,p8=1,cnt .save ar.lc, saved_lc mov saved_lc=ar.lc .body add cnt=-1,cnt add src_pre_mem=0,in1 // prefetch src pointer add dst_pre_mem=0,in0 // prefetch dest pointer ;;(p7) mov ar.lc=cnt // prefetch count(p8) mov ar.lc=r0(p6) br.cond.dpnt .long_copy ;;.prefetch: lfetch.fault [src_pre_mem], 128 lfetch.fault.excl [dst_pre_mem], 128 br.cloop.dptk.few .prefetch ;;.medium_copy: and tmp=31,in2 // copy length after iteration shr.u r29=in2,5 // number of 32-byte iteration add dst1=8,dst0 // 2nd dest pointer ;; add cnt=-1,r29 // ctop iteration adjustment cmp.eq p10,p0=r29,r0 // do we really need to loop? add src1=8,src0 // 2nd src pointer cmp.le p6,p0=8,tmp ;; cmp.le p7,p0=16,tmp mov ar.lc=cnt // loop setup cmp.eq p16,p17 = r0,r0 mov ar.ec=2(p10) br.dpnt.few .aligned_src_tail ;; TEXT_ALIGN(32)1:EX(.ex_handler, (p16) ld8 r34=[src0],16)EK(.ex_handler, (p16) ld8 r38=[src1],16)EX(.ex_handler, (p17) st8 [dst0]=r33,16)EK(.ex_handler, (p17) st8 [dst1]=r37,16) ;;EX(.ex_handler, (p16) ld8 r32=[src0],16)EK(.ex_handler, (p16) ld8 r36=[src1],16)EX(.ex_handler, (p16) st8 [dst0]=r34,16)EK(.ex_handler, (p16) st8 [dst1]=r38,16) br.ctop.dptk.few 1b ;;.aligned_src_tail:EX(.ex_handler, (p6) ld8 t1=[src0]) mov ar.lc=saved_lc mov ar.pfs=saved_pfsEX(.ex_hndlr_s, (p7) ld8 t2=[src1],8) cmp.le p8,p0=24,tmp and r21=-8,tmp ;;EX(.ex_hndlr_s, (p8) ld8 t3=[src1])EX(.ex_handler, (p6) st8 [dst0]=t1) // store byte 1 and in2=7,tmp // remaining lengthEX(.ex_hndlr_d, (p7) st8 [dst1]=t2,8) // store byte 2 add src0=src0,r21 // setting up src pointer add dst0=dst0,r21 // setting up dest pointer ;;EX(.ex_handler, (p8) st8 [dst1]=t3) // store byte 3 mov pr=saved_pr,-1 br.dptk.many .memcpy_short ;;/* code taken from copy_page_mck */.long_copy: .rotr v[2*PREFETCH_DIST] .rotp p[N] mov src_pre_mem = src0 mov pr.rot = 0x10000 mov ar.ec = 1 // special unrolled loop mov dst_pre_mem = dst0 add src_pre_l2 = 8*8, src0 add dst_pre_l2 = 8*8, dst0 ;; add src0 = 8, src_pre_mem // first t1 src mov ar.lc = 2*PREFETCH_DIST - 1 shr.u cnt=in2,7 // number of lines add src1 = 3*8, src_pre_mem // first t3 src add dst0 = 8, dst_pre_mem // first t1 dst add dst1 = 3*8, dst_pre_mem // first t3 dst ;; and tmp=127,in2 // remaining bytes after this block add cnt = -(2*PREFETCH_DIST) - 1, cnt // same as .line_copy loop, but with all predicated-off instructions removed:.prefetch_loop:EX(.ex_hndlr_lcpy_1, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0EK(.ex_hndlr_lcpy_1, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 br.ctop.sptk .prefetch_loop ;; cmp.eq p16, p0 = r0, r0 // reset p16 to 1 mov ar.lc = cnt mov ar.ec = N // # of stages in pipeline ;;.line_copy:EX(.ex_handler, (p[D]) ld8 t2 = [src0], 3*8) // M0EK(.ex_handler, (p[D]) ld8 t4 = [src1], 3*8) // M1EX(.ex_handler_lcpy, (p[B]) st8 [dst_pre_mem] = v[B], 128) // M2 prefetch dst from memoryEK(.ex_handler_lcpy, (p[D]) st8 [dst_pre_l2] = n8, 128) // M3 prefetch dst from L2 ;;EX(.ex_handler_lcpy, (p[A]) ld8 v[A] = [src_pre_mem], 128) // M0 prefetch src from memoryEK(.ex_handler_lcpy, (p[C]) ld8 n8 = [src_pre_l2], 128) // M1 prefetch src from L2EX(.ex_handler, (p[D]) st8 [dst0] = t1, 8) // M2EK(.ex_handler, (p[D]) st8 [dst1] = t3, 8) // M3 ;;EX(.ex_handler, (p[D]) ld8 t5 = [src0], 8)EK(.ex_handler, (p[D]) ld8 t7 = [src1], 3*8)EX(.ex_handler, (p[D]) st8 [dst0] = t2, 3*8)EK(.ex_handler, (p[D]) st8 [dst1] = t4, 3*8) ;;EX(.ex_handler, (p[D]) ld8 t6 = [src0], 3*8)EK(.ex_handler, (p[D]) ld8 t10 = [src1], 8)EX(.ex_handler, (p[D]) st8 [dst0] = t5, 8)EK(.ex_handler, (p[D]) st8 [dst1] = t7, 3*8) ;;EX(.ex_handler, (p[D]) ld8 t9 = [src0], 3*8)EK(.ex_handler, (p[D]) ld8 t11 = [src1], 3*8)EX(.ex_handler, (p[D]) st8 [dst0] = t6, 3*8)EK(.ex_handler, (p[D]) st8 [dst1] = t10, 8) ;;EX(.ex_handler, (p[D]) ld8 t12 = [src0], 8)EK(.ex_handler, (p[D]) ld8 t14 = [src1], 8)EX(.ex_handler, (p[D]) st8 [dst0] = t9, 3*8)EK(.ex_handler, (p[D]) st8 [dst1] = t11, 3*8) ;;EX(.ex_handler, (p[D]) ld8 t13 = [src0], 4*8)EK(.ex_handler, (p[D]) ld8 t15 = [src1], 4*8)EX(.ex_handler, (p[D]) st8 [dst0] = t12, 8)EK(.ex_handler, (p[D]) st8 [dst1] = t14, 8) ;;EX(.ex_handler, (p[C]) ld8 t1 = [src0], 8)EK(.ex_handler, (p[C]) ld8 t3 = [src1], 8)EX(.ex_handler, (p[D]) st8 [dst0] = t13, 4*8)EK(.ex_handler, (p[D]) st8 [dst1] = t15, 4*8) br.ctop.sptk .line_copy ;; add dst0=-8,dst0 add src0=-8,src0 mov in2=tmp .restore sp br.sptk.many .medium_copy ;;#define BLOCK_SIZE 128*32#define blocksize r23#define curlen r24// dest is on 8-byte boundary, src is not. We need to do// ld8-ld8, shrp, then st8. Max 8 byte copy per cycle..unaligned_src: .prologue .save ar.pfs, saved_pfs alloc saved_pfs=ar.pfs,3,5,0,8 .save ar.lc, saved_lc mov saved_lc=ar.lc .save pr, saved_pr mov saved_pr=pr .body.4k_block: mov saved_in0=dst0 // need to save all input arguments mov saved_in2=in2 mov blocksize=BLOCK_SIZE ;; cmp.lt p6,p7=blocksize,in2 mov saved_in1=src0 ;;(p6) mov in2=blocksize ;; shr.u r21=in2,7 // this much cache line shr.u r22=in2,4 // number of 16-byte iteration and curlen=15,in2 // copy length after iteration and r30=7,src0 // source alignment ;; cmp.lt p7,p8=1,r21 add cnt=-1,r21 ;; add src_pre_mem=0,src0 // prefetch src pointer add dst_pre_mem=0,dst0 // prefetch dest pointer and src0=-8,src0 // 1st src pointer(p7) mov ar.lc = cnt(p8) mov ar.lc = r0 ;; TEXT_ALIGN(32)1: lfetch.fault [src_pre_mem], 128 lfetch.fault.excl [dst_pre_mem], 128 br.cloop.dptk.few 1b ;; shladd dst1=r22,3,dst0 // 2nd dest pointer shladd src1=r22,3,src0 // 2nd src pointer cmp.eq p8,p9=r22,r0 // do we really need to loop? cmp.le p6,p7=8,curlen; // have at least 8 byte remaining? add cnt=-1,r22 // ctop iteration adjustment ;;EX(.ex_handler, (p9) ld8 r33=[src0],8) // loop primerEK(.ex_handler, (p9) ld8 r37=[src1],8)(p8) br.dpnt.few .noloop ;;// The jump address is calculated based on src alignment. The COPYU// macro below need to confine its size to power of two, so an entry// can be caulated using shl instead of an expensive multiply. The// size is then hard coded by the following #define to match the// actual size. This make it somewhat tedious when COPYU macro gets// changed and this need to be adjusted to match.#define LOOP_SIZE 61: mov r29=ip // jmp_table thread
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -