📄 memcpy_mck.s
字号:
;; add r29=.jump_table - 1b - (.jmp1-.jump_table), r29 shl r28=r30, LOOP_SIZE // jmp_table thread mov ar.ec=2 // loop setup ;; add r29=r29,r28 // jmp_table thread cmp.eq p16,p17=r0,r0 ;; mov b6=r29 // jmp_table thread ;; br.cond.sptk.few b6// for 8-15 byte case// We will skip the loop, but need to replicate the side effect// that the loop produces..noloop:EX(.ex_handler, (p6) ld8 r37=[src1],8) add src0=8,src0(p6) shl r25=r30,3 ;;EX(.ex_handler, (p6) ld8 r27=[src1])(p6) shr.u r28=r37,r25(p6) sub r26=64,r25 ;;(p6) shl r27=r27,r26 ;;(p6) or r21=r28,r27.unaligned_src_tail:/* check if we have more than blocksize to copy, if so go back */ cmp.gt p8,p0=saved_in2,blocksize ;;(p8) add dst0=saved_in0,blocksize(p8) add src0=saved_in1,blocksize(p8) sub in2=saved_in2,blocksize(p8) br.dpnt .4k_block ;;/* we have up to 15 byte to copy in the tail. * part of work is already done in the jump table code * we are at the following state. * src side: * * xxxxxx xx <----- r21 has xxxxxxxx already * -------- -------- -------- * 0 8 16 * ^ * | * src1 * * dst * -------- -------- -------- * ^ * | * dst1 */EX(.ex_handler, (p6) st8 [dst1]=r21,8) // more than 8 byte to copy(p6) add curlen=-8,curlen // update length mov ar.pfs=saved_pfs ;; mov ar.lc=saved_lc mov pr=saved_pr,-1 mov in2=curlen // remaining length mov dst0=dst1 // dest pointer add src0=src1,r30 // forward by src alignment ;;// 7 byte or smaller..memcpy_short: cmp.le p8,p9 = 1,in2 cmp.le p10,p11 = 2,in2 cmp.le p12,p13 = 3,in2 cmp.le p14,p15 = 4,in2 add src1=1,src0 // second src pointer add dst1=1,dst0 // second dest pointer ;;EX(.ex_handler_short, (p8) ld1 t1=[src0],2)EK(.ex_handler_short, (p10) ld1 t2=[src1],2)(p9) br.ret.dpnt rp // 0 byte copy ;;EX(.ex_handler_short, (p8) st1 [dst0]=t1,2)EK(.ex_handler_short, (p10) st1 [dst1]=t2,2)(p11) br.ret.dpnt rp // 1 byte copyEX(.ex_handler_short, (p12) ld1 t3=[src0],2)EK(.ex_handler_short, (p14) ld1 t4=[src1],2)(p13) br.ret.dpnt rp // 2 byte copy ;; cmp.le p6,p7 = 5,in2 cmp.le p8,p9 = 6,in2 cmp.le p10,p11 = 7,in2EX(.ex_handler_short, (p12) st1 [dst0]=t3,2)EK(.ex_handler_short, (p14) st1 [dst1]=t4,2)(p15) br.ret.dpnt rp // 3 byte copy ;;EX(.ex_handler_short, (p6) ld1 t5=[src0],2)EK(.ex_handler_short, (p8) ld1 t6=[src1],2)(p7) br.ret.dpnt rp // 4 byte copy ;;EX(.ex_handler_short, (p6) st1 [dst0]=t5,2)EK(.ex_handler_short, (p8) st1 [dst1]=t6,2)(p9) br.ret.dptk rp // 5 byte copyEX(.ex_handler_short, (p10) ld1 t7=[src0],2)(p11) br.ret.dptk rp // 6 byte copy ;;EX(.ex_handler_short, (p10) st1 [dst0]=t7,2) br.ret.dptk rp // done all cases/* Align dest to nearest 8-byte boundary. We know we have at * least 7 bytes to copy, enough to crawl to 8-byte boundary. * Actual number of byte to crawl depend on the dest alignment. * 7 byte or less is taken care at .memcpy_short * src0 - source even index * src1 - source odd index * dst0 - dest even index * dst1 - dest odd index * r30 - distance to 8-byte boundary */.align_dest: add src1=1,in1 // source odd index cmp.le p7,p0 = 2,r30 // for .align_dest cmp.le p8,p0 = 3,r30 // for .align_destEX(.ex_handler_short, (p6) ld1 t1=[src0],2) cmp.le p9,p0 = 4,r30 // for .align_dest cmp.le p10,p0 = 5,r30 ;;EX(.ex_handler_short, (p7) ld1 t2=[src1],2)EK(.ex_handler_short, (p8) ld1 t3=[src0],2) cmp.le p11,p0 = 6,r30EX(.ex_handler_short, (p6) st1 [dst0] = t1,2) cmp.le p12,p0 = 7,r30 ;;EX(.ex_handler_short, (p9) ld1 t4=[src1],2)EK(.ex_handler_short, (p10) ld1 t5=[src0],2)EX(.ex_handler_short, (p7) st1 [dst1] = t2,2)EK(.ex_handler_short, (p8) st1 [dst0] = t3,2) ;;EX(.ex_handler_short, (p11) ld1 t6=[src1],2)EK(.ex_handler_short, (p12) ld1 t7=[src0],2) cmp.eq p6,p7=r28,r29EX(.ex_handler_short, (p9) st1 [dst1] = t4,2)EK(.ex_handler_short, (p10) st1 [dst0] = t5,2) sub in2=in2,r30 ;;EX(.ex_handler_short, (p11) st1 [dst1] = t6,2)EK(.ex_handler_short, (p12) st1 [dst0] = t7) add dst0=in0,r30 // setup arguments add src0=in1,r30(p6) br.cond.dptk .aligned_src(p7) br.cond.dpnt .unaligned_src ;;/* main loop body in jump table format */#define COPYU(shift) \1: \EX(.ex_handler, (p16) ld8 r32=[src0],8); /* 1 */ \EK(.ex_handler, (p16) ld8 r36=[src1],8); \ (p17) shrp r35=r33,r34,shift;; /* 1 */ \EX(.ex_handler, (p6) ld8 r22=[src1]); /* common, prime for tail section */ \ nop.m 0; \ (p16) shrp r38=r36,r37,shift; \EX(.ex_handler, (p17) st8 [dst0]=r35,8); /* 1 */ \EK(.ex_handler, (p17) st8 [dst1]=r39,8); \ br.ctop.dptk.few 1b;; \ (p7) add src1=-8,src1; /* back out for <8 byte case */ \ shrp r21=r22,r38,shift; /* speculative work */ \ br.sptk.few .unaligned_src_tail /* branch out of jump table */ \ ;; TEXT_ALIGN(32).jump_table: COPYU(8) // unaligned cases.jmp1: COPYU(16) COPYU(24) COPYU(32) COPYU(40) COPYU(48) COPYU(56)#undef A#undef B#undef C#undef D/* * Due to lack of local tag support in gcc 2.x assembler, it is not clear which * instruction failed in the bundle. The exception algorithm is that we * first figure out the faulting address, then detect if there is any * progress made on the copy, if so, redo the copy from last known copied * location up to the faulting address (exclusive). In the copy_from_user * case, remaining byte in kernel buffer will be zeroed. * * Take copy_from_user as an example, in the code there are multiple loads * in a bundle and those multiple loads could span over two pages, the * faulting address is calculated as page_round_down(max(src0, src1)). * This is based on knowledge that if we can access one byte in a page, we * can access any byte in that page. * * predicate used in the exception handler: * p6-p7: direction * p10-p11: src faulting addr calculation * p12-p13: dst faulting addr calculation */#define A r19#define B r20#define C r21#define D r22#define F r28#define memset_arg0 r32#define memset_arg2 r33#define saved_retval loc0#define saved_rtlink loc1#define saved_pfs_stack loc2.ex_hndlr_s: add src0=8,src0 br.sptk .ex_handler ;;.ex_hndlr_d: add dst0=8,dst0 br.sptk .ex_handler ;;.ex_hndlr_lcpy_1: mov src1=src_pre_mem mov dst1=dst_pre_mem cmp.gtu p10,p11=src_pre_mem,saved_in1 cmp.gtu p12,p13=dst_pre_mem,saved_in0 ;;(p10) add src0=8,saved_in1(p11) mov src0=saved_in1(p12) add dst0=8,saved_in0(p13) mov dst0=saved_in0 br.sptk .ex_handler.ex_handler_lcpy: // in line_copy block, the preload addresses should always ahead // of the other two src/dst pointers. Furthermore, src1/dst1 should // always ahead of src0/dst0. mov src1=src_pre_mem mov dst1=dst_pre_mem.ex_handler: mov pr=saved_pr,-1 // first restore pr, lc, and pfs mov ar.lc=saved_lc mov ar.pfs=saved_pfs ;;.ex_handler_short: // fault occurred in these sections didn't change pr, lc, pfs cmp.ltu p6,p7=saved_in0, saved_in1 // get the copy direction cmp.ltu p10,p11=src0,src1 cmp.ltu p12,p13=dst0,dst1 fcmp.eq p8,p0=f6,f0 // is it memcpy? mov tmp = dst0 ;;(p11) mov src1 = src0 // pick the larger of the two(p13) mov dst0 = dst1 // make dst0 the smaller one(p13) mov dst1 = tmp // and dst1 the larger one ;;(p6) dep F = r0,dst1,0,PAGE_SHIFT // usr dst round down to page boundary(p7) dep F = r0,src1,0,PAGE_SHIFT // usr src round down to page boundary ;;(p6) cmp.le p14,p0=dst0,saved_in0 // no progress has been made on store(p7) cmp.le p14,p0=src0,saved_in1 // no progress has been made on load mov retval=saved_in2(p8) ld1 tmp=[src1] // force an oops for memcpy call(p8) st1 [dst1]=r0 // force an oops for memcpy call(p14) br.ret.sptk.many rp/* * The remaining byte to copy is calculated as: * * A = (faulting_addr - orig_src) -> len to faulting ld address * or * (faulting_addr - orig_dst) -> len to faulting st address * B = (cur_dst - orig_dst) -> len copied so far * C = A - B -> len need to be copied * D = orig_len - A -> len need to be zeroed */(p6) sub A = F, saved_in0(p7) sub A = F, saved_in1 clrrrb ;; alloc saved_pfs_stack=ar.pfs,3,3,3,0 cmp.lt p8,p0=A,r0 sub B = dst0, saved_in0 // how many byte copied so far ;;(p8) mov A = 0; // A shouldn't be negative, cap it ;; sub C = A, B sub D = saved_in2, A ;; cmp.gt p8,p0=C,r0 // more than 1 byte? add memset_arg0=saved_in0, A(p6) mov memset_arg2=0 // copy_to_user should not call memset(p7) mov memset_arg2=D // copy_from_user need to have kbuf zeroed mov r8=0 mov saved_retval = D mov saved_rtlink = b0 add out0=saved_in0, B add out1=saved_in1, B mov out2=C(p8) br.call.sptk.few b0=__copy_user // recursive call ;; add saved_retval=saved_retval,r8 // above might return non-zero value cmp.gt p8,p0=memset_arg2,r0 // more than 1 byte? mov out0=memset_arg0 // *s mov out1=r0 // c mov out2=memset_arg2 // n(p8) br.call.sptk.few b0=memset ;; mov retval=saved_retval mov ar.pfs=saved_pfs_stack mov b0=saved_rtlink br.ret.sptk.many rp/* end of McKinley specific optimization */END(__copy_user)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -