📄 copy_user.s
字号:
/* * * Optimized version of the copy_user() routine. * It is used to copy date across the kernel/user boundary. * * The source and destination are always on opposite side of * the boundary. When reading from user space we must catch * faults on loads. When writing to user space we must catch * errors on stores. Note that because of the nature of the copy * we don't need to worry about overlapping regions. * * * Inputs: * in0 address of source buffer * in1 address of destination buffer * in2 number of bytes to copy * * Outputs: * ret0 0 in case of sucess. The number of bytes NOT copied in * case of error. * * Copyright (C) 2000 Hewlett-Packard Co * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com> * * Fixme: * - handle the case where we have more than 16 bytes and the alignment * are different. * - more benchmarking * - fix extraneous stop bit introduced by the EX() macro. */#include <asm/asmmacro.h>// The label comes first because our store instruction contains a comma// and confuse the preprocessor otherwise//#undef DEBUG#ifdef DEBUG#define EX(y,x...) \99: x#else#define EX(y,x...) \ .section __ex_table,"a"; \ data4 @gprel(99f); \ data4 y-99f; \ .previous; \99: x#endif//// Tuneable parameters//#define COPY_BREAK 16 // we do byte copy below (must be >=16)#define PIPE_DEPTH 4 // pipe depth#define EPI p[PIPE_DEPTH-1] // PASTE(p,16+PIPE_DEPTH-1)//// arguments//#define dst in0#define src in1#define len in2//// local registers//#define t1 r2 // rshift in bytes#define t2 r3 // lshift in bytes#define rshift r14 // right shift in bits#define lshift r15 // left shift in bits#define word1 r16#define word2 r17#define cnt r18#define len2 r19#define saved_lc r20#define saved_pr r21#define tmp r22#define val r23#define src1 r24#define dst1 r25#define src2 r26#define dst2 r27#define len1 r28#define enddst r29#define endsrc r30#define saved_pfs r31 .text .psr abi64 .psr lsbGLOBAL_ENTRY(__copy_user) UNW(.prologue) UNW(.save ar.pfs, saved_pfs) alloc saved_pfs=ar.pfs,3,((2*PIPE_DEPTH+7)&~7),0,((2*PIPE_DEPTH+7)&~7) .rotr val1[PIPE_DEPTH],val2[PIPE_DEPTH] .rotp p[PIPE_DEPTH] adds len2=-1,len // br.ctop is repeat/until mov ret0=r0 ;; // RAW of cfm when len=0 cmp.eq p8,p0=r0,len // check for zero length UNW(.save ar.lc, saved_lc) mov saved_lc=ar.lc // preserve ar.lc (slow)(p8) br.ret.spnt.few rp // empty mempcy() ;; add enddst=dst,len // first byte after end of source add endsrc=src,len // first byte after end of destination UNW(.save pr, saved_pr) mov saved_pr=pr // preserve predicates UNW(.body) mov dst1=dst // copy because of rotation mov ar.ec=PIPE_DEPTH mov pr.rot=1<<16 // p16=true all others are false mov src1=src // copy because of rotation mov ar.lc=len2 // initialize lc for small count cmp.lt p10,p7=COPY_BREAK,len // if len > COPY_BREAK then long copy xor tmp=src,dst // same alignment test prepare(p10) br.cond.dptk.few long_copy_user ;; // RAW pr.rot/p16 ? // // Now we do the byte by byte loop with software pipeline // // p7 is necessarily false by now1: EX(failure_in_pipe1,(p16) ld1 val1[0]=[src1],1) EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) br.ctop.dptk.few 1b ;; mov ar.lc=saved_lc mov pr=saved_pr,0xffffffffffff0000 mov ar.pfs=saved_pfs // restore ar.ec br.ret.sptk.few rp // end of short memcpy // // Not 8-byte aligned //diff_align_copy_user: // At this point we know we have more than 16 bytes to copy // and also that src and dest do _not_ have the same alignment. and src2=0x7,src1 // src offset and dst2=0x7,dst1 // dst offset ;; // The basic idea is that we copy byte-by-byte at the head so // that we can reach 8-byte alignment for both src1 and dst1. // Then copy the body using software pipelined 8-byte copy, // shifting the two back-to-back words right and left, then copy // the tail by copying byte-by-byte. // // Fault handling. If the byte-by-byte at the head fails on the // load, then restart and finish the pipleline by copying zeros // to the dst1. Then copy zeros for the rest of dst1. // If 8-byte software pipeline fails on the load, do the same as // failure_in3 does. If the byte-by-byte at the tail fails, it is // handled simply by failure_in_pipe1. // // The case p14 represents the source has more bytes in the // the first word (by the shifted part), whereas the p15 needs to // copy some bytes from the 2nd word of the source that has the // tail of the 1st of the destination. // // // Optimization. If dst1 is 8-byte aligned (not rarely), we don't need // to copy the head to dst1, to start 8-byte copy software pipleline. // We know src1 is not 8-byte aligned in this case. // cmp.eq p14,p15=r0,dst2(p15) br.cond.spnt.few 1f ;; sub t1=8,src2 mov t2=src2 ;; shl rshift=t2,3 sub len1=len,t1 // set len1 ;; sub lshift=64,rshift ;; br.cond.spnt.few word_copy_user ;; 1: cmp.leu p14,p15=src2,dst2 sub t1=dst2,src2 ;; .pred.rel "mutex", p14, p15(p14) sub word1=8,src2 // (8 - src offset)(p15) sub t1=r0,t1 // absolute value(p15) sub word1=8,dst2 // (8 - dst offset) ;; // For the case p14, we don't need to copy the shifted part to // the 1st word of destination. sub t2=8,t1 (p14) sub word1=word1,t1 ;; sub len1=len,word1 // resulting len(p15) shl rshift=t1,3 // in bits(p14) shl rshift=t2,3 ;; (p14) sub len1=len1,t1 adds cnt=-1,word1 ;; sub lshift=64,rshift mov ar.ec=PIPE_DEPTH mov pr.rot=1<<16 // p16=true all others are false mov ar.lc=cnt ;; 2: EX(failure_in_pipe2,(p16) ld1 val1[0]=[src1],1) ;; EX(failure_out,(EPI) st1 [dst1]=val1[PIPE_DEPTH-1],1) br.ctop.dptk.few 2b ;; clrrrb ;; word_copy_user: cmp.gtu p9,p0=16,len1(p9) br.cond.spnt.few 4f // if (16 > len1) skip 8-byte copy ;; shr.u cnt=len1,3 // number of 64-bit words ;; adds cnt=-1,cnt ;; .pred.rel "mutex", p14, p15 (p14) sub src1=src1,t2(p15) sub src1=src1,t1 // // Now both src1 and dst1 point to an 8-byte aligned address. And // we have more than 8 bytes to copy. // mov ar.lc=cnt mov ar.ec=PIPE_DEPTH mov pr.rot=1<<16 // p16=true all others are false ;; 3: // // The pipleline consists of 3 stages: // 1 (p16): Load a word from src1 // 2 (EPI_1): Shift right pair, saving to tmp // 3 (EPI): Store tmp to dst1 // // To make it simple, use at least 2 (p16) loops to set up val1[n] // because we need 2 back-to-back val1[] to get tmp. // Note that this implies EPI_2 must be p18 or greater. // #define EPI_1 p[PIPE_DEPTH-2]#define SWITCH(pred, shift) cmp.eq pred,p0=shift,rshift#define CASE(pred, shift) \ (pred) br.cond.spnt.few copy_user_bit##shift #define BODY(rshift) \copy_user_bit##rshift: \1: \ EX(failure_out,(EPI) st8 [dst1]=tmp,8); \(EPI_1) shrp tmp=val1[PIPE_DEPTH-3],val1[PIPE_DEPTH-2],rshift; \ EX(failure_in2,(p16) ld8 val1[0]=[src1],8); \ br.ctop.dptk.few 1b; \ ;; \ br.cond.spnt.few .diff_align_do_tail // // Since the instruction 'shrp' requires a fixed 128-bit value // specifying the bits to shift, we need to provide 7 cases // below. // SWITCH(p6, 8) SWITCH(p7, 16) SWITCH(p8, 24) SWITCH(p9, 32) SWITCH(p10, 40) SWITCH(p11, 48) SWITCH(p12, 56) ;; CASE(p6, 8) CASE(p7, 16) CASE(p8, 24) CASE(p9, 32) CASE(p10, 40) CASE(p11, 48) CASE(p12, 56) ;; BODY(8) BODY(16) BODY(24) BODY(32) BODY(40) BODY(48) BODY(56) ;; .diff_align_do_tail: .pred.rel "mutex", p14, p15 (p14) sub src1=src1,t1(p14) adds dst1=-8,dst1 (p15) sub dst1=dst1,t1 ;; 4: // Tail correction. // // The problem with this piplelined loop is that the last word is not // loaded and thus parf of the last word written is not correct. // To fix that, we simply copy the tail byte by byte. sub len1=endsrc,src1,1 clrrrb ;; mov ar.ec=PIPE_DEPTH mov pr.rot=1<<16 // p16=true all others are false mov ar.lc=len1 ;;5:
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -