📄 memcpy.s
字号:
// $Header: /home/harrison/c/tcgmsg/ipcv4.0/RCS/memcpy.s,v 1.1 91/12/06 17:26:46 harrison Exp Locker: harrison $//// RJH// C entry is same as standard library routine memcpy//// char *Memcpy (s1, s2, n)// char *s1, *s2;// int n;//// Memcpy() copies n characters from memory area s2 to s1. It// returns s1. // Standard library routine achieves about 3.8 Mbyte/s.// This does 38.2 Mbyte/s for 8 byte aligned input and output// 21.8 Mbyte/s for 4 ....// 6.3 Mbyte/s for unaligned data// The theoretical peak on the FX2800 is 80/2=40Mb/s for data// in the shared cache.//// FORTRAN entry is//// subroutine memcpy(a, b, n)// .text .globl _Memcpy // Fortran name .globl _memcpy_ // C name .align 16//// FORTRAN entry ... r18 is passed by reference ... load it in//_memcpy_: ld.l 0(r18), r18//// C entry//_Memcpy: mov r16, r19 // save r19 in return register adds -1, r0, r20 // store -1 in r20// or r19, r17, r22 // or addresses together and 7, r22, r0 // bc aligned8 // skip to 8 byte aligned code and 3, r22, r0 // bc aligned4 // skip to 4 byte aligned code br aligned1 // skip to 1 byte aligned code nop//// code for eight byte alignment ... four way unrolled doubles (32 bytes)// 38.2 Mbyte/s = full speed if input is cachable//aligned8: shr 5, r18, r21 // r21 = r18/32 shl 5, r21, r22 subs r18, r22, r18 // r18 = remainder adds -1, r21, r21// bla does 0,...,r21-1 bc aligned4 // skip if r21 < 1 adds -8, r19, r19 // prepare for autoinc bla r20, r21, loop8a adds -8, r17, r17 // prepare for autoincloop8a: fld.d 8(r17)++, f8 // get 8 bytes fld.d 8(r17)++, f10 // get 8 bytes fld.d 8(r17)++, f12 // get 8 bytes fld.d 8(r17)++, f14 // get 8 bytes fst.d f8, 8(r19)++ // store 8 bytes fst.d f10, 8(r19)++ // store 8 bytes fst.d f12, 8(r19)++ // store 8 bytes bla r20, r21, loop8a// decrement and branch fst.d f14, 8(r19)++ // store 8 bytes// adds 8, r19, r19 adds 8, r17, r17 // undo autoinc offsets and fall thru//// code for 4 byte aligned ... 4 way unrolled integer copy (16 bytes)// 21.8 Mbytes/s = about half speed if input is cachable//aligned4: shr 4, r18, r21 // r21 = r18/16 shl 4, r21, r22 subs r18, r22, r18 // r18 = remainder adds -1, r21, r21// bla does 0,...,r21-1 bc aligned1 // skip if r21 < 1 bla r20, r21, loop4a noploop4a: ld.l 0(r17), r22 // get 4 bytes ld.l 4(r17), r23 // get 4 bytes ld.l 8(r17), r24 // get 4 bytes ld.l 12(r17), r25 // get 4 bytes adds 16, r17, r17 // increment address st.l r22, 0(r19) // store 4 bytes st.l r23, 4(r19) // store 4 bytes st.l r24, 8(r19) // store 4 bytes st.l r25, 12(r19) // store 4 bytes bla r20, r21, loop4a// decrement and branch adds 16, r19, r19 // increment address in delay slot//// 2 byte aligned ... slower than single bytes ... deleted//// code for general alignment ... 4 way unrolled byte copy// 6.3 Mbytes/s if input is cachable//aligned1: shr 2, r18, r21 // r21 = r18/4 shl 2, r21, r22 subs r18, r22, r18 // r18 = remainder adds -1, r21, r21// bla does 0,...,r21-1 bc done1a // skip if r21 < 1 bla r20, r21, loop1a noploop1a: ld.b 0(r17), r22 // get byte ld.b 1(r17), r23 // get byte ld.b 2(r17), r24 // get byte ld.b 3(r17), r25 // get byte adds 4, r17, r17 // increment address st.b r22, 0(r19) // store byte st.b r23, 1(r19) // store byte st.b r24, 2(r19) // store byte st.b r25, 3(r19) // store byte bla r20, r21, loop1a adds 4, r19, r19 // increment address in delay slot//// tidy up loop for single byte copy//done1a: adds -1, r18, r18 // bla does 0,...,r18-1 bc done // skip if r18<1 bla r20, r18, loop1b noploop1b: ld.b 0(r17), r22 // get byte adds 1, r17, r17 // increment address st.b r22, 0(r19) // store byte bla r20, r18, loop1b // decrement and branch adds 1, r19, r19 // increment address in delay slot//done: bri r1 nop
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -