⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 memcpy.s

📁 spice中支持多层次元件模型仿真的可单独运行的插件源码
💻 S
字号:
// $Header: /home/harrison/c/tcgmsg/ipcv4.0/RCS/memcpy.s,v 1.1 91/12/06 17:26:46 harrison Exp Locker: harrison $////     RJH//     C entry is same as standard library routine memcpy////     char *Memcpy (s1, s2, n)//     char *s1, *s2;//     int n;////     Memcpy() copies n characters from memory area s2 to s1.   It//     returns s1. //     Standard library routine achieves about 3.8 Mbyte/s.//     This does 38.2 Mbyte/s for 8 byte aligned input and output//               21.8 Mbyte/s for 4 ....//                6.3 Mbyte/s for unaligned data//     The theoretical peak on the FX2800 is 80/2=40Mb/s for data//     in the shared cache.////     FORTRAN entry is////     subroutine memcpy(a, b, n)//	.text	.globl		_Memcpy   // Fortran name 	.globl		_memcpy_  // C name	.align		16////	FORTRAN entry ... r18 is passed by reference ... load it in//_memcpy_:	ld.l	0(r18), r18////	C entry//_Memcpy:	mov	r16, r19	// save r19 in return register	adds	-1, r0, r20	// store -1 in r20//	or	r19, r17, r22	// or addresses together	and	7, r22, r0	//	bc	aligned8	// skip to 8 byte aligned code	and	3, r22, r0	//	bc	aligned4	// skip to 4 byte aligned code	br	aligned1	// skip to 1 byte aligned code	  nop////	code for eight byte alignment ... four way unrolled doubles (32 bytes)//	38.2 Mbyte/s = full speed if input is cachable//aligned8:	shr	5, r18, r21	// r21 = r18/32	shl	5, r21, r22	subs	r18, r22, r18	// r18 = remainder	adds	-1, r21, r21// bla does 0,...,r21-1	bc	aligned4	// skip if r21 < 1	adds	-8, r19, r19	// prepare for autoinc	bla	r20, r21, loop8a	  adds	-8, r17, r17	// prepare for autoincloop8a:	fld.d	8(r17)++, f8	// get 8 bytes	fld.d	8(r17)++, f10	// get 8 bytes	fld.d	8(r17)++, f12	// get 8 bytes	fld.d	8(r17)++, f14	// get 8 bytes	fst.d	f8, 8(r19)++	// store 8 bytes	fst.d	f10, 8(r19)++	// store 8 bytes	fst.d	f12, 8(r19)++	// store 8 bytes	bla	r20, r21, loop8a// decrement and branch	  fst.d	f14, 8(r19)++	// store 8 bytes//	adds	8, r19, r19	adds	8, r17, r17	// undo autoinc offsets and fall thru////	code for 4 byte aligned ... 4 way unrolled integer copy (16 bytes)//	21.8 Mbytes/s = about half speed if input is cachable//aligned4:	shr	4, r18, r21	// r21 = r18/16	shl	4, r21, r22	subs	r18, r22, r18	// r18 = remainder	adds	-1, r21, r21// bla does 0,...,r21-1	bc	aligned1	// skip if r21 < 1	bla	r20, r21, loop4a	  noploop4a:	ld.l	0(r17), r22	// get 4 bytes	ld.l	4(r17), r23	// get 4 bytes	ld.l	8(r17), r24	// get 4 bytes	ld.l	12(r17), r25	// get 4 bytes	adds	16, r17, r17	// increment address	st.l	r22, 0(r19)	// store 4 bytes	st.l	r23, 4(r19)	// store 4 bytes	st.l	r24, 8(r19)	// store 4 bytes	st.l	r25, 12(r19)	// store 4 bytes	bla	r20, r21, loop4a// decrement and branch	  adds	16, r19, r19	// increment address in delay slot////	2 byte aligned ... slower than single bytes ... deleted////	code for general alignment ... 4 way unrolled byte copy//	6.3 Mbytes/s if input is cachable//aligned1:	shr	2, r18, r21	// r21 = r18/4	shl	2, r21, r22	subs	r18, r22, r18	// r18 = remainder	adds	-1, r21, r21// bla does 0,...,r21-1	bc	done1a		// skip if r21 < 1	bla	r20, r21, loop1a	  noploop1a: ld.b	0(r17), r22	// get byte	ld.b	1(r17), r23	// get byte	ld.b	2(r17), r24	// get byte	ld.b	3(r17), r25	// get byte	adds	4, r17, r17	// increment address	st.b	r22, 0(r19)	// store byte	st.b	r23, 1(r19)	// store byte	st.b	r24, 2(r19)	// store byte	st.b	r25, 3(r19)	// store byte	bla	r20, r21, loop1a	  adds	4, r19, r19	// increment address in delay slot////	tidy up loop for single byte copy//done1a:	adds	-1, r18, r18	// bla does 0,...,r18-1	bc	done		// skip if r18<1	bla	r20, r18, loop1b	  noploop1b: ld.b	0(r17), r22	// get byte	adds	1, r17, r17	// increment address	st.b	r22, 0(r19)	// store byte	bla	r20, r18, loop1b	// decrement and branch	  adds	1, r19, r19	// increment address in delay slot//done:	bri	r1	  nop

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -