⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 memcpy-sh4.s

📁 linux-2.6.15.6
💻 S
📖 第 1 页 / 共 2 页
字号:
/* * "memcpy" implementation of SuperH * * Copyright (C) 1999  Niibe Yutaka * Copyright (c) 2002  STMicroelectronics Ltd *   Modified from memcpy.S and micro-optimised for SH4 *   Stuart Menefy (stuart.menefy@st.com) * */#include <linux/linkage.h>#include <linux/config.h>/* * void *memcpy(void *dst, const void *src, size_t n); * * It is assumed that there is no overlap between src and dst. * If there is an overlap, then the results are undefined. */	!	!	GHIJ KLMN OPQR -->  ...G HIJK LMNO PQR.	!	! Size is 16 or greater, and may have trailing bytes	.balign	32.Lcase1:	! Read a long word and write a long word at once	! At the start of each iteration, r7 contains last long load	add	#-1,r5		!  79 EX	mov	r4,r2		!   5 MT (0 cycles latency)	mov.l	@(r0,r5),r7	!  21 LS (2 cycles latency)	add	#-4,r5		!  50 EX	add	#7,r2		!  79 EX	!#ifdef CONFIG_CPU_LITTLE_ENDIAN	! 6 cycles, 4 bytes per iteration3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK	mov	r7, r3		!   5 MT (latency=0)	! RQPO	cmp/hi	r2,r0		!  57 MT	shll16	r3		! 103 EX	mov	r1,r6		!   5 MT (latency=0)	shll8	r3		! 102 EX		! Oxxx	shlr8	r6		! 106 EX		! xNML	mov	r1, r7		!   5 MT (latency=0)	or	r6,r3		!  82 EX		! ONML	bt/s	3b		! 109 BR	 mov.l	r3,@-r0		!  30 LS#else3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! KLMN	mov	r7,r3		!   5 MT (latency=0)	! OPQR	cmp/hi	r2,r0		!  57 MT	shlr16	r3		! 107 EX	shlr8	r3		! 106 EX		! xxxO	mov	r1,r6		!   5 MT (latency=0)	shll8	r6		! 102 EX		! LMNx	mov	r1,r7		!   5 MT (latency=0)	or	r6,r3		!  82 EX		! LMNO	bt/s	3b		! 109 BR	 mov.l	r3,@-r0		!  30 LS#endif	! Finally, copy a byte at once, if necessary	add	#4,r5		!  50 EX	cmp/eq	r4,r0		!  54 MT	add	#-6,r2		!  50 EX	bt	9f		! 109 BR8:	cmp/hi	r2,r0		!  57 MT	mov.b	@(r0,r5),r1	!  20 LS (latency=2)	bt/s	8b		! 109 BR	 mov.b	r1,@-r0		!  29 LS9:	rts	 nop	!	!	GHIJ KLMN OPQR -->  .GHI JKLM NOPQ R...	!	! Size is 16 or greater, and may have trailing bytes	.balign	32.Lcase3:	! Read a long word and write a long word at once	! At the start of each iteration, r7 contains last long load	add	#-3,r5		! 79 EX	mov	r4,r2		!  5 MT (0 cycles latency)	mov.l	@(r0,r5),r7	! 21 LS (2 cycles latency)	add	#-4,r5		! 50 EX	add	#7,r2		!  79 EX	!#ifdef CONFIG_CPU_LITTLE_ENDIAN	! 6 cycles, 4 bytes per iteration3:	mov.l	@(r0,r5),r1	!  21 LS (latency=2)	! NMLK	mov	r7, r3		!   5 MT (latency=0)	! RQPO	cmp/hi	r2,r0		!  57 MT	shll8	r3		! 102 EX		! QPOx	mov	r1,r6		!   5 MT (latency=0)	shlr16	r6		! 107 EX	shlr8	r6		! 106 EX		! xxxN	mov	r1, r7		!   5 MT (latency=0)	or	r6,r3		!  82 EX		! QPON	bt/s	3b		! 109 BR	 mov.l	r3,@-r0		!  30 LS#else3:	mov	r1,r3		! OPQR	shlr8	r3		! xOPQ	mov.l	@(r0,r5),r1	! KLMN	mov	r1,r6	shll16	r6	shll8	r6		! Nxxx	or	r6,r3		! NOPQ	cmp/hi	r2,r0	bt/s	3b	 mov.l	r3,@-r0#endif	! Finally, copy a byte at once, if necessary	add	#6,r5		!  50 EX	cmp/eq	r4,r0		!  54 MT	add	#-6,r2		!  50 EX	bt	9f		! 109 BR8:	cmp/hi	r2,r0		!  57 MT	mov.b	@(r0,r5),r1	!  20 LS (latency=2)	bt/s	8b		! 109 BR	 mov.b	r1,@-r0		!  29 LS9:	rts	 nopENTRY(memcpy)	! Calculate the invariants which will be used in the remainder	! of the code:	!	!      r4   -->  [ ...  ] DST             [ ...  ] SRC	!	         [ ...  ]                 [ ...  ]	!	           :                        :	!      r0   -->  [ ...  ]       r0+r5 --> [ ...  ]	!	!	! Short circuit the common case of src, dst and len being 32 bit aligned	! and test for zero length move	mov	r6, r0		!   5 MT (0 cycle latency)	or	r4, r0		!  82 EX	or	r5, r0		!  82 EX	tst	r6, r6		!  86 MT	bt/s	99f		! 111 BR		(zero len)	 tst	#3, r0		!  87 MT	mov	r4, r0		!   5 MT (0 cycle latency)	add	r6, r0		!  49 EX	mov	#16, r1		!   6 EX	bt/s	.Lcase00	! 111 BR		(aligned)	 sub	r4, r5		!  75 EX	! Arguments are not nicely long word aligned or zero len.	! Check for small copies, and if so do a simple byte at a time copy.	!	! Deciding on an exact value of 'small' is not easy, as the point at which	! using the optimised routines become worthwhile varies (these are the	! cycle counts for differnet sizes using byte-at-a-time vs. optimised):	!	size	byte-at-time	long	word	byte	!	16	42		39-40	46-50	50-55	!	24	58		43-44	54-58	62-67	!	36	82		49-50	66-70	80-85	! However the penalty for getting it 'wrong' is much higher for long word	! aligned data (and this is more common), so use a value of 16.	cmp/gt	r6,r1		!  56 MT	add	#-1,r5		!  50 EX	bf/s	6f		! 108 BR		(not small)	 mov	r5, r3		!   5 MT (latency=0)	shlr	r6		! 104 EX	mov.b	@(r0,r5),r1	!  20 LS (latency=2)	bf/s	4f		! 111 BR	 add	#-1,r3		!  50 EX	tst	r6, r6		!  86 MT	bt/s	98f		! 110 BR	 mov.b	r1,@-r0		!  29 LS	! 4 cycles, 2 bytes per iteration3:	mov.b	@(r0,r5),r1	!  20 LS (latency=2)4:	mov.b	@(r0,r3),r2	!  20 LS (latency=2)	dt	r6		!  67 EX	mov.b	r1,@-r0		!  29 LS	bf/s	3b		! 111 BR	 mov.b	r2,@-r0		!  29 LS98:	rts	 nop99:	rts	 mov	r4, r0	! Size is not small, so its worthwhile looking for optimisations.	! First align destination to a long word boundary.	!	! r5 = normal value -16:	tst	#3, r0		!  87 MT        mov	#3, r3		!   6 EX	bt/s	2f		! 111 BR	 and	r0,r3		!  78 EX	! 3 cycles, 1 byte per iteration1:	dt	r3		!  67 EX	mov.b	@(r0,r5),r1	!  19 LS (latency=2)	add	#-1, r6		!  79 EX	bf/s	1b		! 109 BR	 mov.b	r1,@-r0		!  28 LS2:	add	#1, r5		!  79 EX	! Now select the appropriate bulk transfer code based on relative	! alignment of src and dst.	mov	r0, r3		!   5 MT (latency=0)	mov	r5, r0		!   5 MT (latency=0)	tst	#1, r0		!  87 MT	bf/s	1f		! 111 BR	 mov	#64, r7		!   6 EX	! bit 0 clear	cmp/ge	r7, r6		!  55 MT	bt/s	2f		! 111 BR	 tst	#2, r0		!  87 MT	! small	bt/s	.Lcase0	 mov	r3, r0	bra	.Lcase2	 nop	! big2:	bt/s	.Lcase0b	 mov	r3, r0	bra	.Lcase2b	 nop	! bit 0 set1:	tst	#2, r0		! 87 MT	bt/s	.Lcase1	 mov	r3, r0	bra	.Lcase3	 nop	!	!	GHIJ KLMN OPQR -->  GHIJ KLMN OPQR	!	! src, dst and size are all long word aligned	! size is non-zero	.balign	32.Lcase00:	mov	#64, r1		!   6 EX	mov	r5, r3		!   5 MT (latency=0)	cmp/gt	r6, r1		!  56 MT	add	#-4, r5		!  50 EX	bf	.Lcase00b	! 108 BR		(big loop)	shlr2	r6		! 105 EX	shlr	r6		! 104 EX	mov.l	@(r0, r5), r1	!  21 LS (latency=2)	bf/s	4f		! 111 BR	 add	#-8, r3		!  50 EX	tst	r6, r6		!  86 MT	bt/s	5f		! 110 BR	 mov.l	r1,@-r0		!  30 LS	! 4 cycles, 2 long words per iteration3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)	dt	r6		!  67 EX	mov.l	r1, @-r0	!  30 LS	bf/s	3b		! 109 BR	 mov.l	r2, @-r0	!  30 LS5:	rts	 nop	! Size is 16 or greater and less than 64, but may have trailing bytes	.balign	32.Lcase0:	add	#-4, r5		!  50 EX	mov	r4, r7		!   5 MT (latency=0)	mov.l	@(r0, r5), r1	!  21 LS (latency=2)	mov	#4, r2		!   6 EX	add	#11, r7		!  50 EX	tst	r2, r6		!  86 MT	mov	r5, r3		!   5 MT (latency=0)	bt/s	4f		! 111 BR	 add	#-4, r3		!  50 EX	mov.l	r1,@-r0		!  30 LS	! 4 cycles, 2 long words per iteration3:	mov.l	@(r0, r5), r1	!  21 LS (latency=2)4:	mov.l	@(r0, r3), r2	!  21 LS (latency=2)	cmp/hi	r7, r0	mov.l	r1, @-r0	!  30 LS	bt/s	3b		! 109 BR	 mov.l	r2, @-r0	!  30 LS	! Copy the final 0-3 bytes	add	#3,r5		!  50 EX	cmp/eq	r0, r4		!  54 MT	add	#-10, r7	!  50 EX	bt	9f		! 110 BR	! 3 cycles, 1 byte per iteration1:	mov.b	@(r0,r5),r1	!  19 LS	cmp/hi	r7,r0		!  57 MT	bt/s	1b		! 111 BR	 mov.b	r1,@-r0		!  28 LS9:	rts	 nop	! Size is at least 64 bytes, so will be going round the big loop at least once.	!	!   r2 = rounded up r4	!   r3 = rounded down r0	.balign	32

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -