⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 memcmp.s

📁 glibc 2.9,最新版的C语言库函数
💻 S
📖 第 1 页 / 共 2 页
字号:
/* Optimized strcmp implementation for PowerPC64.   Copyright (C) 2003, 2006 Free Software Foundation, Inc.   This file is part of the GNU C Library.   The GNU C Library is free software; you can redistribute it and/or   modify it under the terms of the GNU Lesser General Public   License as published by the Free Software Foundation; either   version 2.1 of the License, or (at your option) any later version.   The GNU C Library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Lesser General Public License for more details.   You should have received a copy of the GNU Lesser General Public   License along with the GNU C Library; if not, write to the Free   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA   02110-1301 USA.  */#include <sysdep.h>#include <bp-sym.h>#include <bp-asm.h>/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */	.machine power4EALIGN (BP_SYM(memcmp), 4, 0)	CALL_MCOUNT 3#define rTMP	r0#define rRTN	r3#define rSTR1	r3	/* first string arg */#define rSTR2	r4	/* second string arg */#define rN	r5	/* max string length *//* Note:  The Bounded pointer support in this code is broken.  This code   was inherited from PPC32 and and that support was never completed.     Current PPC gcc does not support -fbounds-check or -fbounded-pointers.  */#define rWORD1	r6	/* current word in s1 */#define rWORD2	r7	/* current word in s2 */#define rWORD3	r8	/* next word in s1 */#define rWORD4	r9	/* next word in s2 */#define rWORD5	r10	/* next word in s1 */#define rWORD6	r11	/* next word in s2 */#define rBITDIF	r12	/* bits that differ in s1 & s2 words */#define rWORD7	r30	/* next word in s1 */#define rWORD8	r31	/* next word in s2 */	xor	rTMP, rSTR2, rSTR1	cmpldi	cr6, rN, 0	cmpldi	cr1, rN, 12	clrldi.	rTMP, rTMP, 61	clrldi	rBITDIF, rSTR1, 61	cmpldi	cr5, rBITDIF, 0	beq-	cr6, L(zeroLength)	dcbt	0,rSTR1	dcbt	0,rSTR2/* If less than 8 bytes or not aligned, use the unalligned   byte loop.  */	blt	cr1, L(bytealigned)	std	rWORD8,-8(r1)		cfi_offset(rWORD8,-8)	std	rWORD7,-16(r1)		cfi_offset(rWORD7,-16)	bne	L(unaligned)/* At this point we know both strings have the same alignment and the   compare length is at least 8 bytes.  rBITDIF containes the low order   3 bits of rSTR1 and cr5 contains the result of the logical compare   of rBITDIF to 0.  If rBITDIF == 0 then we are already double word    aligned and can perform the DWaligned loop.     Otherwise we know the two strings have the same alignment (but not   yet DW).  So we can force the string addresses to the next lower DW   boundary and special case this first DW word using shift left to   ellimiate bits preceeding the first byte.  Since we want to join the   normal (DWaligned) compare loop, starting at the second double word,   we need to adjust the length (rN) and special case the loop   versioning for the first DW. This insures that the loop count is   correct and the first DW (shifted) is in the expected resister pair.  */	.align 4L(samealignment):	clrrdi	rSTR1, rSTR1, 3	clrrdi	rSTR2, rSTR2, 3	beq	cr5, L(DWaligned)	add	rN, rN, rBITDIF	sldi	r11, rBITDIF, 3	srdi	rTMP, rN, 5	/* Divide by 32 */	andi.	rBITDIF, rN, 24	/* Get the DW remainder */	ld	rWORD1, 0(rSTR1)	ld	rWORD2, 0(rSTR2)	cmpldi	cr1, rBITDIF, 16	cmpldi	cr7, rN, 32	clrldi	rN, rN, 61	beq	L(dPs4)	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	bgt	cr1, L(dPs3)	beq	cr1, L(dPs2)/* Remainder is 8 */	.align 3L(dsP1):	sld	rWORD5, rWORD1, r11	sld	rWORD6, rWORD2, r11	cmpld	cr5, rWORD5, rWORD6	blt	cr7, L(dP1x)/* Do something useful in this cycle since we have to branch anyway.  */	ld	rWORD1, 8(rSTR1)	ld	rWORD2, 8(rSTR2)	cmpld	cr0, rWORD1, rWORD2	b	L(dP1e)/* Remainder is 16 */	.align 4L(dPs2):	sld	rWORD5, rWORD1, r11	sld	rWORD6, rWORD2, r11	cmpld	cr6, rWORD5, rWORD6	blt	cr7, L(dP2x)/* Do something useful in this cycle since we have to branch anyway.  */	ld	rWORD7, 8(rSTR1)	ld	rWORD8, 8(rSTR2)	cmpld	cr5, rWORD7, rWORD8	b	L(dP2e)/* Remainder is 24 */	.align 4L(dPs3):	sld	rWORD3, rWORD1, r11	sld	rWORD4, rWORD2, r11	cmpld	cr1, rWORD3, rWORD4	b	L(dP3e)/* Count is a multiple of 32, remainder is 0 */	.align 4L(dPs4):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	sld	rWORD1, rWORD1, r11	sld	rWORD2, rWORD2, r11	cmpld	cr0, rWORD1, rWORD2	b	L(dP4e)/* At this point we know both strings are double word aligned and the   compare length is at least 8 bytes.  */	.align 4L(DWaligned):	andi.	rBITDIF, rN, 24	/* Get the DW remainder */	srdi	rTMP, rN, 5	/* Divide by 32 */	cmpldi	cr1, rBITDIF, 16	cmpldi	cr7, rN, 32	clrldi	rN, rN, 61	beq	L(dP4)	bgt	cr1, L(dP3)	beq	cr1, L(dP2)		/* Remainder is 8 */	.align 4L(dP1):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group *//* Normally we'd use rWORD7/rWORD8 here, but since we might exit early   (8-15 byte compare), we want to use only volitile registers.  This   means we can avoid restoring non-volitile registers since we did not   change any on the early exit path.  The key here is the non-early   exit path only cares about the condition code (cr5), not about which    register pair was used.  */	ld	rWORD5, 0(rSTR1)	ld	rWORD6, 0(rSTR2)	cmpld	cr5, rWORD5, rWORD6	blt	cr7, L(dP1x)	ld	rWORD1, 8(rSTR1)	ld	rWORD2, 8(rSTR2)	cmpld	cr0, rWORD1, rWORD2L(dP1e):	ld	rWORD3, 16(rSTR1)	ld	rWORD4, 16(rSTR2)	cmpld	cr1, rWORD3, rWORD4	ld	rWORD5, 24(rSTR1)	ld	rWORD6, 24(rSTR2)	cmpld	cr6, rWORD5, rWORD6	bne	cr5, L(dLcr5)	bne	cr0, L(dLcr0)		ldu	rWORD7, 32(rSTR1)	ldu	rWORD8, 32(rSTR2)	bne	cr1, L(dLcr1)	cmpld	cr5, rWORD7, rWORD8	bdnz	L(dLoop)	bne	cr6, L(dLcr6)	ld	rWORD8,-8(r1)	ld	rWORD7,-16(r1)	.align 3L(dP1x):	sldi.	r12, rN, 3	bne	cr5, L(dLcr5)	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */	bne	L(d00)	li	rRTN, 0	blr		/* Remainder is 16 */	.align 4L(dP2):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	ld	rWORD5, 0(rSTR1)	ld	rWORD6, 0(rSTR2)	cmpld	cr6, rWORD5, rWORD6	blt	cr7, L(dP2x)	ld	rWORD7, 8(rSTR1)	ld	rWORD8, 8(rSTR2)	cmpld	cr5, rWORD7, rWORD8L(dP2e):	ld	rWORD1, 16(rSTR1)	ld	rWORD2, 16(rSTR2)	cmpld	cr0, rWORD1, rWORD2	ld	rWORD3, 24(rSTR1)	ld	rWORD4, 24(rSTR2)	cmpld	cr1, rWORD3, rWORD4	addi	rSTR1, rSTR1, 8	addi	rSTR2, rSTR2, 8	bne	cr6, L(dLcr6)	bne	cr5, L(dLcr5)	b	L(dLoop2)/* Again we are on a early exit path (16-23 byte compare), we want to   only use volitile registers and avoid restoring non-volitile   registers.  */	.align 4L(dP2x):	ld	rWORD3, 8(rSTR1)	ld	rWORD4, 8(rSTR2)	cmpld	cr5, rWORD3, rWORD4	sldi.	r12, rN, 3	bne	cr6, L(dLcr6)	addi	rSTR1, rSTR1, 8	addi	rSTR2, rSTR2, 8	bne	cr5, L(dLcr5)	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */	bne	L(d00)	li	rRTN, 0	blr		/* Remainder is 24 */	.align 4L(dP3):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	ld	rWORD3, 0(rSTR1)	ld	rWORD4, 0(rSTR2)	cmpld	cr1, rWORD3, rWORD4L(dP3e):	ld	rWORD5, 8(rSTR1)	ld	rWORD6, 8(rSTR2)	cmpld	cr6, rWORD5, rWORD6	blt	cr7, L(dP3x)	ld	rWORD7, 16(rSTR1)	ld	rWORD8, 16(rSTR2)	cmpld	cr5, rWORD7, rWORD8	ld	rWORD1, 24(rSTR1)	ld	rWORD2, 24(rSTR2)	cmpld	cr0, rWORD1, rWORD2	addi	rSTR1, rSTR1, 16	addi	rSTR2, rSTR2, 16	bne	cr1, L(dLcr1)	bne	cr6, L(dLcr6)	b	L(dLoop1)/* Again we are on a early exit path (24-31 byte compare), we want to   only use volitile registers and avoid restoring non-volitile   registers.  */	.align 4L(dP3x):	ld	rWORD1, 16(rSTR1)	ld	rWORD2, 16(rSTR2)	cmpld	cr5, rWORD1, rWORD2	sldi.	r12, rN, 3	bne	cr1, L(dLcr1)	addi	rSTR1, rSTR1, 16	addi	rSTR2, rSTR2, 16	bne	cr6, L(dLcr6)	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */	bne	cr5, L(dLcr5)	bne	L(d00)	li	rRTN, 0	blr	/* Count is a multiple of 32, remainder is 0 */	.align 4L(dP4):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	ld	rWORD1, 0(rSTR1)	ld	rWORD2, 0(rSTR2)	cmpld	cr0, rWORD1, rWORD2L(dP4e):	ld	rWORD3, 8(rSTR1)	ld	rWORD4, 8(rSTR2)	cmpld	cr1, rWORD3, rWORD4	ld	rWORD5, 16(rSTR1)	ld	rWORD6, 16(rSTR2)	cmpld	cr6, rWORD5, rWORD6	ldu	rWORD7, 24(rSTR1)	ldu	rWORD8, 24(rSTR2)	cmpld	cr5, rWORD7, rWORD8	bne	cr0, L(dLcr0)	bne	cr1, L(dLcr1)	bdz-	L(d24)		/* Adjust CTR as we start with +4 *//* This is the primary loop */	.align 4L(dLoop):	ld	rWORD1, 8(rSTR1)	ld	rWORD2, 8(rSTR2)	cmpld	cr1, rWORD3, rWORD4	bne	cr6, L(dLcr6)L(dLoop1):	ld	rWORD3, 16(rSTR1)	ld	rWORD4, 16(rSTR2)	cmpld	cr6, rWORD5, rWORD6	bne	cr5, L(dLcr5)L(dLoop2):	ld	rWORD5, 24(rSTR1)	ld	rWORD6, 24(rSTR2)	cmpld	cr5, rWORD7, rWORD8	bne	cr0, L(dLcr0)L(dLoop3):	ldu	rWORD7, 32(rSTR1)	ldu	rWORD8, 32(rSTR2)	bne-	cr1, L(dLcr1)	cmpld	cr0, rWORD1, rWORD2	bdnz+	L(dLoop)		L(dL4):	cmpld	cr1, rWORD3, rWORD4	bne	cr6, L(dLcr6)	cmpld	cr6, rWORD5, rWORD6	bne	cr5, L(dLcr5)	cmpld	cr5, rWORD7, rWORD8L(d44):	bne	cr0, L(dLcr0)L(d34):	bne	cr1, L(dLcr1)L(d24):	bne	cr6, L(dLcr6)L(d14):	sldi.	r12, rN, 3	bne	cr5, L(dLcr5) L(d04):	ld	rWORD8,-8(r1)	ld	rWORD7,-16(r1)	subfic	rN, r12, 64	/* Shift count is 64 - (rN * 8).  */	beq	L(zeroLength)/* At this point we have a remainder of 1 to 7 bytes to compare.  Since   we are aligned it is safe to load the whole double word, and use   shift right double to elliminate bits beyond the compare length.  */ L(d00):	ld	rWORD1, 8(rSTR1)	ld	rWORD2, 8(rSTR2) 	srd	rWORD1, rWORD1, rN	srd	rWORD2, rWORD2, rN	cmpld	cr5, rWORD1, rWORD2 	bne	cr5, L(dLcr5x)	li	rRTN, 0	blr	.align 4L(dLcr0):	ld	rWORD8,-8(r1)	ld	rWORD7,-16(r1)	li	rRTN, 1	bgtlr	cr0	li	rRTN, -1	blr	.align 4L(dLcr1):	ld	rWORD8,-8(r1)	ld	rWORD7,-16(r1)	li	rRTN, 1	bgtlr	cr1	li	rRTN, -1	blr	.align 4L(dLcr6):	ld	rWORD8,-8(r1)	ld	rWORD7,-16(r1)	li	rRTN, 1	bgtlr	cr6	li	rRTN, -1	blr	.align 4L(dLcr5):	ld	rWORD8,-8(r1)	ld	rWORD7,-16(r1)L(dLcr5x):	li	rRTN, 1	bgtlr	cr5	li	rRTN, -1	blr		.align 4L(bytealigned):	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */	beq-	cr6, L(zeroLength)/* We need to prime this loop.  This loop is swing modulo scheduled   to avoid pipe delays.  The dependent instruction latencies (load to    compare to conditional branch) is 2 to 3 cycles.  In this loop each   dispatch group ends in a branch and takes 1 cycle.  Effectively   the first iteration of the loop only serves to load operands and    branches based on compares are delayed until the next loop.    So we must precondition some registers and condition codes so that   we don't exit the loop early on the first iteration.  */   	lbz	rWORD1, 0(rSTR1)	lbz	rWORD2, 0(rSTR2)	bdz-	L(b11)	cmpld	cr0, rWORD1, rWORD2	lbz	rWORD3, 1(rSTR1)	lbz	rWORD4, 1(rSTR2)	bdz-	L(b12)	cmpld	cr1, rWORD3, rWORD4	lbzu	rWORD5, 2(rSTR1)	lbzu	rWORD6, 2(rSTR2)	bdz-	L(b13)	.align 4L(bLoop):	lbzu	rWORD1, 1(rSTR1)	lbzu	rWORD2, 1(rSTR2)	bne-	cr0, L(bLcr0)	cmpld	cr6, rWORD5, rWORD6	bdz-	L(b3i)		lbzu	rWORD3, 1(rSTR1)	lbzu	rWORD4, 1(rSTR2)	bne-	cr1, L(bLcr1)	cmpld	cr0, rWORD1, rWORD2	bdz-	L(b2i)	lbzu	rWORD5, 1(rSTR1)	lbzu	rWORD6, 1(rSTR2)	bne-	cr6, L(bLcr6)	cmpld	cr1, rWORD3, rWORD4	bdnz+	L(bLoop)	/* We speculatively loading bytes before we have tested the previous   bytes.  But we must avoid overrunning the length (in the ctr) to   prevent these speculative loads from causing a segfault.  In this    case the loop will exit early (before the all pending bytes are   tested.  In this case we must complete the pending operations   before returning.  */L(b1i):	bne-	cr0, L(bLcr0)	bne-	cr1, L(bLcr1)	b	L(bx56)	.align 4L(b2i):	bne-	cr6, L(bLcr6)	bne-	cr0, L(bLcr0)	b	L(bx34)	.align 4L(b3i):	bne-	cr1, L(bLcr1)	bne-	cr6, L(bLcr6)	b	L(bx12)	.align 4L(bLcr0):	li	rRTN, 1	bgtlr	cr0	li	rRTN, -1	blrL(bLcr1):	li	rRTN, 1	bgtlr	cr1	li	rRTN, -1	blrL(bLcr6):	li	rRTN, 1	bgtlr	cr6	li	rRTN, -1	blrL(b13):	bne-	cr0, L(bx12)	bne-	cr1, L(bx34)L(bx56):	sub	rRTN, rWORD5, rWORD6	blr	nopL(b12):	bne-	cr0, L(bx12)L(bx34):		sub	rRTN, rWORD3, rWORD4	blrL(b11):L(bx12):	sub	rRTN, rWORD1, rWORD2	blr	.align 4 L(zeroLengthReturn):	ld	rWORD8,-8(r1)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -