⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 memcmp.s

📁 glibc 2.9,最新版的C语言库函数
💻 S
📖 第 1 页 / 共 2 页
字号:
/* Optimized strcmp implementation for PowerPC64.   Copyright (C) 2003, 2006 Free Software Foundation, Inc.   This file is part of the GNU C Library.   The GNU C Library is free software; you can redistribute it and/or   modify it under the terms of the GNU Lesser General Public   License as published by the Free Software Foundation; either   version 2.1 of the License, or (at your option) any later version.   The GNU C Library is distributed in the hope that it will be useful,   but WITHOUT ANY WARRANTY; without even the implied warranty of   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU   Lesser General Public License for more details.   You should have received a copy of the GNU Lesser General Public   License along with the GNU C Library; if not, write to the Free   Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston MA   02110-1301 USA.  */#include <sysdep.h>#include <bp-sym.h>#include <bp-asm.h>/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */	.machine power4EALIGN (BP_SYM(memcmp), 4, 0)	CALL_MCOUNT#define rTMP	r0#define rRTN	r3#define rSTR1	r3	/* first string arg */#define rSTR2	r4	/* second string arg */#define rN	r5	/* max string length */#define rWORD1	r6	/* current word in s1 */#define rWORD2	r7	/* current word in s2 */#define rWORD3	r8	/* next word in s1 */#define rWORD4	r9	/* next word in s2 */#define rWORD5	r10	/* next word in s1 */#define rWORD6	r11	/* next word in s2 */#define rBITDIF	r12	/* bits that differ in s1 & s2 words */#define rWORD7	r30	/* next word in s1 */#define rWORD8	r31	/* next word in s2 */	xor	rTMP, rSTR2, rSTR1	cmplwi	cr6, rN, 0	cmplwi	cr1, rN, 12	clrlwi.	rTMP, rTMP, 30	clrlwi	rBITDIF, rSTR1, 30	cmplwi	cr5, rBITDIF, 0	beq-	cr6, L(zeroLength)	dcbt	0,rSTR1	dcbt	0,rSTR2/* If less than 8 bytes or not aligned, use the unaligned   byte loop.  */	blt	cr1, L(bytealigned)        stwu    1,-64(1)	cfi_adjust_cfa_offset(64)        stw     r31,48(1)		cfi_offset(31,(48-64))        stw     r30,44(1)		cfi_offset(30,(44-64))	bne	L(unaligned)/* At this point we know both strings have the same alignment and the   compare length is at least 8 bytes.  rBITDIF contains the low order   2 bits of rSTR1 and cr5 contains the result of the logical compare   of rBITDIF to 0.  If rBITDIF == 0 then we are already word    aligned and can perform the word aligned loop.     Otherwise we know the two strings have the same alignment (but not   yet word aligned).  So we force the string addresses to the next lower   word boundary and special case this first word using shift left to   eliminate bits preceeding the first byte.  Since we want to join the   normal (word aligned) compare loop, starting at the second word,   we need to adjust the length (rN) and special case the loop   versioning for the first word. This insures that the loop count is   correct and the first word (shifted) is in the expected register pair. */	.align 4L(samealignment):	clrrwi	rSTR1, rSTR1, 2	clrrwi	rSTR2, rSTR2, 2	beq	cr5, L(Waligned)	add	rN, rN, rBITDIF	slwi	r11, rBITDIF, 3	srwi	rTMP, rN, 4	 /* Divide by 16 */	andi.	rBITDIF, rN, 12  /* Get the word remainder */	lwz	rWORD1, 0(rSTR1)	lwz	rWORD2, 0(rSTR2)	cmplwi	cr1, rBITDIF, 8	cmplwi	cr7, rN, 16	clrlwi	rN, rN, 30	beq	L(dPs4)	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	bgt	cr1, L(dPs3)	beq	cr1, L(dPs2)/* Remainder is 4 */	.align 3L(dsP1):	slw	rWORD5, rWORD1, r11	slw	rWORD6, rWORD2, r11	cmplw	cr5, rWORD5, rWORD6	blt	cr7, L(dP1x)/* Do something useful in this cycle since we have to branch anyway.  */	lwz	rWORD1, 4(rSTR1)	lwz	rWORD2, 4(rSTR2)	cmplw	cr0, rWORD1, rWORD2	b	L(dP1e)/* Remainder is 8 */	.align 4L(dPs2):	slw	rWORD5, rWORD1, r11	slw	rWORD6, rWORD2, r11	cmplw	cr6, rWORD5, rWORD6	blt	cr7, L(dP2x)/* Do something useful in this cycle since we have to branch anyway.  */	lwz	rWORD7, 4(rSTR1)	lwz	rWORD8, 4(rSTR2)	cmplw	cr5, rWORD7, rWORD8	b	L(dP2e)/* Remainder is 12 */	.align 4L(dPs3):	slw	rWORD3, rWORD1, r11	slw	rWORD4, rWORD2, r11	cmplw	cr1, rWORD3, rWORD4	b	L(dP3e)/* Count is a multiple of 16, remainder is 0 */	.align 4L(dPs4):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	slw	rWORD1, rWORD1, r11	slw	rWORD2, rWORD2, r11	cmplw	cr0, rWORD1, rWORD2	b	L(dP4e)/* At this point we know both strings are word aligned and the   compare length is at least 8 bytes.  */	.align 4L(Waligned):	andi.	rBITDIF, rN, 12  /* Get the word remainder */	srwi	rTMP, rN, 4	 /* Divide by 16 */	cmplwi	cr1, rBITDIF, 8	cmplwi	cr7, rN, 16	clrlwi	rN, rN, 30	beq	L(dP4)	bgt	cr1, L(dP3)	beq	cr1, L(dP2)		/* Remainder is 4 */	.align 4L(dP1):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group *//* Normally we'd use rWORD7/rWORD8 here, but since we might exit early   (8-15 byte compare), we want to use only volatile registers.  This   means we can avoid restoring non-volatile registers since we did not   change any on the early exit path.  The key here is the non-early   exit path only cares about the condition code (cr5), not about which    register pair was used.  */	lwz	rWORD5, 0(rSTR1)	lwz	rWORD6, 0(rSTR2)	cmplw	cr5, rWORD5, rWORD6	blt	cr7, L(dP1x)	lwz	rWORD1, 4(rSTR1)	lwz	rWORD2, 4(rSTR2)	cmplw	cr0, rWORD1, rWORD2L(dP1e):	lwz	rWORD3, 8(rSTR1)	lwz	rWORD4, 8(rSTR2)	cmplw	cr1, rWORD3, rWORD4	lwz	rWORD5, 12(rSTR1)	lwz	rWORD6, 12(rSTR2)	cmplw	cr6, rWORD5, rWORD6	bne	cr5, L(dLcr5)	bne	cr0, L(dLcr0)		lwzu	rWORD7, 16(rSTR1)	lwzu	rWORD8, 16(rSTR2)	bne	cr1, L(dLcr1)	cmplw	cr5, rWORD7, rWORD8	bdnz	L(dLoop)	bne	cr6, L(dLcr6)        lwz     r30,44(1)        lwz     r31,48(1)	.align 3L(dP1x):	slwi.	r12, rN, 3	bne	cr5, L(dLcr5)	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */        lwz     1,0(1)	bne	L(d00)	li	rRTN, 0	blr		/* Remainder is 8 */	.align 4L(dP2):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	lwz	rWORD5, 0(rSTR1)	lwz	rWORD6, 0(rSTR2)	cmplw	cr6, rWORD5, rWORD6	blt	cr7, L(dP2x)	lwz	rWORD7, 4(rSTR1)	lwz	rWORD8, 4(rSTR2)	cmplw	cr5, rWORD7, rWORD8L(dP2e):	lwz	rWORD1, 8(rSTR1)	lwz	rWORD2, 8(rSTR2)	cmplw	cr0, rWORD1, rWORD2	lwz	rWORD3, 12(rSTR1)	lwz	rWORD4, 12(rSTR2)	cmplw	cr1, rWORD3, rWORD4	addi	rSTR1, rSTR1, 4	addi	rSTR2, rSTR2, 4	bne	cr6, L(dLcr6)	bne	cr5, L(dLcr5)	b	L(dLoop2)/* Again we are on a early exit path (16-23 byte compare), we want to   only use volatile registers and avoid restoring non-volatile   registers.  */	.align 4L(dP2x):	lwz	rWORD3, 4(rSTR1)	lwz	rWORD4, 4(rSTR2)	cmplw	cr5, rWORD3, rWORD4	slwi.	r12, rN, 3	bne	cr6, L(dLcr6)	addi	rSTR1, rSTR1, 4	addi	rSTR2, rSTR2, 4	bne	cr5, L(dLcr5)	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */        lwz     1,0(1)	bne	L(d00)	li	rRTN, 0	blr		/* Remainder is 12 */	.align 4L(dP3):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	lwz	rWORD3, 0(rSTR1)	lwz	rWORD4, 0(rSTR2)	cmplw	cr1, rWORD3, rWORD4L(dP3e):	lwz	rWORD5, 4(rSTR1)	lwz	rWORD6, 4(rSTR2)	cmplw	cr6, rWORD5, rWORD6	blt	cr7, L(dP3x)	lwz	rWORD7, 8(rSTR1)	lwz	rWORD8, 8(rSTR2)	cmplw	cr5, rWORD7, rWORD8	lwz	rWORD1, 12(rSTR1)	lwz	rWORD2, 12(rSTR2)	cmplw	cr0, rWORD1, rWORD2	addi	rSTR1, rSTR1, 8	addi	rSTR2, rSTR2, 8	bne	cr1, L(dLcr1)	bne	cr6, L(dLcr6)	b	L(dLoop1)/* Again we are on a early exit path (24-31 byte compare), we want to   only use volatile registers and avoid restoring non-volatile   registers.  */	.align 4L(dP3x):	lwz	rWORD1, 8(rSTR1)	lwz	rWORD2, 8(rSTR2)	cmplw	cr5, rWORD1, rWORD2	slwi.	r12, rN, 3	bne	cr1, L(dLcr1)	addi	rSTR1, rSTR1, 8	addi	rSTR2, rSTR2, 8	bne	cr6, L(dLcr6)	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */	bne	cr5, L(dLcr5)        lwz     1,0(1)	bne	L(d00)	li	rRTN, 0	blr	/* Count is a multiple of 16, remainder is 0 */	.align 4L(dP4):	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */	lwz	rWORD1, 0(rSTR1)	lwz	rWORD2, 0(rSTR2)	cmplw	cr0, rWORD1, rWORD2L(dP4e):	lwz	rWORD3, 4(rSTR1)	lwz	rWORD4, 4(rSTR2)	cmplw	cr1, rWORD3, rWORD4	lwz	rWORD5, 8(rSTR1)	lwz	rWORD6, 8(rSTR2)	cmplw	cr6, rWORD5, rWORD6	lwzu	rWORD7, 12(rSTR1)	lwzu	rWORD8, 12(rSTR2)	cmplw	cr5, rWORD7, rWORD8	bne	cr0, L(dLcr0)	bne	cr1, L(dLcr1)	bdz-	L(d24)		/* Adjust CTR as we start with +4 *//* This is the primary loop */	.align 4L(dLoop):	lwz	rWORD1, 4(rSTR1)	lwz	rWORD2, 4(rSTR2)	cmplw	cr1, rWORD3, rWORD4	bne	cr6, L(dLcr6)L(dLoop1):	lwz	rWORD3, 8(rSTR1)	lwz	rWORD4, 8(rSTR2)	cmplw	cr6, rWORD5, rWORD6	bne	cr5, L(dLcr5)L(dLoop2):	lwz	rWORD5, 12(rSTR1)	lwz	rWORD6, 12(rSTR2)	cmplw	cr5, rWORD7, rWORD8	bne	cr0, L(dLcr0)L(dLoop3):	lwzu	rWORD7, 16(rSTR1)	lwzu	rWORD8, 16(rSTR2)	bne-	cr1, L(dLcr1)	cmplw	cr0, rWORD1, rWORD2	bdnz+	L(dLoop)		L(dL4):	cmplw	cr1, rWORD3, rWORD4	bne	cr6, L(dLcr6)	cmplw	cr6, rWORD5, rWORD6	bne	cr5, L(dLcr5)	cmplw	cr5, rWORD7, rWORD8L(d44):	bne	cr0, L(dLcr0)L(d34):	bne	cr1, L(dLcr1)L(d24):	bne	cr6, L(dLcr6)L(d14):	slwi.	r12, rN, 3	bne	cr5, L(dLcr5) L(d04):        lwz     r30,44(1)        lwz     r31,48(1)        lwz     1,0(1)	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */	beq	L(zeroLength)/* At this point we have a remainder of 1 to 3 bytes to compare.  Since   we are aligned it is safe to load the whole word, and use   shift right to eliminate bits beyond the compare length. */ L(d00):	lwz	rWORD1, 4(rSTR1)	lwz	rWORD2, 4(rSTR2) 	srw	rWORD1, rWORD1, rN	srw	rWORD2, rWORD2, rN        cmplw   rWORD1,rWORD2        li      rRTN,0        beqlr        li      rRTN,1        bgtlr        li      rRTN,-1        blr	.align 4L(dLcr0):        lwz     r30,44(1)        lwz     r31,48(1)	li	rRTN, 1        lwz     1,0(1)	bgtlr	cr0	li	rRTN, -1	blr	.align 4L(dLcr1):        lwz     r30,44(1)        lwz     r31,48(1)	li	rRTN, 1        lwz     1,0(1)	bgtlr	cr1	li	rRTN, -1	blr	.align 4L(dLcr6):        lwz     r30,44(1)        lwz     r31,48(1)	li	rRTN, 1        lwz     1,0(1)	bgtlr	cr6	li	rRTN, -1	blr	.align 4L(dLcr5):        lwz     r30,44(1)        lwz     r31,48(1)L(dLcr5x):	li	rRTN, 1        lwz     1,0(1)	bgtlr	cr5	li	rRTN, -1	blr		.align 4L(bytealigned):	cfi_adjust_cfa_offset(-64)	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group *//* We need to prime this loop.  This loop is swing modulo scheduled   to avoid pipe delays.  The dependent instruction latencies (load to    compare to conditional branch) is 2 to 3 cycles.  In this loop each   dispatch group ends in a branch and takes 1 cycle.  Effectively   the first iteration of the loop only serves to load operands and    branches based on compares are delayed until the next loop.    So we must precondition some registers and condition codes so that   we don't exit the loop early on the first iteration.  */   	lbz	rWORD1, 0(rSTR1)	lbz	rWORD2, 0(rSTR2)	bdz-	L(b11)	cmplw	cr0, rWORD1, rWORD2	lbz	rWORD3, 1(rSTR1)	lbz	rWORD4, 1(rSTR2)	bdz-	L(b12)	cmplw	cr1, rWORD3, rWORD4	lbzu	rWORD5, 2(rSTR1)	lbzu	rWORD6, 2(rSTR2)	bdz-	L(b13)	.align 4L(bLoop):	lbzu	rWORD1, 1(rSTR1)	lbzu	rWORD2, 1(rSTR2)	bne-	cr0, L(bLcr0)	cmplw	cr6, rWORD5, rWORD6	bdz-	L(b3i)		lbzu	rWORD3, 1(rSTR1)	lbzu	rWORD4, 1(rSTR2)	bne-	cr1, L(bLcr1)	cmplw	cr0, rWORD1, rWORD2	bdz-	L(b2i)	lbzu	rWORD5, 1(rSTR1)	lbzu	rWORD6, 1(rSTR2)	bne-	cr6, L(bLcr6)	cmplw	cr1, rWORD3, rWORD4	bdnz+	L(bLoop)	/* We speculatively loading bytes before we have tested the previous   bytes.  But we must avoid overrunning the length (in the ctr) to   prevent these speculative loads from causing a segfault.  In this    case the loop will exit early (before the all pending bytes are   tested.  In this case we must complete the pending operations   before returning.  */L(b1i):	bne-	cr0, L(bLcr0)	bne-	cr1, L(bLcr1)	b	L(bx56)	.align 4L(b2i):	bne-	cr6, L(bLcr6)	bne-	cr0, L(bLcr0)	b	L(bx34)	.align 4L(b3i):	bne-	cr1, L(bLcr1)	bne-	cr6, L(bLcr6)	b	L(bx12)	.align 4L(bLcr0):	li	rRTN, 1	bgtlr	cr0	li	rRTN, -1	blrL(bLcr1):	li	rRTN, 1	bgtlr	cr1	li	rRTN, -1	blrL(bLcr6):	li	rRTN, 1	bgtlr	cr6	li	rRTN, -1	blrL(b13):	bne-	cr0, L(bx12)	bne-	cr1, L(bx34)L(bx56):	sub	rRTN, rWORD5, rWORD6	blr	nopL(b12):	bne-	cr0, L(bx12)L(bx34):	

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -