📄 memset.s
字号:
/* Optimized memset implementation for PowerPC. Copyright (C) 1997, 1999, 2000, 2003, 2007 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. The GNU C Library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with the GNU C Library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#include <sysdep.h>#include <bp-sym.h>#include <bp-asm.h>/* __ptr_t [r3] memset (__ptr_t s [r3], int c [r4], size_t n [r5])); Returns 's'. The memset is done in four sizes: byte (8 bits), word (32 bits), 32-byte blocks (256 bits) and __cache_line_size (128, 256, 1024 bits). There is a special case for setting whole cache lines to 0, which takes advantage of the dcbz instruction. */ .section ".text"EALIGN (BP_SYM (memset), 5, 1)#define rTMP r0#define rRTN r3 /* initial value of 1st argument */#if __BOUNDED_POINTERS__# define rMEMP0 r4 /* original value of 1st arg */# define rCHR r5 /* char to set in each byte */# define rLEN r6 /* length of region to set */# define rMEMP r10 /* address at which we are storing */#else# define rMEMP0 r3 /* original value of 1st arg */# define rCHR r4 /* char to set in each byte */# define rLEN r5 /* length of region to set */# define rMEMP r6 /* address at which we are storing */#endif#define rALIGN r7 /* number of bytes we are setting now (when aligning) */#define rMEMP2 r8#define rPOS32 r7 /* constant +32 for clearing with dcbz */#define rNEG64 r8 /* constant -64 for clearing with dcbz */#define rNEG32 r9 /* constant -32 for clearing with dcbz */#define rGOT r9 /* Address of the Global Offset Table. */#define rCLS r8 /* Cache line size obtained from static. */#define rCLM r9 /* Cache line size mask to check for cache alignment. */#if __BOUNDED_POINTERS__ cmplwi cr1, rRTN, 0 CHECK_BOUNDS_BOTH_WIDE (rMEMP0, rTMP, rTMP2, rLEN) beq cr1, L(b0) STORE_RETURN_VALUE (rMEMP0) STORE_RETURN_BOUNDS (rTMP, rTMP2)L(b0):#endif/* take care of case for size <= 4 */ cmplwi cr1, rLEN, 4 andi. rALIGN, rMEMP0, 3 mr rMEMP, rMEMP0 ble- cr1, L(small)/* align to word boundary */ cmplwi cr5, rLEN, 31 rlwimi rCHR, rCHR, 8, 16, 23 beq+ L(aligned) /* 8th instruction from .align */ mtcrf 0x01, rMEMP0 subfic rALIGN, rALIGN, 4 add rMEMP, rMEMP, rALIGN sub rLEN, rLEN, rALIGN bf+ 31, L(g0) stb rCHR, 0(rMEMP0) bt 30, L(aligned)L(g0): sth rCHR, -2(rMEMP) /* 16th instruction from .align *//* take care of case for size < 31 */L(aligned): mtcrf 0x01, rLEN rlwimi rCHR, rCHR, 16, 0, 15 ble cr5, L(medium)/* align to cache line boundary... */ andi. rALIGN, rMEMP, 0x1C subfic rALIGN, rALIGN, 0x20 beq L(caligned) mtcrf 0x01, rALIGN add rMEMP, rMEMP, rALIGN sub rLEN, rLEN, rALIGN cmplwi cr1, rALIGN, 0x10 mr rMEMP2, rMEMP bf 28, L(a1) stw rCHR, -4(rMEMP2) stwu rCHR, -8(rMEMP2)L(a1): blt cr1, L(a2) stw rCHR, -4(rMEMP2) /* 32nd instruction from .align */ stw rCHR, -8(rMEMP2) stw rCHR, -12(rMEMP2) stwu rCHR, -16(rMEMP2)L(a2): bf 29, L(caligned) stw rCHR, -4(rMEMP2)/* now aligned to a cache line. */L(caligned): cmplwi cr1, rCHR, 0 clrrwi. rALIGN, rLEN, 5 mtcrf 0x01, rLEN /* 40th instruction from .align *//* Check if we can use the special case for clearing memory using dcbz. This requires that we know the correct cache line size for this processor. Getting the __cache_line_size may require establishing GOT addressability, so branch out of line to set this up. */ beq cr1, L(checklinesize)/* Store blocks of 32-bytes (256-bits) starting on a 32-byte boundary. Can't assume that rCHR is zero or that the cache line size is either 32-bytes or even known. */L(nondcbz): srwi rTMP, rALIGN, 5 mtctr rTMP beq L(medium) /* we may not actually get to do a full line */ clrlwi. rLEN, rLEN, 27 add rMEMP, rMEMP, rALIGN li rNEG64, -0x40 bdz L(cloopdone) /* 48th instruction from .align *//* We can't use dcbz here as we don't know the cache line size. We can use "data cache block touch for store", which is safe. */L(c3): dcbtst rNEG64, rMEMP stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) stw rCHR, -12(rMEMP) stw rCHR, -16(rMEMP) nop /* let 601 fetch last 4 instructions of loop */ stw rCHR, -20(rMEMP) stw rCHR, -24(rMEMP) /* 56th instruction from .align */ nop /* let 601 fetch first 8 instructions of loop */ stw rCHR, -28(rMEMP) stwu rCHR, -32(rMEMP) bdnz L(c3)L(cloopdone): stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) stw rCHR, -12(rMEMP) stw rCHR, -16(rMEMP) /* 64th instruction from .align */ stw rCHR, -20(rMEMP) cmplwi cr1, rLEN, 16 stw rCHR, -24(rMEMP) stw rCHR, -28(rMEMP) stwu rCHR, -32(rMEMP) beqlr add rMEMP, rMEMP, rALIGN b L(medium_tail2) /* 72nd instruction from .align */ .align 5 nop/* Clear cache lines of memory in 128-byte chunks. This code is optimized for processors with 32-byte cache lines. It is further optimized for the 601 processor, which requires some care in how the code is aligned in the i-cache. */L(zloopstart): clrlwi rLEN, rLEN, 27 mtcrf 0x02, rALIGN srwi. rTMP, rALIGN, 7 mtctr rTMP li rPOS32, 0x20 li rNEG64, -0x40 cmplwi cr1, rLEN, 16 /* 8 */ bf 26, L(z0) dcbz 0, rMEMP addi rMEMP, rMEMP, 0x20L(z0): li rNEG32, -0x20 bf 25, L(z1) dcbz 0, rMEMP dcbz rPOS32, rMEMP addi rMEMP, rMEMP, 0x40 /* 16 */L(z1): cmplwi cr5, rLEN, 0 beq L(medium)L(zloop): dcbz 0, rMEMP dcbz rPOS32, rMEMP addi rMEMP, rMEMP, 0x80 dcbz rNEG64, rMEMP dcbz rNEG32, rMEMP bdnz L(zloop) beqlr cr5 b L(medium_tail2) .align 5L(small):/* Memset of 4 bytes or less. */ cmplwi cr5, rLEN, 1 cmplwi cr1, rLEN, 3 bltlr cr5 stb rCHR, 0(rMEMP) beqlr cr5 nop stb rCHR, 1(rMEMP) bltlr cr1 stb rCHR, 2(rMEMP) beqlr cr1 nop stb rCHR, 3(rMEMP) blr/* Memset of 0-31 bytes. */ .align 5L(medium): cmplwi cr1, rLEN, 16L(medium_tail2): add rMEMP, rMEMP, rLENL(medium_tail): bt- 31, L(medium_31t) bt- 30, L(medium_30t)L(medium_30f): bt- 29, L(medium_29t)L(medium_29f): bge- cr1, L(medium_27t) bflr- 28 stw rCHR, -4(rMEMP) /* 8th instruction from .align */ stw rCHR, -8(rMEMP) blrL(medium_31t): stbu rCHR, -1(rMEMP) bf- 30, L(medium_30f)L(medium_30t): sthu rCHR, -2(rMEMP) bf- 29, L(medium_29f)L(medium_29t): stwu rCHR, -4(rMEMP) blt- cr1, L(medium_27f) /* 16th instruction from .align */L(medium_27t): stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) stw rCHR, -12(rMEMP) stwu rCHR, -16(rMEMP)L(medium_27f): bflr- 28L(medium_28t): stw rCHR, -4(rMEMP) stw rCHR, -8(rMEMP) blrL(checklinesize):#ifdef SHARED mflr rTMP/* If the remaining length is less the 32 bytes then don't bother getting the cache line size. */ beq L(medium)/* Establishes GOT addressability so we can load __cache_line_size from static. This value was set from the aux vector during startup. */# ifdef HAVE_ASM_PPC_REL16 bcl 20,31,1f1: mflr rGOT addis rGOT,rGOT,__cache_line_size-1b@ha lwz rCLS,__cache_line_size-1b@l(rGOT)# else bl _GLOBAL_OFFSET_TABLE_@local-4 mflr rGOT lwz rGOT,__cache_line_size@got(rGOT) lwz rCLS,0(rGOT)# endif mtlr rTMP#else/* Load __cache_line_size from static. This value was set from the aux vector during startup. */ lis rCLS,__cache_line_size@ha/* If the remaining length is less the 32 bytes then don't bother getting the cache line size. */ beq L(medium) lwz rCLS,__cache_line_size@l(rCLS)#endif/* If the cache line size was not set then goto to L(nondcbz), which is safe for any cache line size. */ cmplwi cr1,rCLS,0 beq cr1,L(nondcbz)/* If the cache line size is 32 bytes then goto to L(zloopstart), which is coded specificly for 32-byte lines (and 601). */ cmplwi cr1,rCLS,32 beq cr1,L(zloopstart)/* Now we know the cache line size and it is not 32-bytes. However we may not yet be aligned to the cache line and may have a partial line to fill. Touch it 1st to fetch the cache line. */ dcbtst 0,rMEMP addi rCLM,rCLS,-1L(getCacheAligned): cmplwi cr1,rLEN,32 and. rTMP,rCLM,rMEMP blt cr1,L(handletail32) beq L(cacheAligned)/* We are not aligned to start of a cache line yet. Store 32-byte of data and test again. */ addi rMEMP,rMEMP,32 addi rLEN,rLEN,-32 stw rCHR,-32(rMEMP) stw rCHR,-28(rMEMP) stw rCHR,-24(rMEMP) stw rCHR,-20(rMEMP) stw rCHR,-16(rMEMP) stw rCHR,-12(rMEMP) stw rCHR,-8(rMEMP) stw rCHR,-4(rMEMP) b L(getCacheAligned)/* Now we are aligned to the cache line and can use dcbz. */L(cacheAligned): cmplw cr1,rLEN,rCLS blt cr1,L(handletail32) dcbz 0,rMEMP subf rLEN,rCLS,rLEN add rMEMP,rMEMP,rCLS b L(cacheAligned)/* We are here because; the cache line size was set, it was not 32-bytes, and the remainder (rLEN) is now less than the actual cache line size. Set up the preconditions for L(nondcbz) and go there to store the remaining bytes. */L(handletail32): clrrwi. rALIGN, rLEN, 5 b L(nondcbz)END (BP_SYM (memset))libc_hidden_builtin_def (memset)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -