📄 eficorezeromem.s
字号:
//++
// Copyright (c) 1999 - 2002 Intel Corporation. All rights reserved
// This software and associated documentation (if any) is furnished
// under a license and may only be used or copied in accordance
// with the terms of the license. Except as permitted by such
// license, no part of this software or documentation may be
// reproduced, stored in a retrieval system, or transmitted in any
// form or by any means without the express written consent of
// Intel Corporation.
//
//
// Module Name:
//
// EfiCoreZeroMem.s
//
// Abstract:
//
//
// This is a hand-optimized zero-memory routine.
// It will always produce the optimal storage size sequence,
// according to address alignment and number of bytes to fill.
// e.g. zeroing 12 bytes from an address that is aligned on a
// mod 5 boundary will produce the following storage sizes:
// 1 byte -> 2 bytes -> 8 bytes -> 1 byte
//
// The estimated number of clock cycles for this routine is
// 15 for sizes from 0..15
// (size/16) + 14 for sizes 16 and above
// Add to this the penalty for 1 mispredicted branch
// and any penalties caused by the memory interface for cacheline
// fills and spills when zeroing large amounts of memory.
//
//
// Revision History:
//
//--
.file "EfiCoreZeroMem.s"
// EXPORTS
.global EfiCoreZeroMem
.text
//
// VOID
// EfiCoreZeroMem (
// IN VOID *Buffer,
// IN UINTN Length
// )
//
//
//Routine Description:
//
// Set Buffer to 0 for Length bytes.
//
//Arguments:
//
// Buffer - Memory to set.
//
// Length - Number of bytes to set
//
//
//Returns:
//
// None
//
//
.proc EfiCoreZeroMem
EfiCoreZeroMem:
alloc r13 = ar.pfs, 2, 0, 0, 0 // 2 in, 0 local, 0 out, 0 rot
addr = in0
size = in1
align = r16
size1 = r17
size2 = r18
size3 = r19
size4 = r20
mask = r21
mask2 = r22
addr8 = r23
pr_align = r24
pr_rest = r25
saved_lc = r26
// (1) Calculate bitfield with alignment requirements:
// bit 0: Needs alignment on a 2 byte boundary
// bit 1: Needs alignment on a 4 byte boundary
// bit 2: Needs alignment on a 8 byte boundary
// bit 3: Needs alignment on a 16 byte boundary
//
// This bitfield is generated as 0x10 - (addr & 0xf)
//
// (2) Calculate alignment qualifiers to allow only
// alignment operations that don't exceed the
// size of the zero region.
// We do this by generating a mask that has all
// bits below the position of the most significant
// "1" bit in "size" set.
// An and operation with this mask will disable
// all alignment operations exceeding "size"
//
// (3) Merge the result of (1) and (2) together
// and copy it into p6..p9
//
// (4) Prepare address and count values for the
// bzero main loop
//
// (5) Setup p12..p15 for the epilog
and align = 0xf, addr // (1)
shr size1 = size, 1 // (2)
shr size2 = size, 2 // (2)
shr size3 = size, 3 // (2)
shr size4 = size, 4 // (2)
;;
sub align = 0x10, align // (1)
or mask = size1, size2 // (2)
or mask2 = size3, size4 // (2)
cmp.leu p6, p7 = 16, size // (2)
;;
(p6) add mask = 15, r0 // (2)
(p7) or mask = mask, mask2 // (2)
;;
and align = align, mask // (3)
add addr8 = 15, addr // (4)
brp.sptk.imp .loop, .loop
;;
dep.z pr_align = align, 6, 4 // (3)
sub size = size, align // size remaining after alignment
and addr8 = -16, addr8 // (4)
;;
mov pr = pr_align, 0x3c0 // (3) Set p6 .. p9
dep.z pr_rest = size, 12, 4 // (5)
shr.u size = size, 4 // (4)
;;
// Store zeros for aligning the target address.
// p6..p9 contain the predicates for storing 1, 2, 4, 8 bytes
(p6) st1 [addr] = r0, 1
cmp.ne p10, p11 = size, r0 // (4)
;;
(p7) st2 [addr] = r0, 2
(p10) adds size = -1, size // (5)
add addr8 = 8, addr8 // (4)
;;
(p8) st4 [addr] = r0, 4
mov saved_lc = ar.lc
mov ar.lc = size // (5)
;;
(p9) st8 [addr] = r0, 8
mov pr = pr_rest, 0xf000 // (5) Set p12..p15
;;
// Main Loop
// Writes 16 zero bytes per cycle.
// If we have less than 16 bytes left before entering the loop,
// p10 is FALSE and the main loop is skipped.
.loop:
(p10) st8 [addr] = r0, 16
(p10) st8 [addr8] = r0, 16
br.cloop.sptk .loop
;;
// Now store whatever number of zero's is left.
// The 4 lsbs of the remaining size have already
// been loaded into p12..p15, so here we go:
(p15) st8 [addr] = r0, 8
;;
(p14) st4 [addr] = r0, 4
;;
(p13) st2 [addr] = r0, 2
;;
(p12) st1 [addr] = r0, 1
mov ar.lc = saved_lc
br.ret.sptk b0
.endp EfiCoreZeroMem
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -