📄 eficorecopymem.s

📁 Next BIOS Source code : Extensible Firmware Interface
💻 S
字号:
//++
// Copyright (c)  1999 - 2002 Intel Corporation. All rights reserved
// This software and associated documentation (if any) is furnished
// under a license and may only be used or copied in accordance
// with the terms of the license. Except as permitted by such
// license, no part of this software or documentation may be
// reproduced, stored in a retrieval system, or transmitted in any
// form or by any means without the express written consent of
// Intel Corporation.
//
//
// Module Name:
//
//  EfiCoreCopyMem.s
//
// Abstract:
//
//  This is a hand-optimized memory copy routine.
//
// Revision History:
//
//--

.file  "EfiCoreCopyMem.s"

// EXPORTS
.global EfiCoreCopyMem
.type EfiCoreCopyMem, @function

.text

// VOID
// EfiCoreCopyMem (
//   IN VOID   *Destination,
//   IN VOID   *Source,
//   IN UINTN  Length
//   )
//
// Routine Description:
//
//   Copy Length bytes from Source to Destination.
//
// Arguments:
//
//   Destination - Target of copy
//
//   Source      - Place to copy from
//
//   Length      - Number of bytes to copy
//
// Returns:
//
//   None
//
//
// Alternate entry point for bcopy. Swap source and target arguments.
EfiCoreCopyMem:
  alloc   r13 = ar.pfs, 3, 0, 0, 0  // 3 in, 0 local, 0 out, 0 rot
  mov r14 = r32
  mov r32 = r33
  ;;
  mov r33 = r14
  ;;

// void bcopy(vm_offset_t src, vm_offset_t tgt, vm_size_t size)
//
// Copy "size" bytes from address "src" to address "tgt"
//
bcopy:
  alloc   r13 = ar.pfs, 3, 0, 0, 0  // 3 in, 0 local, 0 out, 0 rot
  src = in0     // NOTE: reused for rotating data3
  tgt = in1     // NOTE: reused for rotating data2
  size = in2      // NOTE: reused for rotating data1

  src_top = r16
  temp = r17
  rest_size = r18
  count = r19
  tgt8 = r20
  src8 = r21
  data = r22
  data8 = r23
  saved_lc = r31
  saved_pr = r2
        mov     saved_pr=pr;;

  cmp.ltu p6 = src, tgt
  add src_top = src, size
  or  temp = src, tgt
  and rest_size = 0x7, size
  shr.u count = size, 4
  mov r8 = r0
  mov saved_lc = ar.lc
  ;;
  (p6) cmp.gtu p6 = src_top, tgt
  cmp.gtu p7 = 8, size
  (p6) br.cond.dpnt .overlap
  (p7) br.cond.dpnt .less_than_8_bytes
  and temp = 0x7, temp
  add count = -1, count 
  mov ar.ec = 2
  ;; 
  cmp.ne  p8 = temp, r0
  tbit.nz p9 = size, 3
  (p8) br.cond.dpnt .not_8_byte_aligned
  brp.sptk.imp .aligned_loop, .aligned_loop_bb
  ;;

//  
//  High Performance case:  * source and target are both 8 byte aligned
//                          * size >= 8
//  Main Loop moves 16 bytes / clock
//
  cmp.eq  p6 = -1, count
  mov ar.lc = count
  (p6)  br.cond.spnt .after_aligned_loop
  ;;
  add tgt8 = 8, tgt
  add src8 = 8, src
  mov pr.rot = 0x10000 
  ;;
.aligned_loop:
  (p17) st8 [tgt] = data, 16
  (p16) ld8 data = [src], 16 
.aligned_loop_bb:
  (p17) st8 [tgt8] = data8, 16
  (p16) ld8 data8 = [src8], 16
    br.ctop.sptk .aligned_loop
  ;;
.after_aligned_loop:
  (p9)  ld8 data = [src], 8
  ;;
  (p9)  st8 [tgt] = data, 8

//
//  Byte move: * no alignment restrictions on source or target
//             * size < 8
//  Main Loop moves 1 byte / clock
//
.less_than_8_bytes:
  cmp.eq  p6 = rest_size, r0
  add count = -2, rest_size
  (p6)  br.cond.sptk .return
  brp.sptk.imp .rest_loop, .rest_loop
  ;;
  ld1 data = [src], 1
  cmp.eq p7 = -1, count
  (p7)  br.cond.spnt .after_rest_loop
  mov ar.lc = count
  ;;
.rest_loop:
  st1 [tgt] = data, 1
  ld1 data = [src], 1
  br.cloop.sptk .rest_loop
  ;;
.after_rest_loop:
  st1 [tgt] = data, 1
  br.cond.sptk  .return

// 
//  General case: * no alignment restrictions on copy or target
//                * size >= 8
//  Main Loop moves 8 bytes / clock
//

  data3 = r32
  data2 = r33
  data1 = r34
  last_loaded = r36

  all_1 = r13
  start_tgt = r14
  aligned_src = r15
  aligned_tgt = r16
  src_offset = r17
  sdata1 = r18
  sdata2 = r19
  rest_size = r20
  rest_offset = r21
  rshift = r22
  lshift = r23
  count = r24
  start_data = r25
  start_mask = r26
  tgt_data = r27
  tgt_mask = r28
  end_data = r29
  tgt_offset = r30

.not_8_byte_aligned:
  alloc   r13 = ar.pfs, 3, 5, 0, 8  // Make r32-r39 rotating 
  and src_offset = 0x7, src
  and tgt_offset = 0x7, tgt
  and aligned_src = -8, src
  and aligned_tgt = -8, tgt
  add rest_size = -8, size
  ;;
  cmp.leu p6, p7 = tgt_offset, src_offset
  sub rshift = src_offset, tgt_offset
  add rest_size = tgt_offset, rest_size 
  ;;
  (p7)  add rshift = 8, rshift
  ld8 data1 = [aligned_src], 8
  add all_1 = -1, r0
  mov pr.rot = 0x10000    // Start with Stage 1
  shr.u count = rest_size, 3
  brp.sptk.imp .unaligned_loop, .unaligned_loop_bb
  ;;
  (p6)  ld8 data2 = [aligned_src], 8
  (p7)  mov data2 = data1
  shl rshift = rshift, 3
  mov start_tgt = aligned_tgt
  add aligned_tgt = 8, aligned_tgt
  cmp.eq  p8 = count, r0
  ;;
  sub lshift = 64, rshift
  (p6)  shr.u sdata1 = data1, rshift 
  shl tgt_offset = tgt_offset, 3
  and rest_size = 0x7, rest_size
  add count = -1, count
  mov ar.ec = 4     // 4 Pipeline Stages
  ;;
  (p6)  shl sdata2 = data2, lshift 
  (p7)  shl sdata1 = data1, lshift 
  cmp.ne  p9 = rest_size, r0
  shl rest_size = rest_size, 3
  shl start_mask = all_1, tgt_offset
  ;;
  (p6)  or  start_data = sdata1, sdata2
  (p7)  mov start_data = sdata1
  mov ar.lc = count
  cmp.ltu p10 = lshift, rest_size
  (p8)  mov last_loaded = data2 
  (p8)  br.cond.spnt .after_unaligned_loop
  ;;

  //  Pipelined Loop
  //  Stage 1: Load new source word
  //  Stage 2: Shift new and previous source word
  //  Stage 3: Combine new and previous source word
  //  Stage 4: Write combined word to target
.unaligned_loop:
  (p19) st8 [aligned_tgt] = tgt_data, 8 // Stage 4
  (p18) or  tgt_data = sdata1, sdata2 // Stage 3
  (p17) shr.u sdata1 = data1, rshift    // Stage 2
.unaligned_loop_bb:
  (p16) ld8 data3 = [aligned_src], 8  // Stage 1
  (p17) shl sdata2 = data2, lshift    // Stage 2
  br.ctop.sptk  .unaligned_loop
.after_unaligned_loop:
  ;;
  //  Partial write to last target word
  (p10) ld8 data1 = [aligned_src]
    ld8 tgt_data = [start_tgt]
  (p9)  shr.u sdata2 = last_loaded, rshift
  ;;
    and start_data = start_data, start_mask
    andcm tgt_data = tgt_data, start_mask
  (p10) shl sdata1 = data1, lshift  
  ;;
  (p10) or  sdata2 = sdata1, sdata2
  (p9)  ld8 end_data = [aligned_tgt]
  shl tgt_mask = all_1, rest_size
  ;;
  (p9)  andcm sdata2 = sdata2, tgt_mask
  (p9)  and end_data = end_data, tgt_mask
  ;;
  (p9)  or  end_data = end_data, sdata2
    or  start_data = start_data, tgt_data
  ;;
    st8 [start_tgt] = start_data
  (p9)  st8 [aligned_tgt] = end_data
  br.cond.sptk  .return

// 
//  Special case:
//  The target region overlaps with the source region 
//  in a way that would cause us to destroy source data
//  before copying it.
//  To prevent this, we start at the end of the source region 
//  and copy backwards.
//
//  Since this is not a very frequent case, no effort has been
//  made to optimize for unaligned copies.
//
  
  src_top = r16
  temp = r17
  rest_size = r18
  count = r19
  tgt8 = r20
  src8 = r21
  data = r22
  data8 = r23

.overlap:
  add src = src, size
  add   tgt = tgt, size
  mov rest_size = size
  cmp.gtu p7 = 8, size
  (p7) br.cond.dpnt .copy_bytewise
  brp.sptk.imp .dwordwise_loop, .dwordwise_loop_bb
  ;;
  or  temp = src, tgt
  tbit.nz p9, p10 = size, 3
  cmp.eq  p6 = count, r0
  add tgt8 = -16, tgt
  add src8 = -16, src
  ;;
  and temp = 0x7, temp
  add count = -1, count 
  mov ar.ec = 2
  mov pr.rot = 0x10000 
  ;; 
  cmp.ne  p8 = temp, r0
  mov ar.lc = count
  (p8) br.cond.dpnt .copy_bytewise

//  
//  High Performance case:  * source and target are both 8 byte aligned
//                          * size >= 8
//  Main Loop moves 16 bytes / clock
//
  add tgt = -8, tgt
  add src = -8, src 
  (p6)  br.cond.spnt .after_dwordwise_loop
  ;;
.dwordwise_loop:
  (p17) st8 [tgt] = data, -16
  (p16) ld8 data = [src], -16 
.dwordwise_loop_bb:
  (p17) st8 [tgt8] = data8, -16
  (p16) ld8 data8 = [src8], -16
  br.ctop.sptk .dwordwise_loop
  ;;
.after_dwordwise_loop:
  (p9)  ld8 data = [src]
  (p10) add src = 8, src
  (p10) add tgt = 8, tgt
  and     rest_size = 0x7, size
  ;;
  (p9)  st8 [tgt] = data

//
//  Byte move: * no alignment restrictions on source or target
//  Main Loop moves 1 byte / clock
//
.copy_bytewise:
  add count = -2, rest_size
  cmp.eq  p6 = rest_size, r0
  (p6)  br.cond.sptk .return
  add src = -1, src
  brp.sptk.imp .bytewise_loop, .bytewise_loop
  ;;
  add tgt = -1, tgt
  cmp.eq  p7 = -1, count
  ld1 data = [src], -1
  mov ar.lc = count
  (p7)  br.cond.spnt .after_bytewise_loop
  ;;
.bytewise_loop:
  st1 [tgt] = data, -1
  ld1 data = [src], -1
  br.cloop.sptk .bytewise_loop
  ;;
.after_bytewise_loop:
  st1 [tgt] = data, -1

.return:
  mov ar.lc = saved_lc
  mov     pr = saved_pr
  br.ret.sptk b0
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -