⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 fastcodefillchar.pas

📁 最快的Delphi快速处理源代码
💻 PAS
📖 第 1 页 / 共 2 页
字号:
   ret
   nop
   nop
   nop
   nop
   nop
 @CaseElse :
   //Need at least 32 bytes here. Max 16 for alignment and 16 for loop
   push    esi
   push    edi
   //Broadcast value
   mov     ch, cl
   movd    xmm0, ecx
   pshuflw xmm0, xmm0, 0
   pshufd  xmm0, xmm0, 0
   //Fill first 16 non aligned bytes
   movdqu  [eax],xmm0
   //StopP2 := P + Count;
   lea     ecx,[eax+edx]
   //16 byte Align
   mov     edi,eax
   and     edi,$F
   mov     esi,16
   sub     esi,edi
   add     eax,esi
   sub     edx,esi
   //I := 0;
   xor     esi,esi
   sub     edx,15
   cmp     edx,1048576
   ja      @Repeat4
 @Repeat1 :
   movdqa  [eax+esi],xmm0
   add     esi,16
   cmp     esi,edx
   jl      @Repeat1
   jmp     @Repeat4End
   nop
   nop
 @Repeat4 :
   movntdq [eax+esi],xmm0
   add     esi,16
   cmp     esi,edx
   jl      @Repeat4
 @Repeat4End :
   {movdq2q mm0,xmm0
   movntq  [ecx-16],mm0
   movntq  [ecx-8], mm0
   emms}
   //Fill the rest
   movdqu [ecx-16],xmm0
 @Exit1 :
   pop   edi
   pop   esi
 @Exit2 :
   ret
   nop
   nop
   nop
   nop
   nop
   nop
   nop
   nop
   nop
   nop
   nop
   nop

@Case1JmpTable:
 dd @CaseCount0
 dd @CaseCount1
 dd @CaseCount2
 dd @CaseCount3
 dd @CaseCount4
 dd @CaseCount5
 dd @CaseCount6
 dd @CaseCount7
 dd @CaseCount8
 dd @CaseCount9
 dd @CaseCount10
 dd @CaseCount11
 dd @CaseCount12
 dd @CaseCount13
 dd @CaseCount14
 dd @CaseCount15
 dd @CaseCount16
 dd @CaseCount17
 dd @CaseCount18
 dd @CaseCount19
 dd @CaseCount20
 dd @CaseCount21
 dd @CaseCount22
 dd @CaseCount23
 dd @CaseCount24
 dd @CaseCount25
 dd @CaseCount26
 dd @CaseCount27
 dd @CaseCount28
 dd @CaseCount29
 dd @CaseCount30
 dd @CaseCount31
end;

//Author:            John O'Harrow
//Optimized for:     Intel Pentium M Dothan
//Instructionset(s): IA32
//Original name:     FillCharJOH_FPU

procedure FastcodeFillCharBlended(var Dest; count: Integer; Value: Char);
asm
  {Copy the fill character into ch}
  mov ch, cl
  {Big or small fill?}
  cmp edx, 39
  ja @BigFill
  {Jump to the correct handler}
  jmp [edx * 4 + @JumpTable]
@BigFill:
  cmp edx, 0
  jl @DoneFill
  {Get the values in mm0}
  movd mm0, ecx
  punpcklwd mm0, mm0
  punpckldq mm0, mm0
  {Store the first qword}
  movq [eax], mm0
  {qword align eax}
  add edx, eax
  add eax, 8
  and eax, -8
  sub edx, eax
  {Fill 32 bytes}
@Fill32Loop:
  {Subtract 32 from edx so long}
  sub edx, 32
  {Fill 32 bytes}
  movq [eax], mm0
  movq [eax + 8], mm0
  movq [eax + 16], mm0
  movq [eax + 24], mm0
  add eax, 32
  cmp edx, 32
  jae @Fill32Loop
  {Exit mmx state}
  emms
  {Do the rest of the bytes}
  jmp [edx * 4 + @JumpTable]
@DoneFill:
  ret
  nop
  nop
  nop //align branch targets
@Fill38:
  mov [eax + 36], cx
@Fill36:
  mov [eax + 34], cx
@Fill34:
  mov [eax + 32], cx
@Fill32:
  mov [eax + 30], cx
@Fill30:
  mov [eax + 28], cx
@Fill28:
  mov [eax + 26], cx
@Fill26:
  mov [eax + 24], cx
@Fill24:
  mov [eax + 22], cx
@Fill22:
  mov [eax + 20], cx
@Fill20:
  mov [eax + 18], cx
@Fill18:
  mov [eax + 16], cx
@Fill16:
  mov [eax + 14], cx
@Fill14:
  mov [eax + 12], cx
@Fill12:
  mov [eax + 10], cx
@Fill10:
  mov [eax + 8], cx
@Fill8:
  mov [eax + 6], cx
@Fill6:
  mov [eax + 4], cx
@Fill4:
  mov [eax + 2], cx
@Fill2:
  mov [eax], cx
  ret
@Fill39:
  mov [eax + 37], cx
@Fill37:
  mov [eax + 35], cx
@Fill35:
  mov [eax + 33], cx
@Fill33:
  mov [eax + 31], cx
@Fill31:
  mov [eax + 29], cx
@Fill29:
  mov [eax + 27], cx
@Fill27:
  mov [eax + 25], cx
@Fill25:
  mov [eax + 23], cx
@Fill23:
  mov [eax + 21], cx
@Fill21:
  mov [eax + 19], cx
@Fill19:
  mov [eax + 17], cx
@Fill17:
  mov [eax + 15], cx
@Fill15:
  mov [eax + 13], cx
@Fill13:
  mov [eax + 11], cx
@Fill11:
  mov [eax + 9], cx
@Fill9:
  mov [eax + 7], cx
@Fill7:
  mov [eax + 5], cx
@Fill5:
  mov [eax + 3], cx
@Fill3:
  mov [eax + 1], cx
@Fill1:
  mov [eax], cl
  ret
  nop //dword align jump table
@JumpTable:
  dd @DoneFill
  dd @Fill1, @Fill2, @Fill3, @Fill4, @Fill5, @Fill6, @Fill7, @Fill8, @Fill9
  dd @Fill10, @Fill11, @Fill12, @Fill13, @Fill14, @Fill15, @Fill16, @Fill17
  dd @Fill18, @Fill19, @Fill20, @Fill21, @Fill22, @Fill23, @Fill24, @Fill25
  dd @Fill26, @Fill27, @Fill28, @Fill29, @Fill30, @Fill31, @Fill32, @Fill33
  dd @Fill34, @Fill35, @Fill36, @Fill37, @Fill38, @Fill39
end;

//Author:            Chris Grant
//Optimized for:     Blended / Pascal
//Instructionset(s): IA32
//Original name:     FillCharCJGPas5

procedure FastcodeFillCharP4N(var Dest; count: Integer; Value: Char);
asm {Size = 161 Bytes}
  cmp       edx, 32
  mov       ch, cl                {Copy Value into both Bytes of CX}
  jl        @@Small
  sub       edx, 16
  movd      xmm0, ecx
  pshuflw   xmm0, xmm0, 0
  pshufd    xmm0, xmm0, 0
  movups    [eax], xmm0           {Fill First 16 Bytes}
  movups    [eax+edx], xmm0       {Fill Last 16 Bytes}
  mov       ecx, eax              {16-Byte Align Writes}
  and       ecx, 15
  sub       ecx, 16
  sub       eax, ecx
  add       edx, ecx
  add       eax, edx
  neg       edx
  cmp       edx, -512*1024
  jb        @@Large
@@Loop:
  movaps    [eax+edx], xmm0       {Fill 16 Bytes per Loop}
  add       edx, 16
  jl        @@Loop
  ret
@@Large:
  movntdq    [eax+edx], xmm0      {Fill 16 Bytes per Loop}
  add       edx, 16
  jl        @@Large
  ret
@@Small:
  test      edx, edx
  jle       @@Done
  mov       [eax+edx-1], cl       {Fill Last Byte}
  and       edx, -2               {No. of Words to Fill}
  neg       edx
  lea       edx, [@@SmallFill + 60 + edx * 2]
  jmp       edx
  nop                             {Align Jump Destinations}
  nop
@@SmallFill:
  mov       [eax+28], cx
  mov       [eax+26], cx
  mov       [eax+24], cx
  mov       [eax+22], cx
  mov       [eax+20], cx
  mov       [eax+18], cx
  mov       [eax+16], cx
  mov       [eax+14], cx
  mov       [eax+12], cx
  mov       [eax+10], cx
  mov       [eax+ 8], cx
  mov       [eax+ 6], cx
  mov       [eax+ 4], cx
  mov       [eax+ 2], cx
  mov       [eax   ], cx
  ret {DO NOT REMOVE - This is for Alignment}
@@Done:
end;

//Author:            John O'Harrow
//Optimized for:     AMD Athlon XP
//Instructionset(s): IA32
//Original name:     FillCharJOH_SSE

procedure FastcodeFillCharXP(var Dest; count: Integer; Value: Char);
asm {Size = 161 Bytes}
  cmp       edx, 32
  mov       ch, cl                {Copy Value into both Bytes of CX}
  jl        @@Small
  sub       edx, 16
  mov       [eax], cx             {Fill First 4 Bytes}
  mov       [eax+2], cx
  movss     xmm0, [eax]           {Set each byte of XMM0 to Value}
  shufps    xmm0, xmm0, 0
  movups    [eax], xmm0           {Fill First 16 Bytes}
  movups    [eax+edx], xmm0       {Fill Last 16 Bytes}
  mov       ecx, eax              {16-Byte Align Writes}
  and       ecx, 15
  sub       ecx, 16
  sub       eax, ecx
  add       edx, ecx
  add       eax, edx
  neg       edx
  cmp       edx, -512*1024
  jb        @@Large
@@Loop:
  movaps    [eax+edx], xmm0       {Fill 16 Bytes per Loop}
  add       edx, 16
  jl        @@Loop
  ret
@@Large:
  movntps   [eax+edx], xmm0       {Fill 16 Bytes per Loop}
  add       edx, 16
  jl        @@Large
  ret
@@Small:
  test      edx, edx
  jle       @@Done
  mov       [eax+edx-1], cl       {Fill Last Byte}
  and       edx, -2               {No. of Words to Fill}
  neg       edx
  lea       edx, [@@SmallFill + 60 + edx * 2]
  jmp       edx
  nop                             {Align Jump Destinations}
  nop
@@SmallFill:
  mov       [eax+28], cx
  mov       [eax+26], cx
  mov       [eax+24], cx
  mov       [eax+22], cx
  mov       [eax+20], cx
  mov       [eax+18], cx
  mov       [eax+16], cx
  mov       [eax+14], cx
  mov       [eax+12], cx
  mov       [eax+10], cx
  mov       [eax+ 8], cx
  mov       [eax+ 6], cx
  mov       [eax+ 4], cx
  mov       [eax+ 2], cx
  mov       [eax   ], cx
  ret {DO NOT REMOVE - This is for Alignment}
@@Done:
end;

//Author:            Pierre Le Riche
//Optimized for:     Blended
//Instructionset(s): IA32
//Original name:     FillCharPLRMMX1

procedure FastcodeFillCharPascal(var Dest; count: Integer; Value: Char);
var
  I, J, K : Integer;
  P    : Pointer;
  Label P01, P02, P03, P04, P05, P06, P07, P08, P09, P10, P11, P12;
begin
  if Count > 0 then
    begin
      P := @Dest;
      If Count >= 12 then
        begin
          J := Byte(Value);
          J := J or (J shl  8);
          J := J or (J shl 16);

          PInteger(P)^ := J;
          PInteger(Integer(P) + Count - 4)^ := J;

          I := Count shr 2;

          if Count >= 256 then
            begin
              if count < 448 then
                begin
                  PIntegerArray(P)[1] := J;
                  PIntegerArray(P)[2] := J;
                  PIntegerArray(P)[3] := J;

                  repeat
                    Dec(I,4);
                    PIntegerArray(P)[I]   := J;
                    PIntegerArray(P)[I+1] := J;
                    PIntegerArray(P)[I+2] := J;
                    PIntegerArray(P)[I+3] := J;
                  until I < 4;
                end
              else
                begin
                  I := Count;
                  K := (Integer(P) and 3) - 4;
                  Dec(I, 16);
                  Dec(PByte(P), K);
                  Inc(I, K);
                  Inc(PByte(P), I);
                  PintegerArray(P)[0] := J;
                  PintegerArray(P)[1] := J;
                  PintegerArray(P)[2] := J;
                  PintegerArray(P)[3] := J;
                  repeat
                    PintegerArray(Integer(P)-I)[0] := J;
                    PintegerArray(Integer(P)-I)[1] := J;
                    PintegerArray(Integer(P)-I)[2] := J;
                    PintegerArray(Integer(P)-I)[3] := J;
                    Dec(I, 16);
                  until I <= 0;
                end
             end
          else
            begin
              repeat
                Dec(I,2);
                PIntegerArray(P)[I] := J;
                PIntegerArray(P)[I+1] := J;
              until I < 2;
            end
        end
      else
        begin
          case Count of
            1:  goto P01;
            2:  goto P02;
            3:  goto P03;
            4:  goto P04;
            5:  goto P05;
            6:  goto P06;
            7:  goto P07;
            8:  goto P08;
            9:  goto P09;
            10: goto P10;
            11: goto P11;
            12: goto P12;
          end;
          P12: PByteArray(P)[11] := Byte(Value);
          P11: PByteArray(P)[10] := Byte(Value);
          P10: PByteArray(P)[09] := Byte(Value);
          P09: PByteArray(P)[08] := Byte(Value);
          P08: PByteArray(P)[07] := Byte(Value);
          P07: PByteArray(P)[06] := Byte(Value);
          P06: PByteArray(P)[05] := Byte(Value);
          P05: PByteArray(P)[04] := Byte(Value);
          P04: PByteArray(P)[03] := Byte(Value);
          P03: PByteArray(P)[02] := Byte(Value);
          P02: PByteArray(P)[01] := Byte(Value);
          P01: PByteArray(P)[00] := Byte(Value);

        end
    end;
end;

procedure FillCharStub;
asm
  call SysUtils.@FillChar;
end;

end.

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -