📄 gptextstream.pas

📁 OmniXML源码
💻 PAS
📖 第 1 页 / 共 3 页
字号:
12 3 下一页
{ $OmniXML: OmniXML/GpTextStream.pas,v 1.1.1.1 2004/04/17 11:16:33 mr Exp $ }

{$B-,H+,J+,Q-,T-,X+}

{$UNDEF D3PLUS}
{$UNDEF D4PLUS}
{$IFDEF VER100}{$DEFINE D3PLUS}{$ENDIF}
{$IFDEF VER120}{$DEFINE D3PLUS}{$DEFINE D4PLUS}{$ENDIF}
{$IFDEF VER130}{$DEFINE D3PLUS}{$DEFINE D4PLUS}{$ENDIF}
{$IFDEF VER140}{$DEFINE D3PLUS}{$DEFINE D4PLUS}{$DEFINE D6PLUS}{$ENDIF}
{$IFDEF VER150}{$DEFINE D3PLUS}{$DEFINE D4PLUS}{$DEFINE D6PLUS}{$DEFINE D7PLUS}{$ENDIF}

unit GpTextStream;

(*:Stream wrapper class that automatically converts another stream (containing
   text data) into a Unicode stream. Underlying stream can contain 8-bit text
   (in any codepage) or 16-bit text (in 16-bit or UTF8 encoding).
   @author Primoz Gabrijelcic
   @desc <pre>

This software is distributed under the BSD license.

Copyright (c) 2003, Primoz Gabrijelcic
All rights reserved.

Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
- Redistributions of source code must retain the above copyright notice, this
  list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright notice,
  this list of conditions and the following disclaimer in the documentation
  and/or other materials provided with the distribution.
- The name of the Primoz Gabrijelcic may not be used to endorse or promote
  products derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

   Author           : Primoz Gabrijelcic
   Creation date    : 2001-07-17
   Last modification: 2003-05-16
   Version          : 1.02
   </pre>
*)(*
   History:
     1.02: 2003-05-16
       - Compatible with Delphi 7.

     1.01: 2002-04-24
       - Added TGpTSCreateFlag flag tscfCompressed to keep this enum in sync
         with GpTextFile.TCreateFlag.

     1.0b: 2001-12-15
       - Updated to compile with Delphi 6 (thanks to Artem Khassanov).

     1.0a: 2001-10-06
       - Fixed error in GpTextStream.Read that could cause exception to be
         raised unnecessary.
         
     1.0: 2001-07-17
       - Created from GpTextFile 3.0b (thanks to Miha Remec).
       - Fix UTF 8 decoding error in TGpTextStream.Read.
*)

interface

uses
  Windows,
  SysUtils,
  Classes,
  GpStreamWrapper;

// HelpContext values for all raised exceptions.
const
  //:Windows error.
  hcTFWindowsError            = 3001;
  //:Unknown Windows error.
  hcTFUnknownWindowsError     = 3002;
  //:Cannot append reversed Unicode stream - not supported.
  hcTFCannotAppendReversed    = 3003;
  //:Cannot write to reversed Unicode stream - not supported.
  hcTFCannotWriteReversed     = 3004;
  //:Cannot convert odd number of bytes.
  hcTFCannotConvertOdd        = 3005;

const
{$IFNDEF D3plus}
  CP_UTF8 = 65001;    // UTF-8 pseudo-codepage, defined in Windows.pas in Delphi 3 and newer.
{$ENDIF}

  CP_UNICODE =  1200; // Unicode pseudo-codepage,
  ISO_8859_1 = 28591; // Western Alphabet (ISO)
  ISO_8859_2 = 28592; // Central European Alphabet (ISO)
  ISO_8859_3 = 28593; // Latin 3 Alphabet (ISO)
  ISO_8859_4 = 28594; // Baltic Alphabet (ISO)
  ISO_8859_5 = 28595; // Cyrillic Alphabet (ISO)
  ISO_8859_6 = 28596; // Arabic Alphabet (ISO)
  ISO_8859_7 = 28597; // Greek Alphabet (ISO)
  ISO_8859_8 = 28598; // Hebrew Alphabet (ISO)

type
  {:Base exception class for exceptions created in TGpTextStream.
  }
  EGpTextStream = class(Exception);

  {:Text stream creation flags. Copied from GpTextFile.TCreateFlag. Must be kept
    in sync!
    @enum tscfUnicode          Create Unicode stream.
    @enum tscfReverseByteOrder Create unicode stream with reversed byte order
                               (Motorola format). Used only in Read access,
                               not valid in Write access.
    @enum tscfUse2028          Use standard /2028/ instead of /000D/000A/ for
                               line delimiter (MS Notepad and MS Word do not
                               understand $2028 delimiter). Applies to Unicode
                               streams only.
    @enum tscfUseLF            Use /LF/ instead of /CR/LF/ for line delimiter.
                               Applies to 8-bit streams only.
    @enum tscfWriteUTF8BOM     Write UTF-8 Byte Order Mark to the beginning of
                               stream.
    @enum tscfCompressed       Will try to set the "compressed" attribute (when
                               running on NT and file is on NTFS drive).
  }
  TGpTSCreateFlag = (tscfUnicode, tscfReverseByteOrder, tscfUse2028, tscfUseLF,
    tscfWriteUTF8BOM, tscfCompressed);

  {:Set of all creation flags.
  }
  TGpTSCreateFlags = set of TGpTSCreateFlag;

  {:Line delimiters.
    @enum tstsldCR       Carriage return (Mac style).
    @enum tstsldLF       Line feed (Unix style).
    @enum tstsldCRLF     Carriage return + Line feed (DOS style).
    @enum tstsldLFCR     Line feed + Carriage return (very unusual combination).
    @enum tstsld2028     /2028/ Unicode delimiter.
    @enum tstsld000D000A /000D/000A/ Windows-style Unicode delimiter.
  }
  TGpTSLineDelimiter = (tsldCR, tsldLF, tsldCRLF, tsldLFCR, tsld2028, tsld000D000A);

  {:Set of all line delimiters.
  }
  TGpTSLineDelimiters = set of TGpTSLineDelimiter;

  {:All possible ways to access TGpTextStream. Copied from GpHugeF. Must be kept
    in sync!
    @enum tstsaccRead      Read access.
    @enum tstsaccWrite     Write access.
    @enum tstsaccReadWrite Read and write access.
    @enum tstsaccAppend    Same as tsaccReadWrite, just that Position is set
                           immediatly after the end of stream.
  }
  TGpTSAccess = (tsaccRead, tsaccWrite, tsaccReadWrite, tsaccAppend);

  {:Unified 8/16-bit text stream access. All strings passed as Unicode,
    conversion to/from 8-bit is done automatically according to specified code
    page.
  }
  TGpTextStream = class(TGpStreamWrapper)
  private
    tsAccess      : TGpTSAccess;
    tsCodePage    : word;
    tsCreateFlags : TGpTSCreateFlags;
    tsLineDelims  : TGpTSLineDelimiters;
    tsReadlnBuf   : TMemoryStream;
    tsSmallBuf    : pointer;
    tsWindowsError: DWORD;
  protected
    function  AllocBuffer(size: integer): pointer; virtual;
    procedure FreeBuffer(var buffer: pointer); virtual;
    function  GetWindowsError: DWORD; virtual;
    procedure PrepareStream; virtual;
    procedure SetCodepage(cp: word); virtual;
    function  StreamName(param: string = ''): string; virtual;
    procedure Win32Check(condition: boolean; method: string);
  public
    constructor Create(
      dataStream: TStream; access: TGpTSAccess;
      createFlags: TGpTSCreateFlags {$IFDEF D4plus}= []{$ENDIF};
      codePage: word            {$IFDEF D4plus}= 0{$ENDIF}
      );
    destructor  Destroy; override;
    function  Is16bit: boolean;
    function  IsUnicode: boolean;
    function  Read(var buffer; count: longint): longint; override;
    function  Readln: WideString;
    function  Write(const buffer; count: longint): longint; override;
    function  Writeln(const ln: WideString{$IFDEF D4plus}= ''{$ENDIF}): boolean;
    function  WriteString(const ws: WideString): boolean;
    {:Accepted line delimiters (CR, LF or any combination).
    }
    property AcceptedDelimiters: TGpTSLineDelimiters read tsLineDelims
      write tsLineDelims;
    {:Code page used to convert 8-bit stream to Unicode and back. May be changed
      while stream is open (and even partially read). If set to 0, current
      default code page will be used.
    }
    property  Codepage: word read tsCodePage write SetCodepage;
    {:Stream size. Reintroduced to override GetSize (static in TStream) with
      faster version.
    }
    property  Size: {$IFDEF D7PLUS}int64{$ELSE}longint{$ENDIF D7PLUS} read GetSize write SetSize;
    {:Last Windows error code.
    }
    property  WindowsError: DWORD read GetWindowsError;
  end; { TGpTextStream }

implementation

uses
  SysConst;

const
  {:Header for 'normal' Unicode stream (Intel format).
  }
  CUnicodeNormal  : WideChar = WideChar($FEFF);

  {:Header for 'reversed' Unicode stream (Motorola format).
  }
  CUnicodeReversed: WideChar = WideChar($FFFE);

  {:First two bytes of UTF-8 BOM.
  }
  CUTF8BOM12: WideChar = WideChar($BBEF);

  {:Third byte of UTF-8 BOM.
  }
  CUTF8BOM3: Char = Char($BF);

  {:Size of preallocated buffer used for 8 to 16 to 8 bit conversions in
    TGpTextStream.
  }
  CtsSmallBufSize = 2048; // 1024 WideChars

{$IFDEF D3plus}
resourcestring
{$ELSE}
const
{$ENDIF}
  sCannotAppendReversedUnicodeStream = '%s:Cannot append reversed Unicode stream.';
  sCannotConvertOddNumberOfBytes     = '%s:Cannot convert odd number of bytes: %d';
  sCannotWriteReversedUnicodeStream  = '%s:Cannot write to reversed Unicode stream.';
  sStreamFailed                      = '%s failed. ';

{:Converts Ansi string to Unicode string using specified code page.
  @param   s        Ansi string.
  @param   codePage Code page to be used in conversion.
  @returns Converted wide string.
}
function StringToWideString(const s: AnsiString; codePage: word): WideString;
var
  l: integer;
begin
  if s = '' then
    Result := ''
  else begin
    l := MultiByteToWideChar(codePage, MB_PRECOMPOSED, PChar(@s[1]), -1, nil, 0);
    SetLength(Result, l-1);
    if l > 1 then
      MultiByteToWideChar(CodePage, MB_PRECOMPOSED, PChar(@s[1]), -1, PWideChar(@Result[1]), l-1);
  end;
end; { StringToWideString }

{:Converts Unicode string to Ansi string using specified code page.
  @param   ws       Unicode string.
  @param   codePage Code page to be used in conversion.
  @returns Converted ansi string.
}
function WideStringToString (const ws: WideString; codePage: Word): AnsiString;
var
  l: integer;
begin
  if ws = '' then
    Result := ''
  else begin
    l := WideCharToMultiByte(codePage,
           WC_COMPOSITECHECK or WC_DISCARDNS or WC_SEPCHARS or WC_DEFAULTCHAR,
           @ws[1], -1, nil, 0, nil, nil);
    SetLength(Result, l-1);
    if l > 1 then
      WideCharToMultiByte(codePage,
        WC_COMPOSITECHECK or WC_DISCARDNS or WC_SEPCHARS or WC_DEFAULTCHAR,
        @ws[1], -1, @Result[1], l-1, nil, nil);
  end;
end; { WideStringToString }

{:Convers buffer of WideChars into UTF-8 encoded form. Target buffer must be
  pre-allocated and large enough (each WideChar will use at most three bytes
  in UTF-8 encoding).                                                            <br>
  RFC 2279 (http://www.ietf.org/rfc/rfc2279.txt) describes the conversion:       <br>
  $0000..$007F => $00..$7F                                                       <br>
  $0080..$07FF => 110[bit10..bit6] 10[bit5..bit0]                                <br>
  $0800..$FFFF => 1110[bit15..bit12] 10[bit11..bit6] 10[bit5..bit0]
  @param   unicodeBuf   Buffer of WideChars.
  @param   uniByteCount Size of unicodeBuf, in bytes.
  @param   utf8Buf      Pre-allocated buffer for UTF-8 encoded result.
  @returns Number of bytes used in utf8Buf buffer.
  @since   2.01
}
function WideCharBufToUTF8Buf(const unicodeBuf; uniByteCount: integer;
  var utf8Buf): integer;
var
  iwc: integer;
  pch: PChar;
  pwc: PWideChar;
12 3 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -