📄 adxbase.pas
字号:
(***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is TurboPower Async Professional
*
* The Initial Developer of the Original Code is
* TurboPower Software
*
* Portions created by the Initial Developer are Copyright (C) 1991-2002
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
*
* ***** END LICENSE BLOCK ***** *)
{*********************************************************}
{* ADXBASE.PAS 4.06 *}
{*********************************************************}
{* XML parser Base types and conversion routines *}
{*********************************************************}
{Global defines potentially affecting this unit}
{$I AWDEFINE.INC}
unit AdXBase;
interface
uses
classes,
OOMisc;
{===System functions=================================================}
type
TApdUcs4Char = Longint;
TApdUtf8Char = string[6];
DOMChar = WideChar;
PDOMChar = PWideChar;
{ Character encoding types}
TApdCharEncoding = (ceUnknown, ceUTF8);
{The TApdMemoryStream class is used to expose TMemoryStream's SetPointer
method.}
TApdMemoryStream = class(TMemoryStream)
public
procedure SetPointer(Ptr : Pointer; Size : Longint);
end;
TApdFileStream = class(TFileStream)
FFileName : string;
public
constructor CreateEx(Mode : Word; const FileName : string);
property Filename : string read FFileName;
end;
{ Utility methods }
function ApxPos(const aSubStr, aString : DOMString) : Integer;
function ApxRPos(const sSubStr, sTerm : DOMString) : Integer;
{ character conversion routines }
function ApxIso88591ToUcs4(aInCh : AnsiChar;
var aOutCh : TApdUcs4Char) : Boolean;
function ApxUcs4ToIso88591(aInCh : TApdUcs4Char;
var aOutCh : AnsiChar) : Boolean;
function ApxUcs4ToWideChar(const aInChar : TApdUcs4Char;
var aOutWS : DOMChar) : Boolean;
function ApxUtf16ToUcs4(aInChI,
aInChII : DOMChar;
var aOutCh : TApdUcs4Char;
var aBothUsed : Boolean) : Boolean;
function ApxUcs4ToUtf8(aInCh : TApdUcs4Char;
var aOutCh : TApdUtf8Char) : Boolean;
function ApxUtf8ToUcs4(const aInCh : TApdUtf8Char;
aBytes : Integer;
var aOutCh : TApdUcs4Char) : Boolean;
{ UTF specials }
function ApxGetLengthUtf8(const aCh : AnsiChar) : byte;
{ character classes }
function ApxIsBaseChar(aCh : TApdUcs4Char) : Boolean;
function ApxIsChar(const aCh : TApdUcs4Char) : Boolean;
function ApxIsCombiningChar(aCh : TApdUcs4Char) : Boolean;
function ApxIsDigit(aCh : TApdUcs4Char) : Boolean;
function ApxIsExtender(aCh : TApdUcs4Char) : Boolean;
function ApxIsIdeographic(aCh : TApdUcs4Char) : Boolean;
function ApxIsLetter(aCh : TApdUcs4Char) : Boolean;
function ApxIsNameChar(aCh : TApdUcs4Char) : Boolean;
function ApxIsNameCharFirst(aCh : TApdUcs4Char) : Boolean;
function ApxIsPubidChar(aCh : TApdUcs4Char) : Boolean;
function ApxIsSpace(aCh : TApdUcs4Char) : Boolean;
implementation
uses
sysutils;
{== Utility methods ==================================================}
function ApxPos(const aSubStr, aString : DOMString) : Integer;
begin
Result := AnsiPos(aSubStr, aString);
end;
{--------}
function ApxRPos(const sSubStr, sTerm : DOMString) : Integer;
var
cLast : DOMChar;
i, j : Integer;
begin
j := Length(sSubStr);
cLast := sSubStr[j];
for i := Length(sTerm) downto j do begin
if (sTerm[i] = cLast) and
(Copy(sTerm, i - j + 1, j) = sSubStr) then begin
Result := i - j + 1;
Exit;
end;
end;
Result := 0;
end;
{===character conversion routines====================================}
function ApxIso88591ToUcs4(aInCh : AnsiChar;
var aOutCh : TApdUcs4Char) : boolean;
begin
{Note: the conversion from ISO-8859-1 to UCS-4 is very simple: the
result is the original character}
aOutCh := ord(aInCh);
Result := true; {cannot fail}
end;
{--------}
function ApxUcs4ToIso88591(aInCh : TApdUcs4Char;
var aOutCh : AnsiChar) : Boolean;
begin
{Note: the conversion from UCS-4 to ISO-8859-1 is very simple: if
the character is contained in a byte, the result is the
original character; otherwise the conversion cannot be done}
aInCh := abs(aInCh);
if (($00 <= aInCh) and (aInCh <= $FF)) then begin
aOutCh := AnsiChar(aInCh and $FF);
Result := true;
end
else begin
Result := false;
aOutCh := #0;
end;
end;
{--------}
function ApxUcs4ToWideChar(const aInChar : TApdUcs4Char;
var aOutWS : DOMChar) : Boolean;
var
Temp : Longint;
begin
Temp := abs(aInChar);
if (Temp < $10000) then begin
aOutWS := DOMChar(Temp);
Result := True;
end else if (Temp <= $10FFFF) then begin
dec(Temp, $10000);
Temp := $DC00 or (Temp and $3FF);
Temp := $D800 or (Temp shr 10);
aOutWS := DOMChar(Temp);
Result := True;
end else begin
aOutWS := #0;
Result := False;
end;
end;
{--------}
function ApxUtf16ToUcs4(aInChI,
aInChII : DOMChar;
var aOutCh : TApdUcs4Char;
var aBothUsed : Boolean) : Boolean;
begin
aBothUsed := False;
if (aInChI < #$D800) or (aInChI > #$DFFF) then begin
aOutCh := Integer(aInChI);
Result := True;
end
else if (aInChI < #$DC00) and
((#$DC00 <= aInChII) and (aInChII <= #$DFFF)) then begin
aOutCh := ((integer(aInChI) and $3FF) shl 10) or
(integer(aInChII) and $3FF);
aBothUsed := True;
Result := True;
end
else begin
Result := False;
aOUtCh := 0;
end;
end;
{--------}
function ApxUcs4ToUtf8(aInCh : TApdUcs4Char;
var aOutCh : TApdUtf8Char) : Boolean;
begin
aInCh := abs(aInCh);
{if the UCS-4 value is $00 to $7f, no conversion is required}
if (aInCh < $80) then begin
aOutCh[0] := #1;
aOutCh[1] := AnsiChar(aInCh);
end
{if the UCS-4 value is $80 to $7ff, a two character string is
produced}
else if (aInCh < $800) then begin
aOutCh[0] := #2;
aOutCh[1] := AnsiChar($C0 or (aInCh shr 6));
aOutCh[2] := AnsiChar($80 or (aInCh and $3F));
end
{if the UCS-4 value is $800 to $ffff, a three character string is
produced}
else if (aInCh < $10000) then begin
aOutCh[0] := #3;
aOutCh[1] := AnsiChar($E0 or (aInCh shr 12));
aOutCh[2] := AnsiChar($80 or ((aInCh shr 6) and $3F));
aOutCh[3] := AnsiChar($80 or (aInCh and $3F));
end
{NOTE: the following if clauses will be very rarely used since the
majority of characters will be unicode characters: $0000 to
$FFFF}
{if the UCS-4 value is $10000 to $1fffff, a four character string
is produced}
else if (aInCh < $200000) then begin
aOutCh[0] := #4;
aOutCh[1] := AnsiChar($F0 or (aInCh shr 18));
aOutCh[2] := AnsiChar($80 or ((aInCh shr 12) and $3F));
aOutCh[3] := AnsiChar($80 or ((aInCh shr 6) and $3F));
aOutCh[4] := AnsiChar($80 or (aInCh and $3F));
end
{if the UCS-4 value is $200000 to $3ffffff, a five character
string is produced}
else if (aInCh < $4000000) then begin
aOutCh[0] := #5;
aOutCh[1] := AnsiChar($F8 or (aInCh shr 24));
aOutCh[2] := AnsiChar($80 or ((aInCh shr 18) and $3F));
aOutCh[3] := AnsiChar($80 or ((aInCh shr 12) and $3F));
aOutCh[4] := AnsiChar($80 or ((aInCh shr 6) and $3F));
aOutCh[5] := AnsiChar($80 or (aInCh and $3F));
end
{for all other UCS-4 values, a six character string is produced}
else begin
aOutCh[0] := #6;
aOutCh[1] := AnsiChar($FC or (aInCh shr 30));
aOutCh[2] := AnsiChar($80 or ((aInCh shr 24) and $3F));
aOutCh[3] := AnsiChar($80 or ((aInCh shr 18) and $3F));
aOutCh[4] := AnsiChar($80 or ((aInCh shr 12) and $3F));
aOutCh[5] := AnsiChar($80 or ((aInCh shr 6) and $3F));
aOutCh[6] := AnsiChar($80 or (aInCh and $3F));
end;
Result := True; {cannot fail}
end;
{--------}
function ApxUtf8ToUcs4(const aInCh : TApdUtf8Char;
aBytes : Integer;
var aOutCh : TApdUcs4Char) : Boolean;
var
InFirstByte : AnsiChar;
InCharLen : Integer;
i : Integer;
begin
InFirstByte := aInCh[1];
InCharLen := Length(aInCh);
{the length of the UTF-8 character cannot be zero and must match
that of the first ASCII character in the string}
if ((InCharLen = 0) or
(InCharLen <> aBytes)) then begin
Result := False;
aOutCh := 0;
Exit;
end;
{all subsequent characters must have the most significant bit set
and the next to most significant digit clear; we'll test for this
as we go along}
{get the bits from the first ASCII character}
if (InFirstByte <= #$7F) then
aOutCh := Ord(InFirstByte)
else if (InFirstByte <= #$DF) then
aOutCh := Ord(InFirstByte) and $1F
else if (InFirstByte <= #$EF) then
aOutCh := Ord(InFirstByte) and $0F
else if (InFirstByte <= #$F7) then
aOutCh := Ord(InFirstByte) and $07
else if (InFirstByte <= #$FB) then
aOutCh := Ord(InFirstByte) and $03
else
aOutCh := Ord(InFirstByte) and $01;
{get the bits from the remaining ASCII characters}
for i := 2 to InCharLen do begin
if ((Byte(aInCh[i]) and $C0) <> $80) then begin
Result := False;
aOutCh := 0;
Exit;
end;
aOutCh := (aOutCh shl 6) or (Byte(aInCh[i]) and $3F);
end;
{success}
Result := True;
end;
{====================================================================}
{===UTF specials=====================================================}
function ApxGetLengthUtf8(const aCh : AnsiChar) : Byte;
begin
if (aCh <= #$7F) then
Result := 1
else if (aCh <= #$BF) then
Result := 0 { $80--$BF is an error }
else if (aCh <= #$DF) then
Result := 2
else if (aCh <= #$EF) then
Result := 3
else if (aCh <= #$F7) then
Result := 4
else if (aCh <= #$FB) then
Result := 5
else if (aCh <= #$FD) then
Result := 6
else
Result := 0; { $FE, $FF is an error }
end;
{====================================================================}
{===character classes================================================}
function ApxIsBaseChar(aCh : TApdUcs4Char) : boolean;
begin
Result := (($0041 <= aCh) and (aCh <= $005A)) or
(($0061 <= aCh) and (aCh <= $007A)) or
(($00C0 <= aCh) and (aCh <= $00D6)) or
(($00D8 <= aCh) and (aCh <= $00F6)) or
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -