📄 jclunicode.pas
字号:
{**************************************************************************************************}
{ }
{ Project JEDI Code Library (JCL) }
{ }
{ The contents of this file are subject to the Mozilla Public License Version 1.1 (the "License"); }
{ you may not use this file except in compliance with the License. You may obtain a copy of the }
{ License at http://www.mozilla.org/MPL/ }
{ }
{ Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF }
{ ANY KIND, either express or implied. See the License for the specific language governing rights }
{ and limitations under the License. }
{ }
{ The Original Code is JclUnicode.pas. }
{ }
{ The Initial Developer of the Original Code is Mike Lischke (public att lischke-online dott de). }
{ Portions created by Mike Lischke are Copyright (C) 1999-2000 Mike Lischke. All Rights Reserved. }
{ }
{ Contributor(s): }
{ Marcel van Brakel }
{ Andreas Hausladen (ahuser) }
{ Mike Lischke }
{ Flier Lu (flier) }
{ Robert Marquardt (marquardt) }
{ Robert Rossmair (rrossmair) }
{ Olivier Sannier (obones) }
{ Matthias Thoma (mthoma) }
{ Petr Vones (pvones) }
{ }
{**************************************************************************************************}
{ }
{ Various Unicode related routines }
{ }
{**************************************************************************************************}
// Last modified: $Date: 2005/03/08 08:33:23 $
// For history see end of file
unit JclUnicode;
{$I jcl.inc}
// Copyright (c) 1999-2000 Mike Lischke (public att lischke-online dott de)
//
// 19-SEP-2003: (changes by Andreas Hausladen)
// - added OWN_WIDESTRING_MEMMGR for faster memory managment in TWideStringList
// under Windows
// - fixed: TWideStringList.Destroy does not set OnChange and OnChanging to nil before calling Clear
//
//
// 29-MAR-2002: MT
// - WideNormalize now returns strings with normalization mode nfNone unchanged.
// - Bug fix in WideCompose: Raised exception when Result of WideComposeHangul was an
// empty string. (#0000044)
// - Bug fix in WideAdjustLineBreaks
// - Added Asserts were needed.
// - TWideStrings.IndexOfName now takes care of NormalizeForm as well.
// - TWideStrings.IndexOf now takes care of NormalizeForm as well.
// - TWideString.List Find now uses the same NormalizationForm for the search string as it uses
// within the list itself.
//
// 29-NOV-2001:
// - bug fix
// 06-JUN-2001:
// - small changes
// 28-APR-2001:
// - bug fixes
// 05-APR-2001:
// - bug fixes
// 23-MAR-2001:
// - WideSameText
// - small changes
// 10-FEB-2001:
// - bug fix in StringToWideStringEx and WideStringToStringEx
// 05-FEB-2001:
// - TWideStrings.GetSeparatedText changed (no separator anymore after the last line)
// 29-JAN-2001:
// - PrepareUnicodeData
// - LoadInProgress critical section is now created at init time to avoid critical thread races
// - bug fixes
// 26-JAN-2001:
// - ExpandANSIString
// - TWideStrings.SaveUnicode is by default True now
// 20..21-JAN-2001:
// - StrUpperW, StrLowerW and StrTitleW removed because they potentially would need
// a reallocation to work correctly (use the WideString versions instead)
// - further improvements related to internal data
// - introduced TUnicodeBlock
// - CodeBlockFromChar improved
// 07-JAN-2001:
// optimized access to character properties, combining class etc.
// 06-JAN-2001:
// TWideStrings and TWideStringList improved
// APR-DEC 2000: versions 2.1 - 2.6
// - preparation for public rlease
// - additional conversion routines
// - JCL compliance
// - character properties unified
// - character properties data and lookup improvements
// - reworked Unicode data resource file
// - improved simple string comparation routines (StrCompW, StrLCompW etc., include surrogate fix)
// - special case folding data for language neutral case insensitive comparations included
// - optimized decomposition
// - composition and normalization support
// - normalization conformance tests applied
// - bug fixes
// FEB-MAR 2000: version 2.0
// - Unicode regular expressions (URE) search class (TURESearch)
// - generic search engine base class for both the Boyer-Moore and the RE search class
// - whole word only search in UTBM, bug fixes in UTBM
// - string decompositon (including hangul)
// OCT/99 - JAN/2000: version 1.0
// - basic Unicode implementation, more than 100 WideString/UCS2 and UCS4 core functions
// - TWideStrings and TWideStringList classes
// - Unicode Tuned Boyer-Moore search class (TUTBMSearch)
// - low and high level Unicode/Wide* functions
// - low level Unicode UCS4 data import and functions
// - helper functions
//
// Version 2.9
// This unit contains routines and classes to manage and work with Unicode/WideString strings.
// You need Delphi 4 or higher to compile this code.
//
// Publicly available low level functions are all preceded by "Unicode..." (e.g.
// in UnicodeToUpper) while the high level functions use the Str... or Wide...
// naming scheme (e.g. StrLICompW and WideUpperCase).
//
// The normalization implementation in this unit has successfully and completely passed the
// official normative conformance testing as of Annex 9 in Technical Report #15
// (Unicode Standard Annex #15, http://www.unicode.org/unicode/reports/tr15, from 2000-08-31).
//
// Open issues:
// - Yet to do things in the URE class are:
// - check all character classes if they match correctly
// - optimize rebuild of DFA (build only when pattern changes)
// - set flag parameter of ExecuteURE
// - add \d any decimal digit
// \D any character that is not a decimal digit
// \s any whitespace character
// \S any character that is not a whitespace character
// \w any "word" character
// \W any "non-word" character
// - The wide string classes still compare text with functions provided by the
// particular system. This works usually fine under WinNT/W2K (although also
// there are limitations like maximum text lengths). Under Win9x conversions
// from and to MBCS are necessary which are bound to a particular locale and
// so very limited in general use. These comparisons should be changed so that
// the code in this unit is used.
interface
uses
{$IFDEF MSWINDOWS}
Windows,
{$ENDIF MSWINDOWS}
Classes,
JclBase;
{$IFNDEF FPC}
{$IFDEF MSWINDOWS}
{$DEFINE OWN_WIDESTRING_MEMMGR}
{$ENDIF MSWINDOWS}
{$ENDIF ~FPC}
{$IFDEF SUPPORTS_WIDESTRING}
const
// definitions of often used characters:
// Note: Use them only for tests of a certain character not to determine character
// classes (like white spaces) as in Unicode are often many code points defined
// being in a certain class. Hence your best option is to use the various
// UnicodeIs* functions.
WideNull = WideChar(#0);
WideTabulator = WideChar(#9);
WideSpace = WideChar(#32);
// logical line breaks
WideLF = WideChar(#10);
WideLineFeed = WideChar(#10);
WideVerticalTab = WideChar(#11);
WideFormFeed = WideChar(#12);
WideCR = WideChar(#13);
WideCarriageReturn = WideChar(#13);
WideCRLF: WideString = #13#10;
WideLineSeparator = WideChar($2028);
WideParagraphSeparator = WideChar($2029);
// byte order marks for Unicode files
// Unicode text files (in UTF-16 format) should contain $FFFE as first character to
// identify such a file clearly. Depending on the system where the file was created
// on this appears either in big endian or little endian style.
BOM_LSB_FIRST = WideChar($FEFF);
BOM_MSB_FIRST = WideChar($FFFE);
type
// Unicode transformation formats (UTF) data types
PUTF7 = ^UTF7;
UTF7 = Char;
PUTF8 = ^UTF8;
UTF8 = Char;
PUTF16 = ^UTF16;
UTF16 = WideChar;
PUTF32 = ^UTF32;
UTF32 = Cardinal;
// UTF conversion schemes (UCS) data types
PUCS4 = ^UCS4;
UCS4 = Cardinal;
PUCS2 = PWideChar;
UCS2 = WideChar;
TUCS2Array = array of UCS2;
TUCS4Array = array of UCS4;
// various predefined or otherwise useful character property categories
TCharacterCategory = (
// normative categories
ccLetterUppercase,
ccLetterLowercase,
ccLetterTitlecase,
ccMarkNonSpacing,
ccMarkSpacingCombining,
ccMarkEnclosing,
ccNumberDecimalDigit,
ccNumberLetter,
ccNumberOther,
ccSeparatorSpace,
ccSeparatorLine,
ccSeparatorParagraph,
ccOtherControl,
ccOtherFormat,
ccOtherSurrogate,
ccOtherPrivate,
ccOtherUnassigned,
// informative categories
ccLetterModifier,
ccLetterOther,
ccPunctuationConnector,
ccPunctuationDash,
ccPunctuationOpen,
ccPunctuationClose,
ccPunctuationInitialQuote,
ccPunctuationFinalQuote,
ccPunctuationOther,
ccSymbolMath,
ccSymbolCurrency,
ccSymbolModifier,
ccSymbolOther,
// bidirectional categories
ccLeftToRight,
ccLeftToRightEmbedding,
ccLeftToRightOverride,
ccRightToLeft,
ccRightToLeftArabic,
ccRightToLeftEmbedding,
ccRightToLeftoverride,
ccPopDirectionalFormat,
ccEuropeanNumber,
ccEuropeanNumberSeparator,
ccEuropeanNumberTerminator,
ccArabicNumber,
ccCommonNumberSeparator,
ccBoundaryNeutral,
ccSegmentSeparator, // this includes tab and vertical tab
ccWhiteSpace,
ccOtherNeutrals,
// self defined categories, they do not appear in the Unicode data file
ccComposed, // can be decomposed
ccNonBreaking,
ccSymmetric, // has left and right forms
ccHexDigit,
ccQuotationMark,
ccMirroring,
ccSpaceOther,
ccAssigned // means there is a definition in the Unicode standard
);
TCharacterCategories = set of TCharacterCategory;
// four forms of normalization are defined:
TNormalizationForm = (
nfNone, // do not normalize
nfC, // canonical decomposition followed by canonical composition (this is most often used)
nfD, // canonical decomposition
nfKC, // compatibility decomposition followed by a canonical composition
nfKD // compatibility decomposition
);
// An Unicode block usually corresponds to a particular language script but
// can also represent special characters, musical symbols and the like.
TUnicodeBlock = (
ubUndefined,
ubBasicLatin,
ubLatin1Supplement,
ubLatinExtendedA,
ubLatinExtendedB,
ubIPAExtensions,
ubSpacingModifierLetters,
ubCombiningDiacriticalMarks,
ubGreek,
ubCyrillic,
ubArmenian,
ubHebrew,
ubArabic,
ubSyriac,
ubThaana,
ubDevanagari,
ubBengali,
ubGurmukhi,
ubGujarati,
ubOriya,
ubTamil,
ubTelugu,
ubKannada,
ubMalayalam,
ubSinhala,
ubThai,
ubLao,
ubTibetan,
ubMyanmar,
ubGeorgian,
ubHangulJamo,
ubEthiopic,
ubCherokee,
ubUnifiedCanadianAboriginalSyllabics,
ubOgham,
ubRunic,
ubKhmer,
ubMongolian,
ubLatinExtendedAdditional,
ubGreekExtended,
ubGeneralPunctuation,
ubSuperscriptsAndSubscripts,
ubCurrencySymbols,
ubCombiningMarksForSymbols,
ubLetterlikeSymbols,
ubNumberForms,
ubArrows,
ubMathematicalOperators,
ubMiscellaneousTechnical,
ubControlPictures,
ubOpticalCharacterRecognition,
ubEnclosedAlphanumerics,
ubBoxDrawing,
ubBlockElements,
ubGeometricShapes,
ubMiscellaneousSymbols,
ubDingbats,
ubBraillePatterns,
ubCJKRadicalsSupplement,
ubKangxiRadicals,
ubIdeographicDescriptionCharacters,
ubCJKSymbolsAndPunctuation,
ubHiragana,
ubKatakana,
ubBopomofo,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -