📄 synachar.pas
字号:
{==============================================================================|
| Project : Ararat Synapse | 005.001.000 |
|==============================================================================|
| Content: Charset conversion support |
|==============================================================================|
| Copyright (c)1999-2004, Lukas Gebauer |
| All rights reserved. |
| |
| Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are met: |
| |
| Redistributions of source code must retain the above copyright notice, this |
| list of conditions and the following disclaimer. |
| |
| Redistributions in binary form must reproduce the above copyright notice, |
| this list of conditions and the following disclaimer in the documentation |
| and/or other materials provided with the distribution. |
| |
| Neither the name of Lukas Gebauer nor the names of its contributors may |
| be used to endorse or promote products derived from this software without |
| specific prior written permission. |
| |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR |
| ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH |
| DAMAGE. |
|==============================================================================|
| The Initial Developer of the Original Code is Lukas Gebauer (Czech Republic).|
| Portions created by Lukas Gebauer are Copyright (c)2000-2004. |
| All Rights Reserved. |
|==============================================================================|
| Contributor(s): |
|==============================================================================|
| History: see HISTORY.HTM from distribution package |
| (Found at URL: http://www.ararat.cz/synapse/) |
|==============================================================================}
{: @abstract(Charset conversion support)
This unit contains a routines for lot of charset conversions.
It using built-in conversion tables or external Iconv library. Iconv is used
when needed conversion is known by Iconv library. When Iconv library is not
found or Iconv not know requested conversion, then are internal routines used
for conversion. (You can disable Iconv support from your program too!)
Internal routines knows all major charsets for Europe or America. For East-Asian
charsets you must use Iconv library!
}
{$IFDEF FPC}
{$MODE DELPHI}
{$ENDIF}
{$Q-}
{$H+}
unit synachar;
interface
type
{:Type with all supported charsets.}
TMimeChar = (ISO_8859_1, ISO_8859_2, ISO_8859_3, ISO_8859_4, ISO_8859_5,
ISO_8859_6, ISO_8859_7, ISO_8859_8, ISO_8859_9, ISO_8859_10, ISO_8859_13,
ISO_8859_14, ISO_8859_15, CP1250, CP1251, CP1252, CP1253, CP1254, CP1255,
CP1256, CP1257, CP1258, KOI8_R, CP895, CP852, UCS_2, UCS_4, UTF_8, UTF_7,
UTF_7mod, UCS_2LE, UCS_4LE,
//next is supported by Iconv only...
UTF_16, UTF_16LE, UTF_32, UTF_32LE, C99, JAVA, ISO_8859_16, KOI8_U, KOI8_RU,
CP862, CP866, MAC, MACCE, MACICE, MACCRO, MACRO, MACCYR, MACUK, MACGR, MACTU,
MACHEB, MACAR, MACTH, ROMAN8, NEXTSTEP, ARMASCII, GEORGIAN_AC, GEORGIAN_PS,
KOI8_T, MULELAO, CP1133, TIS620, CP874, VISCII, TCVN, ISO_IR_14, JIS_X0201,
JIS_X0208, JIS_X0212, GB1988_80, GB2312_80, ISO_IR_165, ISO_IR_149, EUC_JP,
SHIFT_JIS, CP932, ISO_2022_JP, ISO_2022_JP1, ISO_2022_JP2, GB2312, CP936,
GB18030, ISO_2022_CN, ISO_2022_CNE, HZ, EUC_TW, BIG5, CP950, BIG5_HKSCS,
EUC_KR, CP949, CP1361, ISO_2022_KR, CP737, CP775, CP853, CP855, CP857,
CP858, CP860, CP861, CP863, CP864, CP865, CP869, CP1125);
{:Set of any charsets.}
TMimeSetChar = set of TMimeChar;
const
{:Set of charsets supported by Iconv library only.}
IconvOnlyChars: set of TMimeChar = [UTF_16, UTF_16LE, UTF_32, UTF_32LE,
C99, JAVA, ISO_8859_16, KOI8_U, KOI8_RU, CP862, CP866, MAC, MACCE, MACICE,
MACCRO, MACRO, MACCYR, MACUK, MACGR, MACTU, MACHEB, MACAR, MACTH, ROMAN8,
NEXTSTEP, ARMASCII, GEORGIAN_AC, GEORGIAN_PS, KOI8_T, MULELAO, CP1133,
TIS620, CP874, VISCII, TCVN, ISO_IR_14, JIS_X0201, JIS_X0208, JIS_X0212,
GB1988_80, GB2312_80, ISO_IR_165, ISO_IR_149, EUC_JP, SHIFT_JIS, CP932,
ISO_2022_JP, ISO_2022_JP1, ISO_2022_JP2, GB2312, CP936, GB18030,
ISO_2022_CN, ISO_2022_CNE, HZ, EUC_TW, BIG5, CP950, BIG5_HKSCS, EUC_KR,
CP949, CP1361, ISO_2022_KR, CP737, CP775, CP853, CP855, CP857, CP858,
CP860, CP861, CP863, CP864, CP865, CP869, CP1125];
{:Set of charsets supported by internal routines only.}
NoIconvChars: set of TMimeChar = [CP895, UTF_7mod];
{:null character replace table. (Usable for disable charater replacing.)}
Replace_None: array[0..0] of Word =
(0);
{:Character replace table for remove Czech diakritics.}
Replace_Czech: array[0..59] of Word =
(
$00E1, $0061,
$010D, $0063,
$010F, $0064,
$010E, $0044,
$00E9, $0065,
$011B, $0065,
$00ED, $0069,
$0148, $006E,
$00F3, $006F,
$0159, $0072,
$0161, $0073,
$0165, $0074,
$00FA, $0075,
$016F, $0075,
$00FD, $0079,
$017E, $007A,
$00C1, $0041,
$010C, $0043,
$00C9, $0045,
$011A, $0045,
$00CD, $0049,
$0147, $004E,
$00D3, $004F,
$0158, $0052,
$0160, $0053,
$0164, $0054,
$00DA, $0055,
$016E, $0055,
$00DD, $0059,
$017D, $005A
);
var
{:By this you can generally disable/enable Iconv support.}
DisableIconv: Boolean = False;
{==============================================================================}
{:Convert Value from one charset to another. See: @link(CharsetConversionEx)}
function CharsetConversion(const Value: AnsiString; CharFrom: TMimeChar;
CharTo: TMimeChar): AnsiString;
{:Convert Value from one charset to another with additional character conversion.
see: @link(Replace_None) and @link(Replace_Czech)}
function CharsetConversionEx(const Value: AnsiString; CharFrom: TMimeChar;
CharTo: TMimeChar; const TransformTable: array of Word): AnsiString;
{:Convert Value from one charset to another with additional character conversion.
This funtion is similar to @link(CharsetConversionEx), but you can disable
transliteration of unconvertible characters.}
function CharsetConversionTrans(Value: AnsiString; CharFrom: TMimeChar;
CharTo: TMimeChar; const TransformTable: array of Word; Translit: Boolean): AnsiString;
{:Returns charset used by operating system.}
function GetCurCP: TMimeChar;
{:Returns charset used by operating system as OEM charset. (in Windows DOS box,
for example)}
function GetCurOEMCP: TMimeChar;
{:Converting string with charset name to TMimeChar.}
function GetCPFromID(Value: AnsiString): TMimeChar;
{:Converting TMimeChar to string with name of charset.}
function GetIDFromCP(Value: TMimeChar): AnsiString;
{:return @true when value need to be converted. (It is not 7-bit ASCII)}
function NeedCharsetConversion(const Value: AnsiString): Boolean;
{:Finding best target charset from set of TMimeChars with minimal count of
unconvertible characters.}
function IdealCharsetCoding(const Value: AnsiString; CharFrom: TMimeChar;
CharTo: TMimeSetChar): TMimeChar;
{:Return BOM (Byte Order Mark) for given unicode charset.}
function GetBOM(Value: TMimeChar): AnsiString;
{:Convert binary string with unicode content to WideString.}
function StringToWide(const Value: AnsiString): WideString;
{:Convert WideString to binary string with unicode content.}
function WideToString(const Value: WideString): AnsiString;
{==============================================================================}
implementation
uses
{$IFDEF LINUX}
Libc,
{$ELSE}
Windows,
{$ENDIF}
SysUtils,
synautil, synacode, synaicnv;
//character transcoding tables X to UCS-2
{
//dummy table
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $00A1, $00A2, $00A3, $00A4, $00A5, $00A6, $00A7,
$00A8, $00A9, $00AA, $00AB, $00AC, $00AD, $00AE, $00AF,
$00B0, $00B1, $00B2, $00B3, $00B4, $00B5, $00B6, $00B7,
$00B8, $00B9, $00BA, $00BB, $00BC, $00BD, $00BE, $00BF,
$00C0, $00C1, $00C2, $00C3, $00C4, $00C5, $00C6, $00C7,
$00C8, $00C9, $00CA, $00CB, $00CC, $00CD, $00CE, $00CF,
$00D0, $00D1, $00D2, $00D3, $00D4, $00D5, $00D6, $00D7,
$00D8, $00D9, $00DA, $00DB, $00DC, $00DD, $00DE, $00DF,
$00E0, $00E1, $00E2, $00E3, $00E4, $00E5, $00E6, $00E7,
$00E8, $00E9, $00EA, $00EB, $00EC, $00ED, $00EE, $00EF,
$00F0, $00F1, $00F2, $00F3, $00F4, $00F5, $00F6, $00F7,
$00F8, $00F9, $00FA, $00FB, $00FC, $00FD, $00FE, $00FF
}
const
{Latin-1
Danish, Dutch, English, Faeroese, Finnish, French, German, Icelandic,
Irish, Italian, Norwegian, Portuguese, Spanish and Swedish.
}
CharISO_8859_1: array[128..255] of Word =
(
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $00A1, $00A2, $00A3, $00A4, $00A5, $00A6, $00A7,
$00A8, $00A9, $00AA, $00AB, $00AC, $00AD, $00AE, $00AF,
$00B0, $00B1, $00B2, $00B3, $00B4, $00B5, $00B6, $00B7,
$00B8, $00B9, $00BA, $00BB, $00BC, $00BD, $00BE, $00BF,
$00C0, $00C1, $00C2, $00C3, $00C4, $00C5, $00C6, $00C7,
$00C8, $00C9, $00CA, $00CB, $00CC, $00CD, $00CE, $00CF,
$00D0, $00D1, $00D2, $00D3, $00D4, $00D5, $00D6, $00D7,
$00D8, $00D9, $00DA, $00DB, $00DC, $00DD, $00DE, $00DF,
$00E0, $00E1, $00E2, $00E3, $00E4, $00E5, $00E6, $00E7,
$00E8, $00E9, $00EA, $00EB, $00EC, $00ED, $00EE, $00EF,
$00F0, $00F1, $00F2, $00F3, $00F4, $00F5, $00F6, $00F7,
$00F8, $00F9, $00FA, $00FB, $00FC, $00FD, $00FE, $00FF
);
{Latin-2
Albanian, Czech, English, German, Hungarian, Polish, Rumanian,
Serbo-Croatian, Slovak, Slovene and Swedish.
}
CharISO_8859_2: array[128..255] of Word =
(
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $0104, $02D8, $0141, $00A4, $013D, $015A, $00A7,
$00A8, $0160, $015E, $0164, $0179, $00AD, $017D, $017B,
$00B0, $0105, $02DB, $0142, $00B4, $013E, $015B, $02C7,
$00B8, $0161, $015F, $0165, $017A, $02DD, $017E, $017C,
$0154, $00C1, $00C2, $0102, $00C4, $0139, $0106, $00C7,
$010C, $00C9, $0118, $00CB, $011A, $00CD, $00CE, $010E,
$0110, $0143, $0147, $00D3, $00D4, $0150, $00D6, $00D7,
$0158, $016E, $00DA, $0170, $00DC, $00DD, $0162, $00DF,
$0155, $00E1, $00E2, $0103, $00E4, $013A, $0107, $00E7,
$010D, $00E9, $0119, $00EB, $011B, $00ED, $00EE, $010F,
$0111, $0144, $0148, $00F3, $00F4, $0151, $00F6, $00F7,
$0159, $016F, $00FA, $0171, $00FC, $00FD, $0163, $02D9
);
{Latin-3
Afrikaans, Catalan, English, Esperanto, French, Galician,
German, Italian, Maltese and Turkish.
}
CharISO_8859_3: array[128..255] of Word =
(
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $0126, $02D8, $00A3, $00A4, $FFFD, $0124, $00A7,
$00A8, $0130, $015E, $011E, $0134, $00AD, $FFFD, $017B,
$00B0, $0127, $00B2, $00B3, $00B4, $00B5, $0125, $00B7,
$00B8, $0131, $015F, $011F, $0135, $00BD, $FFFD, $017C,
$00C0, $00C1, $00C2, $FFFD, $00C4, $010A, $0108, $00C7,
$00C8, $00C9, $00CA, $00CB, $00CC, $00CD, $00CE, $00CF,
$FFFD, $00D1, $00D2, $00D3, $00D4, $0120, $00D6, $00D7,
$011C, $00D9, $00DA, $00DB, $00DC, $016C, $015C, $00DF,
$00E0, $00E1, $00E2, $FFFD, $00E4, $010B, $0109, $00E7,
$00E8, $00E9, $00EA, $00EB, $00EC, $00ED, $00EE, $00EF,
$FFFD, $00F1, $00F2, $00F3, $00F4, $0121, $00F6, $00F7,
$011D, $00F9, $00FA, $00FB, $00FC, $016D, $015D, $02D9
);
{Latin-4
Danish, English, Estonian, Finnish, German, Greenlandic,
Lappish, Latvian, Lithuanian, Norwegian and Swedish.
}
CharISO_8859_4: array[128..255] of Word =
(
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $0104, $0138, $0156, $00A4, $0128, $013B, $00A7,
$00A8, $0160, $0112, $0122, $0166, $00AD, $017D, $00AF,
$00B0, $0105, $02DB, $0157, $00B4, $0129, $013C, $02C7,
$00B8, $0161, $0113, $0123, $0167, $014A, $017E, $014B,
$0100, $00C1, $00C2, $00C3, $00C4, $00C5, $00C6, $012E,
$010C, $00C9, $0118, $00CB, $0116, $00CD, $00CE, $012A,
$0110, $0145, $014C, $0136, $00D4, $00D5, $00D6, $00D7,
$00D8, $0172, $00DA, $00DB, $00DC, $0168, $016A, $00DF,
$0101, $00E1, $00E2, $00E3, $00E4, $00E5, $00E6, $012F,
$010D, $00E9, $0119, $00EB, $0117, $00ED, $00EE, $012B,
$0111, $0146, $014D, $0137, $00F4, $00F5, $00F6, $00F7,
$00F8, $0173, $00FA, $00FB, $00FC, $0169, $016B, $02D9
);
{CYRILLIC
Bulgarian, Bielorussian, English, Macedonian, Russian,
Serbo-Croatian and Ukrainian.
}
CharISO_8859_5: array[128..255] of Word =
(
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $0401, $0402, $0403, $0404, $0405, $0406, $0407,
$0408, $0409, $040A, $040B, $040C, $00AD, $040E, $040F,
$0410, $0411, $0412, $0413, $0414, $0415, $0416, $0417,
$0418, $0419, $041A, $041B, $041C, $041D, $041E, $041F,
$0420, $0421, $0422, $0423, $0424, $0425, $0426, $0427,
$0428, $0429, $042A, $042B, $042C, $042D, $042E, $042F,
$0430, $0431, $0432, $0433, $0434, $0435, $0436, $0437,
$0438, $0439, $043A, $043B, $043C, $043D, $043E, $043F,
$0440, $0441, $0442, $0443, $0444, $0445, $0446, $0447,
$0448, $0449, $044A, $044B, $044C, $044D, $044E, $044F,
$2116, $0451, $0452, $0453, $0454, $0455, $0456, $0457,
$0458, $0459, $045A, $045B, $045C, $00A7, $045E, $045F
);
{ARABIC
}
CharISO_8859_6: array[128..255] of Word =
(
$0080, $0081, $0082, $0083, $0084, $0085, $0086, $0087,
$0088, $0089, $008A, $008B, $008C, $008D, $008E, $008F,
$0090, $0091, $0092, $0093, $0094, $0095, $0096, $0097,
$0098, $0099, $009A, $009B, $009C, $009D, $009E, $009F,
$00A0, $FFFD, $FFFD, $FFFD, $00A4, $FFFD, $FFFD, $FFFD,
$FFFD, $FFFD, $FFFD, $FFFD, $060C, $00AD, $FFFD, $FFFD,
$FFFD, $FFFD, $FFFD, $FFFD, $FFFD, $FFFD, $FFFD, $FFFD,
$FFFD, $FFFD, $FFFD, $061B, $FFFD, $FFFD, $FFFD, $061F,
$FFFD, $0621, $0622, $0623, $0624, $0625, $0626, $0627,
$0628, $0629, $062A, $062B, $062C, $062D, $062E, $062F,
$0630, $0631, $0632, $0633, $0634, $0635, $0636, $0637,
$0638, $0639, $063A, $FFFD, $FFFD, $FFFD, $FFFD, $FFFD,
$0640, $0641, $0642, $0643, $0644, $0645, $0646, $0647,
$0648, $0649, $064A, $064B, $064C, $064D, $064E, $064F,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -