📄 scim_pinyin.h
字号:
/** @file scim_pinyin.h
* @brief the definitions of pinyin related classes and structs.
*/
/*
* Smart Chinese Input Method
*
* Copyright (c) 2002 James Su <suzhe@turbolinux.com.cn>
*
* $Id: scim_pinyin.h,v 1.2 2004/07/17 07:05:32 Lu Mu Exp $
*
*/
#if !defined (__SCIM_PINYIN_H)
#define __SCIM_PINYIN_H
//#include "config.h"
#if defined (HAVE_HASH_MAP)
#include <hash_map>
#elif defined (HAVE_EXT_HASH_MAP)
#include <ext/hash_map>
#else
#include <map>
#endif
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#ifndef X86
#include <algo.h>
#endif
#include <stdint.h>
#include <linux/types.h>
/*
#ifdef __STDC_ISO_10646__
typedef wchar_t ucs4_t;
#else
typedef uint32 ucs4_t;
#endif
*/
typedef unsigned short ucs4_t;
typedef std::basic_string<char> String;
typedef std::basic_string<ucs4_t> WideString;
typedef uint16_t uint16;
typedef uint32_t uint32;
typedef uint64_t uint64;
#define SCIM_PINYIN_KEY_MAXLEN 7
#define SCIM_MAX_CHAR_FREQUENCY (~0)
//using namespace scim;
/*
class PinyinError: public Exception {
public:
PinyinError (const String& what_arg)
: Exception (String("Pinyin: ") + what_arg) { }
};
*/
// Predefinition of some classes and structs
class PinyinKey;
class PinyinEntry;
class PinyinTable;
class PinyinValidator;
class PinyinKeyLessThan;
class PinyinKeyEqualTo;
class PinyinKeyExactLessThan;
class PinyinKeyExactEqualTo;
struct PinyinParsedKey;
struct PinyinToken;
struct PinyinCustomSettings;
typedef std::vector<ucs4_t> CharVector;
typedef std::vector<PinyinKey> PinyinKeyVector;
typedef std::vector<PinyinParsedKey> PinyinParsedKeyVector;
typedef std::pair<ucs4_t, uint32> CharFrequencyPair;
extern const PinyinToken scim_pinyin_initials [];
extern const PinyinToken scim_pinyin_finals [];
extern const PinyinCustomSettings scim_default_custom_settings;
extern const PinyinValidator scim_default_pinyin_validator;
/**
* enums of pinyin initial element.
*
* A pinyin key can be divided into three tokens:
* Initial -- such as B P M F D T N L etc.
* Final -- such as A O E I U V etc.
* Tone -- can be 1, 2, 3, 4 and 5.
*/
enum PinyinInitial
{
SCIM_PINYIN_ZeroInitial = 0, /**< zero initial. indicates invaild initial */
SCIM_PINYIN_Bo = 1,
SCIM_PINYIN_Ci = 2,
SCIM_PINYIN_Chi = 3,
SCIM_PINYIN_De = 4,
SCIM_PINYIN_Fo = 5,
SCIM_PINYIN_Ge = 6,
SCIM_PINYIN_He = 7,
SCIM_PINYIN_Ji = 8,
SCIM_PINYIN_Ke = 9,
SCIM_PINYIN_Le =10,
SCIM_PINYIN_Mo =11,
SCIM_PINYIN_Ne =12,
SCIM_PINYIN_Po =13,
SCIM_PINYIN_Qi =14,
SCIM_PINYIN_Ri =15,
SCIM_PINYIN_Si =16,
SCIM_PINYIN_Shi =17,
SCIM_PINYIN_Te =18,
SCIM_PINYIN_Wo =19,
SCIM_PINYIN_Xi =20,
SCIM_PINYIN_Yi =21,
SCIM_PINYIN_Zi =22,
SCIM_PINYIN_Zhi =23,
SCIM_PINYIN_LastInitial = SCIM_PINYIN_Zhi, /**< the last initial */
SCIM_PINYIN_InitialNumber = SCIM_PINYIN_LastInitial + 1
};
/**
* enums of pinyin final element.
*/
enum PinyinFinal
{
SCIM_PINYIN_ZeroFinal = 0, /**< zero final. indicates invalid final */
SCIM_PINYIN_A = 1,
SCIM_PINYIN_Ai = 2,
SCIM_PINYIN_An = 3,
SCIM_PINYIN_Ang = 4,
SCIM_PINYIN_Ao = 5,
SCIM_PINYIN_E = 6,
SCIM_PINYIN_Ei = 7,
SCIM_PINYIN_En = 8,
SCIM_PINYIN_Eng = 9,
SCIM_PINYIN_Er =10,
SCIM_PINYIN_I =11,
SCIM_PINYIN_Ia =12,
SCIM_PINYIN_Ian =13,
SCIM_PINYIN_Iang =14,
SCIM_PINYIN_Iao =15,
SCIM_PINYIN_Ie =16,
SCIM_PINYIN_In =17,
SCIM_PINYIN_Ing =18,
SCIM_PINYIN_Iong =19,
SCIM_PINYIN_Iou =20,
SCIM_PINYIN_Iu =21,
SCIM_PINYIN_Ng =22,
SCIM_PINYIN_O =23,
SCIM_PINYIN_Ong =24,
SCIM_PINYIN_Ou =25,
SCIM_PINYIN_U =26,
SCIM_PINYIN_Ua =27,
SCIM_PINYIN_Uai =28,
SCIM_PINYIN_Uan =29,
SCIM_PINYIN_Uang =30,
SCIM_PINYIN_Ue =31,
SCIM_PINYIN_Uei =32,
SCIM_PINYIN_Uen =33,
SCIM_PINYIN_Ueng =34,
SCIM_PINYIN_Ui =35,
SCIM_PINYIN_Un =36,
SCIM_PINYIN_Uo =37,
SCIM_PINYIN_V =38,
SCIM_PINYIN_Van =39,
SCIM_PINYIN_Ve =40,
SCIM_PINYIN_Vn =41,
SCIM_PINYIN_LastFinal = SCIM_PINYIN_Vn, /**< the last final */
SCIM_PINYIN_FinalNumber = SCIM_PINYIN_LastFinal + 1
};
/**
* enums of pinyin tone element.
*/
enum PinyinTone
{
SCIM_PINYIN_ZeroTone = 0, /**< zero tone. this will be matched with all other tones. */
SCIM_PINYIN_First = 1,
SCIM_PINYIN_Second = 2,
SCIM_PINYIN_Third = 3,
SCIM_PINYIN_Fourth = 4,
SCIM_PINYIN_Fifth = 5,
SCIM_PINYIN_LastTone = SCIM_PINYIN_Fifth, /**< the last tone */
SCIM_PINYIN_ToneNumber = SCIM_PINYIN_LastTone + 1
};
/**
* struct of pinyin token.
*
* this struct store the informations of a pinyin token
* (an initial or final)
*/
struct PinyinToken
{
char str[8]; /**< ASCII name of the token. */
ucs4_t wstr[4]; /**< Chinese name in unicode. */
int len; /**< length of ASCII name. */
int wlen; /**< length of Chinese name. */
};
/**
* Pinyin key class.
*
* A pinyin key is a composed element of an initial, a final and a tone,
* which represents one or several Chinese ideographs
*/
class PinyinKey
{
unsigned int m_initial : 6; /**< pinyin initial */
unsigned int m_final : 6; /**< pinyin final */
unsigned int m_tone : 4; /**< pinyin tone */
/*
friend class PinyinKeyLessThan;
friend class PinyinKeyEqualTo;
*/
friend class PinyinKeyExactLessThan;
friend class PinyinKeyExactEqualTo;
public:
/**
* constructor.
*
* the default constructor of class PinyinKey.
*/
PinyinKey (PinyinInitial initial = SCIM_PINYIN_ZeroInitial,
PinyinFinal final = SCIM_PINYIN_ZeroFinal,
PinyinTone tone = SCIM_PINYIN_ZeroTone) {
m_initial = initial;
m_final = final;
m_tone = tone;
}
/**
* constructor.
*
* construct a PinyinKey object from a key string, with
* specified validator.
*
* @sa PinyinValidator
*/
PinyinKey (const PinyinValidator &validator,
const char *key) {
set_key (validator, key);
}
/**
* read the pinyin key value from a key string.
*
* @param validator a PinyinValidator object to validate the key.
* @param key a ASCII string including one or more pinyin keys.
* @return the number of characters used by this pinyin key.
*/
int set_key (const PinyinValidator &validator,
const char *key,
int keylen = -1);
/**
* set the pinyin key value to initial, final and tone.
*/
void set_key (PinyinInitial initial = SCIM_PINYIN_ZeroInitial,
PinyinFinal final = SCIM_PINYIN_ZeroFinal,
PinyinTone tone = SCIM_PINYIN_ZeroTone) {
m_initial = initial;
m_final = final;
m_tone = tone;
}
void set_initial (PinyinInitial initial = SCIM_PINYIN_ZeroInitial) {
m_initial = initial;
}
void set_final (PinyinFinal final = SCIM_PINYIN_ZeroFinal) {
m_final = final;
}
void set_tone (PinyinTone tone = SCIM_PINYIN_ZeroTone) {
m_tone = tone;
}
/**
* get pinyin initial of this key.
*/
PinyinInitial get_initial () const {
return static_cast<PinyinInitial>(m_initial);
}
/**
* get pinyin final of this key.
*/
PinyinFinal get_final () const {
return static_cast<PinyinFinal>(m_final);
}
/**
* get pinyin tone of this key.
*/
PinyinTone get_tone () const {
return static_cast<PinyinTone>(m_tone);
}
/**
* get the ASCII name of pinyin initial of this key.
*/
const char* get_initial_string () const {
return scim_pinyin_initials [m_initial].str;
}
/**
* get the Chinese name of pinyin initial of this key.
const ucs4_t* get_initial_wide_string () const {
return scim_pinyin_initials [m_initial].wstr;
}
*/
/**
* get the ASCII name of pinyin final of this key.
*/
const char* get_final_string () const {
return scim_pinyin_finals [m_final].str;
}
/**
* get the Chinese name of pinyin final of this key.
const ucs4_t* get_final_wide_string () const {
return scim_pinyin_finals [m_final].wstr;
}
*/
/**
* get the ASCII name of this key.
*/
String get_key_string () const;
/**
* get the Chinese name of this key.
WideString get_key_wide_string () const;
*/
/**
* set the pinyin key to zero (invalid) key.
*/
bool zero() const {
return m_initial == SCIM_PINYIN_ZeroInitial &&
m_final == SCIM_PINYIN_ZeroFinal;
}
/**
* output the pinyin key in text format.
*
* @param os the ostream object to output to.
*/
std::ostream& output_text (std::ostream &os) const;
/**
* output the pinyin key in binary format.
*
* @param os the ostream object to output to.
std::ostream& output_binary (std::ostream &os) const;
*/
/**
* input the pinyin key in text format.
*
* @param validator the PinyinValidator object to validate the key.
* @paam is the istream object to input from.
*/
std::istream& input_text (const PinyinValidator &validator, std::istream &is);
/**
* input the pinyin key in binary format.
*
* @param validator the PinyinValidator object to validate the key.
* @paam is the istream object to input from.
std::istream& input_binary (const PinyinValidator &validator, std::istream &is);
*/
bool operator == (PinyinKey rhs) const {
return m_initial == rhs.m_initial && m_final == rhs.m_final && m_tone == rhs.m_tone;
}
bool operator != (PinyinKey rhs) const {
return ! (*this == rhs);
}
private:
/**
* apply additional pinyin rules to the initial, final pair.
*
* for example:
*
* SCIM_PINYIN_ZeroInitial, SCIM_PINYIN_I ==> SCIM_PINYIN_Yi, SCIM_PINYIN_I
* SCIM_PINYIN_Ne, SCIM_PINYIN_Ve ==> SCIM_PINYIN_Ne, SCIM_PINYIN_Ue
*
* etc.
*/
void apply_additional_rules (PinyinInitial &initial,
PinyinFinal &final);
/**
* translate an ASCII string into pinyin initial.
*
* @param initial a PinyinInitial object reference to store the result.
* @param key the ASCII key string.
* @param keylen the length of the key string.
* @return the number of chars actually translated.
*/
int parse_initial (PinyinInitial &initial,
const char *key,
int keylen);
/**
* translate an ASCII string into pinyin final.
*
* @param final a PinyinFinal object reference to store the result.
* @param key the ASCII key string.
* @param keylen the length of the key string.
* @return the number of chars actually translated.
*/
int parse_final (PinyinFinal &final,
const char *key,
int keylen);
/**
* translate an ASCII string into pinyin tone.
*
* @param tone a PinyinTone object reference to store the result.
* @param key the ASCII key string.
* @return the number of chars actually translated.
*/
int parse_tone (PinyinTone &tone,
const char *key);
/**
* translate an ASCII string into initial, final and tone.
*
* @param initial store the result of pinyin initial.
* @param final store the result of pinyin final.
* @param tone store the result of pinyin tone.
* @param key the ASCII key string.
* @param keylen the length of key.
* @return the number of chars actually translated.
*/
int parse_key (PinyinInitial &initial,
PinyinFinal &final,
PinyinTone &tone,
const char *key,
int keylen);
void extract_from_bytes (unsigned char byte0,
unsigned char byte1) {
m_initial = (byte0 & 0x3F) % (SCIM_PINYIN_LastInitial+1);
m_final = ((byte0>>6) | ((byte1 & 0xF)<<2)) % (SCIM_PINYIN_LastFinal+1);
m_tone = (byte1>>4) % (SCIM_PINYIN_LastTone+1);
}
void combine_to_bytes (unsigned char *bytes) const {
bytes[0] = (static_cast<unsigned char>(m_initial)) |
(static_cast<unsigned char>(m_final) << 6);
bytes[1] = (static_cast<unsigned char>(m_final) >> 2) |
(static_cast<unsigned char>(m_tone) << 4);
}
public:
/**
* translate an ASCII key string into a set of valid PinyinKey objects.
*
* @param validator to validate the result pinyin keys.
* @param vec a PinyinParsedKey vector to store the result keys.
* @param key a zero ending ASCII string.
* @return the number of chars actually parsed.
*/
static int parse_pinyin_key (const PinyinValidator &validator,
PinyinParsedKeyVector &vec,
const char *key);
static int parse_pinyin_key (const PinyinValidator &validator,
PinyinKeyVector &vec,
const char *key);
};
/**
* Validator of PinyinKey object.
*
* This class is used to validate a PinyinKey object.
*/
const int PinyinValidatorBitmapSize = (SCIM_PINYIN_InitialNumber *
SCIM_PINYIN_FinalNumber *
SCIM_PINYIN_ToneNumber) / 8 + 1;
class PinyinValidator
{
/**
* pinyin custom settings.
*
* different custom settings will lead to defferent validators.
*/
//PinyinCustomSettings m_custom;
char m_bitmap [PinyinValidatorBitmapSize];
public:
PinyinValidator (/*const PinyinCustomSettings &custom,*/
const PinyinTable *table = NULL);
/**
* initialize the validator with specified custom settings
* and PinyinTable.
*/
void initialize (/*const PinyinCustomSettings &custom,*/
const PinyinTable *table = NULL);
/**
* overloaded operator () function to validate a pinyin key.
*
* @param key the key to be validated.
* @return true = the key is valid, otherwise it's invalid.
*/
bool operator () (PinyinKey key) const;
};
/**
* a binary functional class to do less than comparison
* between two pinyin keys.
*
* the user custom settings will be taken account into the comparison.
class PinyinKeyLessThan
: public std::binary_function <PinyinKey, PinyinKey, bool>
{
PinyinCustomSettings m_custom;
public:
PinyinKeyLessThan (const PinyinCustomSettings &custom)
: m_custom (custom) {}
bool operator () (PinyinKey lhs,
PinyinKey rhs) const;
};
*/
/**
* a binary functional class to do equal to comparison
* between two pinyin keys.
class PinyinKeyEqualTo
: public std::binary_function <PinyinKey, PinyinKey, bool>
{
PinyinCustomSettings m_custom;
public:
PinyinKeyEqualTo (const PinyinCustomSettings &custom)
: m_custom (custom) {}
bool operator () (PinyinKey lhs,
PinyinKey rhs) const;
};
*/
/**
* a binary functional class to do bitwise less than comparison
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -