📄 scim_pinyin.h
字号:
* between two pinyin keys.
*/
class PinyinKeyExactLessThan
: public std::binary_function <PinyinKey, PinyinKey, bool>
{
public:
bool operator () (PinyinKey lhs,
PinyinKey rhs) const {
if (lhs.m_initial < rhs.m_initial)
return true;
else if (lhs.m_initial == rhs.m_initial) {
if (lhs.m_final < rhs.m_final)
return true;
else if (lhs.m_final == rhs.m_final &&
lhs.m_tone < rhs.m_tone)
return true;
}
return false;
}
};
/**
* a binary functional class to do bitwise equal to comparison
* between two pinyin keys.
*/
class PinyinKeyExactEqualTo
: public std::binary_function <PinyinKey, PinyinKey, bool>
{
public:
bool operator () (PinyinKey lhs,
PinyinKey rhs) const {
if (lhs.m_initial == rhs.m_initial &&
lhs.m_final == rhs.m_final &&
lhs.m_tone == rhs.m_tone)
return true;
return false;
}
};
/**
* this class is for storing a key which is parsed from a string.
*/
struct PinyinParsedKey : public PinyinKey
{
int m_pos; /**< the position of this key in the whole string. */
int m_length; /**< the length of string used by this key. */
public:
/**
* constructor
*/
PinyinParsedKey (int pos = 0,
int length = 0,
PinyinInitial initial = SCIM_PINYIN_ZeroInitial,
PinyinFinal final = SCIM_PINYIN_ZeroFinal,
PinyinTone tone = SCIM_PINYIN_ZeroTone)
: PinyinKey (initial, final, tone), m_pos (pos), m_length (length) { }
/**
* get the key's position in the whole string.
*/
int get_pos () const { return m_pos; }
/**
* get length of the key string.
*/
int get_length () const { return m_length; }
/**
* get the key's end position in the whole string.
*/
int get_end_pos () const { return m_pos + m_length; }
/**
* set the key's position.
*/
void set_pos (int pos) { m_pos = pos; }
/**
* set the key's length.
*/
void set_length (int length) { m_length = length; }
};
class CharFrequencyPairLessThanByChar {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
return lhs.first < rhs.first;
}
bool operator () (const CharFrequencyPair &lhs,
ucs4_t rhs) const {
return lhs.first < rhs;
}
bool operator () (ucs4_t lhs,
const CharFrequencyPair &rhs) const {
return lhs < rhs.first;
}
};
class CharFrequencyPairGreaterThanByChar {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
return lhs.first > rhs.first;
}
bool operator () (const CharFrequencyPair &lhs,
ucs4_t rhs) const {
return lhs.first > rhs;
}
bool operator () (ucs4_t lhs,
const CharFrequencyPair &rhs) const {
return lhs > rhs.first;
}
};
class CharFrequencyPairLessThanByFrequency {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
return lhs.second < rhs.second;
}
bool operator () (const CharFrequencyPair &lhs,
uint32 rhs) const {
return lhs.second < rhs;
}
bool operator () (uint32 lhs,
const CharFrequencyPair &rhs) const {
return lhs < rhs.second;
}
};
class CharFrequencyPairGreaterThanByFrequency {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
return lhs.second > rhs.second;
}
bool operator () (const CharFrequencyPair &lhs,
uint32 rhs) const {
return lhs.second > rhs;
}
bool operator () (uint32 lhs,
const CharFrequencyPair &rhs) const {
return lhs > rhs.second;
}
};
class CharFrequencyPairLessThanByCharAndFrequency {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
if (lhs.first < rhs.first) return true;
if (lhs.first > rhs.first) return false;
return lhs.second < rhs.second;
}
};
class CharFrequencyPairGreaterThanByCharAndFrequency {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
if (lhs.first > rhs.first) return true;
if (lhs.first < rhs.first) return false;
return lhs.second > rhs.second;
}
};
class CharFrequencyPairEqualToByChar {
public:
bool operator () (const CharFrequencyPair &lhs,
const CharFrequencyPair &rhs) const {
return lhs.first == rhs.first;
}
};
/**
* A PinyinEntry has a pinyin key and a set of ucs4_t,
* whose pronouncation are same as the key.
*/
class PinyinEntry
{
PinyinKey m_key;
/**< the pinyin key of this entry */
std::vector <CharFrequencyPair> m_chars;
/**< the vector to store the chars and their frequencies */
public:
/**
* constructor
*/
PinyinEntry (PinyinKey key)
: m_key (key) {}
/**
* copy constructor
*/
PinyinEntry (const PinyinEntry &entry)
: m_key (entry.m_key), m_chars (entry.m_chars) {}
/**
* constructor.
* read this entry from a stream.
*/
PinyinEntry (const PinyinValidator &validator,
std::istream &is
/*bool binary = false*/) {
/*if (binary) input_binary (validator, is);
else*/ input_text (validator, is);
}
/**
* copy operator.
*/
const PinyinEntry& operator = (const PinyinEntry &entry) {
if (this != &entry) {
m_key = entry.m_key;
m_chars = entry.m_chars;
}
return *this;
}
/**
* set pinyin key of this entry.
*/
void set_key (PinyinKey key) {
m_key = key;
}
/**
* get the pinyin key of this entry.
*/
PinyinKey get_key () const {
return m_key;
}
/**
* check if this entry has the char.
*/
bool has_char (ucs4_t c) const {
return std::binary_search (
m_chars.begin (),
m_chars.end (),
c,
CharFrequencyPairLessThanByChar ());
}
/**
* sort all chars.
*/
void sort () {
std::sort (m_chars.begin(), m_chars.end());
}
/**
* clear this entry.
*/
void clear () {
std::vector <CharFrequencyPair> ().swap (m_chars);
}
/**
* return entry size (number of chars).
*/
size_t size () const {
return m_chars.size();
}
/**
* insert a char into this entry.
*/
void insert (const CharFrequencyPair &ch) {
std::vector<CharFrequencyPair>::iterator i =
std::lower_bound (
m_chars.begin (),
m_chars.end (),
ch.first,
CharFrequencyPairLessThanByChar ());
if (i != m_chars.end () && i->first == ch.first) {
if (ch.second > i->second)
i->second = ch.second;
} else {
m_chars.insert (i, ch);
}
}
/**
* erase a char from this entry.
*/
void erase (ucs4_t c) {
std::vector<CharFrequencyPair>::iterator i =
std::lower_bound (
m_chars.begin (),
m_chars.end (),
c,
CharFrequencyPairLessThanByChar ());
if (i != m_chars.end() && i->first == c) m_chars.erase (i);
}
/**
* get the char at position index.
*/
ucs4_t get_char_by_index (unsigned int index) const {
return m_chars [index].first;
}
/**
* get the char with its frequency.
*/
const CharFrequencyPair & get_char_with_frequency_by_index (unsigned int index) const {
return m_chars [index];
}
int get_all_chars (std::vector<ucs4_t> &vec) const {
for (std::vector<CharFrequencyPair>::const_iterator i = m_chars.begin ();
i != m_chars.end (); ++ i)
vec.push_back (i->first);
return vec.size ();
}
int get_all_chars_with_frequencies (std::vector<CharFrequencyPair> &vec) const {
for (std::vector<CharFrequencyPair>::const_iterator i = m_chars.begin ();
i != m_chars.end (); ++ i)
vec.push_back (*i);
return vec.size ();
}
uint32 get_char_frequency (ucs4_t ch) const {
std::vector<CharFrequencyPair>::const_iterator i =
std::lower_bound (
m_chars.begin (),
m_chars.end (),
ch,
CharFrequencyPairLessThanByChar ());
if (i != m_chars.end() && i->first == ch)
return i->second;
return 0;
}
void set_char_frequency (ucs4_t ch, uint32 freq) {
std::vector<CharFrequencyPair>::iterator i =
std::lower_bound (
m_chars.begin (),
m_chars.end (),
ch,
CharFrequencyPairLessThanByChar ());
if (i != m_chars.end() && i->first == ch)
i->second = freq;
}
void refresh_char_frequency (ucs4_t ch, uint32 shift) {
std::vector<CharFrequencyPair>::iterator i =
std::lower_bound (
m_chars.begin (),
m_chars.end (),
ch,
CharFrequencyPairLessThanByChar ());
if (i != m_chars.end() && i->first == ch) {
uint32 delta = (SCIM_MAX_CHAR_FREQUENCY - i->second);
if (delta) {
delta >>= shift;
if (!delta) ++ delta;
i->second = i->second + delta;
}
}
}
/**
* @sa get_key
*/
operator PinyinKey () const {
return m_key;
}
/**
* output the content of this entry to ostream in text format.
*/
std::ostream& output_text (std::ostream &os) const;
/**
* read the content of this entry from istream in text format.
*/
std::istream& input_text (const PinyinValidator &validator, std::istream &is);
/**
* output in binary format.
std::ostream& output_binary (std::ostream &os) const;
*/
/**
* input in binary format.
std::istream& input_binary (const PinyinValidator &validator, std::istream &is);
*/
};
/**
* a table to store all of the Hanzi characters and its pinyin keys.
*/
class PinyinTable
{
/*
#if defined (HAVE_HASH_MAP)
typedef std::hash_multimap<ucs4_t,PinyinKey, std::hash <unsigned long> > ReversePinyinMap;
#elif defined (HAVE_EXT_HASH_MAP)
typedef __gnu_cxx::hash_multimap<ucs4_t,PinyinKey, __gnu_cxx::hash <unsigned long> > ReversePinyinMap;
#else
typedef std::multimap<ucs4_t, PinyinKey> ReversePinyinMap;
#endif
typedef std::pair<ucs4_t,PinyinKey> ReversePinyinPair;
*/
typedef std::vector<PinyinEntry> PinyinEntryVector;
/**
* the vector to store all of the pinyin entries.
*/
PinyinEntryVector m_table;
/**
* the multimap to store reverse pinyin map.
*
* The reverse pinyin map is used to do Hanzi -> Pinyin mapping.
ReversePinyinMap m_revmap;
*/
/**
* indicates that if the reverse map is OK.
bool m_revmap_ok;
*/
/**
* less than function object of PinyinKey.
*/
PinyinKeyExactLessThan m_pinyin_key_less;
/**
* equal to function object of PinyinKey.
*/
PinyinKeyExactEqualTo m_pinyin_key_equal;
/**
* the validator to valdiate all of the pinyin keys.
*/
const PinyinValidator *m_validator;
public:
/**
* constructor.
*
* @param custom the custom settings to construct less than and equal to
* function object of PinyinKey.
* @param validator the validator to validate all of the pinyin keys.
* @param tablefile the file name of pinyin table.
*/
PinyinTable (/*const PinyinCustomSettings &custom,*/
const PinyinValidator *validator,
const char *tablefile = NULL);
PinyinTable (/*const PinyinCustomSettings &custom,*/
const PinyinValidator *validator,
std::istream &is);
bool output (std::ostream &os, bool binary = false) const;
bool input (std::istream &is);
bool load_table (const char *tablefile);
bool save_table (const char *tablefile, bool binary = false) const;
/*
void update_custom_settings (const PinyinCustomSettings &custom,
const PinyinValidator *validator);
int get_all_chars (std::vector<ucs4_t> &vec) const;
int get_all_chars_with_frequencies (std::vector<CharFrequencyPair> &vec) const;
*/
int find_chars (std::vector<ucs4_t> &vec, PinyinKey key) const;
int find_chars_with_frequencies (std::vector<CharFrequencyPair> &vec, PinyinKey key) const;
int find_keys (PinyinKeyVector &vec, ucs4_t code);
int find_key_strings (std::vector<PinyinKeyVector> &vec, const WideString & str);
void erase (ucs4_t hz, const char *key);
void erase (ucs4_t hz, PinyinKey key);
uint32 get_char_frequency (ucs4_t ch, PinyinKey key = PinyinKey ());
void set_char_frequency (ucs4_t ch, uint32 freq, PinyinKey key = PinyinKey ());
/**
* grow the char frequency by 1/(2^shift).
*/
void refresh (ucs4_t hz, uint32 shift = 31, PinyinKey key = PinyinKey ());
void insert (ucs4_t hz, const char *key);
void insert (ucs4_t hz, PinyinKey key);
size_t size () const;
size_t number_of_entry () const { return m_table.size (); }
// clear this table
void clear () {
m_table.clear ();
// m_revmap.clear ();
//m_revmap_ok = false;
}
bool has_key (const char *key) const;
bool has_key (PinyinKey key) const;
private:
/**
* sort all pinyin entries.
*/
void sort ();
// void create_reverse_map ();
//void insert_to_reverse_map (ucs4_t code, PinyinKey key);
//void erase_from_reverse_map (ucs4_t code, PinyinKey key);
PinyinEntryVector::iterator find_exact_entry (PinyinKey key);
void create_pinyin_key_vector_vector (std::vector<PinyinKeyVector> &vv,
PinyinKeyVector &key_buffer,
PinyinKeyVector *key_vectors,
int index,
int len);
};
inline std::ostream&
operator << (std::ostream& os, PinyinKey key)
{
return key.output_text (os);
}
inline std::ostream&
operator << (std::ostream& os, const PinyinEntry &entry)
{
return entry.output_text (os);
}
/**
* @brief Write a wide char to ostream.
*
* The content written into the ostream will be converted into utf-8 encoding.
*
* @param os the stream to be written.
* @param wc the wide char to be written to the stream.
* @return the same stream object reference.
*/
std::ostream & utf8_write_wchar (std::ostream &os, ucs4_t wc);
#endif
/*
vi:ts=4:nowrap:ai
*/
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -