📄 suggest.cpp
字号:
// Copyright 2000 by Kevin Atkinson under the terms of the LGPL// suggest.cc Suggestion code for Aspell// The magic behind my spell checker comes from merging Lawrence// Philips excellent metaphone algorithm and Ispell's near miss// strategy which is inserting a space or hyphen, interchanging two// adjacent letters, changing one letter, deleting a letter, or adding// a letter.// // The process goes something like this.// // 1. Convert the misspelled word to its soundslike equivalent (its// metaphone for English words).// // 2. Find words that have the same soundslike pattern.//// 3. Find words that have similar soundslike patterns. A similar// soundlike pattern is a pattern that is obtained by// interchanging two adjacent letters, changing one letter,// deleting a letter, or adding a letter.//// 4. Score the result list and return the words with the lowest// score. The score is roughly the weighed average of the edit// distance of the word to the misspelled word, the soundslike// equivalent of the two words, and the phoneme of the two words.// The edit distance is the weighed total of the number of// deletions, insertions, exchanges, or adjacent swaps needed to// make one string equivalent to the other.//// Please note that the soundlike equivalent is a rough approximation// of how the words sounds. It is not the phoneme of the word by any// means. For more information on the metaphone algorithm please see// the file metaphone.cc which included a detailed description of it.// NOTES TO SELF// Consider jump starting the distance alogo. when to words have the same// prefix// Consider short circling the distance algo. when to words have the same// suffix#include "getdata.hpp"#include "fstream.hpp"#include "aspeller.hpp"#include "asuggest.hpp"#include "basic_list.hpp"#include "clone_ptr-t.hpp"#include "config.hpp"#include "data.hpp"#include "editdist.hpp"#include "editdist2.hpp"#include "file_data_util.hpp"#include "errors.hpp"#include "hash-t.hpp"#include "language.hpp"#include "leditdist.hpp"#include "speller_impl.hpp"#include "suggest.hpp"//#define DEBUG_SUGGESTusing namespace aspeller;using namespace acommon;using namespace std;namespace aspeller_default_suggest { typedef vector<String> NearMissesFinal; template <class Iterator> inline Iterator preview_next (Iterator i) { return ++i; } // // OriginalWord stores infomation about the original misspelled word // for convince and speed. // struct OriginalWord { String word; String word_stripped; String soundslike; String phoneme; CasePattern case_pattern; OriginalWord() {} OriginalWord (const String &w, const String &sl, const String &p) : word(w), soundslike(sl), phoneme(p) {} OriginalWord (const String &w, const String &sl, const String &p, const String &l, CasePattern cp) : word(w), word_stripped(l), soundslike(sl), phoneme(p), case_pattern(cp) {} }; // // struct ScoreWordSound - used for storing the possible words while // they are being processed. // struct ScoreWordSound { const char * word; const char * word_stripped; int score; int soundslike_score; bool count; ReplacementList::VirEmul * repl_list; ScoreWordSound() {repl_list = 0;} ~ScoreWordSound() {delete repl_list;} }; inline int compare (const ScoreWordSound &lhs, const ScoreWordSound &rhs) { int temp = lhs.score - rhs.score; if (temp) return temp; return strcmp(lhs.word,rhs.word); } inline bool operator < (const ScoreWordSound & lhs, const ScoreWordSound & rhs) { return compare(lhs, rhs) < 0; } inline bool operator <= (const ScoreWordSound & lhs, const ScoreWordSound & rhs) { return compare(lhs, rhs) <= 0; } inline bool operator == (const ScoreWordSound & lhs, const ScoreWordSound & rhs) { return compare(lhs, rhs) == 0; } typedef BasicList<ScoreWordSound> NearMisses; class Score { protected: const Language * lang; OriginalWord original_word; SuggestParms parms; public: Score(const Language *l, const String &w, const SuggestParms & p) : lang(l), original_word(w, l->to_soundslike(w.c_str()), l->to_phoneme(w.c_str()), to_stripped(*l, w), case_pattern(*l, w)), parms(p) {} String fix_case(const String & word) { return aspeller::fix_case(*lang,original_word.case_pattern,word); } }; class Working : public Score { int threshold; unsigned int max_word_length; SpellerImpl * speller; NearMisses scored_near_misses; NearMisses near_misses; NearMissesFinal * near_misses_final; BasicList<String> strings; static const bool do_count = true; static const bool dont_count = false; static const bool do_need_alloc = true; static const bool dont_need_alloc = false; void try_sound(const char *, int ms); void add_nearmiss(const char * word, int ms, bool count, bool need_alloc, ReplacementList::VirEmul * rl = 0) { near_misses.push_front(ScoreWordSound()); ScoreWordSound & d = near_misses.front(); if (need_alloc) { strings.push_front(word); d.word = strings.front().c_str(); } else { d.word = word; } if (parms.use_typo_analysis) { unsigned int l = strlen(word); if (l > max_word_length) max_word_length = l; } else { if (!is_stripped(*lang,word)) { strings.push_front(to_stripped(*lang,word)); d.word_stripped = strings.front().c_str(); } else { d.word_stripped = d.word; } } d.soundslike_score = ms; d.count = count; d.repl_list = rl; } int needed_level(int want, int soundslike_score) { int n = (100*want - parms.soundslike_weight*soundslike_score) /(parms.word_weight*parms.edit_distance_weights.min); return n > 0 ? n : 0; } int weighted_average(int soundslike_score, int word_score) { return (parms.word_weight*word_score + parms.soundslike_weight*soundslike_score)/100; } int skip_first_couple(NearMisses::iterator & i) { int k = 0; while (preview_next(i) != scored_near_misses.end()) // skip over the first couple of items as they should // not be counted in the threshold score. { if (!i->count) { ++i; } else if (k == parms.skip) { break; } else { ++k; ++i; } } return k; } void try_others(); void score_list(); void transfer(); public: Working(SpellerImpl * m, const Language *l, const String & w, const SuggestParms & p) : Score(l,w,p), threshold(1), max_word_length(0), speller(m) {} void get_suggestions(NearMissesFinal &sug); void get_suggestions_ultra(NearMissesFinal &sug); }; // // try_sound - tries the soundslike string if there is a match add // the possable words to near_misses // void Working::try_sound (const char * m, int ms) { // sound is the object in the list which is a lot smaller than m for (SpellerImpl::DataSetCollection::const_iterator i = speller->data_set_collection().begin(); i != speller->data_set_collection().end(); ++i) { if (!i->use_to_suggest) continue; if (i->data_set->basic_type == DataSet::basic_word_set) { BasicWordSet::Emul e = static_cast<const BasicWordSet *> (i->data_set)->words_w_soundslike(m); BasicWordInfo w; String word; while ((w = e.next())) { w.get_word(word, i->local_info.convert); add_nearmiss(word.c_str(), ms, do_count, do_need_alloc); } } else { BasicReplacementSet::Emul e = static_cast<const BasicReplacementSet *>(i->data_set)->repls_w_soundslike(m); ReplacementList repl; while (! (repl = e.next()).empty() ) add_nearmiss(repl.misspelled_word, ms, dont_count, dont_need_alloc, repl.elements); } } } // // try_others - tries to come up with possible suggestions // void Working::try_others () { const char *replace_list = lang->soundslike_chars(); const String & word = original_word.word; const String & soundslike = original_word.soundslike; String::size_type i; String new_soundslike; new_soundslike.reserve(soundslike.size() + 1); char a,b; const char * c; // Insert a space or hyphone if (word.size() >= 4) { char * new_word = new char[word.size() + 2]; strncpy(new_word, word.data(), word.size()); new_word[word.size() + 1] = '\0'; new_word[word.size() + 0] = new_word[word.size() - 1]; for (i = word.size() - 2; i >= 2; --i) { new_word[i+1] = new_word[i]; new_word[i] = '\0'; if (speller->check(new_word) && speller->check(new_word + i + 1)) { new_word[i] = ' '; add_nearmiss(new_word, parms.edit_distance_weights.del2, dont_count, do_need_alloc); new_word[i] = '-'; add_nearmiss(new_word, parms.edit_distance_weights.del2, dont_count, do_need_alloc); } } delete[] new_word; } if (parms.soundslike_level == 1) { try_sound(soundslike.c_str(), 0); // Change one letter new_soundslike = soundslike; for (i = 0; i != soundslike.size(); ++i) { for (c=replace_list; *c; ++c) { if (*c == soundslike[i]) continue; new_soundslike[i] = *c; try_sound(new_soundslike.c_str(),parms.edit_distance_weights.sub); } new_soundslike[i] = soundslike[i]; } // Interchange two adjacent letters. for (i = 0; i+1 != soundslike.size(); ++i) { a = new_soundslike[i]; b = new_soundslike[i+1]; new_soundslike[i] = b; new_soundslike[i+1] = a; try_sound(new_soundslike.c_str(),parms.edit_distance_weights.swap); new_soundslike[i] = a; new_soundslike[i+1] = b; } // Add one letter new_soundslike += ' '; i = new_soundslike.size()-1; while(true) { for (c=replace_list; *c; ++c) { new_soundslike[i] = *c; try_sound(new_soundslike.c_str(),parms.edit_distance_weights.del1); } if (i == 0) break; new_soundslike[i] = new_soundslike[i-1]; --i; } // Delete one letter if (soundslike.size() > 1) { new_soundslike = soundslike; a = new_soundslike[new_soundslike.size() - 1]; new_soundslike.resize(new_soundslike.size() - 1); i = new_soundslike.size(); while (true) { try_sound(new_soundslike.c_str(),parms.edit_distance_weights.del2); if (i == 0) break; b = a; a = new_soundslike[i-1]; new_soundslike[i-1] = b; --i; } } } else { const char * original_soundslike = original_word.soundslike.c_str(); for (SpellerImpl::DataSetCollection::const_iterator i = speller->data_set_collection().begin(); i != speller->data_set_collection().end(); ++i) { if (!i->use_to_suggest) continue; if (i->data_set->basic_type == DataSet::basic_word_set) { const BasicWordSet * data_set = static_cast<const BasicWordSet *>(i->data_set); BasicWordSet::SoundslikeEmul els = data_set->soundslike_elements(); SoundslikeWord sw; while ( (sw = els.next()) == true) { int score = limit2_edit_distance(original_soundslike, sw.soundslike, parms.edit_distance_weights); if (score < LARGE_NUM) { BasicWordSet::Emul e = data_set->words_w_soundslike(sw); BasicWordInfo bw; String word; while ((bw = e.next())) { bw.get_word(word, i->local_info.convert); add_nearmiss(word.c_str(), score, do_count, do_need_alloc); } } } } else { const BasicReplacementSet * repl_set = static_cast<const BasicReplacementSet *>(i->data_set);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -