📄 suggest.cpp
字号:
BasicWordSet::SoundslikeEmul els = repl_set->soundslike_elements(); SoundslikeWord w; while ( (w = els.next()) == true) { int score = limit2_edit_distance(original_soundslike, w.soundslike, parms.edit_distance_weights); if (score < LARGE_NUM) { BasicReplacementSet::Emul e = repl_set->repls_w_soundslike(w); ReplacementList repl; while (! (repl = e.next()).empty() ) add_nearmiss(repl.misspelled_word, score, dont_count, dont_need_alloc, repl.elements); } } } } } } void Working::score_list() { if (near_misses.empty()) return; bool no_soundslike = strcmp(speller->lang().soundslike_name(), "none") == 0; if (parms.use_typo_analysis) { parms.set_original_word_size(original_word.word.size()); NearMisses::iterator i; int word_score; unsigned int j; vector<unsigned char> original(original_word.word.size() + 1); for (j = 0; j != original_word.word.size(); ++j) original[j] = lang->to_normalized(original_word.word[j]); original[j] = 0; vector<unsigned char> word(max_word_length + 1); for (i = near_misses.begin(); i != near_misses.end(); ++i) { for (j = 0; (i->word)[j] != 0; ++j) word[j] = lang->to_normalized((i->word)[j]); word[j] = 0; word_score = typo_edit_distance(&*word.begin(), &*original.begin(), parms.typo_edit_distance_weights); i->score = weighted_average(i->soundslike_score, word_score); } near_misses.swap(scored_near_misses); scored_near_misses.sort(); i = scored_near_misses.begin(); if (i == scored_near_misses.end()) return; skip_first_couple(i); threshold = i->score + parms.span; if (threshold < parms.edit_distance_weights.max) threshold = parms.edit_distance_weights.max; } else { parms.set_original_word_size(original_word.word.size()); NearMisses::iterator i; NearMisses::iterator prev; int word_score; near_misses.push_front(ScoreWordSound()); // the first item will NEVER be looked at. scored_near_misses.push_front(ScoreWordSound()); scored_near_misses.front().score = -1; // this item will only be looked at when sorting so // make it a small value to keep it at the front. int try_for = (parms.word_weight*parms.edit_distance_weights.max)/100; while (true) { try_for += (parms.word_weight*parms.edit_distance_weights.max)/100; // put all pairs whose score <= initial_limit*max_weight // into the scored list prev = near_misses.begin(); i = prev; ++i; while (i != near_misses.end()) { int level = needed_level(try_for, i->soundslike_score); if (no_soundslike) word_score = i->soundslike_score; else if (level >= int(i->soundslike_score/parms.edit_distance_weights.min)) word_score = edit_distance(original_word.word_stripped.c_str(), i->word_stripped, level, level, parms.edit_distance_weights); else word_score = LARGE_NUM; if (word_score < LARGE_NUM) { i->score = weighted_average(i->soundslike_score, word_score); scored_near_misses.splice_into(near_misses,prev,i); i = prev; // Yes this is right due to the slice ++i; } else { prev = i; ++i; } } scored_near_misses.sort(); i = scored_near_misses.begin(); ++i; if (i == scored_near_misses.end()) continue; int k = skip_first_couple(i); if ((k == parms.skip && i->score <= try_for) || prev == near_misses.begin() ) // or no more left in near_misses break; } threshold = i->score + parms.span; if (threshold < parms.edit_distance_weights.max) threshold = parms.edit_distance_weights.max;# ifdef DEBUG_SUGGEST cout << "Threshold is: " << threshold << endl; cout << "try_for: " << try_for << endl; cout << "Size of scored: " << scored_near_misses.size() << endl; cout << "Size of ! scored: " << near_misses.size() << endl;# endif //if (threshold - try_for <= parms.edit_distance_weights.max/2) return; prev = near_misses.begin(); i = prev; ++i; while (i != near_misses.end()) { int initial_level = needed_level(try_for, i->soundslike_score); int max_level = needed_level(threshold, i->soundslike_score); if (no_soundslike) word_score = i->soundslike_score; else if (initial_level < max_level) word_score = edit_distance(original_word.word_stripped.c_str(), i->word_stripped, initial_level+1,max_level, parms.edit_distance_weights); else word_score = LARGE_NUM; if (word_score < LARGE_NUM) { i->score = weighted_average(i->soundslike_score, word_score); scored_near_misses.splice_into(near_misses,prev,i); i = prev; // Yes this is right due to the slice ++i; } else { prev = i; ++i; } } scored_near_misses.sort(); scored_near_misses.pop_front(); } } void Working::transfer() {# ifdef DEBUG_SUGGEST cout << endl << endl << original_word.word << '\t' << original_word.soundslike << '\t' << endl;# endif int c = 1; hash_set<String,HashString<String> > duplicates_check; String final_word; pair<hash_set<String,HashString<String> >::iterator, bool> dup_pair; for (NearMisses::const_iterator i = scored_near_misses.begin(); i != scored_near_misses.end() && c <= parms.limit && ( i->score <= threshold || c <= 3 ); ++i, ++c) {# ifdef DEBUG_SUGGEST cout << i->word << '\t' << i->score << '\t' << lang->to_soundslike(i->word) << endl;# endif if (i->repl_list != 0) { const char * word; string::size_type pos; while((word = i->repl_list->next()) != 0) { dup_pair = duplicates_check.insert(fix_case(word)); if (dup_pair.second && ((pos = dup_pair.first->find(' '), pos == String::npos) ? (bool)speller->check(*dup_pair.first) : (speller->check((String)dup_pair.first->substr(0,pos)) && speller->check((String)dup_pair.first->substr(pos+1))) )) near_misses_final->push_back(*dup_pair.first); } } else { dup_pair = duplicates_check.insert(fix_case(i->word)); if (dup_pair.second ) near_misses_final->push_back(*dup_pair.first); } } } void Working::get_suggestions(NearMissesFinal & sug) { near_misses_final = & sug; if (original_word.soundslike.empty()) return; try_others(); score_list(); transfer(); } class SuggestionListImpl : public SuggestionList { struct Parms { typedef const char * Value; typedef NearMissesFinal::const_iterator Iterator; Iterator end; Parms(Iterator e) : end(e) {} bool endf(Iterator e) const {return e == end;} Value end_state() const {return 0;} Value deref(Iterator i) const {return i->c_str();} }; public: NearMissesFinal suggestions; SuggestionList * clone() const {return new SuggestionListImpl(*this);} void assign(const SuggestionList * other) { *this = *static_cast<const SuggestionListImpl *>(other); } bool empty() const { return suggestions.empty(); } Size size() const { return suggestions.size(); } VirEmul * elements() const { return new MakeVirEnumeration<Parms, StringEnumeration> (suggestions.begin(), Parms(suggestions.end())); } }; class SuggestImpl : public Suggest { SpellerImpl * speller_; SuggestionListImpl suggestion_list; SuggestParms parms_; public: SuggestImpl(SpellerImpl * m) : speller_(m), parms_(m->config()->retrieve("sug-mode")) {parms_.fill_distance_lookup(m->config(), m->lang());} SuggestImpl(SpellerImpl * m, const SuggestParms & p) : speller_(m), parms_(p) {parms_.fill_distance_lookup(m->config(), m->lang());} PosibErr<void> set_mode(ParmString mode) { return parms_.set(mode); } double score(const char *base, const char *other) { //parms_.set_original_word_size(strlen(base)); //Score s(&speller_->lang(),base,parms_); //string sl = speller_->lang().to_soundslike(other); //ScoreWordSound sws(other, sl.c_str()); //s.score(sws); //return sws.score; return -1; } SuggestionList & suggest(const char * word); }; SuggestionList & SuggestImpl::suggest(const char * word) { # ifdef DEBUG_SUGGEST cout << "=========== begin suggest " << word << " ===========\n";# endif parms_.set_original_word_size(strlen(word)); suggestion_list.suggestions.resize(0); Working sug(speller_, &speller_->lang(),word,parms_); sug.get_suggestions(suggestion_list.suggestions);# ifdef DEBUG_SUGGEST cout << "^^^^^^^^^^^ end suggest " << word << " ^^^^^^^^^^^\n";# endif return suggestion_list; } }namespace aspeller { Suggest * new_default_suggest(SpellerImpl * m) { return new aspeller_default_suggest::SuggestImpl(m); } Suggest * new_default_suggest(SpellerImpl * m, const SuggestParms & p) { return new aspeller_default_suggest::SuggestImpl(m,p); } PosibErr<void> SuggestParms::set(ParmString mode) { if (mode != "normal" && mode != "fast" && mode != "ultra" && mode != "bad-spellers") return make_err(bad_value, "sug-mode", mode, "one of ultra, fast, normal, or bad-spellers"); edit_distance_weights.del1 = 95; edit_distance_weights.del2 = 95; edit_distance_weights.swap = 90; edit_distance_weights.sub = 100; edit_distance_weights.similar = 10; edit_distance_weights.max = 100; edit_distance_weights.min = 90; normal_soundslike_weight = 50; small_word_soundslike_weight = 15; small_word_threshold = 4; soundslike_weight = normal_soundslike_weight; word_weight = 100 - normal_soundslike_weight; skip = 2; limit = 100; if (mode == "normal") { use_typo_analysis = true; soundslike_level = 2; // either one or two span = 50; } else if (mode == "fast") { use_typo_analysis = true; soundslike_level = 1; // either one or two span = 50; } else if (mode == "ultra") { use_typo_analysis = false; soundslike_level = 1; // either one or two span = 50; } else if (mode == "bad-spellers") { use_typo_analysis = false; normal_soundslike_weight = 55; small_word_threshold = 0; soundslike_level = 2; // either one or two span = 125; limit = 1000; } else { abort(); // this should NEVER happen. } return no_err; } PosibErr<void> SuggestParms::fill_distance_lookup(const Config * c, const Language & l) { TypoEditDistanceWeights & w = typo_edit_distance_weights; String keyboard = c->retrieve("keyboard"); if (keyboard == "none") { use_typo_analysis = false; } else { FStream in; String file, dir1, dir2; fill_data_dir(c, dir1, dir2); find_file(file, dir1, dir2, keyboard, ".kbd"); RET_ON_ERR(in.open(file.c_str(), "r")); int c = l.max_normalized() + 1; w.repl .init(c); w.extra.init(c); for (int i = 0; i != c; ++i) { for (int j = 0; j != c; ++j) { w.repl (i,j) = w.repl_dis2; w.extra(i,j) = w.extra_dis2; } } String key, data; while (getdata_pair(in, key, data)) { if (key.size() != 2) return make_err(bad_file_format, file); w.repl (l.to_normalized(key[0]), l.to_normalized(key[1])) = w.repl_dis1; w.repl (l.to_normalized(key[1]), l.to_normalized(key[0])) = w.repl_dis1; w.extra(l.to_normalized(key[0]), l.to_normalized(key[1])) = w.extra_dis1; w.extra(l.to_normalized(key[1]), l.to_normalized(key[0])) = w.extra_dis1; } for (int i = 0; i != c; ++i) { w.repl(i,i) = 0; w.extra(i,i) = w.extra_dis1; } } return no_err; } SuggestParms * SuggestParms::clone() const { return new SuggestParms(*this); } void SuggestParms::set_original_word_size(int size) { if (size <= small_word_threshold) { soundslike_weight = small_word_soundslike_weight; } else { soundslike_weight = normal_soundslike_weight; } word_weight = 100 - soundslike_weight; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -