📄 permngram.cpp
字号:
///////////////////////////////////////////////////////////////////////// File: permngram.cpp// Description: Character n-gram permuter// Author: Thomas Kielbus// Created: Wed Sep 12 11:26:43 PDT 2007//// (C) Copyright 2007, Google Inc.// Licensed under the Apache License, Version 2.0 (the "License");// you may not use this file except in compliance with the License.// You may obtain a copy of the License at// http://www.apache.org/licenses/LICENSE-2.0// Unless required by applicable law or agreed to in writing, software// distributed under the License is distributed on an "AS IS" BASIS,// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.// See the License for the specific language governing permissions and// limitations under the License./////////////////////////////////////////////////////////////////////////#include "const.h"#include "permngram.h"#include "permnum.h"#include "debug.h"#include "permute.h"#include "dawg.h"#include "tordvars.h"#include "stopper.h"#include "globals.h"#include "context.h"#include <math.h>#include <ctype.h>// Ratio to control the relative importance of the classifier and the ngram// in the final score of a classification unit. Must be >= 0 and <= 1.// A value of 1.0 uses only the shape classifier score.// A value of 0.0 uses only the ngram score.double_VAR(classifier_score_ngram_score_ratio, 0.7, "");// Rating adjustment multiplier for words not in the DAWG. Must be >= 1.double_VAR(non_dawg_prefix_rating_adjustment, 1.5, "");// HypothesisPrefix represents a word prefix during the search of the// character-level n-gram model based permuter.// It holds the data needed to create the corresponding A_CHOICE.// Note that the string stored in the _word data member always begin with a// space character. This is used by the n-gram model to score the word.// HypothesisPrefix also contains the node in the DAWG that is reached when// searching for the corresponding prefix.class HypothesisPrefix { public: HypothesisPrefix(); HypothesisPrefix(const HypothesisPrefix& prefix, A_CHOICE* choice, bool end_of_word, EDGE_ARRAY dawg); double rating() const {return rating_;} double certainty() const {return certainty_;} const char* word() const {return word_;} const char* unichar_lengths() const {return unichar_lengths_;} const float* certainty_array() const {return certainty_array_;} bool is_dawg_prefix() const {return is_dawg_prefix_;} NODE_REF dawg_node() const {return dawg_node_;} private: double rating_; double certainty_; char word_[UNICHAR_LEN * MAX_WERD_LENGTH + 2]; char unichar_lengths_[MAX_WERD_LENGTH + 1]; float certainty_array_[MAX_WERD_LENGTH + 1]; NODE_REF dawg_node_; bool is_dawg_prefix_;};// HypothesisPrefix is the class used as nodes in HypothesisPrefixListstypedef HypothesisPrefix HypothesisPrefixListNode;// HypothesisPrefixList maintains a sorted list of HypothesisPrefixes. The size// is bounded by the argument given to the constructor.// For the sake of simplicity, current implementation is not as efficient as it// could be. The list is represented by a static array of pointers to its// elements. All nodes are stored in positions from 0 to (size() - 1).class HypothesisPrefixList { public: HypothesisPrefixList(int size_bound); ~HypothesisPrefixList(); void add_node(HypothesisPrefix* node); int size() const {return _size;} void clear(); const HypothesisPrefix& node(int index) {return *_list_nodes[index];} private: HypothesisPrefix** _list_nodes; int _size_bound; int _size;};// Return the classifier_score_ngram_score_ratio for a given choice string.// The classification decision for characters like comma and period should// be based only on shape rather than on shape and n-gram score.// Return 1.0 for them, the default classifier_score_ngram_score_ratio// otherwise.static double get_classifier_score_ngram_score_ratio(const char* choice);// Permute the given char_choices using a character level n-gram model and// return the best word choice found.// This is performed by maintaining a HypothesisPrefixList of HypothesisPrefixes.// For each character position, each possible character choice is appended to// the best current prefixes to create the list of best prefixes at the next// character position.A_CHOICE *ngram_permute_and_select(CHOICES_LIST char_choices, float rating_limit, EDGE_ARRAY dawg) { if (array_count (char_choices) <= MAX_WERD_LENGTH) { CHOICES choices; int char_index_max = array_count(char_choices); HypothesisPrefixList list_1(20); HypothesisPrefixList list_2(20); HypothesisPrefixList* current_list = &list_1; HypothesisPrefixList* next_list = &list_2; HypothesisPrefix* initial_node = new HypothesisPrefix(); current_list->add_node(initial_node); for (int char_index = 0; char_index < char_index_max; ++char_index) { iterate_list(choices, (CHOICES) array_index(char_choices, char_index)) { A_CHOICE* choice = (A_CHOICE *) first_node(choices); for (int node_index = 0; node_index < current_list->size(); ++node_index) { // Append this choice to the current node HypothesisPrefix* new_node = new HypothesisPrefix( current_list->node(node_index), choice, char_index == char_index_max - 1, dawg); next_list->add_node(new_node); } } // Clear current list and switch lists current_list->clear(); HypothesisPrefixList* temp_list = current_list; current_list = next_list; next_list = temp_list; // Give up if the current best rating is worse than rating_limit if (current_list->node(0).rating() > rating_limit) return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM); } const HypothesisPrefix& best_word = current_list->node(0); A_CHOICE* best_choice = new_choice (best_word.word() + 1, best_word.unichar_lengths(), best_word.rating(), best_word.certainty(), -1, valid_word(best_word.word() + 1) ? SYSTEM_DAWG_PERM : TOP_CHOICE_PERM); LogNewWordChoice(best_choice, best_word.is_dawg_prefix() ? 1.0 : non_dawg_prefix_rating_adjustment, const_cast<float*>(best_word.certainty_array())); return best_choice; } else { return new_choice (NULL, NULL, MAXFLOAT, -MAXFLOAT, -1, NO_PERM); }}double get_classifier_score_ngram_score_ratio(const char* choice) { if (!strcmp(",", choice) || !strcmp(".", choice)) return 1.0; else return classifier_score_ngram_score_ratio;}// Initial HypothesisPrefix constructor used to create the first state of the// search.HypothesisPrefix::HypothesisPrefix() { rating_ = 0; certainty_ = MAXFLOAT; strcpy(word_, " "); unichar_lengths_[0] = '\0'; dawg_node_ = 0; is_dawg_prefix_ = true;}// Main constructor to create a new HypothesisPrefix by appending a character// choice (A_CHOICE) to an existing HypothesisPrefix. This constructor takes// care of copying the original prefix's data members, appends the character// choice to the word and updates its rating using a character-level n-gram// model. The state in the DAWG is also updated.HypothesisPrefix::HypothesisPrefix(const HypothesisPrefix& prefix, A_CHOICE* choice, bool end_of_word, EDGE_ARRAY dawg) { char* word_ptr = word_; const char* prefix_word_ptr = prefix.word_; // Copy first space character *(word_ptr++) = *(prefix_word_ptr++); // Copy existing word, unichar_lengths, certainty_array int char_index; for (char_index = 0; prefix.unichar_lengths_[char_index] != '\0'; ++char_index) { for (int char_subindex = 0; char_subindex < prefix.unichar_lengths_[char_index]; ++char_subindex) { *(word_ptr++) = *(prefix_word_ptr++); } unichar_lengths_[char_index] = prefix.unichar_lengths_[char_index]; certainty_array_[char_index] = prefix.certainty_array_[char_index]; } // If choice is empty, use a space character instead const char* class_string_choice = *class_string(choice) == '\0' ? " " : class_string(choice); // Update certainty certainty_ = min(prefix.certainty_, class_certainty(choice)); // Apprend choice to the word strcpy(word_ptr, class_string_choice); unichar_lengths_[char_index] = strlen(class_string_choice); unichar_lengths_[char_index + 1] = '\0'; // Append choice certainty to the certainty array certainty_array_[char_index] = class_certainty(choice); // Copy DAWG node state dawg_node_ = prefix.dawg_node_; is_dawg_prefix_ = prefix.is_dawg_prefix_; // Verify DAWG and update dawg_node_ if the current prefix is already valid if (is_dawg_prefix_) { for (int char_subindex = 0; class_string_choice[char_subindex] != '\0'; ++char_subindex) { // Verify each byte of the appended character. Note that word_ptr points // to the first byte so (word_ptr - (word_ + 1)) is the index of the first // new byte in the string that starts at (word_ + 1). int current_byte_index = word_ptr - (word_ + 1) + char_subindex; if(!letter_is_okay(dawg, &dawg_node_, current_byte_index, '\0', word_ + 1, end_of_word && class_string_choice[char_subindex + 1] == '\0')) { dawg_node_ = NO_EDGE; is_dawg_prefix_ = false; break; } } } // Copy the prefix rating rating_ = prefix.rating_; // Compute rating of current character double probability = probability_in_context(prefix.word_, -1, class_string_choice, -1); // If last character of the word, take the following space into account if (end_of_word) probability *= probability_in_context(word_, -1, " ", -1); double local_classifier_score_ngram_score_ratio = get_classifier_score_ngram_score_ratio(class_string_choice); double classifier_rating = class_probability(choice); double ngram_rating = -log(probability) / log(2.0); double mixed_rating = local_classifier_score_ngram_score_ratio * classifier_rating + (1 - local_classifier_score_ngram_score_ratio) * ngram_rating; // If the current word is not a valid prefix, adjust the rating of the // character being appended. If it used to be a valid prefix, compensate for // previous adjustments. if (!is_dawg_prefix_) { if (prefix.is_dawg_prefix_) rating_ *= non_dawg_prefix_rating_adjustment; mixed_rating *= non_dawg_prefix_rating_adjustment; } // Update rating by adding the rating of the character being appended. rating_ += mixed_rating;}// Create an empty HypothesisPrefixList. Its maximum size is set to the given// bound.HypothesisPrefixList::HypothesisPrefixList(int size_bound): _size_bound(size_bound), _size(0) { _list_nodes = new HypothesisPrefix*[_size_bound]; for (int i = 0; i < _size_bound; ++i) _list_nodes[i] = NULL;}// Destroy a HypothesisPrefixList all contained nodes are deleted as well.HypothesisPrefixList::~HypothesisPrefixList() { this->clear(); delete[] _list_nodes;}// Add a node to the HypothesisPrefixList. Maintains the sorted list property.// Note that the HypothesisPrefixList takes ownership of the given node and// might delete it if needed. It must therefore have been allocated on the heap.void HypothesisPrefixList::add_node(HypothesisPrefix* node) { // Detect nodes that have a worst rating that the current maximum and treat // them separately. if (_size > 0 && _list_nodes[_size - 1]->rating() < node->rating()) { if (_size == _size_bound) { // The list is already full. This node will not be added delete node; } else { // The list is not full. Add the node at the last position. _list_nodes[_size] = node; ++_size; } return; } // Find the correct position int node_index_target = 0; while (node_index_target < _size_bound && _list_nodes[node_index_target] != NULL && _list_nodes[node_index_target]->rating() < node->rating()) { ++node_index_target; } if (node_index_target >= _size_bound) { delete node; } else { // Move next states by 1. Starting from the last one. int node_index_move = _size - 1; while (node_index_move >= node_index_target) { if (node_index_move == _size_bound - 1) delete _list_nodes[node_index_move]; else _list_nodes[node_index_move + 1] = _list_nodes[node_index_move]; _list_nodes[node_index_move] = NULL; --node_index_move; } // Insert new node _list_nodes[node_index_target] = node; // Increment size if it has changed if (_size < _size_bound) ++_size; }}// Delete all contained nodes and set the size of the HypothesisPrefixList to 0void HypothesisPrefixList::clear() { for (int i = 0; i < _size_bound && _list_nodes[i] != NULL; ++i) { delete _list_nodes[i]; _list_nodes[i] = NULL; } _size = 0;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -