permdawg.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 409 行

CPP
409
字号
/* -*-C-*- ******************************************************************************** * * File:        permdawg.c  (Formerly permdawg.c) * Description: * Author:       Mark Seaman, OCR Technology * Created:      Fri Oct 16 14:37:00 1987 * Modified:     Tue Jul  9 15:43:18 1991 (Mark Seaman) marks@hpgrlt * Language:     C * Package:      N/A * Status:       Reusable Software Component * * (c) Copyright 1987, Hewlett-Packard Company. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * *********************************************************************************//*----------------------------------------------------------------------              I n c l u d e s----------------------------------------------------------------------*/#include "permdawg.h"#include "debug.h"#include "hyphen.h"#include "permute.h"#include "tordvars.h"#include "context.h"#include "stopper.h"#include "freelist.h"#include "globals.h"#include "tprintf.h"#include "cutil.h"#include "dawg.h"#include <ctype.h>/*----------------------------------------------------------------------              T y p e s----------------------------------------------------------------------*/#define FREQ_WERD     1.0#define GOOD_WERD     1.1#define OK_WERD       1.3125#define MAX_FREQ_EDGES    1500#define NO_RATING              -1/*----------------------------------------------------------------------              V a r i a b l e s----------------------------------------------------------------------*/static EDGE_ARRAY frequent_words;static float rating_margin;static float rating_pad = 5.0;make_toggle_var (dawg_debug, 0, make_dawg_debug,8, 10, set_dawg_debug, "DAWG Debug ");make_float_var (ok_word, OK_WERD, make_ok_word,8, 17, set_ok_word, "Bad word adjustment");make_float_var (good_word, GOOD_WERD, make_good_word,8, 18, set_good_word, "Good word adjustment");make_float_var (freq_word, FREQ_WERD, make_freq_word,8, 19, set_freq_word, "Freq word adjustment");/*----------------------------------------------------------------------              F u n c t i o n s----------------------------------------------------------------------*//********************************************************************** * adjust_word * * Assign an adjusted value to a string that is a word.	The value * that this word choice has is based on case and punctuation rules. **********************************************************************/void adjust_word(A_CHOICE *best_choice, float *certainty_array) {  char *this_word;  int punct_status;  float adjust_factor;  if (adjust_debug)    tprintf ("%s %4.2f ",      class_string (best_choice), class_probability (best_choice));  this_word = class_string (best_choice);  punct_status = punctuation_ok (this_word, class_lengths (best_choice));  class_probability (best_choice) += RATING_PAD;  if (case_ok (this_word, class_lengths (best_choice))      && punct_status != -1) {    if (punct_status < 1 && word_in_dawg (frequent_words, this_word)) {      class_probability (best_choice) *= freq_word;      class_permuter (best_choice) = FREQ_DAWG_PERM;      adjust_factor = freq_word;      if (adjust_debug)        tprintf(", F, %4.2f ", freq_word);    }    else {      class_probability (best_choice) *= good_word;      adjust_factor = good_word;      if (adjust_debug)        tprintf(", %4.2f ", good_word);    }  }  else {    class_probability (best_choice) *= ok_word;    adjust_factor = ok_word;    if (adjust_debug) {      if (!case_ok (this_word, class_lengths (best_choice)))        tprintf(", C");      if (punctuation_ok (this_word, class_lengths (best_choice)) == -1)        tprintf(", P");      tprintf(", %4.2f ", ok_word);    }  }  class_probability (best_choice) -= RATING_PAD;  LogNewWordChoice(best_choice, adjust_factor, certainty_array);  if (adjust_debug)    tprintf(" --> %4.2f\n", class_probability (best_choice));}/********************************************************************** * append_next_choice * * Check to see whether or not the next choice is worth appending to * the string being generated.  If so then keep going deeper into the * word. **********************************************************************/void append_next_choice(  /*previous option */                        EDGE_ARRAY dawg,                        NODE_REF node,                        char permuter,                        char *word,                        char unichar_lengths[],                        int unichar_offsets[],                        CHOICES_LIST choices,                        int char_index,                        A_CHOICE *this_choice,                        const char *prevchar,                        float *limit,                        float rating,                        float certainty,                        float *rating_array,                        float *certainty_array,                        int word_ending,                        int last_word,                        CHOICES *result) {  A_CHOICE *better_choice;  /* Add new character */  strcpy(word + unichar_offsets[char_index], class_string (this_choice));  unichar_lengths[char_index] = strlen(class_string (this_choice));  unichar_lengths[char_index + 1] = 0;  unichar_offsets[char_index + 1] = unichar_offsets[char_index] +      unichar_lengths[char_index];  if (word[unichar_offsets[char_index]] == '\0') {    word[unichar_offsets[char_index]] = ' ';    word[unichar_offsets[char_index] + 1] = '\0';    unichar_lengths[char_index] = 1;    unichar_lengths[char_index + 1] = 0;    unichar_offsets[char_index + 1] = unichar_offsets[char_index] +        unichar_lengths[char_index];  }  certainty_array[char_index] = class_certainty (this_choice);  rating += class_probability (this_choice);  certainty = min (class_certainty (this_choice), certainty);  if (rating_array[char_index] == NO_RATING) {                                 /* Prune bad subwords */    rating_array[char_index] = rating;  }  else {    if (rating_array[char_index] * rating_margin + rating_pad < rating) {      if (dawg_debug) {        tprintf("early pruned word rating=%4.2f, limit=%4.2f",                rating, *limit);        print_word_string(word);        tprintf("\n");      }      return;    }  }  /* Deal with hyphens */  if (word_ending && last_word && word[unichar_offsets[char_index]] == '-' &&      char_index > 0) {    *limit = rating;    if (dawg_debug)      tprintf("new hyphen choice = %s\n", word);    better_choice = new_choice (word, unichar_lengths, rating, certainty, -1, permuter);    adjust_word(better_choice, certainty_array);    push_on(*result, better_choice);    set_hyphen_word(word, unichar_lengths, unichar_offsets, rating, node);  }  /* Look up char in DAWG */  else {    int sub_offset = 0;    NODE_REF node_saved = node;    while (sub_offset < unichar_lengths[char_index] &&           letter_is_okay (dawg, &node, unichar_offsets[char_index] +                           sub_offset, *prevchar, word, word_ending &&                           sub_offset == unichar_lengths[char_index] - 1))      ++sub_offset;    if (sub_offset == unichar_lengths[char_index]) {      /* Add a new word choice */      if (word_ending) {        if (dawg_debug == 1)          tprintf("new choice = %s\n", word);        *limit = rating;        better_choice = new_choice (hyphen_tail (word), unichar_lengths +                                    hyphen_base_size(),                                    rating, certainty,                                    -1, permuter);        adjust_word (better_choice, &certainty_array[hyphen_base_size ()]);        push_on(*result, better_choice);      }      else {        /* Search the next letter */        JOIN_ON (*result,                 dawg_permute (dawg, node, permuter,                               choices, char_index + 1, limit,                               word, unichar_lengths, unichar_offsets, rating, certainty,                               rating_array, certainty_array, last_word));      }    } else {      if (dawg_debug == 1) {        tprintf("letter not OK at char %d, index %d + sub index %d/%d\n",                char_index, unichar_offsets[char_index],                sub_offset, unichar_lengths[char_index]);        tprintf("Word");        print_word_string(word);        tprintf("\nRejected tail");        print_word_string(word + unichar_offsets[char_index]);        tprintf("\n");      }      if (node != 0)        node = node_saved;    }  }}/********************************************************************** * dawg_permute * * Permute all the valid words that can be created with this starting * point.  The node (in the DAWG) and the word string define a base * from which to start adding the remaining character choices. **********************************************************************/CHOICES dawg_permute(EDGE_ARRAY dawg,                     NODE_REF node,                     char permuter,                     CHOICES_LIST choices,                     int char_index,                     float *limit,                     char *word,                     char unichar_lengths[],                     int unichar_offsets[],                     float rating,                     float certainty,                     float *rating_array,                     float *certainty_array,                     int last_word) {  CHOICES result = NIL;  CHOICES c;  char *prevchar;  int word_ending = FALSE;  if (dawg_debug) {    tprintf("dawg_permute (node=" REFFORMAT ", char_index=%d, limit=%f, word=",             node, char_index, *limit);    print_word_string(word);    tprintf(", rating=%4.2f, certainty=%4.2f)\n",             rating, certainty);  }  /* Check for EOW */  if (1 + char_index == array_count (choices) + hyphen_base_size ())    word_ending = TRUE;  if (char_index < array_count (choices) + hyphen_base_size ()) {    prevchar = NULL;    iterate_list (c,      (CHOICES) array_index (choices,    char_index - hyphen_base_size ())) {      append_next_choice (dawg, node, permuter, word, unichar_lengths,                          unichar_offsets, choices, char_index,                          (A_CHOICE *) first_node (c),                          prevchar != NULL ? prevchar : "", limit,                          rating, certainty, rating_array, certainty_array,                          word_ending, last_word, &result);      prevchar = best_string (c);    }  }  if (result && (dawg_debug == 1))    print_choices ("dawg_permute", result);  return (result);}/********************************************************************** * dawg_permute_and_select * * Use a DAWG type data structure to enumerate all the valid strings * in some gramar.  Compare each of the choices against the best choice * so far.  Update the best choice if needed. **********************************************************************/void dawg_permute_and_select(const char *string,                             EDGE_ARRAY dawg,                             char permuter,                             CHOICES_LIST character_choices,                             A_CHOICE *best_choice,                             inT16 system_words) {  CHOICES result = NIL;  char word[UNICHAR_LEN * MAX_WERD_LENGTH + 1];  char unichar_lengths[MAX_WERD_LENGTH + 1];  int unichar_offsets[MAX_WERD_LENGTH + 1];  float certainty_array[MAX_WERD_LENGTH + 1];  float rating_array[MAX_WERD_LENGTH + 1];  float rating;  int char_index;  NODE_REF dawg_node = 0;                                 /* Pruning margin ratio */  rating_margin = ok_word / good_word;  word[0] = '\0';  unichar_lengths[0] = 0;  unichar_offsets[0] = 0;  rating = class_probability (best_choice);  for (char_index = 0; char_index < MAX_WERD_LENGTH + 1; char_index++)    rating_array[char_index] = NO_RATING;  char_index = 0;  if (!is_last_word () && hyphen_string) {    strcpy(word, hyphen_string);    strcpy(unichar_lengths, hyphen_unichar_lengths);    memcpy(unichar_offsets, hyphen_unichar_offsets,           (hyphen_base_size()) * sizeof (int));    unichar_offsets[hyphen_base_size()] =        unichar_offsets[hyphen_base_size() - 1] +        unichar_lengths[hyphen_base_size() - 1];    char_index = strlen (hyphen_unichar_lengths);    if (system_words)      dawg_node = hyphen_state;  }  result = dawg_permute (dawg, dawg_node, permuter, character_choices,    char_index, &rating, word, unichar_lengths, unichar_offsets, 0.0, 0.0,    rating_array, certainty_array, is_last_word ());  if (display_ratings && result) {    print_choices(string, result);  }  while (result != NIL) {    if (best_probability (result) < class_probability (best_choice)) {      clone_choice (best_choice, first_node (result));    }    free_choice (first_node (result));    pop_off(result);  }}/********************************************************************** * init_permdawg * * Initialize the variables needed by this file. **********************************************************************/void init_permdawg_vars() {  make_dawg_debug();  make_ok_word();  make_good_word();  make_freq_word();}void init_permdawg() {  STRING name;  name = language_data_path_prefix;  name += "freq-dawg";  frequent_words = read_squished_dawg(name.string());}void end_permdawg() {  memfree(frequent_words);  frequent_words = NULL;}/********************************************************************** * test_freq_words() * * Tests a word against the frequent word dawg **********************************************************************/int test_freq_words(const char *word) {  return (word_in_dawg (frequent_words, word));}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?