docqual.cpp

来自「一ＯＣＲ的相关资料。．希望对研究ＯＣＲ的朋友有所帮助．」· C++ 代码 · 共 1,454 行 · 第 1/4 页
CPP
1,454 行
          }        }      }    }  }}/************************************************************************* * reject_whole_page() * Dont believe any of it - set the reject map to 00..00 in all words * *************************************************************************/void reject_whole_page(PAGE_RES_IT &page_res_it) {   page_res_it.restart_page ();  while (page_res_it.word () != NULL) {    page_res_it.word ()->reject_map.rej_word_doc_rej ();    page_res_it.forward ();  }                                 //whole page is rejected  page_res_it.page_res->rejected = TRUE;}void tilde_crunch(PAGE_RES_IT &page_res_it) {   WERD_RES *word;  GARBAGE_LEVEL garbage_level;  PAGE_RES_IT copy_it;  BOOL8 prev_potential_marked = FALSE;  BOOL8 found_terrible_word = FALSE;  int dict_type;  BOOL8 ok_dict_word;  page_res_it.restart_page ();  while (page_res_it.word () != NULL) {    word = page_res_it.word ();    if (crunch_early_convert_bad_unlv_chs)      convert_bad_unlv_chs(word);     if (crunch_early_merge_tess_fails)      merge_tess_fails(word);     if (word->reject_map.accept_count () != 0) {      found_terrible_word = FALSE;                                 //Forget earlier potential crunches      prev_potential_marked = FALSE;    }    else {      dict_type = dict_word (word->best_choice->string ().string ());      ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM);      garbage_level = garbage_word (word, ok_dict_word);      if ((garbage_level != G_NEVER_CRUNCH) &&      (terrible_word_crunch (word, garbage_level))) {        if (crunch_debug > 0) {          tprintf ("T CRUNCHING: \"%s\"\n",            word->best_choice->string ().string ());        }        word->unlv_crunch_mode = CR_KEEP_SPACE;        if (prev_potential_marked) {          while (copy_it.word () != word) {            if (crunch_debug > 0) {              tprintf ("P1 CRUNCHING: \"%s\"\n",                copy_it.word ()->best_choice->string ().                string ());            }            copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;            copy_it.forward ();          }          prev_potential_marked = FALSE;        }        found_terrible_word = TRUE;      }      else if ((garbage_level != G_NEVER_CRUNCH) &&        (potential_word_crunch (word,      garbage_level, ok_dict_word))) {        if (found_terrible_word) {          if (crunch_debug > 0) {            tprintf ("P2 CRUNCHING: \"%s\"\n",              word->best_choice->string ().string ());          }          word->unlv_crunch_mode = CR_KEEP_SPACE;        }        else if (!prev_potential_marked) {          copy_it = page_res_it;          prev_potential_marked = TRUE;          if (crunch_debug > 1) {            tprintf ("P3 CRUNCHING: \"%s\"\n",              word->best_choice->string ().string ());          }        }      }      else {        found_terrible_word = FALSE;                                 //Forget earlier potential crunches        prev_potential_marked = FALSE;        if (crunch_debug > 2) {          tprintf ("NO CRUNCH: \"%s\"\n",            word->best_choice->string ().string ());        }      }    }    page_res_it.forward ();  }}BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) {   float rating_per_ch;  int adjusted_len;  int crunch_mode = 0;  if ((word->best_choice->string ().length () == 0) ||    (strspn (word->best_choice->string ().string (), " ") ==    word->best_choice->string ().length ()))    crunch_mode = 1;  else {    adjusted_len = word->reject_map.length ();    if (adjusted_len > crunch_rating_max)      adjusted_len = crunch_rating_max;    rating_per_ch = word->best_choice->rating () / adjusted_len;    if (rating_per_ch > crunch_terrible_rating)      crunch_mode = 2;    else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))      crunch_mode = 3;    else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&      (garbage_level != G_OK))      crunch_mode = 4;    else if ((rating_per_ch > crunch_poor_garbage_rate) &&      (garbage_level != G_OK))      crunch_mode = 5;  }  if (crunch_mode > 0) {    if (crunch_debug > 2) {      tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",        crunch_mode, word->best_choice->string ().string ());    }    return TRUE;  }  else    return FALSE;}BOOL8 potential_word_crunch(WERD_RES *word,                            GARBAGE_LEVEL garbage_level,                            BOOL8 ok_dict_word) {  float rating_per_ch;  int adjusted_len;  char *str = (char *) word->best_choice->string ().string ();  BOOL8 word_crunchable;  int poor_indicator_count = 0;  word_crunchable =    !crunch_leave_accept_strings ||    (word->reject_map.length () < 3) ||    ((acceptable_word_string (str) == AC_UNACCEPTABLE) && !ok_dict_word);  adjusted_len = word->reject_map.length ();  if (adjusted_len > 10)    adjusted_len = 10;  rating_per_ch = word->best_choice->rating () / adjusted_len;  if (rating_per_ch > crunch_pot_poor_rate) {    if (crunch_debug > 2) {      tprintf ("Potential poor rating on \"%s\"\n",        word->best_choice->string ().string ());    }    poor_indicator_count++;  }  if (word_crunchable &&  (word->best_choice->certainty () < crunch_pot_poor_cert)) {    if (crunch_debug > 2) {      tprintf ("Potential poor cert on \"%s\"\n",        word->best_choice->string ().string ());    }    poor_indicator_count++;  }  if (garbage_level != G_OK) {    if (crunch_debug > 2) {      tprintf ("Potential garbage on \"%s\"\n",        word->best_choice->string ().string ());    }    poor_indicator_count++;  }  return (poor_indicator_count >= crunch_pot_indicators);}void tilde_delete(PAGE_RES_IT &page_res_it) {   WERD_RES *word;  PAGE_RES_IT copy_it;  BOOL8 deleting_from_bol = FALSE;  BOOL8 marked_delete_point = FALSE;  INT16 debug_delete_mode;  CRUNCH_MODE delete_mode;  INT16 x_debug_delete_mode;  CRUNCH_MODE x_delete_mode;  page_res_it.restart_page ();  while (page_res_it.word () != NULL) {    word = page_res_it.word ();    delete_mode = word_deletable (word, debug_delete_mode);    if (delete_mode != CR_NONE) {      if (word->word->flag (W_BOL) || deleting_from_bol) {        if (crunch_debug > 0) {          tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",            debug_delete_mode,            word->best_choice->string ().string ());        }        word->unlv_crunch_mode = delete_mode;        deleting_from_bol = TRUE;      }      else if (word->word->flag (W_EOL)) {        if (marked_delete_point) {          while (copy_it.word () != word) {            x_delete_mode = word_deletable (copy_it.word (),              x_debug_delete_mode);            if (crunch_debug > 0) {              tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",                x_debug_delete_mode,                copy_it.word ()->best_choice->string ().                string ());            }            copy_it.word ()->unlv_crunch_mode = x_delete_mode;            copy_it.forward ();          }        }        if (crunch_debug > 0) {          tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",            debug_delete_mode,            word->best_choice->string ().string ());        }        word->unlv_crunch_mode = delete_mode;        deleting_from_bol = FALSE;        marked_delete_point = FALSE;      }      else {        if (!marked_delete_point) {          copy_it = page_res_it;          marked_delete_point = TRUE;        }      }    }    else {      deleting_from_bol = FALSE;                                 //Forget earlier potential crunches      marked_delete_point = FALSE;    }    /*      The following step has been left till now as the tess fails are used to      determine if the word is deletable.    */    if (!crunch_early_merge_tess_fails)      merge_tess_fails(word);     page_res_it.forward ();  }}void convert_bad_unlv_chs(  //word to do                          WERD_RES *word_res) {  char *ptr;                     //string ptr  int i;  ptr = (char *) word_res->best_choice->string ().string ();  for (i = 0; i < word_res->reject_map.length (); i++) {    if (ptr[i] == '~') {      ptr[i] = '-';      if (word_res->reject_map[i].accepted ())        word_res->reject_map[i].setrej_unlv_rej ();    }    if (ptr[i] == '^') {      ptr[i] = ' ';      if (word_res->reject_map[i].accepted ())        word_res->reject_map[i].setrej_unlv_rej ();    }  }}/********************************************************************** * merge_tess_fails * * Change pairs of tess failures to a single one **********************************************************************/void merge_tess_fails(  //word to do                      WERD_RES *word_res) {  char *ptr;                     //string ptr  PBLOB_IT blob_it;              //blobs  int i = 0;  int len;  len = strlen (word_res->best_choice->string ().string ());  ASSERT_HOST (word_res->reject_map.length () == len);  ASSERT_HOST (word_res->outword->blob_list ()->length () == len);  ptr = (char *) word_res->best_choice->string ().string ();  blob_it = word_res->outword->blob_list ();  while (*ptr != '\0') {    if ((*ptr == ' ') && (*(ptr + 1) == ' ')) {      strcpy (ptr + 1, ptr + 2); //shuffle up      word_res->reject_map.remove_pos (i);      merge_blobs (blob_it.data_relative (1), blob_it.data ());      delete blob_it.extract (); //get rid of spare    }    else {      i++;      ptr++;    }    blob_it.forward ();  }  len = strlen (word_res->best_choice->string ().string ());  ASSERT_HOST (word_res->reject_map.length () == len);  ASSERT_HOST (word_res->outword->blob_list ()->length () == len);}GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {   enum STATES  {    JUNK,    FIRST_UPPER,    FIRST_LOWER,    FIRST_NUM,    SUBSEQUENT_UPPER,    SUBSEQUENT_LOWER,    SUBSEQUENT_NUM  };  char *str = (char *) word->best_choice->string ().string ();  STATES state = JUNK;  int len = 0;  int isolated_digits = 0;  int isolated_alphas = 0;  int bad_char_count = 0;  int tess_rejs = 0;  int dodgy_chars = 0;  int ok_chars;  char last_char = ' ';  int alpha_repetition_count = 0;  int longest_alpha_repetition_count = 0;  int longest_lower_run_len = 0;  int lower_string_count = 0;  int longest_upper_run_len = 0;  int upper_string_count = 0;  int total_alpha_count = 0;  int total_digit_count = 0;  for (; *str != '\0'; str++) {    len++;    if (isupper (*str)) {      total_alpha_count++;      switch (state) {        case SUBSEQUENT_UPPER:        case FIRST_UPPER:          state = SUBSEQUENT_UPPER;          upper_string_count++;
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?
docqual.cpp

docqual.cpp - 源码说明

⌨️ 快捷键说明