📄 reject.cpp

📁 一ＯＣＲ的相关资料。．希望对研究ＯＣＲ的朋友有所帮助．
💻 CPP
📖 第 1 页 / 共 4 页
字号:
上一页 1 2 34
                         BOOL8 checked_dict_word,                         BOOL8 sensible_word,                         BOOL8 centre,                         BOOL8 good_quality_word) {  INT16 accept_level;            //0 Very clearly matched  //1 Clearly top  //2 Top but poor match  //3 Next & poor top match  //4 Next but good top match  //5 No chance  BOOL8 good_top_choice;  BOOL8 excellent_top_choice;  BOOL8 confusion_match = FALSE;  BOOL8 dodgy_char = !isalnum (tess_ch);  good_top_choice = (top_score > nn_reject_threshold) &&    (nn_reject_head_and_shoulders * top_score > next_score);  excellent_top_choice = good_top_choice &&    (top_score > nn_dodgy_char_threshold);  if (top == tess_ch) {    if (excellent_top_choice)      accept_level = 0;    else if (good_top_choice)      accept_level = 1;          //Top correct and well matched    else      accept_level = 2;          //Top correct but poor match  }  else if ((nn_conf_1Il &&    STRING (conflict_set_I_l_1).contains (tess_ch) &&    STRING (conflict_set_I_l_1).contains (top)) ||    (nn_conf_hyphen &&    STRING (conflict_set_hyphen).contains (tess_ch) &&    STRING (conflict_set_hyphen).contains (top)) ||    (nn_conf_Ss &&    STRING (conflict_set_S_s).contains (tess_ch) &&  STRING (conflict_set_S_s).contains (top))) {    confusion_match = TRUE;    if (good_top_choice)      accept_level = 1;          //Good top confusion    else      accept_level = 2;          //Poor top confusion  }  else if ((nn_conf_1Il &&    STRING (conflict_set_I_l_1).contains (tess_ch) &&    STRING (conflict_set_I_l_1).contains (next)) ||    (nn_conf_hyphen &&    STRING (conflict_set_hyphen).contains (tess_ch) &&    STRING (conflict_set_hyphen).contains (next)) ||    (nn_conf_Ss &&    STRING (conflict_set_S_s).contains (tess_ch) &&  STRING (conflict_set_S_s).contains (next))) {    confusion_match = TRUE;    if (!good_top_choice)      accept_level = 3;          //Next confusion and top match dodgy    else      accept_level = 4;          //Next confusion and good top match  }  else if (next == tess_ch) {    if (!good_top_choice)      accept_level = 3;          //Next match and top match dodgy    else      accept_level = 4;          //Next match and good top match  }  else    accept_level = 5;  /* Could allow some match flexibility here sS$ etc */  /* Now set confirmation level according to how much we can believe the tess    char. */  if ((accept_level == 0) && !confusion_match)    return 3;  if ((accept_level <= 1) &&    (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)    return 3;  if ((accept_level == 2) &&    !confusion_match && !dodgy_char &&    good_quality_word &&    dict_word &&    (checked_dict_word || !nn_double_check_dict) && sensible_word)    return 2;  if (confusion_match &&    (accept_level <= nn_conf_accept_level) &&    (good_quality_word ||    (!nn_conf_test_good_qual &&    !STRING (conflict_set_I_l_1).contains (tess_ch))) &&    (dict_word || !nn_conf_test_dict) &&    (checked_dict_word || !nn_conf_double_check_dict) &&    (sensible_word || !nn_conf_test_sensible))    return 1;  if (!confusion_match &&    nn_lax &&    (accept_level == 3) &&    (good_quality_word || !nn_conf_test_good_qual) &&    (dict_word || !nn_conf_test_dict) &&    (sensible_word || !nn_conf_test_sensible))    return 1;  else    return 0;}/************************************************************************* * dont_allow_dubious_chars() * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong * if adjacent to a reject. *************************************************************************/void dont_allow_dubious_chars(WERD_RES *word) {  int i = 0;  int rej_pos;  int word_len = word->reject_map.length ();  while (i < word_len) {    /* Find next reject */    while ((i < word_len) && (word->reject_map[i].accepted ()))      i++;    if (i < word_len) {      rej_pos = i;      /* Reject dubious chars to the left */      i--;      while ((i >= 0) &&        STRING (dubious_chars_left_of_reject).contains (word->        best_choice->        string ()      [i])) {        word->reject_map[i--].setrej_dubious ();      }      /* Skip adjacent rejects */      for (i = rej_pos;        (i < word_len) && (word->reject_map[i].rejected ()); i++);      /* Reject dubious chars to the right */      while ((i < word_len) &&        STRING (dubious_chars_right_of_reject).contains (word->        best_choice->        string ()      [i])) {        word->reject_map[i++].setrej_dubious ();      }    }  }}/************************************************************************* * dont_allow_1Il() * Dont unreject LONE accepted 1Il conflict set chars *************************************************************************/void dont_allow_1Il(WERD_RES *word) {  int i = 0;  int word_len = word->reject_map.length ();  const char *s = word->best_choice->string ().string ();  BOOL8 accepted_1Il = FALSE;  for (i = 0; i < word_len; i++) {    if (word->reject_map[i].accepted ()) {      if (STRING (conflict_set_I_l_1).contains (s[i]))        accepted_1Il = TRUE;      else {        if (isalnum (s[i]))          return;                // >=1 non 1Il ch accepted      }    }  }  if (!accepted_1Il)    return;                      //Nothing to worry about  for (i = 0; i < word_len; i++) {    if (STRING (conflict_set_I_l_1).contains (s[i]) &&      word->reject_map[i].accepted ())      word->reject_map[i].setrej_postNN_1Il ();  }}INT16 count_alphanums(  //how many alphanums                      WERD_RES *word) {  int count = 0;  int i;  for (i = 0; i < word->reject_map.length (); i++) {    if ((word->reject_map[i].accepted ()) &&      (isalnum (word->best_choice->string ()[i])))      count++;  }  return count;}void reject_mostly_rejects(  //rej all if most rejectd                           WERD_RES *word) {  /* Reject the whole of the word if the fraction of rejects exceeds a limit */  if ((float) word->reject_map.reject_count () / word->reject_map.length () >=    rej_whole_of_mostly_reject_word_fract)    word->reject_map.rej_word_mostly_rej ();}BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {  INT16 char_quality;  INT16 accepted_char_quality;  if (word->best_choice->string ().length () <= 1)    return FALSE;  if (!STRING (ok_repeated_ch_non_alphanum_wds).    contains (word->best_choice->string ()[0]))    return FALSE;  if (!repeated_ch_string (word->best_choice->string ().string ()))    return FALSE;  word_char_quality(word, row, &char_quality, &accepted_char_quality);  if ((word->best_choice->string ().length () == char_quality) &&    (char_quality == accepted_char_quality))    return TRUE;  else    return FALSE;}BOOL8 repeated_ch_string(const char *rep_ch_str) {  char c;  if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {    return FALSE;  }  c = *rep_ch_str;  rep_ch_str++;  while (*rep_ch_str == c) {    rep_ch_str++;  }  if (*rep_ch_str == '\0')    return TRUE;  return FALSE;}INT16 safe_dict_word(const char *s) {  int dict_word_type;  dict_word_type = dict_word (s);  if (dict_word_type == DOC_DAWG_PERM)    return 0;  else    return dict_word_type;}void flip_hyphens(WERD_RES *word) {  char *str = (char *) word->best_choice->string ().string ();  int i = 0;  PBLOB_IT outword_it;  int prev_right = -9999;  int next_left;  BOX out_box;  float aspect_ratio;  if (tessedit_lower_flip_hyphen <= 1)    return;  outword_it.set_to_list (word->outword->blob_list ());  for (outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); outword_it.forward (), i++) {    out_box = outword_it.data ()->bounding_box ();    if (outword_it.at_last ())      next_left = 9999;    else      next_left = outword_it.data_relative (1)->bounding_box ().left ();    /*      Dont touch small or touching blobs - it is too dangerous    */    if ((out_box.width () > 8 * word->denorm.scale ()) &&    (out_box.left () > prev_right) && (out_box.right () < next_left)) {      aspect_ratio = out_box.width () / (float) out_box.height ();      if (str[i] == '.') {        if (aspect_ratio >= tessedit_upper_flip_hyphen) {          /* Certain HYPHEN */          str[i] = '-';          if (word->reject_map[i].rejected ())            word->reject_map[i].setrej_hyphen_accept ();        }        if ((aspect_ratio > tessedit_lower_flip_hyphen) &&          word->reject_map[i].accepted ())                                 //Suspected HYPHEN          word->reject_map[i].setrej_hyphen ();      }      else if (str[i] == '-') {        if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&          (word->reject_map[i].rejected ()))          word->reject_map[i].setrej_hyphen_accept ();        //Certain HYPHEN        if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&          (word->reject_map[i].accepted ()))                                 //Suspected HYPHEN          word->reject_map[i].setrej_hyphen ();      }    }    prev_right = out_box.right ();  }}void flip_0O(WERD_RES *word) {  char *str = (char *) word->best_choice->string ().string ();  int i;  PBLOB_IT outword_it;  BOX out_box;  if (!tessedit_flip_0O)    return;  outword_it.set_to_list (word->outword->blob_list ());  for (i = 0, outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); i++, outword_it.forward ()) {    if (isupper (str[i]) || isdigit (str[i])) {      out_box = outword_it.data ()->bounding_box ();      if ((out_box.top () < bln_baseline_offset + bln_x_height) ||        (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))        return;                  //Beware words with sub/superscripts    }  }  for (i = 1; str[i] != '\0'; i++, outword_it.forward ()) {    if ((str[i] == '0') || (str[i] == 'O')) {      /* A0A */      if (non_O_upper (str[i - 1]) && non_O_upper (str[i + 1])) {        str[i] = 'O';      }      /* A00A */      if (non_O_upper (str[i - 1]) &&        ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&      non_O_upper (str[i + 2])) {        str[i] = 'O';        str[i + 1] = 'O';        i++;      }      /* AA0<non digit or end of word> */      if ((i > 1) &&        non_O_upper (str[i - 2]) &&        non_O_upper (str[i - 1]) &&        !isdigit (str[i + 1]) &&      (str[i + 1] != 'l') && (str[i + 1] != 'I')) {        str[i] = 'O';      }      /* 9O9 */      if (non_0_digit (str[i - 1]) && non_0_digit (str[i + 1])) {        str[i] = '0';      }      /* 9OOO */      if (non_0_digit (str[i - 1]) &&        ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&      ((str[i + 2] == '0') || (str[i + 2] == 'O'))) {        str[i] = '0';        str[i + 1] = '0';        str[i + 2] = '0';        i += 2;      }      /* 9OO<non upper> */      if (non_0_digit (str[i - 1]) &&        ((str[i + 1] == '0') || (str[i + 1] == 'O')) &&      !isupper (str[i + 2])) {        str[i] = '0';        str[i + 1] = '0';        i++;      }      /* 9O<non upper> */      if (non_0_digit (str[i - 1]) && !isupper (str[i + 1])) {        str[i] = '0';      }      /* 9[.,]OOO.. */      if ((i > 1) &&        ((str[i - 1] == '.') || (str[i - 1] == ',')) &&      (isdigit (str[i - 2]) || (str[i - 2] == 'O'))) {        if (str[i - 2] == 'O')          str[i - 2] = '0';        while ((str[i] == 'O') || (str[i] == '0')) {          str[i++] = '0';        }        i--;      }    }  }}BOOL8 non_O_upper(char c) {  return isupper (c) && (c != 'O');}BOOL8 non_0_digit(char c) {  return isdigit (c) && (c != '0');}
上一页 1 2 34
💿 文件大小 2763 K
👤 上传用户 danlong
📂 所属分类其他书籍
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -