reject.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 1,776 行 · 第 1/4 页

CPP
1,776
字号
  else if ((nn_conf_1Il &&    STRING (conflict_set_I_l_1).contains (tess_ch) &&    STRING (conflict_set_I_l_1).contains (top)) ||    (nn_conf_hyphen &&    STRING (conflict_set_hyphen).contains (tess_ch) &&    STRING (conflict_set_hyphen).contains (top)) ||    (nn_conf_Ss &&    STRING (conflict_set_S_s).contains (tess_ch) &&  STRING (conflict_set_S_s).contains (top))) {    confusion_match = TRUE;    if (good_top_choice)      accept_level = 1;          //Good top confusion    else      accept_level = 2;          //Poor top confusion  }  else if ((nn_conf_1Il &&    STRING (conflict_set_I_l_1).contains (tess_ch) &&    STRING (conflict_set_I_l_1).contains (next)) ||    (nn_conf_hyphen &&    STRING (conflict_set_hyphen).contains (tess_ch) &&    STRING (conflict_set_hyphen).contains (next)) ||    (nn_conf_Ss &&    STRING (conflict_set_S_s).contains (tess_ch) &&  STRING (conflict_set_S_s).contains (next))) {    confusion_match = TRUE;    if (!good_top_choice)      accept_level = 3;          //Next confusion and top match dodgy    else      accept_level = 4;          //Next confusion and good top match  }  else if (next == tess_ch) {    if (!good_top_choice)      accept_level = 3;          //Next match and top match dodgy    else      accept_level = 4;          //Next match and good top match  }  else    accept_level = 5;  /* Could allow some match flexibility here sS$ etc */  /* Now set confirmation level according to how much we can believe the tess    char. */  if ((accept_level == 0) && !confusion_match)    return 3;  if ((accept_level <= 1) &&    (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match)    return 3;  if ((accept_level == 2) &&    !confusion_match && !dodgy_char &&    good_quality_word &&    dict_word &&    (checked_dict_word || !nn_double_check_dict) && sensible_word)    return 2;  if (confusion_match &&    (accept_level <= nn_conf_accept_level) &&    (good_quality_word ||    (!nn_conf_test_good_qual &&    !STRING (conflict_set_I_l_1).contains (tess_ch))) &&    (dict_word || !nn_conf_test_dict) &&    (checked_dict_word || !nn_conf_double_check_dict) &&    (sensible_word || !nn_conf_test_sensible))    return 1;  if (!confusion_match &&    nn_lax &&    (accept_level == 3) &&    (good_quality_word || !nn_conf_test_good_qual) &&    (dict_word || !nn_conf_test_dict) &&    (sensible_word || !nn_conf_test_sensible))    return 1;  else    return 0;}/************************************************************************* * dont_allow_dubious_chars() * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong * if adjacent to a reject. *************************************************************************/void dont_allow_dubious_chars(WERD_RES *word) {  int i = 0;  int offset = 0;  int rej_pos;  int word_len = word->reject_map.length ();  while (i < word_len) {    /* Find next reject */    while ((i < word_len) && (word->reject_map[i].accepted ()))    {      offset += word->best_choice->lengths()[i];      i++;    }    if (i < word_len) {      rej_pos = i;      /* Reject dubious chars to the left */      i--;      offset -= word->best_choice->lengths()[i];      while ((i >= 0) &&        STRING (dubious_chars_left_of_reject).contains (word->        best_choice->        string ()      [offset])) {        word->reject_map[i--].setrej_dubious ();        offset -= word->best_choice->lengths()[i];      }      /* Skip adjacent rejects */      for (i = rej_pos;        (i < word_len) && (word->reject_map[i].rejected ());           offset += word->best_choice->lengths()[i++]);      /* Reject dubious chars to the right */      while ((i < word_len) &&        STRING (dubious_chars_right_of_reject).contains (word->        best_choice->        string ()      [offset])) {        offset += word->best_choice->lengths()[i];        word->reject_map[i++].setrej_dubious ();      }    }  }}/************************************************************************* * dont_allow_1Il() * Dont unreject LONE accepted 1Il conflict set chars *************************************************************************/void dont_allow_1Il(WERD_RES *word) {  int i = 0;  int offset;  int word_len = word->reject_map.length ();  const char *s = word->best_choice->string ().string ();  const char *lengths = word->best_choice->lengths ().string ();  BOOL8 accepted_1Il = FALSE;  for (i = 0, offset = 0; i < word_len;       offset += word->best_choice->lengths()[i++]) {    if (word->reject_map[i].accepted ()) {      if (STRING (conflict_set_I_l_1).contains (s[offset]))        accepted_1Il = TRUE;      else {        if (unicharset.get_isalpha (s + offset, lengths[i]) ||            unicharset.get_isdigit (s + offset, lengths[i]))          return;                // >=1 non 1Il ch accepted      }    }  }  if (!accepted_1Il)    return;                      //Nothing to worry about  for (i = 0, offset = 0; i < word_len;       offset += word->best_choice->lengths()[i++]) {    if (STRING (conflict_set_I_l_1).contains (s[offset]) &&      word->reject_map[i].accepted ())      word->reject_map[i].setrej_postNN_1Il ();  }}inT16 count_alphanums(  //how many alphanums                      WERD_RES *word) {  int count = 0;  int i;  int offset;  for (i = 0, offset = 0; i < word->reject_map.length ();       offset += word->best_choice->lengths()[i++]) {    if ((word->reject_map[i].accepted ()) &&      (unicharset.get_isalpha (word->best_choice->string ().string() + offset,                               word->best_choice->lengths ()[i]) ||       unicharset.get_isdigit (word->best_choice->string ().string() + offset,                               word->best_choice->lengths ()[i])))      count++;  }  return count;}void reject_mostly_rejects(  //rej all if most rejectd                           WERD_RES *word) {  /* Reject the whole of the word if the fraction of rejects exceeds a limit */  if ((float) word->reject_map.reject_count () / word->reject_map.length () >=    rej_whole_of_mostly_reject_word_fract)    word->reject_map.rej_word_mostly_rej ();}BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) {  inT16 char_quality;  inT16 accepted_char_quality;  if (word->best_choice->lengths ().length () <= 1)    return FALSE;  if (!STRING (ok_repeated_ch_non_alphanum_wds).    contains (word->best_choice->string ()[0]))    return FALSE;  if (!repeated_ch_string (word->best_choice->string ().string (),                           word->best_choice->lengths ().string ()))    return FALSE;  word_char_quality(word, row, &char_quality, &accepted_char_quality);  if ((word->best_choice->lengths ().length () == char_quality) &&    (char_quality == accepted_char_quality))    return TRUE;  else    return FALSE;}BOOL8 repeated_ch_string(const char *rep_ch_str,                         const char *lengths) {  UNICHAR_ID c;  if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) {    return FALSE;  }  c = unicharset.unichar_to_id(rep_ch_str, *lengths);  rep_ch_str += *(lengths++);  while (*rep_ch_str != '\0' &&         unicharset.unichar_to_id(rep_ch_str, *lengths) == c) {    rep_ch_str++;  }  if (*rep_ch_str == '\0')    return TRUE;  return FALSE;}inT16 safe_dict_word(const char *s) {  int dict_word_type;  dict_word_type = dict_word (s);  if (dict_word_type == DOC_DAWG_PERM)    return 0;  else    return dict_word_type;}void flip_hyphens(WERD_RES *word) {  char *str = (char *) word->best_choice->string ().string ();  int i = 0;  int offset = 0;  PBLOB_IT outword_it;  int prev_right = -9999;  int next_left;  TBOX out_box;  float aspect_ratio;  if (tessedit_lower_flip_hyphen <= 1)    return;  outword_it.set_to_list (word->outword->blob_list ());  for (outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); outword_it.forward (),           offset += word->best_choice->lengths()[i++]) {    out_box = outword_it.data ()->bounding_box ();    if (outword_it.at_last ())      next_left = 9999;    else      next_left = outword_it.data_relative (1)->bounding_box ().left ();    /*      Dont touch small or touching blobs - it is too dangerous    */    if ((out_box.width () > 8 * word->denorm.scale ()) &&    (out_box.left () > prev_right) && (out_box.right () < next_left)) {      aspect_ratio = out_box.width () / (float) out_box.height ();      if (str[offset] == '.') {        if (aspect_ratio >= tessedit_upper_flip_hyphen) {          /* Certain HYPHEN */          str[offset] = '-';          if (word->reject_map[i].rejected ())            word->reject_map[i].setrej_hyphen_accept ();        }        if ((aspect_ratio > tessedit_lower_flip_hyphen) &&          word->reject_map[i].accepted ())                                 //Suspected HYPHEN          word->reject_map[i].setrej_hyphen ();      }      else if (str[offset] == '-') {        if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&          (word->reject_map[i].rejected ()))          word->reject_map[i].setrej_hyphen_accept ();        //Certain HYPHEN        if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&          (word->reject_map[i].accepted ()))                                 //Suspected HYPHEN          word->reject_map[i].setrej_hyphen ();      }    }    prev_right = out_box.right ();  }}void flip_0O(WERD_RES *word) {  char *str = (char *) word->best_choice->string ().string ();  char *lengths = (char *) word->best_choice->lengths ().string ();  int i;  int offset;  PBLOB_IT outword_it;  TBOX out_box;  if (!tessedit_flip_0O)    return;  outword_it.set_to_list (word->outword->blob_list ());  for (i = 0, offset = 0, outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); offset += lengths[i++], outword_it.forward ()) {    if (unicharset.get_isupper (str + offset, lengths[i]) ||        unicharset.get_isdigit (str + offset, lengths[i])) {      out_box = outword_it.data ()->bounding_box ();      if ((out_box.top () < bln_baseline_offset + bln_x_height) ||        (out_box.bottom () > bln_baseline_offset + bln_x_height / 4))        return;                  //Beware words with sub/superscripts    }  }  for (i = 1, offset = lengths[0]; str[offset] != '\0';       offset += lengths[i++], outword_it.forward ()) {    if (lengths[i] == 1 &&        ((str[offset] == '0') || (str[offset] == 'O'))) {      /* A0A */      if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&          lengths[i + 1] > 0 &&          non_O_upper (str + offset + lengths[i], lengths[i + 1])) {        str[offset] = 'O';      }      /* A00A */      if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&        ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||         (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&          lengths[i + 2] > 0 &&          non_O_upper (str + offset + lengths[i] + lengths[i + 1],                       lengths[i + 2])) {        str[offset] = 'O';        str[offset + lengths[i]] = 'O';        offset += lengths[i++];      }      /* AA0<non digit or end of word> */      if ((i > 1) &&        non_O_upper (str + offset - lengths[i - 1] - lengths[i - 2],                     lengths[i - 2]) &&        non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) &&          lengths[i + 1] > 0 &&        !unicharset.get_isdigit (str + offset + lengths[i], lengths[i + 1]) &&          (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'l') &&          (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'I')) {        str[offset] = 'O';      }      /* 9O9 */      if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&          lengths[i + 1] > 0 &&          non_0_digit (str + offset + lengths[i], lengths[i + 1])) {        str[offset] = '0';      }      /* 9OOO */      if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&          ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||           (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&          ((lengths[i + 2] == 1 &&            str[offset + lengths[i] + lengths[i + 1]] == '0') ||           (lengths[i + 2] == 1 &&            str[offset + lengths[i] + lengths[i + 1]] == 'O'))) {        str[offset] = '0';        str[offset + lengths[i]] = '0';        str[offset + lengths[i] + lengths[i + 1]] = '0';        offset += lengths[i++];        offset += lengths[i++];      }      /* 9OO<non upper> */      if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&          ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') ||           (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) &&          lengths[i + 2] > 0 &&          !unicharset.get_isupper (str + offset + lengths[i] + lengths[i + 1],                                   lengths[i + 2])) {        str[offset] = '0';        str[offset + lengths[i]] = '0';        offset += lengths[i++];      }      /* 9O<non upper> */      if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) &&          lengths[i + 1] > 0 &&          !unicharset.get_isupper (str + offset + lengths[i], lengths[i + 1])) {        str[offset] = '0';      }      /* 9[.,]OOO.. */      if ((i > 1) &&        ((lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == '.') ||         (lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == ',')) &&          (unicharset.get_isdigit (str + offset -                                   lengths[i - 1] - lengths[i - 2],                                   lengths[i - 2]) ||           (lengths[i - 2] == 1 &&            str[offset - lengths[i - 1] - lengths[i - 2]] == 'O'))) {        if (lengths[i - 2] == 1 &&            str[offset - lengths[i - 1] - lengths[i - 2]] == 'O')          str[offset - lengths[i - 1] - lengths[i - 2]] = '0';        while (lengths[i] == 1 &&               ((str[offset] == 'O') || (str[offset] == '0'))) {          str[offset] = '0';          offset += lengths[i++];        }        i--;        offset -= lengths[i];      }    }  }}BOOL8 non_O_upper(const char* str, int length) {  return unicharset.get_isupper (str, length) &&      (!unicharset.eq(unicharset.unichar_to_id(str, length), "O"));}BOOL8 non_0_digit(const char* str, int length) {  return unicharset.get_isdigit (str, length) &&      (!unicharset.eq(unicharset.unichar_to_id(str, length), "0"));}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?