reject.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 1,776 行 · 第 1/4 页
CPP
1,776 行
else if ((nn_conf_1Il && STRING (conflict_set_I_l_1).contains (tess_ch) && STRING (conflict_set_I_l_1).contains (top)) || (nn_conf_hyphen && STRING (conflict_set_hyphen).contains (tess_ch) && STRING (conflict_set_hyphen).contains (top)) || (nn_conf_Ss && STRING (conflict_set_S_s).contains (tess_ch) && STRING (conflict_set_S_s).contains (top))) { confusion_match = TRUE; if (good_top_choice) accept_level = 1; //Good top confusion else accept_level = 2; //Poor top confusion } else if ((nn_conf_1Il && STRING (conflict_set_I_l_1).contains (tess_ch) && STRING (conflict_set_I_l_1).contains (next)) || (nn_conf_hyphen && STRING (conflict_set_hyphen).contains (tess_ch) && STRING (conflict_set_hyphen).contains (next)) || (nn_conf_Ss && STRING (conflict_set_S_s).contains (tess_ch) && STRING (conflict_set_S_s).contains (next))) { confusion_match = TRUE; if (!good_top_choice) accept_level = 3; //Next confusion and top match dodgy else accept_level = 4; //Next confusion and good top match } else if (next == tess_ch) { if (!good_top_choice) accept_level = 3; //Next match and top match dodgy else accept_level = 4; //Next match and good top match } else accept_level = 5; /* Could allow some match flexibility here sS$ etc */ /* Now set confirmation level according to how much we can believe the tess char. */ if ((accept_level == 0) && !confusion_match) return 3; if ((accept_level <= 1) && (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match) return 3; if ((accept_level == 2) && !confusion_match && !dodgy_char && good_quality_word && dict_word && (checked_dict_word || !nn_double_check_dict) && sensible_word) return 2; if (confusion_match && (accept_level <= nn_conf_accept_level) && (good_quality_word || (!nn_conf_test_good_qual && !STRING (conflict_set_I_l_1).contains (tess_ch))) && (dict_word || !nn_conf_test_dict) && (checked_dict_word || !nn_conf_double_check_dict) && (sensible_word || !nn_conf_test_sensible)) return 1; if (!confusion_match && nn_lax && (accept_level == 3) && (good_quality_word || !nn_conf_test_good_qual) && (dict_word || !nn_conf_test_dict) && (sensible_word || !nn_conf_test_sensible)) return 1; else return 0;}/************************************************************************* * dont_allow_dubious_chars() * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong * if adjacent to a reject. *************************************************************************/void dont_allow_dubious_chars(WERD_RES *word) { int i = 0; int offset = 0; int rej_pos; int word_len = word->reject_map.length (); while (i < word_len) { /* Find next reject */ while ((i < word_len) && (word->reject_map[i].accepted ())) { offset += word->best_choice->lengths()[i]; i++; } if (i < word_len) { rej_pos = i; /* Reject dubious chars to the left */ i--; offset -= word->best_choice->lengths()[i]; while ((i >= 0) && STRING (dubious_chars_left_of_reject).contains (word-> best_choice-> string () [offset])) { word->reject_map[i--].setrej_dubious (); offset -= word->best_choice->lengths()[i]; } /* Skip adjacent rejects */ for (i = rej_pos; (i < word_len) && (word->reject_map[i].rejected ()); offset += word->best_choice->lengths()[i++]); /* Reject dubious chars to the right */ while ((i < word_len) && STRING (dubious_chars_right_of_reject).contains (word-> best_choice-> string () [offset])) { offset += word->best_choice->lengths()[i]; word->reject_map[i++].setrej_dubious (); } } }}/************************************************************************* * dont_allow_1Il() * Dont unreject LONE accepted 1Il conflict set chars *************************************************************************/void dont_allow_1Il(WERD_RES *word) { int i = 0; int offset; int word_len = word->reject_map.length (); const char *s = word->best_choice->string ().string (); const char *lengths = word->best_choice->lengths ().string (); BOOL8 accepted_1Il = FALSE; for (i = 0, offset = 0; i < word_len; offset += word->best_choice->lengths()[i++]) { if (word->reject_map[i].accepted ()) { if (STRING (conflict_set_I_l_1).contains (s[offset])) accepted_1Il = TRUE; else { if (unicharset.get_isalpha (s + offset, lengths[i]) || unicharset.get_isdigit (s + offset, lengths[i])) return; // >=1 non 1Il ch accepted } } } if (!accepted_1Il) return; //Nothing to worry about for (i = 0, offset = 0; i < word_len; offset += word->best_choice->lengths()[i++]) { if (STRING (conflict_set_I_l_1).contains (s[offset]) && word->reject_map[i].accepted ()) word->reject_map[i].setrej_postNN_1Il (); }}inT16 count_alphanums( //how many alphanums WERD_RES *word) { int count = 0; int i; int offset; for (i = 0, offset = 0; i < word->reject_map.length (); offset += word->best_choice->lengths()[i++]) { if ((word->reject_map[i].accepted ()) && (unicharset.get_isalpha (word->best_choice->string ().string() + offset, word->best_choice->lengths ()[i]) || unicharset.get_isdigit (word->best_choice->string ().string() + offset, word->best_choice->lengths ()[i]))) count++; } return count;}void reject_mostly_rejects( //rej all if most rejectd WERD_RES *word) { /* Reject the whole of the word if the fraction of rejects exceeds a limit */ if ((float) word->reject_map.reject_count () / word->reject_map.length () >= rej_whole_of_mostly_reject_word_fract) word->reject_map.rej_word_mostly_rej ();}BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { inT16 char_quality; inT16 accepted_char_quality; if (word->best_choice->lengths ().length () <= 1) return FALSE; if (!STRING (ok_repeated_ch_non_alphanum_wds). contains (word->best_choice->string ()[0])) return FALSE; if (!repeated_ch_string (word->best_choice->string ().string (), word->best_choice->lengths ().string ())) return FALSE; word_char_quality(word, row, &char_quality, &accepted_char_quality); if ((word->best_choice->lengths ().length () == char_quality) && (char_quality == accepted_char_quality)) return TRUE; else return FALSE;}BOOL8 repeated_ch_string(const char *rep_ch_str, const char *lengths) { UNICHAR_ID c; if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) { return FALSE; } c = unicharset.unichar_to_id(rep_ch_str, *lengths); rep_ch_str += *(lengths++); while (*rep_ch_str != '\0' && unicharset.unichar_to_id(rep_ch_str, *lengths) == c) { rep_ch_str++; } if (*rep_ch_str == '\0') return TRUE; return FALSE;}inT16 safe_dict_word(const char *s) { int dict_word_type; dict_word_type = dict_word (s); if (dict_word_type == DOC_DAWG_PERM) return 0; else return dict_word_type;}void flip_hyphens(WERD_RES *word) { char *str = (char *) word->best_choice->string ().string (); int i = 0; int offset = 0; PBLOB_IT outword_it; int prev_right = -9999; int next_left; TBOX out_box; float aspect_ratio; if (tessedit_lower_flip_hyphen <= 1) return; outword_it.set_to_list (word->outword->blob_list ()); for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward (), offset += word->best_choice->lengths()[i++]) { out_box = outword_it.data ()->bounding_box (); if (outword_it.at_last ()) next_left = 9999; else next_left = outword_it.data_relative (1)->bounding_box ().left (); /* Dont touch small or touching blobs - it is too dangerous */ if ((out_box.width () > 8 * word->denorm.scale ()) && (out_box.left () > prev_right) && (out_box.right () < next_left)) { aspect_ratio = out_box.width () / (float) out_box.height (); if (str[offset] == '.') { if (aspect_ratio >= tessedit_upper_flip_hyphen) { /* Certain HYPHEN */ str[offset] = '-'; if (word->reject_map[i].rejected ()) word->reject_map[i].setrej_hyphen_accept (); } if ((aspect_ratio > tessedit_lower_flip_hyphen) && word->reject_map[i].accepted ()) //Suspected HYPHEN word->reject_map[i].setrej_hyphen (); } else if (str[offset] == '-') { if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word->reject_map[i].rejected ())) word->reject_map[i].setrej_hyphen_accept (); //Certain HYPHEN if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word->reject_map[i].accepted ())) //Suspected HYPHEN word->reject_map[i].setrej_hyphen (); } } prev_right = out_box.right (); }}void flip_0O(WERD_RES *word) { char *str = (char *) word->best_choice->string ().string (); char *lengths = (char *) word->best_choice->lengths ().string (); int i; int offset; PBLOB_IT outword_it; TBOX out_box; if (!tessedit_flip_0O) return; outword_it.set_to_list (word->outword->blob_list ()); for (i = 0, offset = 0, outword_it.mark_cycle_pt (); !outword_it.cycled_list (); offset += lengths[i++], outword_it.forward ()) { if (unicharset.get_isupper (str + offset, lengths[i]) || unicharset.get_isdigit (str + offset, lengths[i])) { out_box = outword_it.data ()->bounding_box (); if ((out_box.top () < bln_baseline_offset + bln_x_height) || (out_box.bottom () > bln_baseline_offset + bln_x_height / 4)) return; //Beware words with sub/superscripts } } for (i = 1, offset = lengths[0]; str[offset] != '\0'; offset += lengths[i++], outword_it.forward ()) { if (lengths[i] == 1 && ((str[offset] == '0') || (str[offset] == 'O'))) { /* A0A */ if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) && lengths[i + 1] > 0 && non_O_upper (str + offset + lengths[i], lengths[i + 1])) { str[offset] = 'O'; } /* A00A */ if (non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) && ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') || (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) && lengths[i + 2] > 0 && non_O_upper (str + offset + lengths[i] + lengths[i + 1], lengths[i + 2])) { str[offset] = 'O'; str[offset + lengths[i]] = 'O'; offset += lengths[i++]; } /* AA0<non digit or end of word> */ if ((i > 1) && non_O_upper (str + offset - lengths[i - 1] - lengths[i - 2], lengths[i - 2]) && non_O_upper (str + offset - lengths[i - 1], lengths[i - 1]) && lengths[i + 1] > 0 && !unicharset.get_isdigit (str + offset + lengths[i], lengths[i + 1]) && (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'l') && (lengths[i + 1] != 1 || str[offset + lengths[i]] != 'I')) { str[offset] = 'O'; } /* 9O9 */ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) && lengths[i + 1] > 0 && non_0_digit (str + offset + lengths[i], lengths[i + 1])) { str[offset] = '0'; } /* 9OOO */ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) && ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') || (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) && ((lengths[i + 2] == 1 && str[offset + lengths[i] + lengths[i + 1]] == '0') || (lengths[i + 2] == 1 && str[offset + lengths[i] + lengths[i + 1]] == 'O'))) { str[offset] = '0'; str[offset + lengths[i]] = '0'; str[offset + lengths[i] + lengths[i + 1]] = '0'; offset += lengths[i++]; offset += lengths[i++]; } /* 9OO<non upper> */ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) && ((lengths[i + 1] == 1 && str[offset + lengths[i]] == '0') || (lengths[i + 1] == 1 && str[offset + lengths[i]] == 'O')) && lengths[i + 2] > 0 && !unicharset.get_isupper (str + offset + lengths[i] + lengths[i + 1], lengths[i + 2])) { str[offset] = '0'; str[offset + lengths[i]] = '0'; offset += lengths[i++]; } /* 9O<non upper> */ if (non_0_digit (str + offset - lengths[i - 1], lengths[i - 1]) && lengths[i + 1] > 0 && !unicharset.get_isupper (str + offset + lengths[i], lengths[i + 1])) { str[offset] = '0'; } /* 9[.,]OOO.. */ if ((i > 1) && ((lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == '.') || (lengths[i - 1] == 1 && str[offset - lengths[i - 1]] == ',')) && (unicharset.get_isdigit (str + offset - lengths[i - 1] - lengths[i - 2], lengths[i - 2]) || (lengths[i - 2] == 1 && str[offset - lengths[i - 1] - lengths[i - 2]] == 'O'))) { if (lengths[i - 2] == 1 && str[offset - lengths[i - 1] - lengths[i - 2]] == 'O') str[offset - lengths[i - 1] - lengths[i - 2]] = '0'; while (lengths[i] == 1 && ((str[offset] == 'O') || (str[offset] == '0'))) { str[offset] = '0'; offset += lengths[i++]; } i--; offset -= lengths[i]; } } }}BOOL8 non_O_upper(const char* str, int length) { return unicharset.get_isupper (str, length) && (!unicharset.eq(unicharset.unichar_to_id(str, length), "O"));}BOOL8 non_0_digit(const char* str, int length) { return unicharset.get_isdigit (str, length) && (!unicharset.eq(unicharset.unichar_to_id(str, length), "0"));}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?