📄 reject.cpp
字号:
BOOL8 checked_dict_word, BOOL8 sensible_word, BOOL8 centre, BOOL8 good_quality_word) { INT16 accept_level; //0 Very clearly matched //1 Clearly top //2 Top but poor match //3 Next & poor top match //4 Next but good top match //5 No chance BOOL8 good_top_choice; BOOL8 excellent_top_choice; BOOL8 confusion_match = FALSE; BOOL8 dodgy_char = !isalnum (tess_ch); good_top_choice = (top_score > nn_reject_threshold) && (nn_reject_head_and_shoulders * top_score > next_score); excellent_top_choice = good_top_choice && (top_score > nn_dodgy_char_threshold); if (top == tess_ch) { if (excellent_top_choice) accept_level = 0; else if (good_top_choice) accept_level = 1; //Top correct and well matched else accept_level = 2; //Top correct but poor match } else if ((nn_conf_1Il && STRING (conflict_set_I_l_1).contains (tess_ch) && STRING (conflict_set_I_l_1).contains (top)) || (nn_conf_hyphen && STRING (conflict_set_hyphen).contains (tess_ch) && STRING (conflict_set_hyphen).contains (top)) || (nn_conf_Ss && STRING (conflict_set_S_s).contains (tess_ch) && STRING (conflict_set_S_s).contains (top))) { confusion_match = TRUE; if (good_top_choice) accept_level = 1; //Good top confusion else accept_level = 2; //Poor top confusion } else if ((nn_conf_1Il && STRING (conflict_set_I_l_1).contains (tess_ch) && STRING (conflict_set_I_l_1).contains (next)) || (nn_conf_hyphen && STRING (conflict_set_hyphen).contains (tess_ch) && STRING (conflict_set_hyphen).contains (next)) || (nn_conf_Ss && STRING (conflict_set_S_s).contains (tess_ch) && STRING (conflict_set_S_s).contains (next))) { confusion_match = TRUE; if (!good_top_choice) accept_level = 3; //Next confusion and top match dodgy else accept_level = 4; //Next confusion and good top match } else if (next == tess_ch) { if (!good_top_choice) accept_level = 3; //Next match and top match dodgy else accept_level = 4; //Next match and good top match } else accept_level = 5; /* Could allow some match flexibility here sS$ etc */ /* Now set confirmation level according to how much we can believe the tess char. */ if ((accept_level == 0) && !confusion_match) return 3; if ((accept_level <= 1) && (!nn_conf_strict_on_dodgy_chs || !dodgy_char) && !confusion_match) return 3; if ((accept_level == 2) && !confusion_match && !dodgy_char && good_quality_word && dict_word && (checked_dict_word || !nn_double_check_dict) && sensible_word) return 2; if (confusion_match && (accept_level <= nn_conf_accept_level) && (good_quality_word || (!nn_conf_test_good_qual && !STRING (conflict_set_I_l_1).contains (tess_ch))) && (dict_word || !nn_conf_test_dict) && (checked_dict_word || !nn_conf_double_check_dict) && (sensible_word || !nn_conf_test_sensible)) return 1; if (!confusion_match && nn_lax && (accept_level == 3) && (good_quality_word || !nn_conf_test_good_qual) && (dict_word || !nn_conf_test_dict) && (sensible_word || !nn_conf_test_sensible)) return 1; else return 0;}/************************************************************************* * dont_allow_dubious_chars() * Let Rejects "eat" into adjacent "dubious" chars. I.e those prone to be wrong * if adjacent to a reject. *************************************************************************/void dont_allow_dubious_chars(WERD_RES *word) { int i = 0; int rej_pos; int word_len = word->reject_map.length (); while (i < word_len) { /* Find next reject */ while ((i < word_len) && (word->reject_map[i].accepted ())) i++; if (i < word_len) { rej_pos = i; /* Reject dubious chars to the left */ i--; while ((i >= 0) && STRING (dubious_chars_left_of_reject).contains (word-> best_choice-> string () [i])) { word->reject_map[i--].setrej_dubious (); } /* Skip adjacent rejects */ for (i = rej_pos; (i < word_len) && (word->reject_map[i].rejected ()); i++); /* Reject dubious chars to the right */ while ((i < word_len) && STRING (dubious_chars_right_of_reject).contains (word-> best_choice-> string () [i])) { word->reject_map[i++].setrej_dubious (); } } }}/************************************************************************* * dont_allow_1Il() * Dont unreject LONE accepted 1Il conflict set chars *************************************************************************/void dont_allow_1Il(WERD_RES *word) { int i = 0; int word_len = word->reject_map.length (); const char *s = word->best_choice->string ().string (); BOOL8 accepted_1Il = FALSE; for (i = 0; i < word_len; i++) { if (word->reject_map[i].accepted ()) { if (STRING (conflict_set_I_l_1).contains (s[i])) accepted_1Il = TRUE; else { if (isalnum (s[i])) return; // >=1 non 1Il ch accepted } } } if (!accepted_1Il) return; //Nothing to worry about for (i = 0; i < word_len; i++) { if (STRING (conflict_set_I_l_1).contains (s[i]) && word->reject_map[i].accepted ()) word->reject_map[i].setrej_postNN_1Il (); }}INT16 count_alphanums( //how many alphanums WERD_RES *word) { int count = 0; int i; for (i = 0; i < word->reject_map.length (); i++) { if ((word->reject_map[i].accepted ()) && (isalnum (word->best_choice->string ()[i]))) count++; } return count;}void reject_mostly_rejects( //rej all if most rejectd WERD_RES *word) { /* Reject the whole of the word if the fraction of rejects exceeds a limit */ if ((float) word->reject_map.reject_count () / word->reject_map.length () >= rej_whole_of_mostly_reject_word_fract) word->reject_map.rej_word_mostly_rej ();}BOOL8 repeated_nonalphanum_wd(WERD_RES *word, ROW *row) { INT16 char_quality; INT16 accepted_char_quality; if (word->best_choice->string ().length () <= 1) return FALSE; if (!STRING (ok_repeated_ch_non_alphanum_wds). contains (word->best_choice->string ()[0])) return FALSE; if (!repeated_ch_string (word->best_choice->string ().string ())) return FALSE; word_char_quality(word, row, &char_quality, &accepted_char_quality); if ((word->best_choice->string ().length () == char_quality) && (char_quality == accepted_char_quality)) return TRUE; else return FALSE;}BOOL8 repeated_ch_string(const char *rep_ch_str) { char c; if ((rep_ch_str == NULL) || (*rep_ch_str == '\0')) { return FALSE; } c = *rep_ch_str; rep_ch_str++; while (*rep_ch_str == c) { rep_ch_str++; } if (*rep_ch_str == '\0') return TRUE; return FALSE;}INT16 safe_dict_word(const char *s) { int dict_word_type; dict_word_type = dict_word (s); if (dict_word_type == DOC_DAWG_PERM) return 0; else return dict_word_type;}void flip_hyphens(WERD_RES *word) { char *str = (char *) word->best_choice->string ().string (); int i = 0; PBLOB_IT outword_it; int prev_right = -9999; int next_left; BOX out_box; float aspect_ratio; if (tessedit_lower_flip_hyphen <= 1) return; outword_it.set_to_list (word->outword->blob_list ()); for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward (), i++) { out_box = outword_it.data ()->bounding_box (); if (outword_it.at_last ()) next_left = 9999; else next_left = outword_it.data_relative (1)->bounding_box ().left (); /* Dont touch small or touching blobs - it is too dangerous */ if ((out_box.width () > 8 * word->denorm.scale ()) && (out_box.left () > prev_right) && (out_box.right () < next_left)) { aspect_ratio = out_box.width () / (float) out_box.height (); if (str[i] == '.') { if (aspect_ratio >= tessedit_upper_flip_hyphen) { /* Certain HYPHEN */ str[i] = '-'; if (word->reject_map[i].rejected ()) word->reject_map[i].setrej_hyphen_accept (); } if ((aspect_ratio > tessedit_lower_flip_hyphen) && word->reject_map[i].accepted ()) //Suspected HYPHEN word->reject_map[i].setrej_hyphen (); } else if (str[i] == '-') { if ((aspect_ratio >= tessedit_upper_flip_hyphen) && (word->reject_map[i].rejected ())) word->reject_map[i].setrej_hyphen_accept (); //Certain HYPHEN if ((aspect_ratio <= tessedit_lower_flip_hyphen) && (word->reject_map[i].accepted ())) //Suspected HYPHEN word->reject_map[i].setrej_hyphen (); } } prev_right = out_box.right (); }}void flip_0O(WERD_RES *word) { char *str = (char *) word->best_choice->string ().string (); int i; PBLOB_IT outword_it; BOX out_box; if (!tessedit_flip_0O) return; outword_it.set_to_list (word->outword->blob_list ()); for (i = 0, outword_it.mark_cycle_pt (); !outword_it.cycled_list (); i++, outword_it.forward ()) { if (isupper (str[i]) || isdigit (str[i])) { out_box = outword_it.data ()->bounding_box (); if ((out_box.top () < bln_baseline_offset + bln_x_height) || (out_box.bottom () > bln_baseline_offset + bln_x_height / 4)) return; //Beware words with sub/superscripts } } for (i = 1; str[i] != '\0'; i++, outword_it.forward ()) { if ((str[i] == '0') || (str[i] == 'O')) { /* A0A */ if (non_O_upper (str[i - 1]) && non_O_upper (str[i + 1])) { str[i] = 'O'; } /* A00A */ if (non_O_upper (str[i - 1]) && ((str[i + 1] == '0') || (str[i + 1] == 'O')) && non_O_upper (str[i + 2])) { str[i] = 'O'; str[i + 1] = 'O'; i++; } /* AA0<non digit or end of word> */ if ((i > 1) && non_O_upper (str[i - 2]) && non_O_upper (str[i - 1]) && !isdigit (str[i + 1]) && (str[i + 1] != 'l') && (str[i + 1] != 'I')) { str[i] = 'O'; } /* 9O9 */ if (non_0_digit (str[i - 1]) && non_0_digit (str[i + 1])) { str[i] = '0'; } /* 9OOO */ if (non_0_digit (str[i - 1]) && ((str[i + 1] == '0') || (str[i + 1] == 'O')) && ((str[i + 2] == '0') || (str[i + 2] == 'O'))) { str[i] = '0'; str[i + 1] = '0'; str[i + 2] = '0'; i += 2; } /* 9OO<non upper> */ if (non_0_digit (str[i - 1]) && ((str[i + 1] == '0') || (str[i + 1] == 'O')) && !isupper (str[i + 2])) { str[i] = '0'; str[i + 1] = '0'; i++; } /* 9O<non upper> */ if (non_0_digit (str[i - 1]) && !isupper (str[i + 1])) { str[i] = '0'; } /* 9[.,]OOO.. */ if ((i > 1) && ((str[i - 1] == '.') || (str[i - 1] == ',')) && (isdigit (str[i - 2]) || (str[i - 2] == 'O'))) { if (str[i - 2] == 'O') str[i - 2] = '0'; while ((str[i] == 'O') || (str[i] == '0')) { str[i++] = '0'; } i--; } } }}BOOL8 non_O_upper(char c) { return isupper (c) && (c != 'O');}BOOL8 non_0_digit(char c) { return isdigit (c) && (c != '0');}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -