📄 docqual.cpp
字号:
} } } } }}/************************************************************************* * reject_whole_page() * Dont believe any of it - set the reject map to 00..00 in all words * *************************************************************************/void reject_whole_page(PAGE_RES_IT &page_res_it) { page_res_it.restart_page (); while (page_res_it.word () != NULL) { page_res_it.word ()->reject_map.rej_word_doc_rej (); page_res_it.forward (); } //whole page is rejected page_res_it.page_res->rejected = TRUE;}void tilde_crunch(PAGE_RES_IT &page_res_it) { WERD_RES *word; GARBAGE_LEVEL garbage_level; PAGE_RES_IT copy_it; BOOL8 prev_potential_marked = FALSE; BOOL8 found_terrible_word = FALSE; int dict_type; BOOL8 ok_dict_word; page_res_it.restart_page (); while (page_res_it.word () != NULL) { word = page_res_it.word (); if (crunch_early_convert_bad_unlv_chs) convert_bad_unlv_chs(word); if (crunch_early_merge_tess_fails) merge_tess_fails(word); if (word->reject_map.accept_count () != 0) { found_terrible_word = FALSE; //Forget earlier potential crunches prev_potential_marked = FALSE; } else { dict_type = dict_word (word->best_choice->string ().string ()); ok_dict_word = (dict_type > 0) && (dict_type != DOC_DAWG_PERM); garbage_level = garbage_word (word, ok_dict_word); if ((garbage_level != G_NEVER_CRUNCH) && (terrible_word_crunch (word, garbage_level))) { if (crunch_debug > 0) { tprintf ("T CRUNCHING: \"%s\"\n", word->best_choice->string ().string ()); } word->unlv_crunch_mode = CR_KEEP_SPACE; if (prev_potential_marked) { while (copy_it.word () != word) { if (crunch_debug > 0) { tprintf ("P1 CRUNCHING: \"%s\"\n", copy_it.word ()->best_choice->string (). string ()); } copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE; copy_it.forward (); } prev_potential_marked = FALSE; } found_terrible_word = TRUE; } else if ((garbage_level != G_NEVER_CRUNCH) && (potential_word_crunch (word, garbage_level, ok_dict_word))) { if (found_terrible_word) { if (crunch_debug > 0) { tprintf ("P2 CRUNCHING: \"%s\"\n", word->best_choice->string ().string ()); } word->unlv_crunch_mode = CR_KEEP_SPACE; } else if (!prev_potential_marked) { copy_it = page_res_it; prev_potential_marked = TRUE; if (crunch_debug > 1) { tprintf ("P3 CRUNCHING: \"%s\"\n", word->best_choice->string ().string ()); } } } else { found_terrible_word = FALSE; //Forget earlier potential crunches prev_potential_marked = FALSE; if (crunch_debug > 2) { tprintf ("NO CRUNCH: \"%s\"\n", word->best_choice->string ().string ()); } } } page_res_it.forward (); }}BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level) { float rating_per_ch; int adjusted_len; int crunch_mode = 0; if ((word->best_choice->string ().length () == 0) || (strspn (word->best_choice->string ().string (), " ") == word->best_choice->string ().length ())) crunch_mode = 1; else { adjusted_len = word->reject_map.length (); if (adjusted_len > crunch_rating_max) adjusted_len = crunch_rating_max; rating_per_ch = word->best_choice->rating () / adjusted_len; if (rating_per_ch > crunch_terrible_rating) crunch_mode = 2; else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE)) crunch_mode = 3; else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) && (garbage_level != G_OK)) crunch_mode = 4; else if ((rating_per_ch > crunch_poor_garbage_rate) && (garbage_level != G_OK)) crunch_mode = 5; } if (crunch_mode > 0) { if (crunch_debug > 2) { tprintf ("Terrible_word_crunch (%d) on \"%s\"\n", crunch_mode, word->best_choice->string ().string ()); } return TRUE; } else return FALSE;}BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word) { float rating_per_ch; int adjusted_len; char *str = (char *) word->best_choice->string ().string (); BOOL8 word_crunchable; int poor_indicator_count = 0; word_crunchable = !crunch_leave_accept_strings || (word->reject_map.length () < 3) || ((acceptable_word_string (str) == AC_UNACCEPTABLE) && !ok_dict_word); adjusted_len = word->reject_map.length (); if (adjusted_len > 10) adjusted_len = 10; rating_per_ch = word->best_choice->rating () / adjusted_len; if (rating_per_ch > crunch_pot_poor_rate) { if (crunch_debug > 2) { tprintf ("Potential poor rating on \"%s\"\n", word->best_choice->string ().string ()); } poor_indicator_count++; } if (word_crunchable && (word->best_choice->certainty () < crunch_pot_poor_cert)) { if (crunch_debug > 2) { tprintf ("Potential poor cert on \"%s\"\n", word->best_choice->string ().string ()); } poor_indicator_count++; } if (garbage_level != G_OK) { if (crunch_debug > 2) { tprintf ("Potential garbage on \"%s\"\n", word->best_choice->string ().string ()); } poor_indicator_count++; } return (poor_indicator_count >= crunch_pot_indicators);}void tilde_delete(PAGE_RES_IT &page_res_it) { WERD_RES *word; PAGE_RES_IT copy_it; BOOL8 deleting_from_bol = FALSE; BOOL8 marked_delete_point = FALSE; INT16 debug_delete_mode; CRUNCH_MODE delete_mode; INT16 x_debug_delete_mode; CRUNCH_MODE x_delete_mode; page_res_it.restart_page (); while (page_res_it.word () != NULL) { word = page_res_it.word (); delete_mode = word_deletable (word, debug_delete_mode); if (delete_mode != CR_NONE) { if (word->word->flag (W_BOL) || deleting_from_bol) { if (crunch_debug > 0) { tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, word->best_choice->string ().string ()); } word->unlv_crunch_mode = delete_mode; deleting_from_bol = TRUE; } else if (word->word->flag (W_EOL)) { if (marked_delete_point) { while (copy_it.word () != word) { x_delete_mode = word_deletable (copy_it.word (), x_debug_delete_mode); if (crunch_debug > 0) { tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", x_debug_delete_mode, copy_it.word ()->best_choice->string (). string ()); } copy_it.word ()->unlv_crunch_mode = x_delete_mode; copy_it.forward (); } } if (crunch_debug > 0) { tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n", debug_delete_mode, word->best_choice->string ().string ()); } word->unlv_crunch_mode = delete_mode; deleting_from_bol = FALSE; marked_delete_point = FALSE; } else { if (!marked_delete_point) { copy_it = page_res_it; marked_delete_point = TRUE; } } } else { deleting_from_bol = FALSE; //Forget earlier potential crunches marked_delete_point = FALSE; } /* The following step has been left till now as the tess fails are used to determine if the word is deletable. */ if (!crunch_early_merge_tess_fails) merge_tess_fails(word); page_res_it.forward (); }}void convert_bad_unlv_chs( //word to do WERD_RES *word_res) { char *ptr; //string ptr int i; ptr = (char *) word_res->best_choice->string ().string (); for (i = 0; i < word_res->reject_map.length (); i++) { if (ptr[i] == '~') { ptr[i] = '-'; if (word_res->reject_map[i].accepted ()) word_res->reject_map[i].setrej_unlv_rej (); } if (ptr[i] == '^') { ptr[i] = ' '; if (word_res->reject_map[i].accepted ()) word_res->reject_map[i].setrej_unlv_rej (); } }}/********************************************************************** * merge_tess_fails * * Change pairs of tess failures to a single one **********************************************************************/void merge_tess_fails( //word to do WERD_RES *word_res) { char *ptr; //string ptr PBLOB_IT blob_it; //blobs int i = 0; int len; len = strlen (word_res->best_choice->string ().string ()); ASSERT_HOST (word_res->reject_map.length () == len); ASSERT_HOST (word_res->outword->blob_list ()->length () == len); ptr = (char *) word_res->best_choice->string ().string (); blob_it = word_res->outword->blob_list (); while (*ptr != '\0') { if ((*ptr == ' ') && (*(ptr + 1) == ' ')) { strcpy (ptr + 1, ptr + 2); //shuffle up word_res->reject_map.remove_pos (i); merge_blobs (blob_it.data_relative (1), blob_it.data ()); delete blob_it.extract (); //get rid of spare } else { i++; ptr++; } blob_it.forward (); } len = strlen (word_res->best_choice->string ().string ()); ASSERT_HOST (word_res->reject_map.length () == len); ASSERT_HOST (word_res->outword->blob_list ()->length () == len);}GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word) { enum STATES { JUNK, FIRST_UPPER, FIRST_LOWER, FIRST_NUM, SUBSEQUENT_UPPER, SUBSEQUENT_LOWER, SUBSEQUENT_NUM }; char *str = (char *) word->best_choice->string ().string (); STATES state = JUNK; int len = 0; int isolated_digits = 0; int isolated_alphas = 0; int bad_char_count = 0; int tess_rejs = 0; int dodgy_chars = 0; int ok_chars; char last_char = ' '; int alpha_repetition_count = 0; int longest_alpha_repetition_count = 0; int longest_lower_run_len = 0; int lower_string_count = 0; int longest_upper_run_len = 0; int upper_string_count = 0; int total_alpha_count = 0; int total_digit_count = 0; for (; *str != '\0'; str++) { len++; if (isupper (*str)) { total_alpha_count++; switch (state) { case SUBSEQUENT_UPPER: case FIRST_UPPER: state = SUBSEQUENT_UPPER; upper_string_count++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -