📄 docqual.cpp
字号:
delete_word(tessword); //get rid of it initial_it.set_to_list (init_word->blob_list ()); init_blobs_left = initial_it.length (); outword_it.set_to_list (word->outword->blob_list ()); for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward ()) { out_box = outword_it.data ()->bounding_box (); /* Skip any initial blobs LEFT of current outword blob */ while (!initial_it.at_last () && (initial_it.data ()->bounding_box ().left () < out_box.left ())) { initial_it.forward (); init_blobs_left--; } /* See if current outword blob matches any initial blob with the same left coord. (Normally only one but possibly more - in unknown order) */ i = 0; matched = FALSE; do { test_blob = initial_it.data_relative (i++); matched = crude_match_blobs (test_blob, outword_it.data ()); if (matched && (word->reject_map[j].accept_if_good_quality ()) && (docqual_excuse_outline_errs || (count_outline_errs (word->best_choice->string ()[j], outword_it.data ()->out_list ()-> length ()) == 0))) word->reject_map[j].setrej_quality_accept (); } while (!matched && (init_blobs_left - i > 0) && (i < 129) && !initial_it.at_last () && test_blob->bounding_box ().left () == out_box.left ()); j++; } delete init_word;}void print_boxes(WERD *word) { PBLOB_IT it; BOX box; it.set_to_list (word->blob_list ()); for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { box = it.data ()->bounding_box (); box.print (); }}INT16 count_outline_errs(char c, INT16 outline_count) { int expected_outline_count; if (STRING (outlines_odd).contains (c)) return 0; //Dont use this char else if (STRING (outlines_2).contains (c)) expected_outline_count = 2; else expected_outline_count = 1; return abs (outline_count - expected_outline_count);}void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) { if ((tessedit_good_quality_unrej && good_quality_doc)) unrej_good_quality_words(page_res_it); doc_and_block_rejection(page_res_it, good_quality_doc); page_res_it.restart_page (); while (page_res_it.word () != NULL) { insert_rej_cblobs (page_res_it.word ()); page_res_it.forward (); } if (unlv_tilde_crunching) { tilde_crunch(page_res_it); tilde_delete(page_res_it); }}/************************************************************************* * unrej_good_quality_words() * Accept potential rejects in words which pass the following checks: * - Contains a potential reject * - Word looks like a sensible alpha word. * - Word segmentation is the same as the original image * - All characters have the expected number of outlines * NOTE - the rejection counts are recalculated after unrejection * - CANT do it in a single pass without a bit of fiddling * - keep it simple but inefficient *************************************************************************/void unrej_good_quality_words( //unreject potential PAGE_RES_IT &page_res_it) { WERD_RES *word; ROW_RES *current_row; BLOCK_RES *current_block; int i; page_res_it.restart_page (); while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 100); if (bland_unrej) { word = page_res_it.word (); for (i = 0; i < word->reject_map.length (); i++) { if (word->reject_map[i].accept_if_good_quality ()) word->reject_map[i].setrej_quality_accept (); } page_res_it.forward (); } else if ((page_res_it.row ()->char_count > 0) && ((page_res_it.row ()->rej_count / (float) page_res_it.row ()->char_count) <= quality_rowrej_pc)) { word = page_res_it.word (); if (word->reject_map.quality_recoverable_rejects () && (tessedit_unrej_any_wd || acceptable_word_string (word->best_choice->string ().string ()) != AC_UNACCEPTABLE)) { unrej_good_chs (word, page_res_it.row ()->row); } page_res_it.forward (); } else { /* Skip to end of dodgy row */ current_row = page_res_it.row (); while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) page_res_it.forward (); } check_debug_pt (page_res_it.word (), 110); } page_res_it.restart_page (); page_res_it.page_res->char_count = 0; page_res_it.page_res->rej_count = 0; current_block = NULL; current_row = NULL; while (page_res_it.word () != NULL) { if (current_block != page_res_it.block ()) { current_block = page_res_it.block (); current_block->char_count = 0; current_block->rej_count = 0; } if (current_row != page_res_it.row ()) { current_row = page_res_it.row (); current_row->char_count = 0; current_row->rej_count = 0; current_row->whole_word_rej_count = 0; } page_res_it.rej_stat_word (); page_res_it.forward (); }}/************************************************************************* * doc_and_block_rejection() * * If the page has too many rejects - reject all of it. * If any block has too many rejects - reject all words in the block *************************************************************************/void doc_and_block_rejection( //reject big chunks PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) { INT16 block_no = 0; INT16 row_no = 0; BLOCK_RES *current_block; ROW_RES *current_row; BOOL8 rej_word; BOOL8 prev_word_rejected; INT16 char_quality; INT16 accepted_char_quality; if ((page_res_it.page_res->rej_count * 100.0 / page_res_it.page_res->char_count) > tessedit_reject_doc_percent) { reject_whole_page(page_res_it); #ifndef SECURE_NAMES if (tessedit_debug_doc_rejection) { tprintf ("REJECT ALL #chars: %d #Rejects: %d; \n", page_res_it.page_res->char_count, page_res_it.page_res->rej_count); } #endif } else { #ifndef SECURE_NAMES if (tessedit_debug_doc_rejection) tprintf ("NO PAGE REJECTION #chars: %d # Rejects: %d; \n", page_res_it.page_res->char_count, page_res_it.page_res->rej_count); #endif /* Walk blocks testing for block rejection */ page_res_it.restart_page (); while (page_res_it.word () != NULL) { current_block = page_res_it.block (); if (current_block->block->text_region () != NULL) block_no = current_block->block->text_region ()->id_no (); else block_no = -1; if ((page_res_it.block ()->char_count > 0) && ((page_res_it.block ()->rej_count * 100.0 / page_res_it.block ()->char_count) > tessedit_reject_block_percent)) { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n", block_no, page_res_it.block ()->char_count, page_res_it.block ()->rej_count); #endif prev_word_rejected = FALSE; while ((page_res_it.word () != NULL) && (page_res_it.block () == current_block)) { if (tessedit_preserve_blk_rej_perfect_wds) { rej_word = (page_res_it.word ()->reject_map.reject_count () > 0) || (page_res_it.word ()->reject_map.length () < tessedit_preserve_min_wd_len); if (rej_word && tessedit_dont_blkrej_good_wds && !(page_res_it.word ()->reject_map.length () < tessedit_preserve_min_wd_len) && (acceptable_word_string (page_res_it.word ()->best_choice->string (). string ()) != AC_UNACCEPTABLE)) { word_char_quality (page_res_it.word (), page_res_it.row ()->row, &char_quality, &accepted_char_quality); rej_word = char_quality != page_res_it.word ()->reject_map.length (); } } else rej_word = TRUE; if (rej_word) { /* Reject spacing if both current and prev words are rejected. NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated more space errors. */ if (tessedit_use_reject_spaces && prev_word_rejected && (page_res_it.prev_row () == page_res_it.row ()) && (page_res_it.word ()->word->space () == 1)) page_res_it.word ()->reject_spaces = TRUE; page_res_it.word ()->reject_map.rej_word_block_rej (); } prev_word_rejected = rej_word; page_res_it.forward (); } } else { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n", block_no, page_res_it.block ()->char_count, page_res_it.block ()->rej_count); #endif /* Walk rows in block testing for row rejection */ row_no = 0; while ((page_res_it.word () != NULL) && (page_res_it.block () == current_block)) { current_row = page_res_it.row (); row_no++; /* Reject whole row if: fraction of chars on row which are rejected exceed a limit AND fraction rejects which occur in WHOLE WERD rejects is LESS THAN a limit */ if ((page_res_it.row ()->char_count > 0) && ((page_res_it.row ()->rej_count * 100.0 / page_res_it.row ()->char_count) > tessedit_reject_row_percent) && ((page_res_it.row ()->whole_word_rej_count * 100.0 / page_res_it.row ()->rej_count) < tessedit_whole_wd_rej_row_percent)) { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("REJECTING ROW %d #chars: %d; #Rejects: %d\n", row_no, page_res_it.row ()->char_count, page_res_it.row ()->rej_count); #endif prev_word_rejected = FALSE; while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) { /* Preserve words on good docs unless they are mostly rejected*/ if (!tessedit_row_rej_good_docs && good_quality_doc) { rej_word = page_res_it.word ()->reject_map. reject_count () / (float) page_res_it.word ()->reject_map. length () > tessedit_good_doc_still_rowrej_wd; } /* Preserve perfect words anyway */ else if (tessedit_preserve_row_rej_perfect_wds) { rej_word = (page_res_it.word ()->reject_map. reject_count () > 0) || (page_res_it.word ()->reject_map. length () < tessedit_preserve_min_wd_len); if (rej_word && tessedit_dont_rowrej_good_wds && !(page_res_it.word ()->reject_map. length () < tessedit_preserve_min_wd_len) && (acceptable_word_string (page_res_it.word ()->best_choice-> string ().string ()) != AC_UNACCEPTABLE)) { word_char_quality (page_res_it.word (), page_res_it.row ()->row, &char_quality, &accepted_char_quality); rej_word = char_quality != page_res_it.word ()->reject_map.length (); } } else rej_word = TRUE; if (rej_word) { /* Reject spacing if both current and prev words are rejected. NOTE - this is NOT restricted to FUZZY spaces. - When tried this generated more space errors. */ if (tessedit_use_reject_spaces && prev_word_rejected && (page_res_it.prev_row () == page_res_it.row ()) && (page_res_it.word ()->word->space () == 1)) page_res_it.word ()->reject_spaces = TRUE; page_res_it.word ()->reject_map. rej_word_row_rej(); } prev_word_rejected = rej_word; page_res_it.forward (); } } else { #ifndef SECURE_NAMES if (tessedit_debug_block_rejection) tprintf ("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n", row_no, page_res_it.row ()->char_count, page_res_it.row ()->rej_count); #endif while ((page_res_it.word () != NULL) && (page_res_it.row () == current_row)) page_res_it.forward ();
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -