📄 control.cpp
字号:
reject_suspect_fullstops (page_res_it.word ()); page_res_it.rej_stat_word (); chars_in_word = page_res_it.word ()->reject_map.length (); rejects_in_word = page_res_it.word ()->reject_map.reject_count (); blob_quality = word_blob_quality (page_res_it.word (), page_res_it.row ()->row); doc_blob_quality += blob_quality; outline_errs = word_outline_errs (page_res_it.word ()); doc_outline_errs += outline_errs; word_char_quality (page_res_it.word (), page_res_it.row ()->row, &all_char_quality, &accepted_all_char_quality); doc_char_quality += all_char_quality; permuter_type = page_res_it.word ()->best_choice->permuter (); if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || (permuter_type == USER_DAWG_PERM)) { good_char_count += chars_in_word - rejects_in_word; doc_good_char_quality += accepted_all_char_quality; } check_debug_pt (page_res_it.word (), 80); if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) page_res_it.word ()->reject_map.rej_word_bad_quality (); check_debug_pt (page_res_it.word (), 90); page_res_it.forward (); } page_res_it.restart_page (); while (!tessedit_test_adaption && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) { if (monitor != NULL) monitor->ocr_alive = TRUE; if (tessedit_cluster_adaption_mode != 0) adapt_to_good_samples (page_res_it.word (), &char_clusters, &chars_waiting); page_res_it.forward (); } #ifndef SECURE_NAMES if (tessedit_debug_quality_metrics) { tprintf ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", page_res->char_count, page_res->rej_count, page_res->rej_count / (float) page_res->char_count, doc_blob_quality, doc_blob_quality / (float) page_res->char_count, doc_outline_errs, doc_outline_errs / (float) page_res->char_count, doc_char_quality, doc_char_quality / (float) page_res->char_count, doc_good_char_quality, good_char_count > 0 ? doc_good_char_quality / (float) good_char_count : 0.0); } #endif good_quality_doc = (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc) && (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) && (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) && (doc_char_quality / (float) page_res->char_count >= quality_char_pc); /* Do whole document or whole block rejection pass*/ if (!tessedit_test_adaption) { set_global_loc_code(LOC_DOC_BLK_REJ); quality_based_rejection(page_res_it, good_quality_doc); } font_recognition_pass(page_res_it); /* Write results pass */ set_global_loc_code(LOC_WRITE_RESULTS); // This is now redundant, but retained commented so show how to obtain // bounding boxes and style information. // output_pass (page_res_it, false);}/********************************************************************** * classify_word_pass1 * * Baseline normalize the word and pass it to Tess. **********************************************************************/void classify_word_pass1( //recog one word WERD_RES *word, //word to do ROW *row, BOOL8 cluster_adapt, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { WERD *bln_word; //baseline norm copy //detailed results BLOB_CHOICE_LIST_CLIST blob_choices; BOOL8 adapt_ok; const char *rejmap; INT16 index; STRING mapstr = ""; char *match_string; char word_string[1024]; if (matcher_fp != NULL) { fgets (word_string, 1023, correct_fp); if ((match_string = strchr (word_string, '\r')) != NULL) *match_string = '\0'; if ((match_string = strchr (word_string, '\n')) != NULL) *match_string = '\0'; if (word_string[0] != '\0') { word->word->set_text (word_string); word_answer = (char *) word->word->text (); } else word_answer = NULL; } check_debug_pt (word, 0); matcher_pass = 0; bln_word = make_bln_copy (word->word, row, row->x_height (), &word->denorm); word->best_choice = tess_segment_pass1 (bln_word, &word->denorm, tess_default_matcher, word->raw_choice, &blob_choices, word->outword); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((word->best_choice->string ().length () == 0) || (strspn (word->best_choice->string ().string (), " ") == word->best_choice->string ().length ())) { word->done = FALSE; //Try again on pass2 - adaption may help word->tess_failed = TRUE; word->reject_map.initialise (word->best_choice->string ().length ()); word->reject_map.rej_word_tess_failure (); } else { word->tess_failed = FALSE; if ((word->best_choice->string ().length () != word->outword->blob_list ()->length ()) || (word->best_choice->string ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word->best_choice->string ().string (), word->best_choice->string ().length (), word->outword->blob_list ()->length (), blob_choices.length ()); } ASSERT_HOST (word->best_choice->string ().length () == word->outword->blob_list ()->length ()); ASSERT_HOST (word->best_choice->string ().length () == blob_choices.length ()); /* The adaption step used to be here. It has been moved to after make_reject_map so that we know whether the word will be accepted in the first pass or not. This move will PREVENT adaption to words containing double quotes because the word will not be identical to what tess thinks its best choice is. (See CurrentBestChoiceIs in danj/microfeatures/stopper.c which is used by AdaptableWord in danj/microfeatures/adaptmatch.c) */ if (word->word->flag (W_REP_CHAR)) { fix_rep_char(word); } else { fix_quotes ((char *) word->best_choice->string ().string (), //turn to double word->outword, &blob_choices); if (tessedit_fix_hyphens) //turn 2 to 1 fix_hyphens ((char *) word->best_choice->string ().string (), word->outword, &blob_choices); record_certainty (word->best_choice->certainty (), 1); //accounting word->tess_accepted = tess_acceptable_word (word->best_choice, word->raw_choice); word->tess_would_adapt = tess_adaptable_word (word->outword, word->best_choice, word->raw_choice); // Also sets word->done flag make_reject_map (word, &blob_choices, row, 1); adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode); if (cluster_adapt) adapt_to_good_samples(word, char_clusters, chars_waiting); if (adapt_ok || tessedit_tess_adapt_to_rejmap) { if (!tessedit_tess_adapt_to_rejmap) rejmap = NULL; else { ASSERT_HOST (word->reject_map.length () == word->best_choice->string ().length ()); for (index = 0; index < word->reject_map.length (); index++) { if (adapt_ok || word->reject_map[index].accepted ()) mapstr += '1'; else mapstr += '0'; } rejmap = mapstr.string (); } //adapt to it tess_adapter (word->outword, &word->denorm, word->best_choice->string ().string (), word->raw_choice->string ().string (), rejmap); } if (tessedit_enable_doc_dict) tess_add_doc_word (word->best_choice); set_word_fonts(word, &blob_choices); } } if (tessedit_print_text) { write_cooked_text (bln_word, word->best_choice->string (), word->done, FALSE, stdout); } delete bln_word; blob_choices.deep_clear ();}/********************************************************************** * classify_word_pass2 * * Control what to do with the word in pass 2 **********************************************************************/void classify_word_pass2( //word to do WERD_RES *word, ROW *row) { BOOL8 done_this_pass = FALSE; WERD_RES new_x_ht_word (word->word); float new_x_ht = 0.0; INT16 old_xht_reject_count; INT16 new_xht_reject_count; INT16 old_xht_accept_count; INT16 new_xht_accept_count; BOOL8 accept_new_x_ht = FALSE; INT16 old_chs_in_wd; INT16 new_chs_in_wd; INT16 old_word_quality; INT16 new_word_quality; INT16 dummy; set_global_subloc_code(SUBLOC_NORM); check_debug_pt (word, 30); if (!word->done || tessedit_training_tess || tessedit_training_wiseowl || tessedit_dump_choices) { word->x_height = row->x_height (); word->caps_height = 0.0; if (word->outword != NULL) { delete word->outword; //get rid of junk delete word->best_choice; delete word->raw_choice; } match_word_pass2 (word, row, row->x_height ()); done_this_pass = TRUE; check_debug_pt (word, 40); } if (!word->tess_failed && !word->word->flag (W_REP_CHAR)) { set_global_subloc_code(SUBLOC_FIX_XHT); if ((tessedit_xht_fiddles_on_done_wds || !word->done) && (tessedit_xht_fiddles_on_no_rej_wds || (word->reject_map.reject_count () > 0))) { if ((x_ht_check_word_occ >= 2) && word_occ_first) check_block_occ(word); if (tessedit_redo_xheight) re_estimate_x_ht(word, &new_x_ht); if (((x_ht_check_word_occ >= 2) && !word_occ_first) || ((x_ht_check_word_occ >= 1) && (new_x_ht > 0))) check_block_occ(word); } if (new_x_ht > 0) { old_chs_in_wd = word->reject_map.length (); /* Re-estimated x_ht error suggests a rematch is worthwhile. */ new_x_ht_word.x_height = new_x_ht; new_x_ht_word.caps_height = 0.0; match_word_pass2 (&new_x_ht_word, row, new_x_ht_word.x_height); if (!new_x_ht_word.tess_failed) { if ((x_ht_check_word_occ >= 1) && word_occ_first) check_block_occ(&new_x_ht_word); re_estimate_x_ht(&new_x_ht_word, &new_x_ht); if ((x_ht_check_word_occ >= 1) && !word_occ_first) check_block_occ(&new_x_ht_word); old_xht_reject_count = word->reject_map.reject_count (); old_xht_accept_count = old_chs_in_wd - old_xht_reject_count; new_xht_reject_count = new_x_ht_word.reject_map.reject_count (); new_chs_in_wd = new_x_ht_word.reject_map.length (); new_xht_accept_count = new_chs_in_wd - new_xht_reject_count; accept_new_x_ht = ((new_xht_accept_count > old_xht_accept_count) || ((new_xht_accept_count == old_xht_accept_count) && (new_xht_accept_count > 0))) && (!new_x_ht_word.guessed_x_ht || !new_x_ht_word.guessed_caps_ht); if (accept_new_x_ht && x_ht_quality_check) { word_char_quality(word, row, &old_word_quality, &dummy); word_char_quality(&new_x_ht_word, row, &new_word_quality, &dummy); if (old_word_quality > new_word_quality) accept_new_x_ht = FALSE; } if (accept_new_x_ht && (x_ht_stringency > 0)) { accept_new_x_ht = (count_alphanums (&new_x_ht_word) > x_ht_stringency); if (!accept_new_x_ht && rej_use_xht) { if (debug_x_ht_level >= 1) tprintf ("Failed stringency test so reject original word\n"); word->reject_map.rej_word_xht_fixup (); } } #ifndef SECURE_NAMES if (debug_x_ht_level >= 1) { tprintf ("New XHT Match:: %s ", word->best_choice->string ().string ()); word->reject_map.print (debug_fp); tprintf (" -> %s ", new_x_ht_word.best_choice->string ().string ()); new_x_ht_word.reject_map.print (debug_fp); tprintf (" %s->%s %s %s\n", word->guessed_x_ht ? "GUESS" : "CERT", new_x_ht_word.guessed_x_ht ? "GUESS" : "CERT", new_x_ht > 0.1 ? "STILL DOUBT" : "OK", accept_new_x_ht ? "ACCEPTED" : ""); } #endif } if (accept_new_x_ht) { /* The new x_ht is deemed superior so put the final results in the real word and destroy the old results */ delete word->outword; //get rid of junk word->outword = new_x_ht_word.outword; word->denorm = new_x_ht_word.denorm; delete word->best_choice; word->best_choice = new_x_ht_word.best_choice; delete word->raw_choice; word->raw_choice = new_x_ht_word.raw_choice; word->reject_map = new_x_ht_word.reject_map; word->done = new_x_ht_word.done; done_this_pass = TRUE; } else { /* The new x_ht is no better, so destroy the copy word and put any uncertain x or cap ht estimate back to default. (I.e. dont blame me if its bad!) Conditionally, use any ammended block occ chars. */ //get rid of junk delete new_x_ht_word.outword; delete new_x_ht_word.best_choice; delete new_x_ht_word.raw_choice; } //to keep new destructor happy new_x_ht_word.outword = NULL; //to keep new destructor happy new_x_ht_word.best_choice = NULL; //to keep new destructor happy new_x_ht_word.raw_choice = NULL; if (rej_mostly_reject_mode == 2) { reject_mostly_rejects(word); tprintf ("Rejecting mostly rejects on %s ", word->best_choice->string ().string ()); } } set_global_subloc_code(SUBLOC_NORM); if (done_this_pass && !word->done && tessedit_save_stats) SaveBadWord (word->best_choice->string ().string (), word->best_choice->certainty ()); record_certainty (word->best_choice->certainty (), 2); //accounting }#ifndef GRAPHICS_DISABLED if (tessedit_draw_outwords) { if (fx_win == NO_WINDOW) create_fx_win(); clear_fx_win(); word->outword->plot (fx_win); make_picture_current(fx_win); }#endif set_global_subloc_code(SUBLOC_NORM); if (tessedit_print_text) { write_cooked_text (word->outword, word->best_choice->string (), word->done, done_this_pass, stdout); } check_debug_pt (word, 50);}/********************************************************************** * match_word_pass2 * * Baseline normalize the word and pass it to Tess. **********************************************************************/void match_word_pass2( //recog one word WERD_RES *word, //word to do ROW *row, float x_height) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -