control.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 1,843 行 · 第 1/5 页
CPP
1,843 行
/* Pass 2 */ page_res_it.restart_page (); word_index = 0; while (!tessedit_test_adaption && page_res_it.word () != NULL) { set_global_loc_code(LOC_PASS2); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 80 + 10 * word_index / word_count; if ((monitor->end_time != 0 && clock() > monitor->end_time) || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, dict_words))) return; }//changed by jetsoft//specific to its needs to extract one word when need if (target_word_box) { TBOX current_word_box=page_res_it.word ()->word->bounding_box(); FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); if (!target_word_box->contains(center_pt)) { page_res_it.forward (); continue; } }//end jetsoft classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row); if (tessedit_em_adaption_mode > 0) collect_ems_for_adaption (page_res_it.word (), &em_clusters, &ems_waiting); if (tessedit_cluster_adapt_after_pass2 && tessedit_cluster_adaption_mode != 0) collect_characters_for_adaption (page_res_it.word (), &char_clusters, &chars_waiting); page_res_it.forward (); } /* Another pass */ set_global_loc_code(LOC_FUZZY_SPACE); if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word) fix_fuzzy_spaces(monitor, word_count, page_res); if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0) // Initially ems only print_em_stats(&em_clusters, &ems_waiting); /* Pass 3 - used for checking confusion sets */ page_res_it.restart_page (); word_index = 0; while (!tessedit_test_adaption && page_res_it.word () != NULL) { set_global_loc_code(LOC_MM_ADAPT); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 95 + 5 * word_index / word_count; } check_debug_pt (page_res_it.word (), 70); /* Use good matches to sort out confusions *///changed by jetsoft//specific to its needs to extract one word when need if (target_word_box) { TBOX current_word_box=page_res_it.word ()->word->bounding_box(); FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); if (!target_word_box->contains(center_pt)) { page_res_it.forward (); continue; } }// end jetsoft if (tessedit_em_adaption_mode != 0) adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting); if (tessedit_cluster_adapt_after_pass2 && tessedit_cluster_adaption_mode != 0) adapt_to_good_samples (page_res_it.word (), &char_clusters, &chars_waiting); if (tessedit_reject_fullstops && strchr (page_res_it.word ()->best_choice->string ().string (), '.') != NULL) reject_all_fullstops (page_res_it.word ()); else if (tessedit_reject_suspect_fullstops && strchr (page_res_it.word ()->best_choice->string (). string (), '.') != NULL) reject_suspect_fullstops (page_res_it.word ()); page_res_it.rej_stat_word (); chars_in_word = page_res_it.word ()->reject_map.length (); rejects_in_word = page_res_it.word ()->reject_map.reject_count (); blob_quality = word_blob_quality (page_res_it.word (), page_res_it.row ()->row); doc_blob_quality += blob_quality; outline_errs = word_outline_errs (page_res_it.word ()); doc_outline_errs += outline_errs; word_char_quality (page_res_it.word (), page_res_it.row ()->row, &all_char_quality, &accepted_all_char_quality); doc_char_quality += all_char_quality; uinT8 permuter_type = page_res_it.word ()->best_choice->permuter (); if ((permuter_type == SYSTEM_DAWG_PERM) || (permuter_type == FREQ_DAWG_PERM) || (permuter_type == USER_DAWG_PERM)) { good_char_count += chars_in_word - rejects_in_word; doc_good_char_quality += accepted_all_char_quality; } check_debug_pt (page_res_it.word (), 80); if (tessedit_reject_bad_qual_wds && (blob_quality == 0) && (outline_errs >= chars_in_word)) page_res_it.word ()->reject_map.rej_word_bad_quality (); check_debug_pt (page_res_it.word (), 90); page_res_it.forward (); } page_res_it.restart_page (); while (!tessedit_test_adaption && tessedit_cluster_adapt_after_pass3 && page_res_it.word () != NULL) { if (monitor != NULL) monitor->ocr_alive = TRUE;//changed by jetsoft//specific to its needs to extract one word when need if (target_word_box) { TBOX current_word_box=page_res_it.word ()->word->bounding_box(); FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); if (!target_word_box->contains(center_pt)) { page_res_it.forward (); continue; } }//end jetsoft if (tessedit_cluster_adaption_mode != 0) adapt_to_good_samples (page_res_it.word (), &char_clusters, &chars_waiting); page_res_it.forward (); } #ifndef SECURE_NAMES if (tessedit_debug_quality_metrics) { tprintf ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n", page_res->char_count, page_res->rej_count, page_res->rej_count / (float) page_res->char_count, doc_blob_quality, doc_blob_quality / (float) page_res->char_count, doc_outline_errs, doc_outline_errs / (float) page_res->char_count, doc_char_quality, doc_char_quality / (float) page_res->char_count, doc_good_char_quality, good_char_count > 0 ? doc_good_char_quality / (float) good_char_count : 0.0); } #endif BOOL8 good_quality_doc = (page_res->rej_count / (float) page_res->char_count <= quality_rej_pc) && (doc_blob_quality / (float) page_res->char_count >= quality_blob_pc) && (doc_outline_errs / (float) page_res->char_count <= quality_outline_pc) && (doc_char_quality / (float) page_res->char_count >= quality_char_pc); /* Do whole document or whole block rejection pass*/ if (!tessedit_test_adaption) { set_global_loc_code(LOC_DOC_BLK_REJ); quality_based_rejection(page_res_it, good_quality_doc); } font_recognition_pass(page_res_it); /* Write results pass */ set_global_loc_code(LOC_WRITE_RESULTS); // This is now redundant, but retained commented so show how to obtain // bounding boxes and style information.////changed by jetsoft//needed for dll to output memory structure if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv)) output_pass(page_res_it, ocr_char_space() > 0, target_word_box);// end jetsoft}/********************************************************************** * classify_word_pass1 * * Baseline normalize the word and pass it to Tess. **********************************************************************/void classify_word_pass1( //recog one word WERD_RES *word, //word to do ROW *row, BOOL8 cluster_adapt, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { WERD *bln_word; //baseline norm copy //detailed results BLOB_CHOICE_LIST_CLIST local_blob_choices; BLOB_CHOICE_LIST_CLIST *blob_choices; BOOL8 adapt_ok; const char *rejmap; inT16 index; STRING mapstr = ""; char *match_string; char word_string[1024]; if (save_best_choices) blob_choices = new BLOB_CHOICE_LIST_CLIST(); else blob_choices = &local_blob_choices; if (matcher_fp != NULL) { fgets (word_string, 1023, correct_fp); if ((match_string = strchr (word_string, '\r')) != NULL) *match_string = '\0'; if ((match_string = strchr (word_string, '\n')) != NULL) *match_string = '\0'; if (word_string[0] != '\0') { word->word->set_text (word_string); word_answer = (char *) word->word->text (); } else word_answer = NULL; } check_debug_pt (word, 0); matcher_pass = 0; bln_word = make_bln_copy (word->word, row, word->x_height, &word->denorm); word->best_choice = tess_segment_pass1 (bln_word, &word->denorm, tess_default_matcher, word->raw_choice, blob_choices, word->outword); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((word->best_choice->lengths ().length () == 0) || (strspn (word->best_choice->string ().string (), " ") == word->best_choice->string ().length ())) { word->done = FALSE; //Try again on pass2 - adaption may help word->tess_failed = TRUE; word->reject_map.initialise (word->best_choice->lengths ().length ()); word->reject_map.rej_word_tess_failure (); } else { word->tess_failed = FALSE; if ((word->best_choice->lengths ().length () != word->outword->blob_list ()->length ()) || (word->best_choice->lengths ().length () != blob_choices->length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word->best_choice->string ().string (), word->best_choice->lengths ().length (), word->outword->blob_list ()->length (), blob_choices->length ()); } ASSERT_HOST (word->best_choice->lengths ().length () == word->outword->blob_list ()->length ()); ASSERT_HOST (word->best_choice->lengths ().length () == blob_choices->length ()); /* The adaption step used to be here. It has been moved to after make_reject_map so that we know whether the word will be accepted in the first pass or not. This move will PREVENT adaption to words containing double quotes because the word will not be identical to what tess thinks its best choice is. (See CurrentBestChoiceIs in danj/microfeatures/stopper.c which is used by AdaptableWord in danj/microfeatures/adaptmatch.c) */ if (word->word->flag (W_REP_CHAR)) { fix_rep_char(word); } else { fix_quotes (word->best_choice, //turn to double word->outword, blob_choices); if (tessedit_fix_hyphens) //turn 2 to 1 fix_hyphens (word->best_choice, word->outword, blob_choices); record_certainty (word->best_choice->certainty (), 1); //accounting word->tess_accepted = tess_acceptable_word (word->best_choice, word->raw_choice); word->tess_would_adapt = tess_adaptable_word (word->outword, word->best_choice, word->raw_choice); // Also sets word->done flag make_reject_map (word, blob_choices, row, 1); adapt_ok = word_adaptable (word, tessedit_tess_adaption_mode); if (cluster_adapt) adapt_to_good_samples(word, char_clusters, chars_waiting); if (adapt_ok || tessedit_tess_adapt_to_rejmap) { if (!tessedit_tess_adapt_to_rejmap) rejmap = NULL; else { ASSERT_HOST (word->reject_map.length () == word->best_choice->lengths ().length ()); for (index = 0; index < word->reject_map.length (); index++) { if (adapt_ok || word->reject_map[index].accepted ()) mapstr += '1'; else mapstr += '0'; } rejmap = mapstr.string (); } //adapt to it tess_adapter (word->outword, &word->denorm, *word->best_choice, *word->raw_choice, rejmap); } if (tessedit_enable_doc_dict) tess_add_doc_word (word->best_choice); set_word_fonts(word, blob_choices); } }#if 0 if (tessedit_print_text) { write_cooked_text (bln_word, word->best_choice->string (), word->done, FALSE, stdout); }#endif delete bln_word; // Save best choices in the WERD_CHOICE if needed if (blob_choices != &local_blob_choices) word->best_choice->set_blob_choices(blob_choices); else blob_choices->deep_clear();}/********************************************************************** * classify_word_pass2 * * Control what to do with the word in pass 2 **********************************************************************/void classify_word_pass2( //word to do
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?