applybox.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 892 行 · 第 1/3 页
CPP
892 行
} else ok_row_count++; } } min_samples = 9999; for (i = 0; i < unicharset_boxes.size(); i++) { if (tgt_char_counts[i] > labelled_char_counts[i]) { if (labelled_char_counts[i] <= 1) { tprintf ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d:\n", labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]); PrintString(unicharset_boxes.id_to_unichar(i)); } else { rebalance_needed = TRUE; if (applybox_debug > 0) tprintf ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n", unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]); } } if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) { min_samples = labelled_char_counts[i]; *min_uch_id = i; } } while (applybox_rebalance && rebalance_needed) { block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); left = word->bounding_box ().left (); if (*word->text () != '\0') uch_id = unicharset_boxes.unichar_to_id(word->text ()); else uch_id = -1; at_dupe_of_prev_word = ((row == prev_row) && (left = prev_left) && (uch_id == prev_uch_id)); if ((uch_id != -1) && (labelled_char_counts[uch_id] > 1) && (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) && (!at_dupe_of_prev_word)) { /* Duplicate the word to rebalance the labelled samples */ if (applybox_debug > 9) { tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id)); word->bounding_box ().print (); } duplicate_word = new WERD; *duplicate_word = *word; word_it.add_after_then_move (duplicate_word); rebalance_count++; labelled_char_counts[uch_id]++; } prev_row = row; prev_left = left; prev_uch_id = uch_id; } } } rebalance_needed = FALSE; for (i = 0; i < unicharset_boxes.size(); i++) { if ((tgt_char_counts[i] > labelled_char_counts[i]) && (labelled_char_counts[i] > 1)) { rebalance_needed = TRUE; break; } } } /* Now final check - count labelled blobs */ final_labelled_blob_count = 0; block_it.set_to_list (block_list); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); word_it.sort (word_comparator); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) > 0) && (word->gblob_list ()->length () == 1)) final_labelled_blob_count++; } } }}void report_failed_box(inT16 boxfile_lineno, inT16 boxfile_charno, TBOX box, const char *box_ch, const char *err_msg) { if (applybox_debug > 4) tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n", boxfile_lineno, boxfile_charno, box_ch, box.left (), box.bottom (), box.right (), box.top (), err_msg);}void apply_box_training(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; WERD_IT word_it; WERD *word; WERD *bln_word; WERD copy_outword; // copy to denorm PBLOB_IT blob_it; DENORM denorm; inT16 count = 0; char unichar[UNICHAR_LEN + 1]; unichar[UNICHAR_LEN] = '\0'; tprintf ("Generating training data\n"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((strlen (word->text ()) > 0) && (word->gblob_list ()->length () == 1)) { /* Here is a word with a single unichar label and a single blob so train on it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); strncpy(unichar, word->text (), UNICHAR_LEN); tess_training_tester (blob_it.data (), //single blob &denorm, TRUE, //correct unichar, //correct character strlen(unichar), //character length NULL); copy_outword = *(bln_word); copy_outword.baseline_denormalise (&denorm); blob_it.set_to_list (copy_outword.blob_list ()); delete bln_word; count++; } } } } tprintf ("Generated training data for %d blobs\n", count);}void apply_box_testing(BLOCK_LIST *block_list) { BLOCK_IT block_it(block_list); ROW_IT row_it; ROW *row; inT16 row_count = 0; WERD_IT word_it; WERD *word; WERD *bln_word; inT16 word_count = 0; PBLOB_IT blob_it; DENORM denorm; inT16 count = 0; char ch[2]; WERD *outword; //bln best choice //segmentation WERD_CHOICE *best_choice; //tess output WERD_CHOICE *raw_choice; //top choice permuter //detailed results BLOB_CHOICE_LIST_CLIST blob_choices; inT16 char_count = 0; inT16 correct_count = 0; inT16 err_count = 0; inT16 rej_count = 0; #ifndef SECURE_NAMES WERDSTATS wordstats; //As from newdiff #endif char tess_rej_str[3]; char tess_long_str[3]; ch[1] = '\0'; strcpy (tess_rej_str, "|A"); strcpy (tess_long_str, "|B"); for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); row_count++; word_count = 0; word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_count++; if ((strlen (word->text ()) == 1) && !STRING (applybox_test_exclusions).contains (*word->text ()) && (word->gblob_list ()->length () == 1)) { /* Here is a word with a single char label and a single blob so test it */ bln_word = make_bln_copy (word, row, row->x_height (), &denorm); blob_it.set_to_list (bln_word->blob_list ()); ch[0] = *word->text (); char_count++; best_choice = tess_segment_pass1 (bln_word, &denorm, tess_default_matcher, raw_choice, &blob_choices, outword); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((best_choice->lengths ().length () == 0) || (strspn (best_choice->string ().string (), " ") == best_choice->string ().length ())) { rej_count++; tprintf ("%d:%d: \"%s\" -> TESS FAILED\n", row_count, word_count, ch); #ifndef SECURE_NAMES wordstats.word (tess_rej_str, 2, ch, 1); #endif } else { if ((best_choice->lengths ().length () != outword->blob_list ()->length ()) || (best_choice->lengths ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", best_choice->string ().string (), best_choice->lengths ().length (), outword->blob_list ()->length (), blob_choices.length ()); } ASSERT_HOST (best_choice->lengths ().length () == outword->blob_list ()->length ()); ASSERT_HOST (best_choice->lengths ().length () == blob_choices.length ()); fix_quotes (best_choice, //turn to double outword, &blob_choices); if (strcmp (best_choice->string ().string (), ch) != 0) { err_count++; tprintf ("%d:%d: \"%s\" -> \"%s\"\n", row_count, word_count, ch, best_choice->string ().string ()); } else correct_count++; #ifndef SECURE_NAMES if (best_choice->string ().length () > 2) wordstats.word (tess_long_str, 2, ch, 1); else wordstats.word ((char *) best_choice->string (). string (), best_choice->string ().length (), ch, 1); #endif } delete bln_word; delete outword; delete best_choice; delete raw_choice; blob_choices.deep_clear (); count++; } } } } #ifndef SECURE_NAMES wordstats.print (1, 100.0); wordstats.conf_matrix (); tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n", char_count, correct_count, rej_count, err_count); #endif}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?