applybox.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 892 行 · 第 1/3 页

CPP
892
字号
      }      else        ok_row_count++;    }  }  min_samples = 9999;  for (i = 0; i < unicharset_boxes.size(); i++) {    if (tgt_char_counts[i] > labelled_char_counts[i]) {      if (labelled_char_counts[i] <= 1) {        tprintf          ("APPLY_BOXES: FATALITY - %d labelled samples of \"%s\" - target is %d:\n",          labelled_char_counts[i], unicharset_boxes.id_to_unichar(i), tgt_char_counts[i]);        PrintString(unicharset_boxes.id_to_unichar(i));      }      else {        rebalance_needed = TRUE;        if (applybox_debug > 0)          tprintf            ("APPLY_BOXES: REBALANCE REQD \"%s\" - target of %d from %d labelled samples\n",            unicharset_boxes.id_to_unichar(i), tgt_char_counts[i], labelled_char_counts[i]);      }    }    if ((min_samples > labelled_char_counts[i]) && (tgt_char_counts[i] > 0)) {      min_samples = labelled_char_counts[i];      *min_uch_id = i;    }  }  while (applybox_rebalance && rebalance_needed) {    block_it.set_to_list (block_list);    for (block_it.mark_cycle_pt ();    !block_it.cycled_list (); block_it.forward ()) {      row_it.set_to_list (block_it.data ()->row_list ());      for (row_it.mark_cycle_pt ();      !row_it.cycled_list (); row_it.forward ()) {        row = row_it.data ();        word_it.set_to_list (row->word_list ());        for (word_it.mark_cycle_pt ();        !word_it.cycled_list (); word_it.forward ()) {          word = word_it.data ();          left = word->bounding_box ().left ();          if (*word->text () != '\0')            uch_id = unicharset_boxes.unichar_to_id(word->text ());          else            uch_id = -1;          at_dupe_of_prev_word = ((row == prev_row) &&            (left = prev_left) &&            (uch_id == prev_uch_id));          if ((uch_id != -1) &&            (labelled_char_counts[uch_id] > 1) &&            (tgt_char_counts[uch_id] > labelled_char_counts[uch_id]) &&          (!at_dupe_of_prev_word)) {            /* Duplicate the word to rebalance the labelled samples */            if (applybox_debug > 9) {              tprintf ("Duping \"%s\" from ", unicharset_boxes.id_to_unichar(uch_id));              word->bounding_box ().print ();            }            duplicate_word = new WERD;            *duplicate_word = *word;            word_it.add_after_then_move (duplicate_word);            rebalance_count++;            labelled_char_counts[uch_id]++;          }          prev_row = row;          prev_left = left;          prev_uch_id = uch_id;        }      }    }    rebalance_needed = FALSE;    for (i = 0; i < unicharset_boxes.size(); i++) {      if ((tgt_char_counts[i] > labelled_char_counts[i]) &&      (labelled_char_counts[i] > 1)) {        rebalance_needed = TRUE;        break;      }    }  }  /* Now final check - count labelled blobs */  final_labelled_blob_count = 0;  block_it.set_to_list (block_list);  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    row_it.set_to_list (block_it.data ()->row_list ());    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {      row = row_it.data ();      word_it.set_to_list (row->word_list ());      word_it.sort (word_comparator);      for (word_it.mark_cycle_pt ();      !word_it.cycled_list (); word_it.forward ()) {        word = word_it.data ();        if ((strlen (word->text ()) > 0) &&          (word->gblob_list ()->length () == 1))          final_labelled_blob_count++;      }    }  }}void report_failed_box(inT16 boxfile_lineno,                       inT16 boxfile_charno,                       TBOX box,                       const char *box_ch,                       const char *err_msg) {  if (applybox_debug > 4)    tprintf ("APPLY_BOXES: boxfile %1d/%1d/%s ((%1d,%1d),(%1d,%1d)): %s\n",      boxfile_lineno,      boxfile_charno,      box_ch,      box.left (), box.bottom (), box.right (), box.top (), err_msg);}void apply_box_training(BLOCK_LIST *block_list) {  BLOCK_IT block_it(block_list);  ROW_IT row_it;  ROW *row;  WERD_IT word_it;  WERD *word;  WERD *bln_word;  WERD copy_outword;             // copy to denorm  PBLOB_IT blob_it;  DENORM denorm;  inT16 count = 0;  char unichar[UNICHAR_LEN + 1];  unichar[UNICHAR_LEN] = '\0';  tprintf ("Generating training data\n");  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    row_it.set_to_list (block_it.data ()->row_list ());    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {      row = row_it.data ();      word_it.set_to_list (row->word_list ());      for (word_it.mark_cycle_pt ();      !word_it.cycled_list (); word_it.forward ()) {        word = word_it.data ();        if ((strlen (word->text ()) > 0) &&        (word->gblob_list ()->length () == 1)) {          /* Here is a word with a single unichar label and a single blob so train on it */          bln_word =            make_bln_copy (word, row, row->x_height (), &denorm);          blob_it.set_to_list (bln_word->blob_list ());          strncpy(unichar, word->text (), UNICHAR_LEN);          tess_training_tester (blob_it.data (),                                 //single blob            &denorm, TRUE,       //correct            unichar,             //correct character            strlen(unichar),     //character length            NULL);          copy_outword = *(bln_word);          copy_outword.baseline_denormalise (&denorm);          blob_it.set_to_list (copy_outword.blob_list ());          delete bln_word;          count++;        }      }    }  }  tprintf ("Generated training data for %d blobs\n", count);}void apply_box_testing(BLOCK_LIST *block_list) {  BLOCK_IT block_it(block_list);  ROW_IT row_it;  ROW *row;  inT16 row_count = 0;  WERD_IT word_it;  WERD *word;  WERD *bln_word;  inT16 word_count = 0;  PBLOB_IT blob_it;  DENORM denorm;  inT16 count = 0;  char ch[2];  WERD *outword;                 //bln best choice  //segmentation  WERD_CHOICE *best_choice;      //tess output  WERD_CHOICE *raw_choice;       //top choice permuter                                 //detailed results  BLOB_CHOICE_LIST_CLIST blob_choices;  inT16 char_count = 0;  inT16 correct_count = 0;  inT16 err_count = 0;  inT16 rej_count = 0;  #ifndef SECURE_NAMES  WERDSTATS wordstats;           //As from newdiff  #endif  char tess_rej_str[3];  char tess_long_str[3];  ch[1] = '\0';  strcpy (tess_rej_str, "|A");  strcpy (tess_long_str, "|B");  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    row_it.set_to_list (block_it.data ()->row_list ());    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {      row = row_it.data ();      row_count++;      word_count = 0;      word_it.set_to_list (row->word_list ());      for (word_it.mark_cycle_pt ();      !word_it.cycled_list (); word_it.forward ()) {        word = word_it.data ();        word_count++;        if ((strlen (word->text ()) == 1) &&          !STRING (applybox_test_exclusions).contains (*word->text ())        && (word->gblob_list ()->length () == 1)) {          /* Here is a word with a single char label and a single blob so test it */          bln_word =            make_bln_copy (word, row, row->x_height (), &denorm);          blob_it.set_to_list (bln_word->blob_list ());          ch[0] = *word->text ();          char_count++;          best_choice = tess_segment_pass1 (bln_word,            &denorm,            tess_default_matcher,            raw_choice,            &blob_choices, outword);          /*            Test for TESS screw up on word. Recog_word has already ensured that the            choice list, outword blob lists and best_choice string are the same            length. A TESS screw up is indicated by a blank filled or 0 length string.          */          if ((best_choice->lengths ().length () == 0) ||            (strspn (best_choice->string ().string (), " ") ==          best_choice->string ().length ())) {            rej_count++;            tprintf ("%d:%d: \"%s\" -> TESS FAILED\n",              row_count, word_count, ch);            #ifndef SECURE_NAMES            wordstats.word (tess_rej_str, 2, ch, 1);            #endif          }          else {            if ((best_choice->lengths ().length () !=              outword->blob_list ()->length ()) ||              (best_choice->lengths ().length () !=            blob_choices.length ())) {              tprintf                ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n",                best_choice->string ().string (),                best_choice->lengths ().length (),                outword->blob_list ()->length (),                blob_choices.length ());            }            ASSERT_HOST (best_choice->lengths ().length () ==              outword->blob_list ()->length ());            ASSERT_HOST (best_choice->lengths ().length () ==              blob_choices.length ());            fix_quotes (best_choice,                                 //turn to double              outword, &blob_choices);            if (strcmp (best_choice->string ().string (), ch) != 0) {              err_count++;              tprintf ("%d:%d: \"%s\" -> \"%s\"\n",                row_count, word_count, ch,                best_choice->string ().string ());            }            else              correct_count++;            #ifndef SECURE_NAMES            if (best_choice->string ().length () > 2)              wordstats.word (tess_long_str, 2, ch, 1);            else              wordstats.word ((char *) best_choice->string ().                string (),                best_choice->string ().length (), ch,                1);            #endif          }          delete bln_word;          delete outword;          delete best_choice;          delete raw_choice;          blob_choices.deep_clear ();          count++;        }      }    }  }  #ifndef SECURE_NAMES  wordstats.print (1, 100.0);  wordstats.conf_matrix ();  tprintf ("Tested %d chars: %d correct; %d rejected by tess; %d errs\n",    char_count, correct_count, rej_count, err_count);  #endif}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?