wordseg.cpp

来自「一OCR的相关资料。.希望对研究OCR的朋友有所帮助.」· C++ 代码 · 共 621 行 · 第 1/2 页

CPP
621
字号
INT32 row_words2(                  //compute space size                 TO_BLOCK *block,  //block it came from                 TO_ROW *row,      //row to operate on                 INT32 maxwidth,   //max expected space size                 FCOORD rotation,  //for drawing                 BOOL8 testing_on  //for debug                ) {  BOOL8 testing_row;             //contains testpt  BOOL8 prev_valid;              //if decent size  BOOL8 this_valid;              //current blob big enough  INT32 prev_x;                  //end of prev blob  INT32 min_width;               //min interesting width  INT32 valid_count;             //good gaps  INT32 total_count;             //total gaps  INT32 cluster_count;           //no of clusters  INT32 prev_count;              //previous cluster_count  INT32 gap_index;               //which cluster  INT32 smooth_factor;           //for smoothing stats  BLOBNBOX *blob;                //current blob  float lower, upper;            //clustering parameters  ICOORD testpt;  BOX blob_box;                  //bounding box                                 //iterator  BLOBNBOX_IT blob_it = row->blob_list ();  STATS gap_stats (0, maxwidth);                                 //gap sizes  float gaps[BLOCK_STATS_CLUSTERS];  STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1];  //clusters  testpt = ICOORD (textord_test_x, textord_test_y);  smooth_factor =    (INT32) (block->xheight * textord_wordstats_smooth_factor + 1.5);  //      if (testing_on)  //              tprintf("Row smooth factor=%d\n",smooth_factor);  prev_valid = FALSE;  prev_x = -MAX_INT16;  testing_row = FALSE;                                 //min blob size  min_width = (INT32) block->pr_space;  total_count = 0;  for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {    blob = blob_it.data ();    if (!blob->joined_to_prev ()) {      blob_box = blob->bounding_box ();      this_valid = blob_box.width () >= min_width;      this_valid = TRUE;      if (this_valid && prev_valid      && blob_box.left () - prev_x < maxwidth) {        gap_stats.add (blob_box.left () - prev_x, 1);      }      total_count++;             //count possibles      prev_x = blob_box.right ();      prev_valid = this_valid;    }  }  valid_count = gap_stats.get_total ();  if (valid_count < total_count * textord_words_minlarge) {    gap_stats.clear ();    prev_x = -MAX_INT16;    for (blob_it.mark_cycle_pt (); !blob_it.cycled_list ();    blob_it.forward ()) {      blob = blob_it.data ();      if (!blob->joined_to_prev ()) {        blob_box = blob->bounding_box ();        if (blob_box.left () - prev_x < maxwidth) {          gap_stats.add (blob_box.left () - prev_x, 1);        }        prev_x = blob_box.right ();      }    }  }  if (gap_stats.get_total () == 0) {    row->min_space = 0;          //no evidence    row->max_nonspace = 0;    return 0;  }  cluster_count = 0;  lower = block->xheight * words_initial_lower;  upper = block->xheight * words_initial_upper;  gap_stats.smooth (smooth_factor);  do {    prev_count = cluster_count;    cluster_count = gap_stats.cluster (lower, upper,      textord_spacesize_ratioprop,      BLOCK_STATS_CLUSTERS, cluster_stats);  }  while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS);  if (cluster_count < 1) {    row->min_space = 0;    row->max_nonspace = 0;    return 0;  }  for (gap_index = 0; gap_index < cluster_count; gap_index++)    gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5);  //get medians  if (testing_on) {    tprintf ("cluster_count=%d:", cluster_count);    for (gap_index = 0; gap_index < cluster_count; gap_index++)      tprintf (" %g(%d)", gaps[gap_index],        cluster_stats[gap_index + 1].get_total ());    tprintf ("\n");  }  //Try to find proportional non-space and space for row.  for (gap_index = 0; gap_index < cluster_count    && gaps[gap_index] > block->max_nonspace; gap_index++);  if (gap_index < cluster_count)    lower = gaps[gap_index];     //most frequent below  else {    if (testing_on)      tprintf ("No cluster below block threshold!, using default=%g\n",        block->pr_nonsp);    lower = block->pr_nonsp;  }  for (gap_index = 0; gap_index < cluster_count    && gaps[gap_index] <= block->max_nonspace; gap_index++);  if (gap_index < cluster_count)    upper = gaps[gap_index];     //most frequent above  else {    if (testing_on)      tprintf ("No cluster above block threshold!, using default=%g\n",        block->pr_space);    upper = block->pr_space;  }  row->min_space =    (INT32) ceil (upper - (upper - lower) * textord_words_definite_spread);  row->max_nonspace =    (INT32) floor (lower + (upper - lower) * textord_words_definite_spread);  row->space_threshold = (row->max_nonspace + row->min_space) / 2;  row->space_size = upper;  row->kern_size = lower;  if (testing_on) {    if (testing_row) {      tprintf ("GAP STATS\n");      gap_stats.print (stdout, TRUE);      tprintf ("SPACE stats\n");      cluster_stats[2].print (stdout, FALSE);      tprintf ("NONSPACE stats\n");      cluster_stats[1].print (stdout, FALSE);    }    tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n",      row->intercept (), row->min_space, upper,      row->max_nonspace, lower);  }  return 1;}/********************************************************************** * make_real_words * * Convert a TO_BLOCK to a BLOCK. **********************************************************************/void make_real_words(                  //find lines                     TO_BLOCK *block,  //block to do                     FCOORD rotation   //for drawing                    ) {  TO_ROW *row;                   //current row  TO_ROW_IT row_it = block->get_rows ();  ROW *real_row = NULL;          //output row  ROW_IT real_row_it = block->block->row_list ();  if (row_it.empty ())    return;                      //empty block  for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {    row = row_it.data ();    if (row->blob_list ()->empty () && !row->rep_words.empty ()) {      real_row = make_rep_words (row, block);    }    else if (!row->blob_list ()->empty ()) {      //                      tprintf("Row pitch_decision=%d",row->pitch_decision);      if (row->pitch_decision == PITCH_DEF_FIXED        || row->pitch_decision == PITCH_CORR_FIXED)        real_row = fixed_pitch_words (row, rotation);      else if (row->pitch_decision == PITCH_DEF_PROP        || row->pitch_decision == PITCH_CORR_PROP)        real_row = make_prop_words (row, rotation);      else        ASSERT_HOST(FALSE);     }    if (real_row != NULL) {                                 //put row in block      real_row_it.add_after_then_move (real_row);    }  }  block->block->set_stats (block->fixed_pitch == 0, (INT16) block->kern_size,    (INT16) block->space_size,    (INT16) block->fixed_pitch);  block->block->check_pitch ();}/********************************************************************** * make_rep_words * * Fabricate a real row from only the repeated blob words. * Get the xheight from the block as it may be more meaningful. **********************************************************************/ROW *make_rep_words(                 //make a row                    TO_ROW *row,     //row to convert                    TO_BLOCK *block  //block it lives in                   ) {  INT32 xstarts[2];              //ends of row  ROW *real_row;                 //output row  BOX word_box;                  //bounding box  double coeffs[3];              //spline                                 //iterator  WERD_IT word_it = &row->rep_words;  if (word_it.empty ())    return NULL;  word_box = word_it.data ()->bounding_box ();  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ())    word_box += word_it.data ()->bounding_box ();  xstarts[0] = word_box.left ();  xstarts[1] = word_box.right ();  coeffs[0] = 0;  coeffs[1] = row->line_m ();  coeffs[2] = row->line_c ();  row->xheight = block->xheight;  real_row = new ROW (row,    (INT16) block->kern_size, (INT16) block->space_size);  word_it.set_to_list (real_row->word_list ());                                 //put words in row  word_it.add_list_after (&row->rep_words);  real_row->recalc_bounding_box ();  return real_row;}/********************************************************************** * make_real_word * * Construct a WERD from a given number of adjacent entries in a * list of BLOBNBOXs. **********************************************************************/WERD *make_real_word(                      //make a WERD                     BLOBNBOX_IT *box_it,  //iterator                     INT32 blobcount,      //no of blobs to use                     BOOL8 bol,            //start of line                     BOOL8 fuzzy_sp,       //fuzzy space                     BOOL8 fuzzy_non,      //fuzzy non-space                     UINT8 blanks          //no of blanks                    ) {  OUTLINE_IT out_it;             //outlines  C_OUTLINE_IT cout_it;  PBLOB_LIST blobs;              //blobs in word  C_BLOB_LIST cblobs;  PBLOB_IT blob_it = &blobs;     //iterator  C_BLOB_IT cblob_it = &cblobs;  WERD *word;                    //new word  BLOBNBOX *bblob;               //current blob  INT32 blobindex;               //in row  for (blobindex = 0; blobindex < blobcount; blobindex++) {    bblob = box_it->extract ();    if (bblob->joined_to_prev ()) {      if (bblob->blob () != NULL) {        out_it.set_to_list (blob_it.data ()->out_list ());        out_it.move_to_last ();        out_it.add_list_after (bblob->blob ()->out_list ());        delete bblob->blob ();      }      else if (bblob->cblob () != NULL) {        cout_it.set_to_list (cblob_it.data ()->out_list ());        cout_it.move_to_last ();        cout_it.add_list_after (bblob->cblob ()->out_list ());        delete bblob->cblob ();      }    }    else {      if (bblob->blob () != NULL)        blob_it.add_after_then_move (bblob->blob ());      else if (bblob->cblob () != NULL)        cblob_it.add_after_then_move (bblob->cblob ());    }    delete bblob;    box_it->forward ();          //next one  }  if (blanks < 1)    blanks = 1;  if (!blob_it.empty ()) {                                 //make real word    word = new WERD (&blobs, blanks, NULL);  }  else {    word = new WERD (&cblobs, blanks, NULL);  }  if (bol) {    word->set_flag (W_BOL, TRUE);  }  if (fuzzy_sp)                                 //probably space    word->set_flag (W_FUZZY_SP, TRUE);  else if (fuzzy_non)                                 //probably not    word->set_flag (W_FUZZY_NON, TRUE);  if (box_it->at_first ()) {    word->set_flag (W_EOL, TRUE);//at end of line  }  return word;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?