wordseg.cpp
来自「一OCR的相关资料。.希望对研究OCR的朋友有所帮助.」· C++ 代码 · 共 621 行 · 第 1/2 页
CPP
621 行
INT32 row_words2( //compute space size TO_BLOCK *block, //block it came from TO_ROW *row, //row to operate on INT32 maxwidth, //max expected space size FCOORD rotation, //for drawing BOOL8 testing_on //for debug ) { BOOL8 testing_row; //contains testpt BOOL8 prev_valid; //if decent size BOOL8 this_valid; //current blob big enough INT32 prev_x; //end of prev blob INT32 min_width; //min interesting width INT32 valid_count; //good gaps INT32 total_count; //total gaps INT32 cluster_count; //no of clusters INT32 prev_count; //previous cluster_count INT32 gap_index; //which cluster INT32 smooth_factor; //for smoothing stats BLOBNBOX *blob; //current blob float lower, upper; //clustering parameters ICOORD testpt; BOX blob_box; //bounding box //iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS gap_stats (0, maxwidth); //gap sizes float gaps[BLOCK_STATS_CLUSTERS]; STATS cluster_stats[BLOCK_STATS_CLUSTERS + 1]; //clusters testpt = ICOORD (textord_test_x, textord_test_y); smooth_factor = (INT32) (block->xheight * textord_wordstats_smooth_factor + 1.5); // if (testing_on) // tprintf("Row smooth factor=%d\n",smooth_factor); prev_valid = FALSE; prev_x = -MAX_INT16; testing_row = FALSE; //min blob size min_width = (INT32) block->pr_space; total_count = 0; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); this_valid = blob_box.width () >= min_width; this_valid = TRUE; if (this_valid && prev_valid && blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } total_count++; //count possibles prev_x = blob_box.right (); prev_valid = this_valid; } } valid_count = gap_stats.get_total (); if (valid_count < total_count * textord_words_minlarge) { gap_stats.clear (); prev_x = -MAX_INT16; for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!blob->joined_to_prev ()) { blob_box = blob->bounding_box (); if (blob_box.left () - prev_x < maxwidth) { gap_stats.add (blob_box.left () - prev_x, 1); } prev_x = blob_box.right (); } } } if (gap_stats.get_total () == 0) { row->min_space = 0; //no evidence row->max_nonspace = 0; return 0; } cluster_count = 0; lower = block->xheight * words_initial_lower; upper = block->xheight * words_initial_upper; gap_stats.smooth (smooth_factor); do { prev_count = cluster_count; cluster_count = gap_stats.cluster (lower, upper, textord_spacesize_ratioprop, BLOCK_STATS_CLUSTERS, cluster_stats); } while (cluster_count > prev_count && cluster_count < BLOCK_STATS_CLUSTERS); if (cluster_count < 1) { row->min_space = 0; row->max_nonspace = 0; return 0; } for (gap_index = 0; gap_index < cluster_count; gap_index++) gaps[gap_index] = cluster_stats[gap_index + 1].ile (0.5); //get medians if (testing_on) { tprintf ("cluster_count=%d:", cluster_count); for (gap_index = 0; gap_index < cluster_count; gap_index++) tprintf (" %g(%d)", gaps[gap_index], cluster_stats[gap_index + 1].get_total ()); tprintf ("\n"); } //Try to find proportional non-space and space for row. for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] > block->max_nonspace; gap_index++); if (gap_index < cluster_count) lower = gaps[gap_index]; //most frequent below else { if (testing_on) tprintf ("No cluster below block threshold!, using default=%g\n", block->pr_nonsp); lower = block->pr_nonsp; } for (gap_index = 0; gap_index < cluster_count && gaps[gap_index] <= block->max_nonspace; gap_index++); if (gap_index < cluster_count) upper = gaps[gap_index]; //most frequent above else { if (testing_on) tprintf ("No cluster above block threshold!, using default=%g\n", block->pr_space); upper = block->pr_space; } row->min_space = (INT32) ceil (upper - (upper - lower) * textord_words_definite_spread); row->max_nonspace = (INT32) floor (lower + (upper - lower) * textord_words_definite_spread); row->space_threshold = (row->max_nonspace + row->min_space) / 2; row->space_size = upper; row->kern_size = lower; if (testing_on) { if (testing_row) { tprintf ("GAP STATS\n"); gap_stats.print (stdout, TRUE); tprintf ("SPACE stats\n"); cluster_stats[2].print (stdout, FALSE); tprintf ("NONSPACE stats\n"); cluster_stats[1].print (stdout, FALSE); } tprintf ("Row at %g has minspace=%d(%g), max_non=%d(%g)\n", row->intercept (), row->min_space, upper, row->max_nonspace, lower); } return 1;}/********************************************************************** * make_real_words * * Convert a TO_BLOCK to a BLOCK. **********************************************************************/void make_real_words( //find lines TO_BLOCK *block, //block to do FCOORD rotation //for drawing ) { TO_ROW *row; //current row TO_ROW_IT row_it = block->get_rows (); ROW *real_row = NULL; //output row ROW_IT real_row_it = block->block->row_list (); if (row_it.empty ()) return; //empty block for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (row->blob_list ()->empty () && !row->rep_words.empty ()) { real_row = make_rep_words (row, block); } else if (!row->blob_list ()->empty ()) { // tprintf("Row pitch_decision=%d",row->pitch_decision); if (row->pitch_decision == PITCH_DEF_FIXED || row->pitch_decision == PITCH_CORR_FIXED) real_row = fixed_pitch_words (row, rotation); else if (row->pitch_decision == PITCH_DEF_PROP || row->pitch_decision == PITCH_CORR_PROP) real_row = make_prop_words (row, rotation); else ASSERT_HOST(FALSE); } if (real_row != NULL) { //put row in block real_row_it.add_after_then_move (real_row); } } block->block->set_stats (block->fixed_pitch == 0, (INT16) block->kern_size, (INT16) block->space_size, (INT16) block->fixed_pitch); block->block->check_pitch ();}/********************************************************************** * make_rep_words * * Fabricate a real row from only the repeated blob words. * Get the xheight from the block as it may be more meaningful. **********************************************************************/ROW *make_rep_words( //make a row TO_ROW *row, //row to convert TO_BLOCK *block //block it lives in ) { INT32 xstarts[2]; //ends of row ROW *real_row; //output row BOX word_box; //bounding box double coeffs[3]; //spline //iterator WERD_IT word_it = &row->rep_words; if (word_it.empty ()) return NULL; word_box = word_it.data ()->bounding_box (); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) word_box += word_it.data ()->bounding_box (); xstarts[0] = word_box.left (); xstarts[1] = word_box.right (); coeffs[0] = 0; coeffs[1] = row->line_m (); coeffs[2] = row->line_c (); row->xheight = block->xheight; real_row = new ROW (row, (INT16) block->kern_size, (INT16) block->space_size); word_it.set_to_list (real_row->word_list ()); //put words in row word_it.add_list_after (&row->rep_words); real_row->recalc_bounding_box (); return real_row;}/********************************************************************** * make_real_word * * Construct a WERD from a given number of adjacent entries in a * list of BLOBNBOXs. **********************************************************************/WERD *make_real_word( //make a WERD BLOBNBOX_IT *box_it, //iterator INT32 blobcount, //no of blobs to use BOOL8 bol, //start of line BOOL8 fuzzy_sp, //fuzzy space BOOL8 fuzzy_non, //fuzzy non-space UINT8 blanks //no of blanks ) { OUTLINE_IT out_it; //outlines C_OUTLINE_IT cout_it; PBLOB_LIST blobs; //blobs in word C_BLOB_LIST cblobs; PBLOB_IT blob_it = &blobs; //iterator C_BLOB_IT cblob_it = &cblobs; WERD *word; //new word BLOBNBOX *bblob; //current blob INT32 blobindex; //in row for (blobindex = 0; blobindex < blobcount; blobindex++) { bblob = box_it->extract (); if (bblob->joined_to_prev ()) { if (bblob->blob () != NULL) { out_it.set_to_list (blob_it.data ()->out_list ()); out_it.move_to_last (); out_it.add_list_after (bblob->blob ()->out_list ()); delete bblob->blob (); } else if (bblob->cblob () != NULL) { cout_it.set_to_list (cblob_it.data ()->out_list ()); cout_it.move_to_last (); cout_it.add_list_after (bblob->cblob ()->out_list ()); delete bblob->cblob (); } } else { if (bblob->blob () != NULL) blob_it.add_after_then_move (bblob->blob ()); else if (bblob->cblob () != NULL) cblob_it.add_after_then_move (bblob->cblob ()); } delete bblob; box_it->forward (); //next one } if (blanks < 1) blanks = 1; if (!blob_it.empty ()) { //make real word word = new WERD (&blobs, blanks, NULL); } else { word = new WERD (&cblobs, blanks, NULL); } if (bol) { word->set_flag (W_BOL, TRUE); } if (fuzzy_sp) //probably space word->set_flag (W_FUZZY_SP, TRUE); else if (fuzzy_non) //probably not word->set_flag (W_FUZZY_NON, TRUE); if (box_it->at_first ()) { word->set_flag (W_EOL, TRUE);//at end of line } return word;}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?