📄 tospace.cpp
字号:
float kern_estimate; float crude_threshold_estimate; INT16 small_gaps_count; INT16 total; //iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS cert_space_gap_stats (0, MAXSPACING); STATS all_space_gap_stats (0, MAXSPACING); STATS small_gap_stats (0, MAXSPACING); BOX blob_box; BOX prev_blob_box; INT16 gap_width; INT32 end_of_row; INT32 row_length; kern_estimate = all_gap_stats->median (); crude_threshold_estimate = MAX (tosp_init_guess_kn_mult * kern_estimate, tosp_init_guess_xht_mult * row->xheight); small_gaps_count = stats_count_under (all_gap_stats, (INT16) ceil (crude_threshold_estimate)); total = all_gap_stats->get_total (); if ((total <= tosp_redo_kern_limit) || ((small_gaps_count / (float) total) < tosp_enough_small_gaps) || (total - small_gaps_count < 1)) { if (tosp_debug_level > 5) tprintf ("B:%d R:%d -- Cant do isolated row stats.\n", block_idx, row_idx); return FALSE; } blob_it.set_to_list (row->blob_list ()); blob_it.mark_cycle_pt (); end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); if (tosp_use_pre_chopping) blob_box = box_next_pre_chopped (&blob_it); else if (tosp_stats_use_xht_gaps) blob_box = reduced_box_next (row, &blob_it); else blob_box = box_next (&blob_it); row_length = end_of_row - blob_box.left (); prev_blob_box = blob_box; while (!blob_it.cycled_list ()) { if (tosp_use_pre_chopping) blob_box = box_next_pre_chopped (&blob_it); else if (tosp_stats_use_xht_gaps) blob_box = reduced_box_next (row, &blob_it); else blob_box = box_next (&blob_it); gap_width = blob_box.left () - prev_blob_box.right (); if (!ignore_big_gap (row, row_length, gapmap, prev_blob_box.right (), blob_box.left ()) && (gap_width > crude_threshold_estimate)) { if ((gap_width > tosp_fuzzy_space_factor2 * row->xheight) || ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && (!tosp_narrow_blobs_not_cert || (!narrow_blob (row, prev_blob_box) && !narrow_blob (row, blob_box)))) || (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box))) cert_space_gap_stats.add (gap_width, 1); all_space_gap_stats.add (gap_width, 1); } if (gap_width < crude_threshold_estimate) small_gap_stats.add (gap_width, 1); prev_blob_box = blob_box; } if (cert_space_gap_stats.get_total () >= tosp_enough_space_samples_for_median) //median row->space_size = cert_space_gap_stats.median (); else if (suspected_table && (cert_space_gap_stats.get_total () > 0)) //to avoid spaced row->space_size = cert_space_gap_stats.mean (); // 1's in tables else if (all_space_gap_stats.get_total () >= tosp_enough_space_samples_for_median) //median row->space_size = all_space_gap_stats.median (); else row->space_size = all_space_gap_stats.mean (); if (tosp_only_small_gaps_for_kern) row->kern_size = small_gap_stats.median (); else row->kern_size = all_gap_stats->median (); row->space_threshold = INT32 (floor ((row->space_size + row->kern_size) / 2)); /* Sanity check */ if ((row->kern_size >= row->space_threshold) || (row->space_threshold >= row->space_size) || (row->space_threshold <= 0)) { if (tosp_debug_level > 0) tprintf ("B:%d R:%d -- Isolated row stats SANITY FAILURE: %f %d %f\n", block_idx, row_idx, row->kern_size, row->space_threshold, row->space_size); row->kern_size = 0.0f; row->space_threshold = 0; row->space_size = 0.0f; return FALSE; } if (tosp_debug_level > 5) tprintf ("B:%d R:%d -- Isolated row stats: %f %d %f\n", block_idx, row_idx, row->kern_size, row->space_threshold, row->space_size); return TRUE;}INT16 stats_count_under(STATS *stats, INT16 threshold) { INT16 index; INT16 total = 0; for (index = 0; index < threshold; index++) total += stats->pile_count (index); return total;}/************************************************************************* * improve_row_threshold() * Try to recognise a "normal line" - * > 25 gaps * && space > 3 * kn && space > 10 * (I.e. reasonably large space and kn:sp ratio) * && > 3/4 # gaps < kn + (sp - kn)/3 * (I.e. most gaps are well away from space estimate) * && a gap of max( 3, (sp - kn)/3 ) empty histogram positions is found * somewhere in the histogram between kn and sp * THEN set the threshold and fuzzy limits to this gap - ie NO fuzzies * NO!!!!! the bristol line has "11" with a gap of 12 between the 1's!!! * try moving the default threshold to within this band but leave the * fuzzy limit calculation as at present. *************************************************************************/void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats) { float sp = row->space_size; float kn = row->kern_size; INT16 reqd_zero_width = 0; INT16 zero_width = 0; INT16 zero_start = 0; INT16 index = 0; if (tosp_debug_level > 10) tprintf ("Improve row threshold 0"); if ((all_gap_stats->get_total () <= 25) || (sp <= 10) || (sp <= 3 * kn) || (stats_count_under (all_gap_stats, (INT16) ceil (kn + (sp - kn) / 3 + 0.5)) < (0.75 * all_gap_stats->get_total ()))) return; if (tosp_debug_level > 10) tprintf (" 1"); /* Look for the first region of all 0's in the histogram which is wider than max( 3, (sp - kn)/3 ) and starts between kn and sp. If found, and current threshold is not within it, move the threshold so that is is just inside it. */ reqd_zero_width = (INT16) floor ((sp - kn) / 3 + 0.5); if (reqd_zero_width < 3) reqd_zero_width = 3; for (index = INT16 (ceil (kn)); index < INT16 (floor (sp)); index++) { if (all_gap_stats->pile_count (index) == 0) { if (zero_width == 0) zero_start = index; zero_width++; } else { if (zero_width >= reqd_zero_width) break; else { zero_width = 0; } } } index--; if (tosp_debug_level > 10) tprintf (" reqd_z_width: %d found %d 0's, starting %d; thresh: %d/n", reqd_zero_width, zero_width, zero_start, row->space_threshold); if ((zero_width < reqd_zero_width) || ((row->space_threshold >= zero_start) && (row->space_threshold <= index))) return; if (tosp_debug_level > 10) tprintf (" 2"); if (row->space_threshold < zero_start) { if (tosp_debug_level > 5) tprintf ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, index, row->space_threshold, zero_start); row->space_threshold = zero_start; } if (row->space_threshold > index) { if (tosp_debug_level > 5) tprintf ("Improve row kn:%5.2f sp:%5.2f 0's: %d -> %d thresh:%d -> %d\n", kn, sp, zero_start, index, row->space_threshold, index); row->space_threshold = index; }}/********************************************************************** * make_prop_words * * Convert a TO_BLOCK to a BLOCK. **********************************************************************/ROW *make_prop_words( //find lines TO_ROW *row, //row to make FCOORD rotation //for drawing ) { BOOL8 bol; //start of line /* prev_ values are for start of word being built. non prev_ values are for the gap between the word being built and the next one. */ BOOL8 prev_fuzzy_sp; //probably space BOOL8 prev_fuzzy_non; //probably not UINT8 prev_blanks; //in front of word BOOL8 fuzzy_sp; //probably space BOOL8 fuzzy_non; //probably not UINT8 blanks; //in front of word ROW *real_row; //output row OUTLINE_IT out_it; //outlines C_OUTLINE_IT cout_it; PBLOB_LIST blobs; //blobs in word C_BLOB_LIST cblobs; PBLOB_IT blob_it = &blobs; //iterator C_BLOB_IT cblob_it = &cblobs; WERD_LIST words; WERD_IT word_it; //new words WERD *word; //new word WERD_IT rep_char_it; //repeated char words INT32 next_rep_char_word_right = MAX_INT32; float repetition_spacing; //gap between repetitions INT32 xstarts[2]; //row ends double coeffs[3]; //quadratic INT32 prev_x; //end of prev blob BLOBNBOX *bblob; //current blob BOX blob_box; //bounding box BLOBNBOX_IT box_it; //iterator BOX prev_blob_box; BOX next_blob_box; INT16 prev_gap = MAX_INT16; INT16 current_gap = MAX_INT16; INT16 next_gap = MAX_INT16; INT16 prev_within_xht_gap = MAX_INT16; INT16 current_within_xht_gap = MAX_INT16; INT16 next_within_xht_gap = MAX_INT16; INT16 word_count = 0; static INT16 row_count = 0; row_count++; rep_char_it.set_to_list (&(row->rep_words)); if (!rep_char_it.empty ()) { next_rep_char_word_right = rep_char_it.data ()->bounding_box ().right (); } prev_x = -MAX_INT16; blob_it.set_to_list (&blobs); cblob_it.set_to_list (&cblobs); box_it.set_to_list (row->blob_list ()); word_it.set_to_list (&words); bol = TRUE; prev_blanks = 0; prev_fuzzy_sp = FALSE; prev_fuzzy_non = FALSE; if (!box_it.empty ()) { xstarts[0] = box_it.data ()->bounding_box ().left (); if (xstarts[0] > next_rep_char_word_right) { /* We need to insert a repeated char word at the start of the row */ word = rep_char_it.extract (); word_it.add_after_then_move (word); /* Set spaces before repeated char word */ word->set_flag (W_BOL, TRUE); bol = FALSE; word->set_blanks (0); //NO uncertainty word->set_flag (W_FUZZY_SP, FALSE); word->set_flag (W_FUZZY_NON, FALSE); xstarts[0] = word->bounding_box ().left (); /* Set spaces after repeated char word (and leave current word set) */ repetition_spacing = find_mean_blob_spacing (word); current_gap = box_it.data ()->bounding_box ().left () - next_rep_char_word_right; current_within_xht_gap = current_gap; if (current_gap > tosp_rep_space * repetition_spacing) { prev_blanks = (UINT8) floor (current_gap / row->space_size); if (prev_blanks < 1) prev_blanks = 1; } else prev_blanks = 0; if (tosp_debug_level > 5) tprintf ("Repch wd at BOL(%d, %d). rep spacing %5.2f; Rgap:%d ", box_it.data ()->bounding_box ().left (), box_it.data ()->bounding_box ().bottom (), repetition_spacing, current_gap); prev_fuzzy_sp = FALSE; prev_fuzzy_non = FALSE; if (rep_char_it.empty ()) { next_rep_char_word_right = MAX_INT32; } else { rep_char_it.forward (); next_rep_char_word_right = rep_char_it.data ()->bounding_box ().right (); } } peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); do { bblob = box_it.data (); blob_box = bblob->bounding_box (); if (bblob->joined_to_prev ()) { if (bblob->blob () != NULL) { out_it.set_to_list (blob_it.data ()->out_list ()); out_it.move_to_last (); out_it.add_list_after (bblob->blob ()->out_list ()); delete bblob->blob (); } else if (bblob->cblob () != NULL) { cout_it.set_to_list (cblob_it.data ()->out_list ()); cout_it.move_to_last (); cout_it.add_list_after (bblob->cblob ()->out_list ()); delete bblob->cblob (); } } else { if (bblob->blob () != NULL) blob_it.add_after_then_move (bblob->blob ()); else if (bblob->cblob () != NULL) cblob_it.add_after_then_move (bblob->cblob ()); prev_x = blob_box.right (); } box_it.forward (); //next one bblob = box_it.data (); blob_box = bblob->bounding_box (); if (!bblob->joined_to_prev () && (bblob->blob () != NULL || bblob->cblob () != NULL)) { /* Real Blob - not multiple outlines or pre-chopped */ prev_gap = current_gap; prev_within_xht_gap = current_within_xht_gap; prev_blob_box = next_blob_box; current_gap = next_gap; current_within_xht_gap = next_within_xht_gap; peek_at_next_gap(row, box_it, next_blob_box, next_gap, next_within_xht_gap); if ((blob_box.left () > next_rep_char_word_right) || (!tosp_only_use_xht_gaps &&
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -