📄 tospace.cpp
字号:
//iterator BLOBNBOX_IT blob_it = row->blob_list (); STATS all_gap_stats (0, MAXSPACING); STATS cert_space_gap_stats (0, MAXSPACING); STATS all_space_gap_stats (0, MAXSPACING); STATS small_gap_stats (0, MAXSPACING); BOX blob_box; BOX prev_blob_box; INT16 gap_width; INT16 real_space_threshold = 0; INT16 max = 0; INT16 index; INT16 large_gap_count = 0; BOOL8 suspected_table; INT32 max_max_nonspace; //upper bound BOOL8 good_block_space_estimate = block_space_gap_width > 0; INT32 end_of_row; INT32 row_length = 0; float sane_space; INT32 sane_threshold; /* Collect first pass stats for row */ if (!good_block_space_estimate) block_space_gap_width = INT16 (floor (row->xheight / 2)); if (!row->blob_list ()->empty ()) { if (tosp_threshold_bias1 > 0) real_space_threshold = block_non_space_gap_width + INT16 (floor (0.5 + tosp_threshold_bias1 * (block_space_gap_width - block_non_space_gap_width))); else real_space_threshold = //Old TO method (block_space_gap_width + block_non_space_gap_width) / 2; blob_it.set_to_list (row->blob_list ()); blob_it.mark_cycle_pt (); end_of_row = blob_it.data_relative (-1)->bounding_box ().right (); if (tosp_use_pre_chopping) blob_box = box_next_pre_chopped (&blob_it); else if (tosp_stats_use_xht_gaps) blob_box = reduced_box_next (row, &blob_it); else blob_box = box_next (&blob_it); row_length = end_of_row - blob_box.left (); prev_blob_box = blob_box; while (!blob_it.cycled_list ()) { if (tosp_use_pre_chopping) blob_box = box_next_pre_chopped (&blob_it); else if (tosp_stats_use_xht_gaps) blob_box = reduced_box_next (row, &blob_it); else blob_box = box_next (&blob_it); gap_width = blob_box.left () - prev_blob_box.right (); if (ignore_big_gap (row, row_length, gapmap, prev_blob_box.right (), blob_box.left ())) large_gap_count++; else { if (gap_width >= real_space_threshold) { if (!tosp_row_use_cert_spaces || (gap_width > tosp_fuzzy_space_factor2 * row->xheight) || ((gap_width > tosp_fuzzy_space_factor1 * row->xheight) && (!tosp_narrow_blobs_not_cert || (!narrow_blob (row, prev_blob_box) && !narrow_blob (row, blob_box)))) || (wide_blob (row, prev_blob_box) && wide_blob (row, blob_box))) cert_space_gap_stats.add (gap_width, 1); all_space_gap_stats.add (gap_width, 1); } else small_gap_stats.add (gap_width, 1); all_gap_stats.add (gap_width, 1); } prev_blob_box = blob_box; } } suspected_table = (large_gap_count > 1) || ((large_gap_count > 0) && (all_gap_stats.get_total () <= tosp_few_samples)); /* Now determine row kern size, space size and threshold */ if ((cert_space_gap_stats.get_total () >= tosp_enough_space_samples_for_median) || ((suspected_table || all_gap_stats.get_total () <= tosp_short_row) && cert_space_gap_stats.get_total () > 0)) old_to_method(row, &all_gap_stats, &cert_space_gap_stats, &small_gap_stats, block_space_gap_width, block_non_space_gap_width); else { if (!tosp_recovery_isolated_row_stats || !isolated_row_stats (row, gapmap, &all_gap_stats, suspected_table, block_idx, row_idx)) { if (tosp_row_use_cert_spaces && (tosp_debug_level > 5)) tprintf ("B:%d R:%d -- Inadequate certain spaces.\n", block_idx, row_idx); if (tosp_row_use_cert_spaces1 && good_block_space_estimate) { //Use block default row->space_size = block_space_gap_width; if (all_gap_stats.get_total () > tosp_redo_kern_limit) row->kern_size = all_gap_stats.median (); else row->kern_size = block_non_space_gap_width; row->space_threshold = INT32 (floor ((row->space_size + row->kern_size) / 2)); } else old_to_method(row, &all_gap_stats, &all_space_gap_stats, &small_gap_stats, block_space_gap_width, block_non_space_gap_width); } } if (tosp_improve_thresh && !suspected_table) improve_row_threshold(row, &all_gap_stats); /* Now lets try to be careful not to do anything silly with tables when we are ignoring big gaps*/ if (tosp_sanity_method == 0) { if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { if (tosp_debug_level > 0) tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, row_idx, row->kern_size, row->space_threshold, row->space_size); row->space_threshold = (INT32) (tosp_table_kn_sp_ratio * row->kern_size); row->space_size = MAX (row->space_threshold + 1, row->xheight); } } else if (tosp_sanity_method == 1) { sane_space = row->space_size; /* NEVER let space size get too close to kern size */ if ((row->space_size < tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5)) || ((row->space_size - row->kern_size) < (tosp_silly_kn_sp_gap * row->xheight))) { if (good_block_space_estimate && (block_space_gap_width >= tosp_min_sane_kn_sp * row->kern_size)) sane_space = block_space_gap_width; else sane_space = MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5), row->xheight / 2); if (tosp_debug_level > 0) tprintf ("B:%d R:%d -- DONT BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", block_idx, row_idx, row->kern_size, row->space_threshold, row->space_size, sane_space); row->space_size = sane_space; row->space_threshold = INT32 (floor ((row->space_size + row->kern_size) / 2)); } /* NEVER let threshold get VERY far away from kern */ sane_threshold = INT32 (floor (tosp_max_sane_kn_thresh * MAX (row->kern_size, 2.5))); if (row->space_threshold > sane_threshold) { if (tosp_debug_level > 0) tprintf ("B:%d R:%d -- DONT BELIEVE THRESH %3.2f %d %3.2f->%d.\n", block_idx, row_idx, row->kern_size, row->space_threshold, row->space_size, sane_threshold); row->space_threshold = sane_threshold; if (row->space_size <= sane_threshold) row->space_size = row->space_threshold + 1.0f; } /* Beware of tables - there may be NO spaces */ if (suspected_table) { sane_space = MAX (tosp_table_kn_sp_ratio * row->kern_size, tosp_table_xht_sp_ratio * row->xheight); sane_threshold = INT32 (floor ((sane_space + row->kern_size) / 2)); if ((row->space_size < sane_space) || (row->space_threshold < sane_threshold)) { if (tosp_debug_level > 0) tprintf ("B:%d R:%d -- SUSPECT NO SPACES %3.2f %d %3.2f.\n", block_idx, row_idx, row->kern_size, row->space_threshold, row->space_size); //the minimum sane value row->space_threshold = (INT32) sane_space; row->space_size = MAX (row->space_threshold + 1, row->xheight); } } } /* Now lets try to put some error limits on the threshold */ if (tosp_old_to_method) { /* Old textord made a space if gap >= threshold */ //NO FUZZY SPACES YET row->max_nonspace = row->space_threshold; //NO FUZZY SPACES YET row->min_space = row->space_threshold + 1; } else { /* Any gap greater than 0.6 x-ht is bound to be a space (isn't it:-) */ row->min_space = MIN (INT32 (ceil (tosp_fuzzy_space_factor * row->xheight)), INT32 (row->space_size)); if (row->min_space <= row->space_threshold) //Dont be silly row->min_space = row->space_threshold + 1; /* Lets try to guess the max certain kern gap by looking at the cluster of kerns for the row. The row is proportional so the kerns should cluster tightly at the bottom of the distribution. We also expect most gaps to be kerns. Find the maximum of the kern piles between 0 and twice the kern estimate. Piles before the first one with less than 1/10 the maximum number of samples can be taken as certain kerns. Of course, there are some cases where the kern peak and space peaks merge, so we will put an UPPER limit on the max certain kern gap of some fraction below the threshold. */ max_max_nonspace = INT32 ((row->space_threshold + row->kern_size) / 2); //default row->max_nonspace = max_max_nonspace; for (index = 0; index <= max_max_nonspace; index++) { if (all_gap_stats.pile_count (index) > max) max = all_gap_stats.pile_count (index); if ((index > row->kern_size) && (all_gap_stats.pile_count (index) < 0.1 * max)) { row->max_nonspace = index; break; } } } /* Yet another algorithm - simpler this time - just choose a fraction of the threshold to space range */ if ((tosp_fuzzy_sp_fraction > 0) && (row->space_size > row->space_threshold)) row->min_space = MAX (row->min_space, (INT32) ceil (row->space_threshold + tosp_fuzzy_sp_fraction * (row->space_size - row->space_threshold))); /* Ensure that ANY space less than some multiplier times the kern size is fuzzy. In tables there is a risk of erroneously setting a small space size when there are no real spaces. Sometimes tables have text squashed into columns so that the kn->sp ratio is small anyway - this means that we cant use this to force a wider separation - hence we rely on context to join any dubious breaks. */ if ((tosp_table_fuzzy_kn_sp_ratio > 0) && (suspected_table || tosp_fuzzy_limit_all)) row->min_space = MAX (row->min_space, (INT32) ceil (tosp_table_fuzzy_kn_sp_ratio * row->kern_size)); if ((tosp_fuzzy_kn_fraction > 0) && (row->kern_size < row->space_threshold)) row->max_nonspace = (INT32) floor (0.5 + row->kern_size + tosp_fuzzy_kn_fraction * (row->space_threshold - row->kern_size)); if (row->max_nonspace > row->space_threshold) //Dont be silly row->max_nonspace = row->space_threshold; if (tosp_debug_level > 5) tprintf ("B:%d R:%d L:%d-- Kn:%d Sp:%d Thr:%d -- Kn:%3.2f (%d) Thr:%d (%d) Sp:%3.2f\n", block_idx, row_idx, row_length, block_non_space_gap_width, block_space_gap_width, real_space_threshold, row->kern_size, row->max_nonspace, row->space_threshold, row->min_space, row->space_size);}void old_to_method( //estimate for block TO_ROW *row, STATS *all_gap_stats, STATS *space_gap_stats, STATS *small_gap_stats, INT16 block_space_gap_width, INT16 block_non_space_gap_width //estimate for block ) { /* Old to condition was > 2 */ if (space_gap_stats->get_total () >= tosp_enough_space_samples_for_median) { //Adequate samples /* Set space size to median of spaces BUT limits it if it seems wildly out */ row->space_size = space_gap_stats->median (); if (row->space_size > block_space_gap_width * 1.5) { if (tosp_old_to_bug_fix) row->space_size = block_space_gap_width * 1.5; else //BUG??? should be *1.5 row->space_size = block_space_gap_width; } if (row->space_size < (block_non_space_gap_width * 2) + 1) row->space_size = (block_non_space_gap_width * 2) + 1; } //Only 1 or 2 samples else if (space_gap_stats->get_total () >= 1) { //hence mean not median row->space_size = space_gap_stats->mean (); if (row->space_size > block_space_gap_width * 1.5) { if (tosp_old_to_bug_fix) row->space_size = block_space_gap_width * 1.5; else //BUG??? should be *1.5 row->space_size = block_space_gap_width; } if (row->space_size < (block_non_space_gap_width * 3) + 1) row->space_size = (block_non_space_gap_width * 3) + 1; } else //Use block default row->space_size = block_space_gap_width; if ((tosp_only_small_gaps_for_kern) && (small_gap_stats->get_total () > tosp_redo_kern_limit)) row->kern_size = small_gap_stats->median (); else if (all_gap_stats->get_total () > tosp_redo_kern_limit) row->kern_size = all_gap_stats->median (); else //old TO -SAME FOR ALL ROWS row->kern_size = block_non_space_gap_width; if (tosp_threshold_bias2 > 0) row->space_threshold = INT32 (floor (0.5 + row->kern_size + tosp_threshold_bias2 * (row->space_size - row->kern_size))); else /* NOTE old text ord uses (space_size + kern_size + 1)/2 as the threshold and holds this in a float. The use is with a >= test NEW textord uses an integer threshold and a > test It comes to the same thing. (Though there is a difference in that old textor has integer space_size and kern_size.) */ row->space_threshold = INT32 (floor ((row->space_size + row->kern_size) / 2));}/************************************************************************* * isolated_row_stats() * Set values for min_space, max_non_space based on row stats only *************************************************************************/BOOL8 isolated_row_stats(TO_ROW *row, GAPMAP *gapmap, STATS *all_gap_stats, BOOL8 suspected_table, INT16 block_idx, INT16 row_idx) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -