📄 topitch.cpp
字号:
/********************************************************************** * File: topitch.cpp (Formerly to_pitch.c) * Description: Code to determine fixed pitchness and the pitch if fixed. * Author: Ray Smith * Created: Tue Aug 24 16:57:29 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#ifdef __UNIX__#include <assert.h>#endif#include "stderr.h"#include "blobbox.h"#include "lmedsq.h"#include "statistc.h"#include "drawtord.h"#include "makerow.h"#include "pitsync1.h"#include "pithsync.h"#include "blobcmpl.h"#include "tovars.h"#include "wordseg.h"#include "topitch.h"#include "secname.h"#define EXTERNEXTERN BOOL_VAR (textord_all_prop, FALSE, "All doc is proportial text");EXTERN BOOL_VAR (textord_debug_pitch_test, FALSE,"Debug on fixed pitch test");EXTERN BOOL_VAR (textord_disable_pitch_test, FALSE,"Turn off dp fixed pitch algorithm");EXTERN BOOL_VAR (textord_fast_pitch_test, FALSE,"Do even faster pitch algorithm");EXTERN BOOL_VAR (textord_debug_pitch_metric, FALSE,"Write full metric stuff");EXTERN BOOL_VAR (textord_show_row_cuts, FALSE, "Draw row-level cuts");EXTERN BOOL_VAR (textord_show_page_cuts, FALSE, "Draw page-level cuts");EXTERN BOOL_VAR (textord_pitch_cheat, FALSE,"Use correct answer for fixed/prop");EXTERN BOOL_VAR (textord_blockndoc_fixed, FALSE,"Attempt whole doc/block fixed pitch");EXTERN double_VAR (textord_projection_scale, 0.200, "Ding rate for mid-cuts");EXTERN double_VAR (textord_balance_factor, 1.0,"Ding rate for unbalanced char cells");EXTERN double_VAR (textord_repch_width_variance, 0.2,"Max width change of gap/blob");#define FIXED_WIDTH_MULTIPLE 5#define BLOCK_STATS_CLUSTERS 10#define MAX_ALLOWED_PITCH 100 //max pixel pitch./********************************************************************** * compute_fixed_pitch * * Decide whether each row is fixed pitch individually. * Correlate definite and uncertain results to obtain an individual * result for each row in the TO_ROW class. **********************************************************************/void compute_fixed_pitch( //determine pitch ICOORD page_tr, //top right TO_BLOCK_LIST *port_blocks, //input list float gradient, //page skew FCOORD rotation, //for drawing BOOL8 testing_on //correct orientation ) { TO_BLOCK_IT block_it; //iterator TO_BLOCK *block; //current block; TO_ROW_IT row_it; //row iterator TO_ROW *row; //current row int block_index; //block number int row_index; //row number#ifndef GRAPHICS_DISABLED if (textord_show_initial_words && testing_on) { if (to_win == NO_WINDOW) create_to_win(page_tr); }#endif block_it.set_to_list (port_blocks); block_index = 1; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block = block_it.data (); compute_block_pitch(block, rotation, block_index, testing_on); block_index++; } if (!try_doc_fixed (page_tr, port_blocks, gradient)) { block_index = 1; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block = block_it.data (); if (!try_block_fixed (block, block_index)) try_rows_fixed(block, block_index, testing_on); block_index++; } } block_index = 1; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block = block_it.data (); row_it.set_to_list (block->get_rows ()); row_index = 1; for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); fix_row_pitch(row, block, port_blocks, row_index, block_index); row_index++; } if (testing_on && (textord_debug_pitch_test && block->block->text_region () != NULL || textord_blocksall_fixed || textord_blocksall_prop)) { tprintf ("Corr:"); print_block_counts(block, block_index); } block_index++; }#ifndef GRAPHICS_DISABLED if (textord_show_initial_words && testing_on) { overlap_picture_ops(TRUE); }#endif}/********************************************************************** * fix_row_pitch * * Get a pitch_decision for this row by voting among similar rows in the * block, then similar rows over all the page, or any other rows at all. **********************************************************************/void fix_row_pitch( //get some value TO_ROW *bad_row, //row to fix TO_BLOCK *bad_block, //block of bad_row TO_BLOCK_LIST *blocks, //blocks to scan INT32 row_target, //number of row INT32 block_target //number of block ) { const char *res_string; //decision on line INT16 mid_cuts; int block_votes; //votes in block int like_votes; //votes over page int other_votes; //votes of unlike blocks int block_index; //number of block int row_index; //number of row int maxwidth; //max pitch TO_BLOCK_IT block_it = blocks; //block iterator TO_ROW_IT row_it; TO_BLOCK *block; //current block TO_ROW *row; //current row float sp_sd; //space deviation STATS block_stats; //pitches in block STATS like_stats; //pitches in page block_votes = like_votes = other_votes = 0; maxwidth = (INT32) ceil (bad_row->xheight * textord_words_maxspace); if (bad_row->pitch_decision != PITCH_DEF_FIXED && bad_row->pitch_decision != PITCH_DEF_PROP) { block_stats.set_range (0, maxwidth); like_stats.set_range (0, maxwidth); block_index = 1; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block = block_it.data (); row_index = 1; row_it.set_to_list (block->get_rows ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row = row_it.data (); if (bad_row->all_caps && row->xheight + row->ascrise < (bad_row->xheight + bad_row->ascrise) * (1 + textord_pitch_rowsimilarity) && row->xheight + row->ascrise > (bad_row->xheight + bad_row->ascrise) * (1 - textord_pitch_rowsimilarity) || !bad_row->all_caps && row->xheight < bad_row->xheight * (1 + textord_pitch_rowsimilarity) && row->xheight > bad_row->xheight * (1 - textord_pitch_rowsimilarity)) { if (block_index == block_target) { if (row->pitch_decision == PITCH_DEF_FIXED) { block_votes += textord_words_veto_power; block_stats.add ((INT32) row->fixed_pitch, textord_words_veto_power); } else if (row->pitch_decision == PITCH_MAYBE_FIXED || row->pitch_decision == PITCH_CORR_FIXED) { block_votes++; block_stats.add ((INT32) row->fixed_pitch, 1); } else if (row->pitch_decision == PITCH_DEF_PROP) block_votes -= textord_words_veto_power; else if (row->pitch_decision == PITCH_MAYBE_PROP || row->pitch_decision == PITCH_CORR_PROP) block_votes--; } else { if (row->pitch_decision == PITCH_DEF_FIXED) { like_votes += textord_words_veto_power; like_stats.add ((INT32) row->fixed_pitch, textord_words_veto_power); } else if (row->pitch_decision == PITCH_MAYBE_FIXED || row->pitch_decision == PITCH_CORR_FIXED) { like_votes++; like_stats.add ((INT32) row->fixed_pitch, 1); } else if (row->pitch_decision == PITCH_DEF_PROP) like_votes -= textord_words_veto_power; else if (row->pitch_decision == PITCH_MAYBE_PROP || row->pitch_decision == PITCH_CORR_PROP) like_votes--; } } else { if (row->pitch_decision == PITCH_DEF_FIXED) other_votes += textord_words_veto_power; else if (row->pitch_decision == PITCH_MAYBE_FIXED || row->pitch_decision == PITCH_CORR_FIXED) other_votes++; else if (row->pitch_decision == PITCH_DEF_PROP) other_votes -= textord_words_veto_power; else if (row->pitch_decision == PITCH_MAYBE_PROP || row->pitch_decision == PITCH_CORR_PROP) other_votes--; } row_index++; } block_index++; } if (block_votes > textord_words_veto_power) { bad_row->fixed_pitch = block_stats.ile (0.5); bad_row->pitch_decision = PITCH_CORR_FIXED; } else if (block_votes <= textord_words_veto_power && like_votes > 0) { bad_row->fixed_pitch = like_stats.ile (0.5); bad_row->pitch_decision = PITCH_CORR_FIXED; } else { bad_row->pitch_decision = PITCH_CORR_PROP; #ifndef SECURE_NAMES if (block_votes == 0 && like_votes == 0 && other_votes > 0 && (textord_debug_pitch_test || textord_debug_pitch_metric)) tprintf ("Warning:row %d of block %d set prop with no like rows against trend\n", row_target, block_target); #endif } } if (textord_debug_pitch_metric) { tprintf (":b_votes=%d:l_votes=%d:o_votes=%d", block_votes, like_votes, other_votes); if (bad_row->pitch_decision == PITCH_CORR_PROP || bad_row->pitch_decision == PITCH_DEF_PROP) { res_string = bad_block->block->text_region () != NULL ? (bad_block->block->text_region ()-> is_prop ()? "CP" : "WP") : "XP"; } else { res_string = bad_block->block->text_region () != NULL ? (bad_block->block->text_region ()-> is_prop ()? "WF" : "CF") : "XF"; } tprintf (":Blk=%d:Row=%d:%c:", block_target, row_target, bad_block->block->text_region () != NULL ? (bad_block->block->text_region ()-> is_prop ()? 'P' : 'F') : 'X'); tprintf ("x=%g:asc=%g:corr_res=%s\n", bad_row->xheight, bad_row->ascrise, res_string); } if (textord_pitch_cheat && bad_block->block->text_region () != NULL) bad_row->pitch_decision = bad_block->block->text_region ()-> is_prop ()? PITCH_CORR_PROP : PITCH_CORR_FIXED; if (bad_row->pitch_decision == PITCH_CORR_FIXED) { if (bad_row->fixed_pitch < textord_min_xheight) { if (block_votes > 0) bad_row->fixed_pitch = block_stats.ile (0.5); else if (block_votes == 0 && like_votes > 0) bad_row->fixed_pitch = like_stats.ile (0.5); else { tprintf ("Warning:guessing pitch as xheight on row %d, block %d\n", row_target, block_target); bad_row->fixed_pitch = bad_row->xheight; } } if (bad_row->fixed_pitch < textord_min_xheight) bad_row->fixed_pitch = (float) textord_min_xheight; bad_row->kern_size = bad_row->fixed_pitch / 4; bad_row->min_space = (INT32) (bad_row->fixed_pitch * 0.6); bad_row->max_nonspace = (INT32) (bad_row->fixed_pitch * 0.4); bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2; bad_row->space_size = bad_row->fixed_pitch; if (bad_row->char_cells.empty ()) tune_row_pitch (bad_row, &bad_row->projection, bad_row->projection_left, bad_row->projection_right, (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, sp_sd, mid_cuts, &bad_row->char_cells, FALSE); } else if (bad_row->pitch_decision == PITCH_CORR_PROP || bad_row->pitch_decision == PITCH_DEF_PROP) { bad_row->fixed_pitch = 0.0f; bad_row->char_cells.clear (); }}/********************************************************************** * compute_block_pitch * * Decide whether each block is fixed pitch individually. **********************************************************************/void compute_block_pitch( //process each block TO_BLOCK *block, //input list FCOORD rotation, //for drawing INT32 block_index, //block number BOOL8 testing_on //correct orientation ) { BOX block_box; //bounding box block_box = block->block->bounding_box (); if (testing_on && textord_debug_pitch_test) { tprintf ("Block %d at (%d,%d)->(%d,%d)\n", block_index, block_box.left (), block_box.bottom (), block_box.right (), block_box.top ()); } block->min_space = (INT32) floor (block->xheight * textord_words_default_minspace);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -