📄 fixspace.cpp
字号:
/****************************************************************** * File: fixspace.cpp (Formerly fixspace.c) * Description: Implements a pass over the page res, exploring the alternative * spacing possibilities, trying to use context to improve the word spacing* Author: Phil Cheatle* Created: Thu Oct 21 11:38:43 BST 1993** (C) Copyright 1993, Hewlett-Packard Ltd.** Licensed under the Apache License, Version 2.0 (the "License");** you may not use this file except in compliance with the License.** You may obtain a copy of the License at** http://www.apache.org/licenses/LICENSE-2.0** Unless required by applicable law or agreed to in writing, software** distributed under the License is distributed on an "AS IS" BASIS,** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.** See the License for the specific language governing permissions and** limitations under the License.***********************************************************************/#include "mfcpch.h"#include <ctype.h>#include "reject.h"#include "statistc.h"#include "genblob.h"#include "control.h"#include "fixspace.h"#include "tessvars.h"#include "tessbox.h"#include "secname.h"#define EXTERNEXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,"Try turning noise to space in fixed pitch");EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");EXTERN INT_VAR (fixsp_non_noise_limit, 1,"How many non-noise blbs either side?");EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,"Limit context word spacing");EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,"Reward punctation joins");EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");EXTERN STRING_VAR (numeric_punctuation, ".,","Punct. chs expected WITHIN numbers");#define PERFECT_WERDS 999#define MAXSPACING 128 /*max expected spacing in pix *//************************************************************************* * fix_fuzzy_spaces() * Walk over the page finding sequences of words joined by fuzzy spaces. Extract * them as a sublist, process the sublist to find the optimal arrangement of * spaces then replace the sublist in the ROW_RES. *************************************************************************/void fix_fuzzy_spaces( //find fuzzy words volatile ETEXT_DESC *monitor, //progress monitor INT32 word_count, //count of words in doc PAGE_RES *page_res) { BLOCK_RES_IT block_res_it; //iterators ROW_RES_IT row_res_it; WERD_RES_IT word_res_it_from; WERD_RES_IT word_res_it_to; WERD_RES *word_res; WERD_RES_LIST fuzzy_space_words; INT16 new_length; BOOL8 prevent_null_wd_fixsp; //DONT process blobless wds INT32 word_index; //current word block_res_it.set_to_list (&page_res->block_res_list); word_index = 0; for (block_res_it.mark_cycle_pt (); !block_res_it.cycled_list (); block_res_it.forward ()) { row_res_it.set_to_list (&block_res_it.data ()->row_res_list); for (row_res_it.mark_cycle_pt (); !row_res_it.cycled_list (); row_res_it.forward ()) { word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list); while (!word_res_it_from.at_last ()) { word_res = word_res_it_from.data (); while (!word_res_it_from.at_last () && !(word_res->combination || word_res_it_from.data_relative (1)-> word->flag (W_FUZZY_NON) || word_res_it_from.data_relative (1)-> word->flag (W_FUZZY_SP))) { fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row); word_res = word_res_it_from.forward (); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; } } if (!word_res_it_from.at_last ()) { word_res_it_to = word_res_it_from; prevent_null_wd_fixsp = word_res->word->gblob_list ()->empty (); if (check_debug_pt (word_res, 60)) debug_fix_space_level.set_value (10); word_res_it_to.forward (); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 90 + 5 * word_index / word_count; } while (!word_res_it_to.at_last () && (word_res_it_to.data_relative (1)-> word->flag (W_FUZZY_NON) || word_res_it_to.data_relative (1)-> word->flag (W_FUZZY_SP))) { if (check_debug_pt (word_res, 60)) debug_fix_space_level.set_value (10); if (word_res->word->gblob_list ()->empty ()) prevent_null_wd_fixsp = TRUE; word_res = word_res_it_to.forward (); } if (check_debug_pt (word_res, 60)) debug_fix_space_level.set_value (10); if (word_res->word->gblob_list ()->empty ()) prevent_null_wd_fixsp = TRUE; if (prevent_null_wd_fixsp) word_res_it_from = word_res_it_to; else { fuzzy_space_words.assign_to_sublist (&word_res_it_from, &word_res_it_to); fix_fuzzy_space_list (fuzzy_space_words, row_res_it.data ()->row); new_length = fuzzy_space_words.length (); word_res_it_from.add_list_before (&fuzzy_space_words); for (; (!word_res_it_from.at_last () && (new_length > 0)); new_length--) { word_res_it_from.forward (); } } if (test_pt) debug_fix_space_level.set_value (0); } fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row); //Last word in row } } }}void fix_fuzzy_space_list( //space explorer WERD_RES_LIST &best_perm, ROW *row) { INT16 best_score; WERD_RES_LIST current_perm; INT16 current_score; BOOL8 improved = FALSE; //default score best_score = eval_word_spacing (best_perm); dump_words (best_perm, best_score, 1, improved); if (best_score != PERFECT_WERDS) initialise_search(best_perm, current_perm); while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) { match_current_words(current_perm, row); current_score = eval_word_spacing (current_perm); dump_words (current_perm, current_score, 2, improved); if (current_score > best_score) { best_perm.clear (); best_perm.deep_copy (¤t_perm); best_score = current_score; improved = TRUE; } if (current_score < PERFECT_WERDS) transform_to_next_perm(current_perm); } dump_words (best_perm, best_score, 3, improved);}void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) { WERD_RES_IT src_it(&src_list); WERD_RES_IT new_it(&new_list); WERD_RES *src_wd; WERD_RES *new_wd; for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) { src_wd = src_it.data (); if (!src_wd->combination) { new_wd = new WERD_RES (*src_wd); new_wd->combination = FALSE; new_wd->part_of_combo = FALSE; new_it.add_after_then_move (new_wd); } }}void match_current_words(WERD_RES_LIST &words, ROW *row) { WERD_RES_IT word_it(&words); WERD_RES *word; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); if ((!word->part_of_combo) && (word->outword == NULL)) classify_word_pass2(word, row); }}/************************************************************************* * eval_word_spacing() * The basic measure is the number of characters in contextually confirmed * words. (I.e the word is done) * If all words are contextually confirmed the evaluation is deemed perfect. * * Some fiddles are done to handle "1"s as these are VERY frequent causes of * fuzzy spaces. The problem with the basic measure is that "561 63" would score * the same as "56163", though given our knowledge that the space is fuzzy, and * that there is a "1" next to the fuzzy space, we need to ensure that "56163" * is prefered. * * The solution is to NOT COUNT the score of any word which has a digit at one * end and a "1Il" as the character the other side of the space. * * Conversly, any character next to a "1" within a word is counted as a positive * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of * the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 * sides of a "1" joined. * * The joined 1 rule is applied to any word REGARDLESS of contextual * confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. * *************************************************************************/INT16 eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_res_it(&word_res_list); INT16 total_score = 0; INT16 word_count = 0; INT16 done_word_count = 0; INT16 word_len; INT16 i; WERD_RES *word; //current word INT16 prev_word_score = 0; BOOL8 prev_word_done = FALSE; BOOL8 prev_char_1 = FALSE; //prev ch a "1/I/l"? BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0 BOOL8 current_char_1 = FALSE; BOOL8 current_word_ok_so_far; STRING punct_chars = "!\"`',.:;"; BOOL8 prev_char_punct = FALSE; BOOL8 current_char_punct = FALSE; BOOL8 word_done = FALSE; do { word = word_res_it.data (); word_done = fixspace_thinks_word_done (word); word_count++; if (word->tess_failed) { total_score += prev_word_score; if (prev_word_done) done_word_count++; prev_word_score = 0; prev_char_1 = FALSE; prev_char_digit = FALSE; prev_word_done = FALSE; } else { /* Can we add the prev word score and potentially count this word? Yes IF it didnt end in a 1 when the first char of this word is a digit AND it didnt end in a digit when the first char of this word is a 1 */ word_len = word->reject_map.length (); current_word_ok_so_far = FALSE; if (!((prev_char_1 && digit_or_numeric_punct (word, word->best_choice->string ()[0])) || (prev_char_digit && ((word_done && (word->best_choice->string ()[0] == '1')) || (!word_done && STRING (conflict_set_I_l_1).contains (word->best_choice-> string ()[0])))))) { total_score += prev_word_score; if (prev_word_done) done_word_count++; current_word_ok_so_far = word_done; } if ((current_word_ok_so_far) && (!tessedit_test_uniform_wd_spacing || ((word->best_choice->permuter () == NUMBER_PERM) || uniformly_spaced (word)))) { prev_word_done = TRUE; prev_word_score = word_len; } else { prev_word_done = FALSE; prev_word_score = 0; } if (fixsp_prefer_joined_1s) { /* Add 1 to total score for every joined 1 regardless of context and rejtn */ for (i = 0, prev_char_1 = FALSE; i < word_len; i++) { current_char_1 = word->best_choice->string ()[i] == '1'; if (prev_char_1 || (current_char_1 && (i > 0))) total_score++; prev_char_1 = current_char_1; } } /* Add 1 to total score for every joined punctuation regardless of context and rejtn */ if (tessedit_prefer_joined_punct) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -