fixspace.cpp

来自「一ＯＣＲ的相关资料。．希望对研究ＯＣＲ的朋友有所帮助．」· C++ 代码 · 共 975 行 · 第 1/3 页
CPP
975 行
/****************************************************************** * File:        fixspace.cpp  (Formerly fixspace.c) * Description: Implements a pass over the page res, exploring the alternative *					spacing possibilities, trying to use context to improve the          word spacing* Author:		Phil Cheatle* Created:		Thu Oct 21 11:38:43 BST 1993** (C) Copyright 1993, Hewlett-Packard Ltd.** Licensed under the Apache License, Version 2.0 (the "License");** you may not use this file except in compliance with the License.** You may obtain a copy of the License at** http://www.apache.org/licenses/LICENSE-2.0** Unless required by applicable law or agreed to in writing, software** distributed under the License is distributed on an "AS IS" BASIS,** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.** See the License for the specific language governing permissions and** limitations under the License.***********************************************************************/#include "mfcpch.h"#include          <ctype.h>#include          "reject.h"#include          "statistc.h"#include          "genblob.h"#include          "control.h"#include          "fixspace.h"#include          "tessvars.h"#include          "tessbox.h"#include          "secname.h"#define EXTERNEXTERN BOOL_VAR (fixsp_check_for_fp_noise_space, TRUE,"Try turning noise to space in fixed pitch");EXTERN BOOL_VAR (fixsp_fp_eval, TRUE, "Use alternate evaluation for fp");EXTERN BOOL_VAR (fixsp_noise_score_fixing, TRUE, "More sophisticated?");EXTERN INT_VAR (fixsp_non_noise_limit, 1,"How many non-noise blbs either side?");EXTERN double_VAR (fixsp_small_outlines_size, 0.28, "Small if lt xht x this");EXTERN BOOL_VAR (fixsp_ignore_punct, TRUE, "In uniform spacing calc");EXTERN BOOL_VAR (fixsp_numeric_fix, TRUE, "Try to deal with numeric punct");EXTERN BOOL_VAR (fixsp_prefer_joined_1s, TRUE, "Arbitrary boost");EXTERN BOOL_VAR (tessedit_test_uniform_wd_spacing, FALSE,"Limit context word spacing");EXTERN BOOL_VAR (tessedit_prefer_joined_punct, FALSE,"Reward punctation joins");EXTERN INT_VAR (fixsp_done_mode, 1, "What constitues done for spacing");EXTERN INT_VAR (debug_fix_space_level, 0, "Contextual fixspace debug");EXTERN STRING_VAR (numeric_punctuation, ".,","Punct. chs expected WITHIN numbers");#define PERFECT_WERDS   999#define MAXSPACING      128      /*max expected spacing in pix *//************************************************************************* * fix_fuzzy_spaces() * Walk over the page finding sequences of words joined by fuzzy spaces. Extract * them as a sublist, process the sublist to find the optimal arrangement of * spaces then replace the sublist in the ROW_RES. *************************************************************************/void fix_fuzzy_spaces(                               //find fuzzy words                      volatile ETEXT_DESC *monitor,  //progress monitor                      INT32 word_count,              //count of words in doc                      PAGE_RES *page_res) {  BLOCK_RES_IT block_res_it;     //iterators  ROW_RES_IT row_res_it;  WERD_RES_IT word_res_it_from;  WERD_RES_IT word_res_it_to;  WERD_RES *word_res;  WERD_RES_LIST fuzzy_space_words;  INT16 new_length;  BOOL8 prevent_null_wd_fixsp;   //DONT process blobless wds  INT32 word_index;              //current word  block_res_it.set_to_list (&page_res->block_res_list);  word_index = 0;  for (block_res_it.mark_cycle_pt ();  !block_res_it.cycled_list (); block_res_it.forward ()) {    row_res_it.set_to_list (&block_res_it.data ()->row_res_list);    for (row_res_it.mark_cycle_pt ();    !row_res_it.cycled_list (); row_res_it.forward ()) {      word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);      while (!word_res_it_from.at_last ()) {        word_res = word_res_it_from.data ();        while (!word_res_it_from.at_last () &&          !(word_res->combination ||          word_res_it_from.data_relative (1)->          word->flag (W_FUZZY_NON) ||          word_res_it_from.data_relative (1)->        word->flag (W_FUZZY_SP))) {          fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);          word_res = word_res_it_from.forward ();          word_index++;          if (monitor != NULL) {            monitor->ocr_alive = TRUE;            monitor->progress = 90 + 5 * word_index / word_count;          }        }        if (!word_res_it_from.at_last ()) {          word_res_it_to = word_res_it_from;          prevent_null_wd_fixsp =            word_res->word->gblob_list ()->empty ();          if (check_debug_pt (word_res, 60))            debug_fix_space_level.set_value (10);          word_res_it_to.forward ();          word_index++;          if (monitor != NULL) {            monitor->ocr_alive = TRUE;            monitor->progress = 90 + 5 * word_index / word_count;          }          while (!word_res_it_to.at_last () &&            (word_res_it_to.data_relative (1)->            word->flag (W_FUZZY_NON) ||            word_res_it_to.data_relative (1)->          word->flag (W_FUZZY_SP))) {            if (check_debug_pt (word_res, 60))              debug_fix_space_level.set_value (10);            if (word_res->word->gblob_list ()->empty ())              prevent_null_wd_fixsp = TRUE;            word_res = word_res_it_to.forward ();          }          if (check_debug_pt (word_res, 60))            debug_fix_space_level.set_value (10);          if (word_res->word->gblob_list ()->empty ())            prevent_null_wd_fixsp = TRUE;          if (prevent_null_wd_fixsp)            word_res_it_from = word_res_it_to;          else {            fuzzy_space_words.assign_to_sublist (&word_res_it_from,              &word_res_it_to);            fix_fuzzy_space_list (fuzzy_space_words,              row_res_it.data ()->row);            new_length = fuzzy_space_words.length ();            word_res_it_from.add_list_before (&fuzzy_space_words);            for (;              (!word_res_it_from.at_last () &&            (new_length > 0)); new_length--) {              word_res_it_from.forward ();            }          }          if (test_pt)            debug_fix_space_level.set_value (0);        }        fix_sp_fp_word (word_res_it_from, row_res_it.data ()->row);        //Last word in row      }    }  }}void fix_fuzzy_space_list(  //space explorer                          WERD_RES_LIST &best_perm,                          ROW *row) {  INT16 best_score;  WERD_RES_LIST current_perm;  INT16 current_score;  BOOL8 improved = FALSE;                                 //default score  best_score = eval_word_spacing (best_perm);  dump_words (best_perm, best_score, 1, improved);  if (best_score != PERFECT_WERDS)    initialise_search(best_perm, current_perm);   while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) {    match_current_words(current_perm, row);     current_score = eval_word_spacing (current_perm);    dump_words (current_perm, current_score, 2, improved);    if (current_score > best_score) {      best_perm.clear ();      best_perm.deep_copy (&current_perm);      best_score = current_score;      improved = TRUE;    }    if (current_score < PERFECT_WERDS)      transform_to_next_perm(current_perm);   }  dump_words (best_perm, best_score, 3, improved);}void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list) {   WERD_RES_IT src_it(&src_list);   WERD_RES_IT new_it(&new_list);   WERD_RES *src_wd;  WERD_RES *new_wd;  for (src_it.mark_cycle_pt (); !src_it.cycled_list (); src_it.forward ()) {    src_wd = src_it.data ();    if (!src_wd->combination) {      new_wd = new WERD_RES (*src_wd);      new_wd->combination = FALSE;      new_wd->part_of_combo = FALSE;      new_it.add_after_then_move (new_wd);    }  }}void match_current_words(WERD_RES_LIST &words, ROW *row) {   WERD_RES_IT word_it(&words);   WERD_RES *word;  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {    word = word_it.data ();    if ((!word->part_of_combo) && (word->outword == NULL))      classify_word_pass2(word, row);   }}/************************************************************************* * eval_word_spacing() * The basic measure is the number of characters in contextually confirmed * words. (I.e the word is done) * If all words are contextually confirmed the evaluation is deemed perfect. * * Some fiddles are done to handle "1"s as these are VERY frequent causes of * fuzzy spaces. The problem with the basic measure is that "561 63" would score * the same as "56163", though given our knowledge that the space is fuzzy, and * that there is a "1" next to the fuzzy space, we need to ensure that "56163" * is prefered. * * The solution is to NOT COUNT the score of any word which has a digit at one * end and a "1Il" as the character the other side of the space. * * Conversly, any character next to a "1" within a word is counted as a positive * score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of * the "1" joined).  "56163" would score 7 - all chars in a numeric word + 2 * sides of a "1" joined. * * The joined 1 rule is applied to any word REGARDLESS of contextual * confirmation.  Thus "PS7a71 3/7a" scores 1 (neither word is contexutally * confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. * *************************************************************************/INT16 eval_word_spacing(WERD_RES_LIST &word_res_list) {   WERD_RES_IT word_res_it(&word_res_list);   INT16 total_score = 0;  INT16 word_count = 0;  INT16 done_word_count = 0;  INT16 word_len;  INT16 i;  WERD_RES *word;                //current word  INT16 prev_word_score = 0;  BOOL8 prev_word_done = FALSE;  BOOL8 prev_char_1 = FALSE;     //prev ch a "1/I/l"?  BOOL8 prev_char_digit = FALSE; //prev ch 2..9 or 0  BOOL8 current_char_1 = FALSE;  BOOL8 current_word_ok_so_far;  STRING punct_chars = "!\"`',.:;";  BOOL8 prev_char_punct = FALSE;  BOOL8 current_char_punct = FALSE;  BOOL8 word_done = FALSE;  do {    word = word_res_it.data ();    word_done = fixspace_thinks_word_done (word);    word_count++;    if (word->tess_failed) {      total_score += prev_word_score;      if (prev_word_done)        done_word_count++;      prev_word_score = 0;      prev_char_1 = FALSE;      prev_char_digit = FALSE;      prev_word_done = FALSE;    }    else {      /*        Can we add the prev word score and potentially count this word?        Yes IF it didnt end in a 1 when the first char of this word is a digit          AND it didnt end in a digit when the first char of this word is a 1      */      word_len = word->reject_map.length ();      current_word_ok_so_far = FALSE;      if (!((prev_char_1 &&        digit_or_numeric_punct (word,        word->best_choice->string ()[0])) ||        (prev_char_digit &&        ((word_done &&        (word->best_choice->string ()[0] == '1')) ||        (!word_done &&        STRING (conflict_set_I_l_1).contains (word->best_choice->      string ()[0])))))) {        total_score += prev_word_score;        if (prev_word_done)          done_word_count++;        current_word_ok_so_far = word_done;      }      if ((current_word_ok_so_far) &&        (!tessedit_test_uniform_wd_spacing ||        ((word->best_choice->permuter () == NUMBER_PERM) ||      uniformly_spaced (word)))) {        prev_word_done = TRUE;        prev_word_score = word_len;      }      else {        prev_word_done = FALSE;        prev_word_score = 0;      }      if (fixsp_prefer_joined_1s) {        /* Add 1 to total score for every joined 1 regardless of context and rejtn */        for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {          current_char_1 = word->best_choice->string ()[i] == '1';          if (prev_char_1 || (current_char_1 && (i > 0)))            total_score++;          prev_char_1 = current_char_1;        }      }      /* Add 1 to total score for every joined punctuation regardless of context        and rejtn */      if (tessedit_prefer_joined_punct) {
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?
fixspace.cpp

fixspace.cpp - 源码说明

⌨️ 快捷键说明