docqual.cpp

来自「一ＯＣＲ的相关资料。．希望对研究ＯＣＲ的朋友有所帮助．」· C++ 代码 · 共 1,454 行 · 第 1/4 页
CPP
1,454 行
/****************************************************************** * File:        docqual.cpp  (Formerly docqual.c) * Description: Document Quality Metrics * Author:		Phil Cheatle * Created:		Mon May  9 11:27:28 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#include          <ctype.h>#include          "docqual.h"#include          "tstruct.h"#include          "tfacep.h"#include          "reject.h"#include          "tessvars.h"#include          "genblob.h"#include          "secname.h"#define EXTERNEXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");EXTERN STRING_VAR (outlines_2, "ij!?%\":;","Non standard number of outlines");EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,"Allow outline errs in unrejection?");EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,"Reduce rejection on good docs");EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,"%rej allowed before rej whole doc");EXTERN double_VAR (tessedit_reject_block_percent, 45.00,"%rej allowed before rej whole block");EXTERN double_VAR (tessedit_reject_row_percent, 40.00,"%rej allowed before rej whole row");EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,"%of row rejects in whole word rejects which prevents whole row rejection");EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,"Only rej partially rejected words in block rejection");EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,"Only rej partially rejected words in row rejection");EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,"Use word segmentation quality metric");EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,"Use word segmentation quality metric");EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,"Only preserve wds longer than this");EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,"Apply row rejection to good docs");EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,"rej good doc wd if more than this fraction rejected");EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,"Reject all bad quality wds");EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,"Output data to debug file");EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");EXTERN double_VAR (quality_rowrej_pc, 1.1,"good_quality_doc gte good char limit");EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,"Mark v.bad words for tilde crunch");EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,"Take out ~^ early?");EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,"crunch garbage cert lt this");EXTERN double_VAR (crunch_poor_garbage_rate, 60,"crunch garbage rating lt this");EXTERN double_VAR (crunch_pot_poor_rate, 40,"POTENTIAL crunch rating lt this");EXTERN double_VAR (crunch_pot_poor_cert, -8.0,"POTENTIAL crunch cert lt this");EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");EXTERN double_VAR (crunch_del_min_width, 3.0,"Del if word width lt xht x this");EXTERN double_VAR (crunch_del_high_word, 1.5,"Del if word gt xht x this above bl");EXTERN double_VAR (crunch_del_low_word, 0.5,"Del if word gt xht x this below bl");EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");EXTERN INT_VAR (crunch_pot_indicators, 1,"How many potential indicators needed");EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,"Dont touch sensible strings");EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,"Dont pot crunch sensible strings");EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");EXTERN INT_VAR (crunch_leave_lc_strings, 4,"Dont crunch words with long lower case strings");EXTERN INT_VAR (crunch_leave_uc_strings, 4,"Dont crunch words with long lower case strings");EXTERN INT_VAR (crunch_long_repetitions, 3,"Crunch words with long repetitions");EXTERN INT_VAR (crunch_debug, 0, "As it says");/************************************************************************* * word_blob_quality() * How many blobs in the outword are identical to those of the inword? * ASSUME blobs in both initial word and outword are in ascending order of * left hand blob edge. *************************************************************************/INT16 word_blob_quality(  //Blob seg changes                        WERD_RES *word,                        ROW *row) {  WERD *bln_word;                //BL norm init word  TWERD *tessword;               //tess format  WERD *init_word;               //BL norm init word  PBLOB_IT outword_it;  PBLOB_IT initial_it;  INT16 i;  INT16 init_blobs_left;  INT16 match_count = 0;  BOOL8 matched;  BOX out_box;  PBLOB *test_blob;  DENORM denorm;  float bln_xht;  if (word->word->gblob_list ()->empty ())    return 0;                                 //xht used for blnorm  bln_xht = bln_x_height / word->denorm.scale ();  bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);  /*    NOTE: Need to convert to tess format and back again to ensure that the    same float -> int rounding of coords is done to source wd as out wd before    comparison  */  //   if (!bln_word->flag(W_POLYGON))  //           tprintf( "NON POLYGON BLN WERD\n");  tessword = make_tess_word (bln_word, NULL);                                 //convert word  init_word = make_ed_word (tessword, bln_word);  //   if (!init_word->flag(W_POLYGON))  //         tprintf( "NON POLYGON INIT WERD\n");  //   tprintf( "SOURCE BLOBS-AFTER TESS:\n");  //   print_boxes( init_word );  //   tprintf( "OUTPUT BLOBS:\n");  //   print_boxes( word->outword );  initial_it.set_to_list (init_word->blob_list ());  init_blobs_left = initial_it.length ();  outword_it.set_to_list (word->outword->blob_list ());  delete bln_word;  delete_word(tessword);  //get rid of it  for (outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); outword_it.forward ()) {    out_box = outword_it.data ()->bounding_box ();    /* Skip any initial blobs LEFT of current outword blob */    while (!initial_it.at_last () &&    (initial_it.data ()->bounding_box ().left () < out_box.left ())) {      initial_it.forward ();      init_blobs_left--;    }    /* See if current outword blob matches any initial blob with the same left      coord. (Normally only one but possibly more - in unknown order) */    i = 0;    matched = FALSE;    do {      test_blob = initial_it.data_relative (i++);      matched = crude_match_blobs (test_blob, outword_it.data ());      if (matched)        match_count++;    }    while (!matched &&      (init_blobs_left - i > 0) &&      (i < 129) &&      !initial_it.at_last () &&      test_blob->bounding_box ().left () == out_box.left ());  }  delete init_word;  return match_count;}/************************************************************************* * crude_match_blobs() * Check bounding boxes are the same and the number of outlines are the same. *************************************************************************/BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) {   BOX box1 = blob1->bounding_box ();  BOX box2 = blob2->bounding_box ();  if (box1.contains (box2) &&    box2.contains (box1) &&    (blob1->out_list ()->length () == blob1->out_list ()->length ()))    return TRUE;  else    return FALSE;}INT16 word_outline_errs(  //Outline count errs                        WERD_RES *word) {  PBLOB_IT outword_it;  INT16 i = 0;  INT16 err_count = 0;  outword_it.set_to_list (word->outword->blob_list ());  for (outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); outword_it.forward ()) {    err_count += count_outline_errs (word->best_choice->string ()[i],      outword_it.data ()->out_list ()->      length ());    i++;  }  return err_count;}/************************************************************************* * word_char_quality() * Combination of blob quality and outline quality - how many good chars are * there? - I.e chars which pass the blob AND outline tests. *************************************************************************/void word_char_quality(  //Blob seg changes                       WERD_RES *word,                       ROW *row,                       INT16 *match_count,                       INT16 *accepted_match_count) {  WERD *bln_word;                //BL norm init word  TWERD *tessword;               //tess format  WERD *init_word;               //BL norm init word  PBLOB_IT outword_it;  PBLOB_IT initial_it;  INT16 i;  INT16 init_blobs_left;  BOOL8 matched;  BOX out_box;  PBLOB *test_blob;  DENORM denorm;  float bln_xht;  INT16 j = 0;  *match_count = 0;  *accepted_match_count = 0;  if (word->word->gblob_list ()->empty ())    return;                                 //xht used for blnorm  bln_xht = bln_x_height / word->denorm.scale ();  bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);  /*    NOTE: Need to convert to tess format and back again to ensure that the    same float -> int rounding of coords is done to source wd as out wd before    comparison  */  tessword = make_tess_word (bln_word, NULL);                                 //convert word  init_word = make_ed_word (tessword, bln_word);  delete bln_word;  delete_word(tessword);  //get rid of it  //   tprintf( "SOURCE BLOBS-AFTER TESS:\n");  //   print_boxes( init_word );  //   tprintf( "OUTPUT BLOBS:\n");  //   print_boxes( word->outword );  initial_it.set_to_list (init_word->blob_list ());  init_blobs_left = initial_it.length ();  outword_it.set_to_list (word->outword->blob_list ());  for (outword_it.mark_cycle_pt ();  !outword_it.cycled_list (); outword_it.forward ()) {    out_box = outword_it.data ()->bounding_box ();    /* Skip any initial blobs LEFT of current outword blob */    while (!initial_it.at_last () &&    (initial_it.data ()->bounding_box ().left () < out_box.left ())) {      initial_it.forward ();      init_blobs_left--;    }    /* See if current outword blob matches any initial blob with the same left      coord. (Normally only one but possibly more - in unknown order) */    i = 0;    matched = FALSE;    do {      test_blob = initial_it.data_relative (i++);      matched = crude_match_blobs (test_blob, outword_it.data ());      if (matched &&        (count_outline_errs (word->best_choice->string ()[j],        outword_it.data ()->out_list ()->length ())      == 0)) {        (*match_count)++;        if (word->reject_map[j].accepted ())          (*accepted_match_count)++;      }    }    while (!matched &&      (init_blobs_left - i > 0) &&      (i < 129) &&      !initial_it.at_last () &&      test_blob->bounding_box ().left () == out_box.left ());    j++;  }  delete init_word;}/************************************************************************* * unrej_good_chs() * Unreject POTENTIAL rejects if the blob passes the blob and outline checks *************************************************************************/void unrej_good_chs(WERD_RES *word, ROW *row) {   WERD *bln_word;                //BL norm init word  TWERD *tessword;               //tess format  WERD *init_word;               //BL norm init word  PBLOB_IT outword_it;  PBLOB_IT initial_it;  INT16 i;  INT16 init_blobs_left;  BOOL8 matched;  BOX out_box;  PBLOB *test_blob;  DENORM denorm;  float bln_xht;  INT16 j = 0;  if (word->word->gblob_list ()->empty ())    return;                                 //xht used for blnorm  bln_xht = bln_x_height / word->denorm.scale ();  bln_word = make_bln_copy (word->word, row, bln_xht, &denorm);  /*    NOTE: Need to convert to tess format and back again to ensure that the    same float -> int rounding of coords is done to source wd as out wd before    comparison  */  tessword = make_tess_word (bln_word, NULL);                                 //convert word  init_word = make_ed_word (tessword, bln_word);  delete bln_word;
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?
docqual.cpp

docqual.cpp - 源码说明

⌨️ 快捷键说明