📄 docqual.cpp
字号:
/****************************************************************** * File: docqual.cpp (Formerly docqual.c) * Description: Document Quality Metrics * Author: Phil Cheatle * Created: Mon May 9 11:27:28 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#include <ctype.h>#include "docqual.h"#include "tstruct.h"#include "tfacep.h"#include "reject.h"#include "tessvars.h"#include "genblob.h"#include "secname.h"#define EXTERNEXTERN STRING_VAR (outlines_odd, "%| ", "Non standard number of outlines");EXTERN STRING_VAR (outlines_2, "ij!?%\":;","Non standard number of outlines");EXTERN BOOL_VAR (docqual_excuse_outline_errs, FALSE,"Allow outline errs in unrejection?");EXTERN BOOL_VAR (tessedit_good_quality_unrej, TRUE,"Reduce rejection on good docs");EXTERN BOOL_VAR (tessedit_use_reject_spaces, TRUE, "Reject spaces?");EXTERN double_VAR (tessedit_reject_doc_percent, 65.00,"%rej allowed before rej whole doc");EXTERN double_VAR (tessedit_reject_block_percent, 45.00,"%rej allowed before rej whole block");EXTERN double_VAR (tessedit_reject_row_percent, 40.00,"%rej allowed before rej whole row");EXTERN double_VAR (tessedit_whole_wd_rej_row_percent, 70.00,"%of row rejects in whole word rejects which prevents whole row rejection");EXTERN BOOL_VAR (tessedit_preserve_blk_rej_perfect_wds, TRUE,"Only rej partially rejected words in block rejection");EXTERN BOOL_VAR (tessedit_preserve_row_rej_perfect_wds, TRUE,"Only rej partially rejected words in row rejection");EXTERN BOOL_VAR (tessedit_dont_blkrej_good_wds, FALSE,"Use word segmentation quality metric");EXTERN BOOL_VAR (tessedit_dont_rowrej_good_wds, FALSE,"Use word segmentation quality metric");EXTERN INT_VAR (tessedit_preserve_min_wd_len, 2,"Only preserve wds longer than this");EXTERN BOOL_VAR (tessedit_row_rej_good_docs, TRUE,"Apply row rejection to good docs");EXTERN double_VAR (tessedit_good_doc_still_rowrej_wd, 1.1,"rej good doc wd if more than this fraction rejected");EXTERN BOOL_VAR (tessedit_reject_bad_qual_wds, TRUE,"Reject all bad quality wds");EXTERN BOOL_VAR (tessedit_debug_doc_rejection, FALSE, "Page stats");EXTERN BOOL_VAR (tessedit_debug_quality_metrics, FALSE,"Output data to debug file");EXTERN BOOL_VAR (bland_unrej, FALSE, "unrej potential with no chekcs");EXTERN double_VAR (quality_rowrej_pc, 1.1,"good_quality_doc gte good char limit");EXTERN BOOL_VAR (unlv_tilde_crunching, TRUE,"Mark v.bad words for tilde crunch");EXTERN BOOL_VAR (crunch_early_merge_tess_fails, TRUE, "Before word crunch?");EXTERN BOOL_EVAR (crunch_early_convert_bad_unlv_chs, FALSE,"Take out ~^ early?");EXTERN double_VAR (crunch_terrible_rating, 80.0, "crunch rating lt this");EXTERN BOOL_VAR (crunch_terrible_garbage, TRUE, "As it says");EXTERN double_VAR (crunch_poor_garbage_cert, -9.0,"crunch garbage cert lt this");EXTERN double_VAR (crunch_poor_garbage_rate, 60,"crunch garbage rating lt this");EXTERN double_VAR (crunch_pot_poor_rate, 40,"POTENTIAL crunch rating lt this");EXTERN double_VAR (crunch_pot_poor_cert, -8.0,"POTENTIAL crunch cert lt this");EXTERN BOOL_VAR (crunch_pot_garbage, TRUE, "POTENTIAL crunch garbage");EXTERN double_VAR (crunch_del_rating, 60, "POTENTIAL crunch rating lt this");EXTERN double_VAR (crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this");EXTERN double_VAR (crunch_del_min_ht, 0.7, "Del if word ht lt xht x this");EXTERN double_VAR (crunch_del_max_ht, 3.0, "Del if word ht gt xht x this");EXTERN double_VAR (crunch_del_min_width, 3.0,"Del if word width lt xht x this");EXTERN double_VAR (crunch_del_high_word, 1.5,"Del if word gt xht x this above bl");EXTERN double_VAR (crunch_del_low_word, 0.5,"Del if word gt xht x this below bl");EXTERN double_VAR (crunch_small_outlines_size, 0.6, "Small if lt xht x this");EXTERN INT_VAR (crunch_rating_max, 10, "For adj length in rating per ch");EXTERN INT_VAR (crunch_pot_indicators, 1,"How many potential indicators needed");EXTERN BOOL_VAR (crunch_leave_ok_strings, TRUE,"Dont touch sensible strings");EXTERN BOOL_VAR (crunch_accept_ok, TRUE, "Use acceptability in okstring");EXTERN BOOL_VAR (crunch_leave_accept_strings, FALSE,"Dont pot crunch sensible strings");EXTERN BOOL_VAR (crunch_include_numerals, FALSE, "Fiddle alpha figures");EXTERN INT_VAR (crunch_leave_lc_strings, 4,"Dont crunch words with long lower case strings");EXTERN INT_VAR (crunch_leave_uc_strings, 4,"Dont crunch words with long lower case strings");EXTERN INT_VAR (crunch_long_repetitions, 3,"Crunch words with long repetitions");EXTERN INT_VAR (crunch_debug, 0, "As it says");/************************************************************************* * word_blob_quality() * How many blobs in the outword are identical to those of the inword? * ASSUME blobs in both initial word and outword are in ascending order of * left hand blob edge. *************************************************************************/INT16 word_blob_quality( //Blob seg changes WERD_RES *word, ROW *row) { WERD *bln_word; //BL norm init word TWERD *tessword; //tess format WERD *init_word; //BL norm init word PBLOB_IT outword_it; PBLOB_IT initial_it; INT16 i; INT16 init_blobs_left; INT16 match_count = 0; BOOL8 matched; BOX out_box; PBLOB *test_blob; DENORM denorm; float bln_xht; if (word->word->gblob_list ()->empty ()) return 0; //xht used for blnorm bln_xht = bln_x_height / word->denorm.scale (); bln_word = make_bln_copy (word->word, row, bln_xht, &denorm); /* NOTE: Need to convert to tess format and back again to ensure that the same float -> int rounding of coords is done to source wd as out wd before comparison */ // if (!bln_word->flag(W_POLYGON)) // tprintf( "NON POLYGON BLN WERD\n"); tessword = make_tess_word (bln_word, NULL); //convert word init_word = make_ed_word (tessword, bln_word); // if (!init_word->flag(W_POLYGON)) // tprintf( "NON POLYGON INIT WERD\n"); // tprintf( "SOURCE BLOBS-AFTER TESS:\n"); // print_boxes( init_word ); // tprintf( "OUTPUT BLOBS:\n"); // print_boxes( word->outword ); initial_it.set_to_list (init_word->blob_list ()); init_blobs_left = initial_it.length (); outword_it.set_to_list (word->outword->blob_list ()); delete bln_word; delete_word(tessword); //get rid of it for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward ()) { out_box = outword_it.data ()->bounding_box (); /* Skip any initial blobs LEFT of current outword blob */ while (!initial_it.at_last () && (initial_it.data ()->bounding_box ().left () < out_box.left ())) { initial_it.forward (); init_blobs_left--; } /* See if current outword blob matches any initial blob with the same left coord. (Normally only one but possibly more - in unknown order) */ i = 0; matched = FALSE; do { test_blob = initial_it.data_relative (i++); matched = crude_match_blobs (test_blob, outword_it.data ()); if (matched) match_count++; } while (!matched && (init_blobs_left - i > 0) && (i < 129) && !initial_it.at_last () && test_blob->bounding_box ().left () == out_box.left ()); } delete init_word; return match_count;}/************************************************************************* * crude_match_blobs() * Check bounding boxes are the same and the number of outlines are the same. *************************************************************************/BOOL8 crude_match_blobs(PBLOB *blob1, PBLOB *blob2) { BOX box1 = blob1->bounding_box (); BOX box2 = blob2->bounding_box (); if (box1.contains (box2) && box2.contains (box1) && (blob1->out_list ()->length () == blob1->out_list ()->length ())) return TRUE; else return FALSE;}INT16 word_outline_errs( //Outline count errs WERD_RES *word) { PBLOB_IT outword_it; INT16 i = 0; INT16 err_count = 0; outword_it.set_to_list (word->outword->blob_list ()); for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward ()) { err_count += count_outline_errs (word->best_choice->string ()[i], outword_it.data ()->out_list ()-> length ()); i++; } return err_count;}/************************************************************************* * word_char_quality() * Combination of blob quality and outline quality - how many good chars are * there? - I.e chars which pass the blob AND outline tests. *************************************************************************/void word_char_quality( //Blob seg changes WERD_RES *word, ROW *row, INT16 *match_count, INT16 *accepted_match_count) { WERD *bln_word; //BL norm init word TWERD *tessword; //tess format WERD *init_word; //BL norm init word PBLOB_IT outword_it; PBLOB_IT initial_it; INT16 i; INT16 init_blobs_left; BOOL8 matched; BOX out_box; PBLOB *test_blob; DENORM denorm; float bln_xht; INT16 j = 0; *match_count = 0; *accepted_match_count = 0; if (word->word->gblob_list ()->empty ()) return; //xht used for blnorm bln_xht = bln_x_height / word->denorm.scale (); bln_word = make_bln_copy (word->word, row, bln_xht, &denorm); /* NOTE: Need to convert to tess format and back again to ensure that the same float -> int rounding of coords is done to source wd as out wd before comparison */ tessword = make_tess_word (bln_word, NULL); //convert word init_word = make_ed_word (tessword, bln_word); delete bln_word; delete_word(tessword); //get rid of it // tprintf( "SOURCE BLOBS-AFTER TESS:\n"); // print_boxes( init_word ); // tprintf( "OUTPUT BLOBS:\n"); // print_boxes( word->outword ); initial_it.set_to_list (init_word->blob_list ()); init_blobs_left = initial_it.length (); outword_it.set_to_list (word->outword->blob_list ()); for (outword_it.mark_cycle_pt (); !outword_it.cycled_list (); outword_it.forward ()) { out_box = outword_it.data ()->bounding_box (); /* Skip any initial blobs LEFT of current outword blob */ while (!initial_it.at_last () && (initial_it.data ()->bounding_box ().left () < out_box.left ())) { initial_it.forward (); init_blobs_left--; } /* See if current outword blob matches any initial blob with the same left coord. (Normally only one but possibly more - in unknown order) */ i = 0; matched = FALSE; do { test_blob = initial_it.data_relative (i++); matched = crude_match_blobs (test_blob, outword_it.data ()); if (matched && (count_outline_errs (word->best_choice->string ()[j], outword_it.data ()->out_list ()->length ()) == 0)) { (*match_count)++; if (word->reject_map[j].accepted ()) (*accepted_match_count)++; } } while (!matched && (init_blobs_left - i > 0) && (i < 129) && !initial_it.at_last () && test_blob->bounding_box ().left () == out_box.left ()); j++; } delete init_word;}/************************************************************************* * unrej_good_chs() * Unreject POTENTIAL rejects if the blob passes the blob and outline checks *************************************************************************/void unrej_good_chs(WERD_RES *word, ROW *row) { WERD *bln_word; //BL norm init word TWERD *tessword; //tess format WERD *init_word; //BL norm init word PBLOB_IT outword_it; PBLOB_IT initial_it; INT16 i; INT16 init_blobs_left; BOOL8 matched; BOX out_box; PBLOB *test_blob; DENORM denorm; float bln_xht; INT16 j = 0; if (word->word->gblob_list ()->empty ()) return; //xht used for blnorm bln_xht = bln_x_height / word->denorm.scale (); bln_word = make_bln_copy (word->word, row, bln_xht, &denorm); /* NOTE: Need to convert to tess format and back again to ensure that the same float -> int rounding of coords is done to source wd as out wd before comparison */ tessword = make_tess_word (bln_word, NULL); //convert word init_word = make_ed_word (tessword, bln_word); delete bln_word;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -