📄 control.cpp
字号:
/****************************************************************** * File: control.cpp (Formerly control.c) * Description: Module-independent matcher controller. * Author: Ray Smith * Created: Thu Apr 23 11:09:58 BST 1992 * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#include "mainblk.h"#include <string.h>#include <math.h>#ifdef __UNIX__#include <assert.h>#include <unistd.h>#include <errno.h>#endif#include <ctype.h>#include "ocrclass.h"#include "werdit.h"#include "drawfx.h"#include "tfacep.h"#include "tessbox.h"#include "tessvars.h"//#include "fxtop.h"#include "pgedit.h"#include "reject.h"#include "adaptions.h"#include "charcut.h"#include "fixxht.h"#include "fixspace.h"#include "genblob.h"#include "docqual.h"#include "control.h"#include "secname.h"#include "output.h"#include "callcpp.h"#include "notdll.h"#include "tordvars.h"#include "adaptmatch.h"#define MIN_FONT_ROW_COUNT 8#define MAX_XHEIGHT_DIFF 3#define EXTERN//extern "C" {//EXTERN BOOL_VAR(tessedit_small_match,FALSE,"Use small matrix matcher");//extern FILE* matcher_fp;//extern FILE* correct_fp;//};BOOL_VAR (tessedit_small_match, FALSE, "Use small matrix matcher");EXTERN BOOL_VAR (tessedit_print_text, FALSE, "Write text to stdout");EXTERN BOOL_VAR (tessedit_draw_words, FALSE, "Draw source words");EXTERN BOOL_VAR (tessedit_draw_outwords, FALSE, "Draw output words");EXTERN BOOL_VAR (tessedit_training_wiseowl, FALSE, "Call WO to learn blobs");EXTERN BOOL_VAR (tessedit_training_tess, FALSE, "Call Tess to learn blobs");EXTERN BOOL_VAR (tessedit_matcher_is_wiseowl, FALSE, "Call WO to classify");EXTERN BOOL_VAR (tessedit_dump_choices, FALSE, "Dump char choices");EXTERN BOOL_VAR (tessedit_fix_fuzzy_spaces, TRUE,"Try to improve fuzzy spaces");EXTERN BOOL_VAR (tessedit_unrej_any_wd, FALSE,"Dont bother with word plausibility");EXTERN BOOL_VAR (tessedit_fix_hyphens, TRUE, "Crunch double hyphens?");EXTERN BOOL_VAR (tessedit_reject_fullstops, FALSE, "Reject all fullstops");EXTERN BOOL_VAR (tessedit_reject_suspect_fullstops, FALSE,"Reject suspect fullstops");EXTERN BOOL_VAR (tessedit_redo_xheight, TRUE, "Check/Correct x-height");EXTERN BOOL_VAR (tessedit_cluster_adaption_on, TRUE,"Do our own adaption - ems only");EXTERN BOOL_VAR (tessedit_enable_doc_dict, TRUE,"Add words to the document dictionary");EXTERN BOOL_VAR (word_occ_first, FALSE, "Do word occ before re-est xht");EXTERN BOOL_VAR (tessedit_debug_fonts, FALSE, "Output font info per char");EXTERN BOOL_VAR (tessedit_xht_fiddles_on_done_wds, TRUE,"Apply xht fix up even if done");EXTERN BOOL_VAR (tessedit_xht_fiddles_on_no_rej_wds, TRUE,"Apply xht fix up even in no rejects");EXTERN INT_VAR (x_ht_check_word_occ, 2, "Check Char Block occupancy");EXTERN INT_VAR (x_ht_stringency, 1, "How many confirmed a/n to accept?");EXTERN BOOL_VAR (x_ht_quality_check, TRUE, "Dont allow worse quality");EXTERN BOOL_VAR (tessedit_debug_block_rejection, FALSE,"Block and Row stats");EXTERN INT_VAR (debug_x_ht_level, 0, "Reestimate debug");EXTERN BOOL_VAR (rej_use_xht, TRUE, "Individual rejection control");EXTERN BOOL_VAR (debug_acceptable_wds, FALSE, "Dump word pass/fail chk");EXTERN STRING_VAR (chs_leading_punct, "('`\"", "Leading punctuation");EXTERNSTRING_VAR (chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation");EXTERN STRING_VAR (chs_trailing_punct2, ")'`\"","2nd Trailing punctuation");EXTERN double_VAR (quality_rej_pc, 0.08,"good_quality_doc lte rejection limit");EXTERN double_VAR (quality_blob_pc, 0.0,"good_quality_doc gte good blobs limit");EXTERN double_VAR (quality_outline_pc, 1.0,"good_quality_doc lte outline error limit");EXTERN double_VAR (quality_char_pc, 0.95,"good_quality_doc gte good char limit");EXTERN INT_VAR (quality_min_initial_alphas_reqd, 2,"alphas in a good word");EXTERN BOOL_VAR (tessedit_tess_adapt_to_rejmap, FALSE,"Use reject map to control Tesseract adaption");EXTERN INT_VAR (tessedit_tess_adaption_mode, 0x27,"Adaptation decision algorithm for tess");EXTERN INT_VAR (tessedit_em_adaption_mode, 0,"Adaptation decision algorithm for ems matrix matcher");EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass1, FALSE,"Adapt using clusterer after pass 1");EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass2, FALSE,"Adapt using clusterer after pass 1");EXTERN BOOL_VAR (tessedit_cluster_adapt_after_pass3, FALSE,"Adapt using clusterer after pass 1");EXTERN BOOL_VAR (tessedit_cluster_adapt_before_pass1, FALSE,"Adapt using clusterer before Tess adaping during pass 1");EXTERN INT_VAR (tessedit_cluster_adaption_mode, 0,"Adaptation decision algorithm for matrix matcher");EXTERN BOOL_VAR (tessedit_adaption_debug, FALSE,"Generate and print debug information for adaption");EXTERN BOOL_VAR (tessedit_minimal_rej_pass1, FALSE,"Do minimal rejection on pass 1 output");EXTERN BOOL_VAR (tessedit_test_adaption, FALSE,"Test adaption criteria");EXTERN BOOL_VAR (tessedit_global_adaption, FALSE,"Adapt to all docs over time");EXTERN BOOL_VAR (tessedit_matcher_log, FALSE, "Log matcher activity");EXTERN INT_VAR (tessedit_test_adaption_mode, 3,"Adaptation decision algorithm for tess");EXTERN BOOL_VAR (test_pt, FALSE, "Test for point");EXTERN double_VAR (test_pt_x, 99999.99, "xcoord");EXTERN double_VAR (test_pt_y, 99999.99, "ycoord");extern int MatcherDebugLevel;extern int display_ratings;extern int number_debug;extern int adjust_debug;/*extern "C" { extern int MatcherDebugLevel; extern int display_ratings; extern int number_debug; extern int adjust_debug;// extern int LearningDebugLevel; };*/FILE *choice_file = NULL; //Choice file ptrCLISTIZEH (PBLOB) CLISTIZE (PBLOB)/* DEBUGGING */INT16 blob_count(WERD *w) { return w->blob_list ()->length ();}/********************************************************************** * recog_pseudo_word * * Make a word from the selected blobs and run Tess on them. **********************************************************************/void recog_pseudo_word( //recognize blobs BLOCK_LIST *block_list, //blocks to check BOX &selection_box) { WERD *word; ROW *pseudo_row; //row of word BLOCK *pseudo_block; //block of word word = make_pseudo_word (block_list, selection_box, pseudo_block, pseudo_row); if (word != NULL) { recog_interactive(pseudo_block, pseudo_row, word); delete word; }}/********************************************************************** * recog_interactive * * Recognize a single word in interactive mode. **********************************************************************/BOOL8 recog_interactive( //recognize blobs BLOCK *, //block ROW *row, //row of word WERD *word //word to recognize ) { WERD_RES word_res(word); INT16 char_qual; INT16 good_char_qual; classify_word_pass2(&word_res, row); #ifndef SECURE_NAMES if (tessedit_debug_quality_metrics) { word_char_quality(&word_res, row, &char_qual, &good_char_qual); tprintf ("\n%d chars; word_blob_quality: %d; outline_errs: %d; char_quality: %d; good_char_quality: %d\n", word_res.reject_map.length (), word_blob_quality (&word_res, row), word_outline_errs (&word_res), char_qual, good_char_qual); } #endif return TRUE;}/********************************************************************** * recog_all_words() * * Walk the current block list applying the specified word processor function * to all words **********************************************************************/void recog_all_words( //process words PAGE_RES *page_res, //page structure volatile ETEXT_DESC *monitor //progress monitor ) { //reset page iterator PAGE_RES_IT page_res_it(page_res); INT16 chars_in_word; INT16 rejects_in_word; CHAR_SAMPLES_LIST em_clusters; CHAR_SAMPLE_LIST ems_waiting; CHAR_SAMPLES_LIST char_clusters; CHAR_SAMPLE_LIST chars_waiting; INT16 blob_quality = 0; INT16 outline_errs = 0; INT16 doc_blob_quality = 0; INT16 doc_outline_errs = 0; INT16 doc_char_quality = 0; INT16 all_char_quality; INT16 accepted_all_char_quality; INT16 good_char_count = 0; INT16 doc_good_char_quality = 0; const STRING *wordstr; const char *text; int i; BOOL8 good_quality_doc; UINT8 permuter_type; INT32 tess_adapt_mode = 0; INT32 word_count; //count of words in doc INT32 word_index; //current word if (tessedit_minimal_rej_pass1) { tessedit_test_adaption.set_value (TRUE); tessedit_minimal_rejection.set_value (TRUE); } if (tessedit_cluster_adapt_before_pass1) { tess_adapt_mode = tessedit_tess_adaption_mode; tessedit_tess_adaption_mode.set_value (0); tessedit_tess_adapt_to_rejmap.set_value (TRUE); } /* Pass 1 */ word_count = 0; if (monitor != NULL) { monitor->ocr_alive = TRUE; while (page_res_it.word () != NULL) { word_count++; page_res_it.forward (); } page_res_it.restart_page (); } else word_count = 1; word_index = 0; int dict_words = 0; while (page_res_it.word () != NULL) { set_global_loc_code(LOC_PASS1); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 30 + 50 * word_index / word_count; if ((monitor->end_time != 0 && clock() > monitor->end_time) || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, dict_words))) return; } classify_word_pass1 (page_res_it.word (), page_res_it.row ()->row, FALSE, NULL, NULL); if (tessedit_test_adaption && !tessedit_minimal_rejection) { if (!word_adaptable (page_res_it.word (), tessedit_test_adaption_mode)) page_res_it.word ()->reject_map.rej_word_tess_failure (); //FAKE PERM REJ else { wordstr = &(page_res_it.word ()->best_choice->string ()); /* Override rejection mechanisms for this word */ text = wordstr->string (); for (i = 0; text[i] != '\0'; i++) { if ((text[i] != ' ') && page_res_it.word ()->reject_map[i].rejected ()) page_res_it.word ()->reject_map[i]. setrej_minimal_rej_accept(); } } } if ((tessedit_cluster_adapt_after_pass1 || tessedit_cluster_adapt_after_pass3 || tessedit_cluster_adapt_before_pass1) && tessedit_cluster_adaption_mode != 0) { collect_characters_for_adaption (page_res_it.word (), &char_clusters, &chars_waiting); } // Count dict words. if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM) ++dict_words; page_res_it.forward (); } if (tessedit_cluster_adapt_before_pass1) tessedit_tess_adaption_mode.set_value (tess_adapt_mode); page_res_it.restart_page (); while ((tessedit_cluster_adapt_after_pass1 || tessedit_cluster_adapt_before_pass1) && page_res_it.word () != NULL) { if (monitor != NULL) monitor->ocr_alive = TRUE; if (tessedit_cluster_adapt_after_pass1) adapt_to_good_samples (page_res_it.word (), &char_clusters, &chars_waiting); else classify_word_pass1 (page_res_it.word (), page_res_it.row ()->row, TRUE, &char_clusters, &chars_waiting); page_res_it.forward (); } /* Pass 2 */ page_res_it.restart_page (); word_index = 0; while (!tessedit_test_adaption && page_res_it.word () != NULL) { set_global_loc_code(LOC_PASS2); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 80 + 10 * word_index / word_count; if ((monitor->end_time != 0 && clock() > monitor->end_time) || (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this, dict_words))) return; } classify_word_pass2 (page_res_it.word (), page_res_it.row ()->row); if (tessedit_em_adaption_mode > 0) collect_ems_for_adaption (page_res_it.word (), &em_clusters, &ems_waiting); if (tessedit_cluster_adapt_after_pass2 && tessedit_cluster_adaption_mode != 0) collect_characters_for_adaption (page_res_it.word (), &char_clusters, &chars_waiting); page_res_it.forward (); } /* Another pass */ set_global_loc_code(LOC_FUZZY_SPACE); if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces && !tessedit_word_for_word) fix_fuzzy_spaces(monitor, word_count, page_res); if (!tessedit_test_adaption && tessedit_em_adaption_mode != 0) // Initially ems only print_em_stats(&em_clusters, &ems_waiting); /* Pass 3 - used for checking confusion sets */ page_res_it.restart_page (); word_index = 0; while (!tessedit_test_adaption && page_res_it.word () != NULL) { set_global_loc_code(LOC_MM_ADAPT); word_index++; if (monitor != NULL) { monitor->ocr_alive = TRUE; monitor->progress = 95 + 5 * word_index / word_count; } check_debug_pt (page_res_it.word (), 70); /* Use good matches to sort out confusions */ if (tessedit_em_adaption_mode != 0) adapt_to_good_ems (page_res_it.word (), &em_clusters, &ems_waiting); if (tessedit_cluster_adapt_after_pass2 && tessedit_cluster_adaption_mode != 0) adapt_to_good_samples (page_res_it.word (), &char_clusters, &chars_waiting); if (tessedit_reject_fullstops && strchr (page_res_it.word ()->best_choice->string ().string (), '.') != NULL) reject_all_fullstops (page_res_it.word ()); else if (tessedit_reject_suspect_fullstops && strchr (page_res_it.word ()->best_choice->string (). string (), '.') != NULL)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -