📄 adaptions.cpp
字号:
/********************************************************************** * File: adaptions.cpp (Formerly adaptions.c) * Description: Functions used to adapt to blobs already confidently * identified * Author: Chris Newton * Created: Thu Oct 7 10:17:28 BST 1993 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#ifdef __UNIX__#include <assert.h>#endif#include <ctype.h>#include <string.h>#include "tessbox.h"#include "tessvars.h"#include "memry.h"#include "mainblk.h"#include "charcut.h"#include "imgs.h"#include "scaleimg.h"#include "reject.h"#include "control.h"#include "adaptions.h"#include "stopper.h"#include "charsample.h"#include "matmatch.h"#include "secname.h"INT32 demo_word = 0;#define EXTERNEXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");EXTERN double_VAR (tessedit_cluster_t1, 0.20,"t1 threshold for clustering samples");EXTERN double_VAR (tessedit_cluster_t2, 0.40,"t2 threshold for clustering samples");EXTERN double_VAR (tessedit_cluster_t3, 0.12,"Extra threshold for clustering samples, only keep a new sample if best score greater than this value");EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,"Largest fraction of characters in cluster for it to be used for adaption");EXTERN INT_VAR (tessedit_cluster_min_size, 3,"Smallest number of samples in a cluster for it to be used for adaption");EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,"Generate and print debug information for adaption by clustering");EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,"Use best sample from cluster when adapting");EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,"Set reject map to enable cluster input to be measured");EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,"Don't try to adapt to characters on this list");EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*","Characters to be avoided when adapting");EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,"Use prototypes when adapting");EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,"Use prototypes as clusters are built");EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,"Adapt to characters using reject map");EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,"Adapt to all characters using, matrix matcher");EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,"Only match samples against clusters for the same character");EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,"Display cut images and matrix match for demo purposes");EXTERN INT_VAR (tessedit_demo_word1, 62,"Word number of first word to display");EXTERN INT_VAR (tessedit_demo_word2, 64,"Word number of second word to display");EXTERN STRING_VAR (tessedit_demo_file, "academe","Name of document containing demo words");BOOL8 word_adaptable( //should we adapt? WERD_RES *word, UINT16 mode) { BOOL8 status = FALSE; BITS16 flags(mode); enum MODES { ADAPTABLE_WERD, ACCEPTABLE_WERD, CHECK_DAWGS, CHECK_SPACES, CHECK_ONE_ELL_CONFLICT, CHECK_AMBIG_WERD }; /* 0: NO adaption */ if (mode == 0) { return FALSE; } if (flags.bit (ADAPTABLE_WERD)) status |= word->tess_would_adapt; if (flags.bit (ACCEPTABLE_WERD)) status |= word->tess_accepted; if (!status) // If not set then return FALSE; // ignore other checks if (flags.bit (CHECK_DAWGS) && (word->best_choice->permuter () != SYSTEM_DAWG_PERM) && (word->best_choice->permuter () != FREQ_DAWG_PERM) && (word->best_choice->permuter () != USER_DAWG_PERM) && (word->best_choice->permuter () != NUMBER_PERM)) return FALSE; if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) return FALSE; if (flags.bit (CHECK_SPACES) && (strchr (word->best_choice->string ().string (), ' ') != NULL)) return FALSE;// if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word)) if (flags.bit (CHECK_AMBIG_WERD) && !NoDangerousAmbig(word->best_choice->string().string(), NULL)) return FALSE; return status;}void collect_ems_for_adaption(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); INT16 i; CHAR_SAMPLE *sample; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image BOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm PBLOB_IT copy_blob_it; OUTLINE_IT copy_outline_it; INT32 resolution = page_image.get_res (); if (tessedit_reject_ems || tessedit_reject_suspect_ems) return; // Do nothing if (word->word->bounding_box ().height () > resolution / 3) return; if (tessedit_demo_adaption) // Make sure not set tessedit_display_mm.set_value (FALSE); if (word_adaptable (word, tessedit_em_adaption_mode) && word->reject_map.reject_count () == 0 && (strchr (word->best_choice->string ().string (), 'm') != NULL || (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") != NULL))) { if (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") != NULL) { copy_outword = *(word->outword); copy_blob_it.set_to_list (copy_outword.blob_list ()); i = 0; while (word->best_choice->string ()[i] != '\0') { if (word->best_choice->string ()[i] == 'r' && word->best_choice->string ()[i + 1] == 'n') { copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ()); copy_outline_it.add_list_after (copy_blob_it. data_relative (1)-> out_list ()); copy_blob_it.forward (); delete (copy_blob_it.extract ()); i++; } copy_blob_it.forward (); i++; } } else copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); blob_it.move_to_first (); for (i = 0; word->best_choice->string ()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward ()) { if (word->best_choice->string ()[i] == 'm' || (word->best_choice->string ()[i] == 'r' && word->best_choice->string ()[i + 1] == 'n')) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample %c for adaption found in %s, index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), i); #endif if (tessedit_matrix_match) { sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->string ()[i]); if (sample == NULL) { //Clip failed #ifndef SECURE_NAMES tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->string ().string (), i); #endif if (word->best_choice->string ()[i] == 'r') i++; continue; } } else sample = new CHAR_SAMPLE (blob_it.data (), &word->denorm, word->best_choice->string ()[i]); cluster_sample(sample, char_clusters, chars_waiting); if (word->best_choice->string ()[i] == 'r') i++; // Skip next character } } delete[]imlines; // Free array of imlines delete pixrow_list; }}void collect_characters_for_adaption(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); INT16 i; CHAR_SAMPLE *sample; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image BOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm INT32 resolution = page_image.get_res (); if (word->word->bounding_box ().height () > resolution / 3) return; if (tessedit_demo_adaption) // Make sure not set tessedit_display_mm.set_value (FALSE); if ((word_adaptable (word, tessedit_cluster_adaption_mode) && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) { if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) return; // Reject map set to acceptable /* Collect information about good matches */ copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); blob_it.move_to_first (); for (i = 0; word->best_choice->string ()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward ()) { if (!(tessedit_mm_use_non_adaption_set && STRING (tessedit_non_adaption_set).contains (word-> best_choice-> string ()[i])) || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample %c for adaption found in %s, index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), i); #endif sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->string ()[i]); if (sample == NULL) { //Clip failed #ifndef SECURE_NAMES tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->string ().string (), i); #endif continue; } cluster_sample(sample, char_clusters, chars_waiting); } } delete[]imlines; // Free array of imlines delete pixrow_list; } else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap) // Set word to all rejects word->reject_map.rej_word_tess_failure ();}void cluster_sample(CHAR_SAMPLE *sample, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { CHAR_SAMPLES *best_cluster = NULL; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; float score; float best_score = MAX_INT32; if (c_it.empty ()) c_it.add_to_end (new CHAR_SAMPLES (sample)); else { for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { score = c_it.data ()->match_score (sample); if (score < best_score) { best_score = score; best_cluster = c_it.data (); } } if (tessedit_cluster_debug) tprintf ("Sample's best score %f\n", best_score); if (best_score < tessedit_cluster_t1) { if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) { best_cluster->add_sample (sample); check_wait_list(chars_waiting, sample, best_cluster);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -