⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 adaptions.cpp

📁 一OCR的相关资料。.希望对研究OCR的朋友有所帮助.
💻 CPP
📖 第 1 页 / 共 3 页
字号:
/********************************************************************** * File:        adaptions.cpp  (Formerly adaptions.c) * Description: Functions used to adapt to blobs already confidently *					identified * Author:		Chris Newton * Created:		Thu Oct  7 10:17:28 BST 1993 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#ifdef __UNIX__#include          <assert.h>#endif#include          <ctype.h>#include          <string.h>#include          "tessbox.h"#include          "tessvars.h"#include          "memry.h"#include          "mainblk.h"#include          "charcut.h"#include          "imgs.h"#include          "scaleimg.h"#include          "reject.h"#include          "control.h"#include          "adaptions.h"#include          "stopper.h"#include          "charsample.h"#include          "matmatch.h"#include          "secname.h"INT32 demo_word = 0;#define EXTERNEXTERN BOOL_VAR (tessedit_reject_ems, FALSE, "Reject all m's");EXTERN BOOL_VAR (tessedit_reject_suspect_ems, FALSE, "Reject suspect m's");EXTERN double_VAR (tessedit_cluster_t1, 0.20,"t1 threshold for clustering samples");EXTERN double_VAR (tessedit_cluster_t2, 0.40,"t2 threshold for clustering samples");EXTERN double_VAR (tessedit_cluster_t3, 0.12,"Extra threshold for clustering samples, only keep a new sample if best score greater than this value");EXTERN double_VAR (tessedit_cluster_accept_fraction, 0.80,"Largest fraction of characters in cluster for it to be used for adaption");EXTERN INT_VAR (tessedit_cluster_min_size, 3,"Smallest number of samples in a cluster for it to be used for adaption");EXTERN BOOL_VAR (tessedit_cluster_debug, FALSE,"Generate and print debug information for adaption by clustering");EXTERN BOOL_VAR (tessedit_use_best_sample, FALSE,"Use best sample from cluster when adapting");EXTERN BOOL_VAR (tessedit_test_cluster_input, FALSE,"Set reject map to enable cluster input to be measured");EXTERN BOOL_VAR (tessedit_matrix_match, TRUE, "Use matrix matcher");EXTERN BOOL_VAR (tessedit_mm_use_non_adaption_set, FALSE,"Don't try to adapt to characters on this list");EXTERN STRING_VAR (tessedit_non_adaption_set, ",.;:'~@*","Characters to be avoided when adapting");EXTERN BOOL_VAR (tessedit_mm_adapt_using_prototypes, TRUE,"Use prototypes when adapting");EXTERN BOOL_VAR (tessedit_mm_use_prototypes, TRUE,"Use prototypes as clusters are built");EXTERN BOOL_VAR (tessedit_mm_use_rejmap, FALSE,"Adapt to characters using reject map");EXTERN BOOL_VAR (tessedit_mm_all_rejects, FALSE,"Adapt to all characters using, matrix matcher");EXTERN BOOL_VAR (tessedit_mm_only_match_same_char, FALSE,"Only match samples against clusters for the same character");EXTERN BOOL_VAR (tessedit_process_rns, FALSE, "Handle m - rn ambigs");EXTERN BOOL_VAR (tessedit_demo_adaption, FALSE,"Display cut images and matrix match for demo purposes");EXTERN INT_VAR (tessedit_demo_word1, 62,"Word number of first word to display");EXTERN INT_VAR (tessedit_demo_word2, 64,"Word number of second word to display");EXTERN STRING_VAR (tessedit_demo_file, "academe","Name of document containing demo words");BOOL8 word_adaptable(  //should we adapt?                     WERD_RES *word,                     UINT16 mode) {  BOOL8 status = FALSE;  BITS16 flags(mode);  enum MODES  {    ADAPTABLE_WERD,    ACCEPTABLE_WERD,    CHECK_DAWGS,    CHECK_SPACES,    CHECK_ONE_ELL_CONFLICT,    CHECK_AMBIG_WERD  };  /*  0: NO adaption  */  if (mode == 0) {    return FALSE;  }  if (flags.bit (ADAPTABLE_WERD))    status |= word->tess_would_adapt;  if (flags.bit (ACCEPTABLE_WERD))    status |= word->tess_accepted;  if (!status)                   // If not set then    return FALSE;                // ignore other checks  if (flags.bit (CHECK_DAWGS) &&    (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&    (word->best_choice->permuter () != FREQ_DAWG_PERM) &&    (word->best_choice->permuter () != USER_DAWG_PERM) &&    (word->best_choice->permuter () != NUMBER_PERM))    return FALSE;  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE))    return FALSE;  if (flags.bit (CHECK_SPACES) &&    (strchr (word->best_choice->string ().string (), ' ') != NULL))    return FALSE;//  if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))  if (flags.bit (CHECK_AMBIG_WERD) &&      !NoDangerousAmbig(word->best_choice->string().string(), NULL))    return FALSE;  return status;}void collect_ems_for_adaption(WERD_RES *word,                              CHAR_SAMPLES_LIST *char_clusters,                              CHAR_SAMPLE_LIST *chars_waiting) {  PBLOB_LIST *blobs = word->outword->blob_list ();  PBLOB_IT blob_it(blobs);  INT16 i;  CHAR_SAMPLE *sample;  PIXROW_LIST *pixrow_list;  PIXROW_IT pixrow_it;  IMAGELINE *imlines;            // lines of the image  BOX pix_box;                   // box of imlines  // extent  WERD copy_outword;             // copy to denorm  PBLOB_IT copy_blob_it;  OUTLINE_IT copy_outline_it;  INT32 resolution = page_image.get_res ();  if (tessedit_reject_ems || tessedit_reject_suspect_ems)    return;                      // Do nothing  if (word->word->bounding_box ().height () > resolution / 3)    return;  if (tessedit_demo_adaption)                                 // Make sure not set    tessedit_display_mm.set_value (FALSE);  if (word_adaptable (word, tessedit_em_adaption_mode)    && word->reject_map.reject_count () == 0    && (strchr (word->best_choice->string ().string (), 'm') != NULL    || (tessedit_process_rns    && strstr (word->best_choice->string ().string (),  "rn") != NULL))) {    if (tessedit_process_rns    && strstr (word->best_choice->string ().string (), "rn") != NULL) {      copy_outword = *(word->outword);      copy_blob_it.set_to_list (copy_outword.blob_list ());      i = 0;      while (word->best_choice->string ()[i] != '\0') {        if (word->best_choice->string ()[i] == 'r'        && word->best_choice->string ()[i + 1] == 'n') {          copy_outline_it.set_to_list (copy_blob_it.data ()->            out_list ());          copy_outline_it.add_list_after (copy_blob_it.            data_relative (1)->            out_list ());          copy_blob_it.forward ();          delete (copy_blob_it.extract ());          i++;        }        copy_blob_it.forward ();        i++;      }    }    else      copy_outword = *(word->outword);    copy_outword.baseline_denormalise (&word->denorm);    char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);    pixrow_it.set_to_list (pixrow_list);    pixrow_it.move_to_first ();    blob_it.move_to_first ();    for (i = 0;      word->best_choice->string ()[i] != '\0';    i++, pixrow_it.forward (), blob_it.forward ()) {      if (word->best_choice->string ()[i] == 'm'        || (word->best_choice->string ()[i] == 'r'      && word->best_choice->string ()[i + 1] == 'n')) {        #ifndef SECURE_NAMES        if (tessedit_cluster_debug)          tprintf ("Sample %c for adaption found in %s, index %d\n",            word->best_choice->string ()[i],            word->best_choice->string ().string (), i);        #endif        if (tessedit_matrix_match) {          sample = clip_sample (pixrow_it.data (),            imlines,            pix_box,            copy_outword.flag (W_INVERSE),            word->best_choice->string ()[i]);          if (sample == NULL) {  //Clip failed            #ifndef SECURE_NAMES            tprintf ("Unable to clip sample from %s, index %d\n",              word->best_choice->string ().string (), i);            #endif            if (word->best_choice->string ()[i] == 'r')              i++;            continue;          }        }        else          sample = new CHAR_SAMPLE (blob_it.data (),            &word->denorm,            word->best_choice->string ()[i]);        cluster_sample(sample, char_clusters, chars_waiting);        if (word->best_choice->string ()[i] == 'r')          i++;                   // Skip next character      }    }    delete[]imlines;             // Free array of imlines    delete pixrow_list;  }}void collect_characters_for_adaption(WERD_RES *word,                                     CHAR_SAMPLES_LIST *char_clusters,                                     CHAR_SAMPLE_LIST *chars_waiting) {  PBLOB_LIST *blobs = word->outword->blob_list ();  PBLOB_IT blob_it(blobs);  INT16 i;  CHAR_SAMPLE *sample;  PIXROW_LIST *pixrow_list;  PIXROW_IT pixrow_it;  IMAGELINE *imlines;            // lines of the image  BOX pix_box;                   // box of imlines  // extent  WERD copy_outword;             // copy to denorm  INT32 resolution = page_image.get_res ();  if (word->word->bounding_box ().height () > resolution / 3)    return;  if (tessedit_demo_adaption)                                 // Make sure not set    tessedit_display_mm.set_value (FALSE);  if ((word_adaptable (word, tessedit_cluster_adaption_mode)  && word->reject_map.reject_count () == 0) || tessedit_mm_use_rejmap) {    if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)      return;                    // Reject map set to acceptable    /* Collect information about good matches */    copy_outword = *(word->outword);    copy_outword.baseline_denormalise (&word->denorm);    char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);    pixrow_it.set_to_list (pixrow_list);    pixrow_it.move_to_first ();    blob_it.move_to_first ();    for (i = 0;      word->best_choice->string ()[i] != '\0';    i++, pixrow_it.forward (), blob_it.forward ()) {      if (!(tessedit_mm_use_non_adaption_set        && STRING (tessedit_non_adaption_set).contains (word->        best_choice->        string ()[i]))      || (tessedit_mm_use_rejmap && word->reject_map[i].accepted ())) {        #ifndef SECURE_NAMES        if (tessedit_cluster_debug)          tprintf ("Sample %c for adaption found in %s, index %d\n",            word->best_choice->string ()[i],            word->best_choice->string ().string (), i);        #endif        sample = clip_sample (pixrow_it.data (),          imlines,          pix_box,          copy_outword.flag (W_INVERSE),          word->best_choice->string ()[i]);        if (sample == NULL) {    //Clip failed          #ifndef SECURE_NAMES          tprintf ("Unable to clip sample from %s, index %d\n",            word->best_choice->string ().string (), i);          #endif          continue;        }        cluster_sample(sample, char_clusters, chars_waiting);      }    }    delete[]imlines;             // Free array of imlines    delete pixrow_list;  }  else if (tessedit_test_cluster_input && !tessedit_mm_use_rejmap)    // Set word to all rejects    word->reject_map.rej_word_tess_failure ();}void cluster_sample(CHAR_SAMPLE *sample,                    CHAR_SAMPLES_LIST *char_clusters,                    CHAR_SAMPLE_LIST *chars_waiting) {  CHAR_SAMPLES *best_cluster = NULL;  CHAR_SAMPLES_IT c_it = char_clusters;  CHAR_SAMPLE_IT cw_it = chars_waiting;  float score;  float best_score = MAX_INT32;  if (c_it.empty ())    c_it.add_to_end (new CHAR_SAMPLES (sample));  else {    for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) {      score = c_it.data ()->match_score (sample);      if (score < best_score) {        best_score = score;        best_cluster = c_it.data ();      }    }    if (tessedit_cluster_debug)      tprintf ("Sample's best score %f\n", best_score);    if (best_score < tessedit_cluster_t1) {      if (best_score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) {        best_cluster->add_sample (sample);        check_wait_list(chars_waiting, sample, best_cluster);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -