📄 fixxht.cpp

📁 一ＯＣＲ的相关资料。．希望对研究ＯＣＲ的朋友有所帮助．
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
/********************************************************************** * File:        fixxht.cpp  (Formerly fixxht.c) * Description: Improve x_ht and look out for case inconsistencies * Author:		Phil Cheatle * Created:		Thu Aug  5 14:11:08 BST 1993 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#include          <string.h>#include          <ctype.h>#include          "varable.h"#include          "tessvars.h"#include          "control.h"#include          "reject.h"#include          "fixxht.h"#include          "secname.h"#define EXTERNEXTERN double_VAR (x_ht_fraction_of_caps_ht, 0.7,"Fract of cps ht est of xht");EXTERN double_VAR (x_ht_variation, 0.35,"Err band as fract of caps/xht dist");EXTERN double_VAR (x_ht_sub_variation, 0.5,"Err band as fract of caps/xht dist");EXTERN BOOL_VAR (rej_trial_ambigs, TRUE,"reject x-ht ambigs when under trial");EXTERN BOOL_VAR (x_ht_conservative_ambigs, FALSE,"Dont rely on ambigs + maxht");EXTERN BOOL_VAR (x_ht_check_est, TRUE, "Cross check estimates");EXTERN BOOL_VAR (x_ht_case_flip, FALSE, "Flip or reject suspect case");EXTERN BOOL_VAR (x_ht_include_dodgy_blobs, TRUE,"Include blobs with possible noise?");EXTERN BOOL_VAR (x_ht_limit_flip_trials, TRUE,"Dont do trial flips when ambigs are close to xht?");EXTERN BOOL_VAR (rej_use_check_block_occ, TRUE,"Analyse rejection behaviour");EXTERN STRING_VAR (chs_non_ambig_caps_ht,"!#$%&()/12346789?ABDEFGHIKLNQRT[]\\bdfhkl","Reliable ascenders");EXTERN STRING_VAR (chs_x_ht, "acegmnopqrsuvwxyz", "X height chars");EXTERN STRING_VAR (chs_non_ambig_x_ht, "aenqr", "reliable X height chars");EXTERN STRING_VAR (chs_ambig_caps_x, "cCmMoO05sSuUvVwWxXzZ","X ht or caps ht chars");EXTERN STRING_VAR (chs_bl_ambig_caps_x, "pPyY", " Caps or descender ambigs");/* The following arent used in this module but are used in applybox.c */EXTERN STRING_VAR (chs_caps_ht,"!#$%&()/0123456789?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]\\bdfhkl{|}","Ascender chars");EXTERN STRING_VAR (chs_desc, "gjpqy", "Descender chars");EXTERN STRING_VAR (chs_non_ambig_bl,"!#$%&01246789?ABCDEFGHIKLMNORSTUVWXYZabcdehiklmnorstuvwxz","Reliable baseline chars");EXTERN STRING_VAR (chs_odd_top, "ijt", "Chars with funny ascender region");EXTERN STRING_VAR (chs_odd_bot, "()35JQ[]\\/{}|", "Chars with funny base");/* The following arent used but are defined for completeness */EXTERN STRING_VAR (chs_bl,"!#$%&()/01246789?ABCDEFGHIJKLMNOPRSTUVWXYZ[]\\abcdefhiklmnorstuvwxz{}","Baseline chars");EXTERN STRING_VAR (chs_non_ambig_desc, "gq", "Reliable descender chars");/************************************************************************* * re_estimate_x_ht() * * Walk the blobs in the word together with the text string and reject map. * NOTE: All evaluation is done on the baseline normalised word. This is so that * the BOX class can be used (integer). The reasons for this are: *   a) We must use the outword - ie the Tess result *   b) The outword is always converted to integer representation as that is how *      Tess works *   c) We would like to use the BOX class, cos its there - this is integer *      precision. *   d) If we de-normed the outword we would get rounding errors and would find *      that integers are too imprecise (x-height around 15 pixels instead of a *      scale of 128 in bln form. *   CONVINCED? * * A) Try to re-estimatate x-ht and caps ht from confirmed pts in word. * *    FOR each non reject blob *       IF char is baseline posn ambiguous *			Remove ambiguity by comparing its posn with respect to baseline. *		IF char is a confirmed x-ht char *			Add x-ht posn to confirmed_x_ht pts for word *    IF char is a confirmed caps-ht char *			Add blob_ht to caps ht pts for word * *    IF Std Dev of caps hts < 2  (AND # samples > 0) *		Use mean as caps ht estimate (Dont use median as we can expect a *			fair variation between the heights of the NON_AMBIG_CAPS_HT_CHS) *    IF Std Dev of caps hts >= 2  (AND # samples > 0) *			Suspect small caps font. *			Look for 2 clusters,	each with Std Dev < 2. *			IF 2 clusters found *			Pick the smaller median as the caps ht estimate of the smallcaps. * *    IF failed to estimate a caps ht *       Use the median caps ht if there is one, *		ELSE use the caps ht estimate of the previous word. NO!!! * * *    IF there are confirmed x-height chars *			Estimate confirmed x-height as the median value *    ELSE IF there is a confirmed caps ht *			Estimate confirmed x-height as a fraction of confirmed caps ht value *		ELSE *			Use the value for the previous word or the row value if this is the *			first word in the block. NO!!! * * B) Add in case ambiguous blobs based on confirmed x-ht/caps ht, changing case *    as necessary. Reestimate caps ht and x-ht as in A, using the extended *    clusters. * * C) If word contains rejects, and x-ht estimate significantly differs from *    original estimate, return TRUE so that the word can be rematched *************************************************************************/void re_estimate_x_ht(                     //improve for 1 word                      WERD_RES *word_res,  //word to do                      float *trial_x_ht    //new match value                     ) {  PBLOB_IT blob_it;  INT16 blob_ht_above_baseline;  const char *word_str;  INT16 i;  STATS all_blobs_ht (0, 300);   //every blob in word  STATS x_ht (0, 300);           //confirmed pts in wd  STATS caps_ht (0, 300);        //confirmed pts in wd  STATS case_ambig (0, 300);     //lower case ambigs  INT16 rej_blobs_count = 0;  INT16 rej_blobs_max_height = 0;  INT32 rej_blobs_max_area = 0;  float x_ht_ok_variation;  float max_blob_ht;  float marginally_above_x_ht;  BOX blob_box;                  //blob bounding box  float est_x_ht = 0.0;          //word estimate  float est_caps_ht = 0.0;       //word estimate                                 //based on hard data?  BOOL8 est_caps_ht_certain = FALSE;  BOOL8 est_x_ht_certain = FALSE;//based on hard data?  BOOL8 trial = FALSE;           //Sepeculative values?  BOOL8 no_comment = FALSE;      //No change in xht  float ambig_lc_x_est;  float ambig_uc_caps_est;  INT16 x_ht_ambigs = 0;  INT16 caps_ht_ambigs = 0;  /* Calculate default variation of blob x_ht from bln x_ht for bln word */  x_ht_ok_variation =    (bln_x_height / x_ht_fraction_of_caps_ht - bln_x_height) * x_ht_variation;  word_str = word_res->best_choice->string ().string ();  /*    Cycle blobs, allocating to one of the stats sets when possible.  */  blob_it.set_to_list (word_res->outword->blob_list ());  for (blob_it.mark_cycle_pt (), i = 0;  !blob_it.cycled_list (); blob_it.forward (), i++) {    if (!dodgy_blob (blob_it.data ())) {      blob_box = blob_it.data ()->bounding_box ();      blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;      all_blobs_ht.add (blob_ht_above_baseline, 1);      if (word_res->reject_map[i].rejected ()) {        rej_blobs_count++;        if (blob_box.height () > rej_blobs_max_height)          rej_blobs_max_height = blob_box.height ();        if (blob_box.area () > rej_blobs_max_area)          rej_blobs_max_area = blob_box.area ();      }      else {        if (STRING (chs_non_ambig_x_ht).contains (word_str[i]))          x_ht.add (blob_ht_above_baseline, 1);        if (STRING (chs_non_ambig_caps_ht).contains (word_str[i]))          caps_ht.add (blob_ht_above_baseline, 1);        if (STRING (chs_ambig_caps_x).contains (word_str[i])) {          case_ambig.add (blob_ht_above_baseline, 1);          if (STRING (chs_x_ht).contains (word_str[i]))            x_ht_ambigs++;          else            caps_ht_ambigs++;        }        if (STRING (chs_bl_ambig_caps_x).contains (word_str[i])) {          if (STRING (chs_x_ht).contains (word_str[i])) {            /* confirm x_height provided > 15% total height below baseline */            if ((bln_baseline_offset - blob_box.bottom ()) /              (float) blob_box.height () > 0.15)              x_ht.add (blob_ht_above_baseline, 1);          }          else {            /* confirm caps_height provided < 5% total height below baseline */            if ((bln_baseline_offset - blob_box.bottom ()) /              (float) blob_box.height () < 0.05)              caps_ht.add (blob_ht_above_baseline, 1);          }        }      }    }  }  est_caps_ht = estimate_from_stats (caps_ht);  est_x_ht = estimate_from_stats (x_ht);  est_ambigs(word_res, case_ambig, &ambig_lc_x_est, &ambig_uc_caps_est);   max_blob_ht = all_blobs_ht.ile (0.9999);  #ifndef SECURE_NAMES  if (debug_x_ht_level >= 20) {    tprintf ("Mode20:A: %s ", word_str);    word_res->reject_map.print (debug_fp);    tprintf (" XHT:%f CAP:%f MAX:%f AMBIG X:%f CAP:%f\n",      est_x_ht, est_caps_ht, max_blob_ht,      ambig_lc_x_est, ambig_uc_caps_est);  }  #endif  if (!x_ht_conservative_ambigs &&    (ambig_lc_x_est > 0) &&    (ambig_lc_x_est == ambig_uc_caps_est) &&  (max_blob_ht > ambig_lc_x_est + x_ht_ok_variation)) {                                 //may be zero but believe xht    ambig_uc_caps_est = est_caps_ht;    #ifndef SECURE_NAMES    if (debug_x_ht_level >= 20)      tprintf ("Mode20:B: Fiddle ambig_uc_caps_est to %f\n",        ambig_lc_x_est);    #endif  }  /* Now make some estimates */  if ((est_x_ht > 0) ||    (est_caps_ht > 0) ||  ((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {    /* There is some sensible data to go on so make the most of it. */    if (debug_x_ht_level >= 20)      tprintf ("Mode20:C: Sensible Data\n", ambig_lc_x_est);    if (est_x_ht > 0) {      est_x_ht_certain = TRUE;      if (est_caps_ht == 0) {        if ((ambig_uc_caps_est > ambig_lc_x_est) &&          (ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))          est_caps_ht = ambig_uc_caps_est;        else          est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;      }      if (case_ambig.get_total () > 0)        improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);       est_caps_ht_certain = caps_ht.get_total () > 0;      #ifndef SECURE_NAMES      if (debug_x_ht_level >= 20)        tprintf ("Mode20:D: Est from xht XHT:%f CAP:%f\n",          est_x_ht, est_caps_ht);      #endif    }    else if (est_caps_ht > 0) {      est_caps_ht_certain = TRUE;      if ((ambig_lc_x_est > 0) &&        (ambig_lc_x_est < est_caps_ht - x_ht_ok_variation))        est_x_ht = ambig_lc_x_est;      else        est_x_ht = est_caps_ht * x_ht_fraction_of_caps_ht;      if (ambig_lc_x_est + ambig_uc_caps_est > 0)        improve_estimate(word_res, est_x_ht, est_caps_ht, x_ht, caps_ht);       est_x_ht_certain = x_ht.get_total () > 0;      #ifndef SECURE_NAMES      if (debug_x_ht_level >= 20)        tprintf ("Mode20:E: Est from caps XHT:%f CAP:%f\n",          est_x_ht, est_caps_ht);      #endif    }    else {      /* Do something based on case ambig chars alone - we have guessed that the        ambigs are lower case. */      est_x_ht = ambig_lc_x_est;      est_x_ht_certain = TRUE;      if (ambig_uc_caps_est > ambig_lc_x_est) {        est_caps_ht = ambig_uc_caps_est;        est_caps_ht_certain = TRUE;      }      else        est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;      #ifndef SECURE_NAMES      if (debug_x_ht_level >= 20)        tprintf ("Mode20:F: Est from ambigs XHT:%f CAP:%f\n",          est_x_ht, est_caps_ht);      #endif    }    /* Check for sane interpretation of evidence:      Try shifting caps ht if min certain caps ht is not significantly greater      than the estimated x ht or the max certain x ht is not significantly less      than the estimated caps ht. */    if (x_ht_check_est) {      if ((caps_ht.get_total () > 0) &&      (est_x_ht + x_ht_ok_variation >= caps_ht.ile (0.0001))) {        trial = TRUE;        est_caps_ht = est_x_ht;        est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;        #ifndef SECURE_NAMES        if (debug_x_ht_level >= 20)          tprintf ("Mode20:G: Trial XHT:%f CAP:%f\n",            est_x_ht, est_caps_ht);        #endif      }      else if ((x_ht.get_total () > 0) &&      (est_caps_ht - x_ht_ok_variation <= x_ht.ile (0.9999))) {        trial = TRUE;        est_x_ht = est_caps_ht;        est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;        #ifndef SECURE_NAMES        if (debug_x_ht_level >= 20)          tprintf ("Mode20:H: Trial XHT:%f CAP:%f\n",            est_x_ht, est_caps_ht);        #endif      }    }  }  else {    /* There is no sensible data so we're in the dark. */    marginally_above_x_ht = bln_x_height +      x_ht_ok_variation * x_ht_sub_variation;    /*      If there are no rejects, or the only rejects have a narrow height, or have      a small area compared to a normal char, then estimate the x-height as the      original one. (I.e dont fiddle about if the only rejects look like      punctuation) - we use max height as mean or median will be too low if      there are only two blobs - Eg "F."    */    if (debug_x_ht_level >= 20)      tprintf ("Mode20:I: In the dark\n");    if ((rej_blobs_count == 0) ||      (rej_blobs_max_height < 0.3 * max_blob_ht) ||    (rej_blobs_max_area < 0.3 * max_blob_ht * max_blob_ht)) {      no_comment = TRUE;      if (debug_x_ht_level >= 20)        tprintf ("Mode20:J: No comment due to no rejects\n");    }    else if (x_ht_limit_flip_trials &&      ((max_blob_ht < marginally_above_x_ht) ||      ((ambig_lc_x_est > 0) &&      (ambig_lc_x_est == ambig_uc_caps_est) &&    (ambig_lc_x_est < marginally_above_x_ht)))) {      no_comment = TRUE;      if (debug_x_ht_level >= 20)        tprintf ("Mode20:K: No comment as close to xht %f < %f\n",          ambig_lc_x_est, marginally_above_x_ht);    }    else if (x_ht_conservative_ambigs && (ambig_uc_caps_est > 0)) {      trial = TRUE;      est_caps_ht = ambig_lc_x_est;      est_x_ht = x_ht_fraction_of_caps_ht * est_caps_ht;      #ifndef SECURE_NAMES      if (debug_x_ht_level >= 20)        tprintf ("Mode20:L: Trial XHT:%f CAP:%f\n",          est_x_ht, est_caps_ht);      #endif    }    /*      If the top of the word is nowhere near where we expect ascenders to be      (less than half the x_ht -> caps_ht distance) - suspect an all caps word      at the x-ht. Estimate x-ht accordingly - but only as a TRIAL!      NOTE we do NOT check location of baseline. Commas can descend as much as      real descenders so we would need to do something to make sure that any      disqualifying descenders were not at the end.    */    else {      if (max_blob_ht <      (bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {        trial = TRUE;
12 下一页
💿 文件大小 2763 K
👤 上传用户 danlong
📂 所属分类其他书籍
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -