⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 applybox.cpp

📁 一OCR的相关资料。.希望对研究OCR的朋友有所帮助.
💻 CPP
📖 第 1 页 / 共 2 页
字号:
/********************************************************************** * File:        applybox.cpp  (Formerly applybox.c) * Description: Re segment rows according to box file data * Author:		Phil Cheatle * Created:		Wed Nov 24 09:11:23 GMT 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************//*define SECURE_NAMES for code versions which go to UNLV to stop tesseditincluding all the newdiff stuff (which contains lots of text indicatingwhat measures we are interested in.*//* #define SECURE_NAMES done in secnames.h when necessary*/#include "mfcpch.h"#include          "applybox.h"#include          <ctype.h>#include          <string.h>#ifdef __UNIX__#include          <assert.h>#include                    <errno.h>#endif#include          "mainblk.h"#include                   "genblob.h"#include                   "fixxht.h"#include          "control.h"#include          "tessbox.h"#include          "globals.h"#include          "secname.h"#define SECURE_NAMES#ifndef SECURE_NAMES#include          "wordstats.h"#endif#define EXTERNEXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");EXTERN INT_VAR (applybox_debug, 0, "Debug level");EXTERN STRING_VAR (applybox_test_exclusions, "|","Chars ignored for testing");EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");/************************************************************************* * The code re-assigns outlines to form words each with ONE labelled blob. * Noise is left in UNLABELLED words. The chars on the page are checked crudely * for sensible position relative to baseline and xht. Failed boxes are * compensated for by duplicating other believable instances of the character. * * The box file is assumed to contain box definitions, one per line, of the * following format: *   <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused * * The approach taken is to search the WHOLE page for stuff overlapping each box. *	- This is not too inefficient and is SAFE. *    - We can detect overlapping blobs as we will be attempting to put a blob *      from a LABELLED word into the current word. *    - When all the boxes have been processed we can detect any stuff which is *      being ignored - it is the unlabelled words left on the page. * * A box should only overlap one row. * * A warning is given if the box is on the same row as the previous box, but NOT * on the same row as the previous blob. * * Any OUTLINE which overlaps the box is put into the new word. * * ascender chars must ascend above xht significantly * xht chars must not rise above row xht significantly * bl chars must not descend below baseline significantly * descender chars must descend below baseline significantly * * ?? Certain chars are DROPPED - to limit the training data. * *************************************************************************/void apply_boxes(BLOCK_LIST *block_list    //real blocks                ) {  INT16 boxfile_lineno = 0;  INT16 boxfile_charno = 0;  BOX box;                       //boxfile box  char ch[2];                    //correct ch from boxfile  ROW *row;  ROW *prev_row = NULL;  INT16 prev_box_right = MAX_INT16;  INT16 block_id;  INT16 row_id;  INT16 box_count = 0;  INT16 box_failures = 0;  INT16 labels_ok;  INT16 rows_ok;  INT16 bad_blobs;  INT16 tgt_char_counts[128];    //No. of box samples  //      INT16                                   labelled_char_counts[128];      //No. of unique labelled samples  INT16 i;  INT16 rebalance_count = 0;  char min_char;  INT16 min_samples;  INT16 final_labelled_blob_count;  for (i = 0; i < 128; i++)    tgt_char_counts[i] = 0;  FILE* box_file;  STRING filename = imagefile;  filename += ".box";  if (!(box_file = fopen (filename.string(), "r"))) {    CANTOPENFILE.error ("read_next_box", EXIT,      "Cant open box file %s %d",      filename.string(), errno);  }  ch[1] = '\0';  clear_any_old_text(block_list);  while (read_next_box (box_file, &box, &ch[0])) {    box_count++;    tgt_char_counts[ch[0]]++;    row = find_row_of_box (block_list, box, block_id, row_id);    if (box.left () < prev_box_right) {      boxfile_lineno++;      boxfile_charno = 1;    }    else      boxfile_charno++;    if (row == NULL) {      box_failures++;      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,        "FAILURE! box overlaps no blobs or blobs in multiple rows");    }    else {      if ((box.left () >= prev_box_right) && (row != prev_row))        report_failed_box (boxfile_lineno, boxfile_charno, box, ch,          "WARNING! false row break");      box_failures += resegment_box (row, box, ch, block_id, row_id,        boxfile_lineno, boxfile_charno);      prev_row = row;    }    prev_box_right = box.right ();  }  tidy_up(block_list,          labels_ok,          rows_ok,          bad_blobs,          tgt_char_counts,          rebalance_count,          min_char,          min_samples,          final_labelled_blob_count);  tprintf ("APPLY_BOXES:\n");  tprintf ("   Boxes read from boxfile:  %6d\n", box_count);  tprintf ("   Initially labelled blobs: %6d in %d rows\n",    labels_ok, rows_ok);  tprintf ("   Box failures detected:		%6d\n", box_failures);  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);  tprintf ("   \"%c\" has fewest samples:%6d\n", min_char, min_samples);  tprintf ("				Total unlabelled words:   %6d\n",    bad_blobs);  tprintf ("				Final labelled words:     %6d\n",    final_labelled_blob_count);}void clear_any_old_text(                        //remove correct text                        BLOCK_LIST *block_list  //real blocks                       ) {  BLOCK_IT block_it(block_list);  ROW_IT row_it;  WERD_IT word_it;  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    row_it.set_to_list (block_it.data ()->row_list ());    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {      word_it.set_to_list (row_it.data ()->word_list ());      for (word_it.mark_cycle_pt ();      !word_it.cycled_list (); word_it.forward ()) {        word_it.data ()->set_text ("");      }    }  }}BOOL8 read_next_box(FILE* box_file,  //                    BOX *box,                    char *ch) {  char buff[256];                //boxfile read buffer  char *buffptr = buff;  STRING box_filename;  static INT16 line = 0;  INT32 x_min;  INT32 y_min;  INT32 x_max;  INT32 y_max;  INT32 count = 0;  while (!feof (box_file)) {    fgets (buff, sizeof (buff) - 1, box_file);    line++;    /* Check for blank lines in box file */    for (buffptr = buff; isspace (*buffptr); buffptr++)      ;    if (*buffptr != '\0') {      count =        sscanf (buff,        "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " "        INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max);      if (count != 5) {        tprintf ("Box file format error on line %i ignored\n", line);      }      else {        *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));        return TRUE;             //read a box ok      }    }  }  return FALSE;                  //EOF}ROW *find_row_of_box(                         //                     BLOCK_LIST *block_list,  //real blocks                     BOX box,                 //from boxfile                     INT16 &block_id,                     INT16 &row_id_to_process) {  BLOCK_IT block_it(block_list);  BLOCK *block;  ROW_IT row_it;  ROW *row;  ROW *row_to_process = NULL;  INT16 row_id;  WERD_IT word_it;  WERD *word;  BOOL8 polyg;  PBLOB_IT blob_it;  PBLOB *blob;  OUTLINE_IT outline_it;  OUTLINE *outline;  /*    Find row to process - error if box REALLY overlaps more than one row. (I.e    it overlaps blobs in the row - not just overlaps the bounding box of the    whole row.)  */  block_id = 0;  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    block_id++;    row_id = 0;    block = block_it.data ();    if (block->bounding_box ().overlap (box)) {      row_it.set_to_list (block->row_list ());      for (row_it.mark_cycle_pt ();      !row_it.cycled_list (); row_it.forward ()) {        row_id++;        row = row_it.data ();        if (row->bounding_box ().overlap (box)) {          word_it.set_to_list (row->word_list ());          for (word_it.mark_cycle_pt ();          !word_it.cycled_list (); word_it.forward ()) {            word = word_it.data ();            polyg = word->flag (W_POLYGON);            if (word->bounding_box ().overlap (box)) {              blob_it.set_to_list (word->gblob_list ());              for (blob_it.mark_cycle_pt ();              !blob_it.cycled_list (); blob_it.forward ()) {                blob = blob_it.data ();                if (gblob_bounding_box (blob, polyg).                overlap (box)) {                  outline_it.                    set_to_list (gblob_out_list                    (blob, polyg));                  for (outline_it.mark_cycle_pt ();                    !outline_it.cycled_list ();                  outline_it.forward ()) {                    outline = outline_it.data ();                    if (goutline_bounding_box                    (outline, polyg).major_overlap (box)) {                      if ((row_to_process == NULL) ||                      (row_to_process == row)) {                        row_to_process = row;                        row_id_to_process = row_id;                      }                      else                        /* RETURN ERROR Box overlaps blobs in more than one row  */                        return NULL;                    }                  }                }              }            }          }        }      }    }  }  return row_to_process;}INT16 resegment_box(  //                    ROW *row,                    BOX box,                    char *ch,                    INT16 block_id,                    INT16 row_id,                    INT16 boxfile_lineno,                    INT16 boxfile_charno) {  WERD_IT word_it;  WERD *word;  WERD *new_word = NULL;  BOOL8 polyg = false;  PBLOB_IT blob_it;  PBLOB_IT new_blob_it;  PBLOB *blob;  PBLOB *new_blob;  OUTLINE_IT outline_it;  OUTLINE_LIST dummy;  // Just to initialize new_outline_it.  OUTLINE_IT new_outline_it = &dummy;  OUTLINE *outline;  BOX new_word_box;  float word_x_centre;  float baseline;  INT16 error_count = 0;         //number of chars lost  word_it.set_to_list (row->word_list ());  for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {    word = word_it.data ();    polyg = word->flag (W_POLYGON);    if (word->bounding_box ().overlap (box)) {      blob_it.set_to_list (word->gblob_list ());      for (blob_it.mark_cycle_pt ();      !blob_it.cycled_list (); blob_it.forward ()) {        blob = blob_it.data ();        if (gblob_bounding_box (blob, polyg).overlap (box)) {          outline_it.set_to_list (gblob_out_list (blob, polyg));          for (outline_it.mark_cycle_pt ();          !outline_it.cycled_list (); outline_it.forward ()) {            outline = outline_it.data ();            if (goutline_bounding_box (outline, polyg).            major_overlap (box)) {              if (strlen (word->text ()) > 0) {                if (error_count == 0) {                  error_count = 1;                  if (applybox_debug > 4)                    report_failed_box (boxfile_lineno,                      boxfile_charno,                      box, ch,                      "FAILURE! box overlaps blob in labelled word");                }                if (applybox_debug > 4)                  tprintf                    ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n",                    block_id, row_id,                    word_it.data ()->text ());                word_it.data ()->set_text ("");                //UN label it                error_count++;              }              if (error_count == 0) {                if (new_word == NULL) {                                 /* Make a new word with a single blob */                  new_word = word->shallow_copy ();                  new_word->set_text (ch);                  if (polyg)                    new_blob = new PBLOB;                  else                    new_blob = (PBLOB *) new C_BLOB;                  new_blob_it.set_to_list (new_word->                    gblob_list ());                  new_blob_it.add_to_end (new_blob);                  new_outline_it.                    set_to_list (gblob_out_list                    (new_blob, polyg));                }                new_outline_it.add_to_end (outline_it.                  extract ());                //move blob              }            }          }                                 //no outlines in blob          if (outline_it.empty ())                                 //so delete blob            delete blob_it.extract ();        }      }      if (blob_it.empty ())      //no blobs in word                                 //so delete word          delete word_it.extract ();    }  }  if (error_count > 0)    return error_count;  if (new_word != NULL) {    gblob_sort_list (new_word->gblob_list (), polyg);    word_it.add_to_end (new_word);    new_word_box = new_word->bounding_box ();    word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f;    baseline = row->base_line (word_x_centre);    if (STRING (chs_caps_ht).contains (ch[0]) &&      (new_word_box.top () <    baseline + (1 + applybox_error_band) * row->x_height ())) {      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,        "FAILURE! caps-ht char didn't ascend");      new_word->set_text ("");      return 1;    }    if (STRING (chs_odd_top).contains (ch[0]) &&      (new_word_box.top () <    baseline + (1 - applybox_error_band) * row->x_height ())) {      report_failed_box (boxfile_lineno, boxfile_charno, box, ch,        "FAILURE! Odd top char below xht");      new_word->set_text ("");

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -