applybox.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 892 行 · 第 1/3 页

CPP
892
字号
/********************************************************************** * File:        applybox.cpp  (Formerly applybox.c) * Description: Re segment rows according to box file data * Author:		Phil Cheatle * Created:		Wed Nov 24 09:11:23 GMT 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************//*define SECURE_NAMES for code versions which go to UNLV to stop tesseditincluding all the newdiff stuff (which contains lots of text indicatingwhat measures we are interested in.*//* #define SECURE_NAMES done in secnames.h when necessary*/#include "mfcpch.h"#include "applybox.h"#include <ctype.h>#include <string.h>#ifdef __UNIX__#include <assert.h>#include <errno.h>#endif#include "boxread.h"#include "mainblk.h"#include "genblob.h"#include "fixxht.h"#include "control.h"#include "tessbox.h"#include "globals.h"#include "secname.h"#include "unichar.h"#include "matchdefs.h"#define SECURE_NAMES#ifndef SECURE_NAMES#include          "wordstats.h"#endif#define EXTERNEXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");EXTERN INT_VAR (applybox_debug, 5, "Debug level");EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from");EXTERN STRING_VAR (applybox_test_exclusions, "","Chars ignored for testing");EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");// The unicharset used during box trainingstatic UNICHARSET unicharset_boxes;static void PrintString(const char* str) {  tprintf("%s:", str);  int step = 0;  for (int i = 0; str[i]; i += step) {    step = UNICHAR::utf8_step(str + i);    if (step == 0)      step = 1;    UNICHAR ch(str + i, step);    tprintf("[%x]", ch.first_uni());  }  tprintf("\n", str);}/************************************************************************* * The code re-assigns outlines to form words each with ONE labelled blob. * Noise is left in UNLABELLED words. The chars on the page are checked crudely * for sensible position relative to baseline and xht. Failed boxes are * compensated for by duplicating other believable instances of the character. * * The box file is assumed to contain box definitions, one per line, of the * following format: *   <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused * * The approach taken is to search the WHOLE page for stuff overlapping each box. *	- This is not too inefficient and is SAFE. *    - We can detect overlapping blobs as we will be attempting to put a blob *      from a LABELLED word into the current word. *    - When all the boxes have been processed we can detect any stuff which is *      being ignored - it is the unlabelled words left on the page. * * A box should only overlap one row. * * A warning is given if the box is on the same row as the previous box, but NOT * on the same row as the previous blob. * * Any OUTLINE which overlaps the box is put into the new word. * * ascender chars must ascend above xht significantly * xht chars must not rise above row xht significantly * bl chars must not descend below baseline significantly * descender chars must descend below baseline significantly * * ?? Certain chars are DROPPED - to limit the training data. * *************************************************************************/void apply_boxes(BLOCK_LIST *block_list    //real blocks                ) {  inT16 boxfile_lineno = 0;  inT16 boxfile_charno = 0;  TBOX box;                       //boxfile box  UNICHAR_ID uch_id;             //correct ch from boxfile  ROW *row;  ROW *prev_row = NULL;  inT16 prev_box_right = MAX_INT16;  inT16 block_id;  inT16 row_id;  inT16 box_count = 0;  inT16 box_failures = 0;  inT16 labels_ok;  inT16 rows_ok;  inT16 bad_blobs;  inT16 tgt_char_counts[MAX_NUM_CLASSES];    //No. of box samples  //      inT16                                   labelled_char_counts[128];      //No. of unique labelled samples  inT16 i;  inT16 rebalance_count = 0;  UNICHAR_ID min_uch_id;  inT16 min_samples;  inT16 final_labelled_blob_count;  // Clean the unichar set  unicharset_boxes.clear();  // Space character needed to represent NIL classification  unicharset_boxes.unichar_insert(" ");  for (i = 0; i < MAX_NUM_CLASSES; i++)    tgt_char_counts[i] = 0;  FILE* box_file;  STRING filename = imagefile;  filename += ".box";  if (!(box_file = fopen (filename.string(), "r"))) {    CANTOPENFILE.error ("read_next_box", EXIT,      "Cant open box file %s %d",      filename.string(), errno);  }  clear_any_old_text(block_list);  while (read_next_box(applybox_page, box_file, &box, &uch_id)) {    box_count++;    tgt_char_counts[uch_id]++;    row = find_row_of_box (block_list, box, block_id, row_id);    if (box.left () < prev_box_right) {      boxfile_lineno++;      boxfile_charno = 1;    }    else      boxfile_charno++;    if (row == NULL) {      box_failures++;      report_failed_box (boxfile_lineno, boxfile_charno, box,                         unicharset_boxes.id_to_unichar(uch_id),        "FAILURE! box overlaps no blobs or blobs in multiple rows");    }    else {      if ((box.left () >= prev_box_right) && (row != prev_row))        report_failed_box (boxfile_lineno, boxfile_charno, box,                           unicharset_boxes.id_to_unichar(uch_id),          "WARNING! false row break");      box_failures += resegment_box (row, box, uch_id, block_id, row_id,        boxfile_lineno, boxfile_charno);      prev_row = row;    }    prev_box_right = box.right ();  }  tidy_up(block_list,          labels_ok,          rows_ok,          bad_blobs,          tgt_char_counts,          rebalance_count,          &min_uch_id,          min_samples,          final_labelled_blob_count);  tprintf ("APPLY_BOXES:\n");  tprintf ("   Boxes read from boxfile:  %6d\n", box_count);  tprintf ("   Initially labelled blobs: %6d in %d rows\n",    labels_ok, rows_ok);  tprintf ("   Box failures detected:		%6d\n", box_failures);  tprintf ("   Duped blobs for rebalance:%6d\n", rebalance_count);  tprintf ("   \"%s\" has fewest samples:%6d\n",           unicharset_boxes.id_to_unichar(min_uch_id), min_samples);  tprintf ("				Total unlabelled words:   %6d\n",    bad_blobs);  tprintf ("				Final labelled words:     %6d\n",    final_labelled_blob_count);}void clear_any_old_text(                        //remove correct text                        BLOCK_LIST *block_list  //real blocks                       ) {  BLOCK_IT block_it(block_list);  ROW_IT row_it;  WERD_IT word_it;  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    row_it.set_to_list (block_it.data ()->row_list ());    for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) {      word_it.set_to_list (row_it.data ()->word_list ());      for (word_it.mark_cycle_pt ();      !word_it.cycled_list (); word_it.forward ()) {        word_it.data ()->set_text ("");      }    }  }}BOOL8 read_next_box(int page,                    FILE* box_file,  //                    TBOX *box,                    UNICHAR_ID *uch_id) {  int x_min;  int y_min;  int x_max;  int y_max;  char uch[kBoxReadBufSize];  while (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) {    if (!unicharset_boxes.contains_unichar(uch))    {      unicharset_boxes.unichar_insert(uch);      if (unicharset_boxes.size() > MAX_NUM_CLASSES) {        tprintf("Error: Size of unicharset of boxes is "                "greater than MAX_NUM_CLASSES (%d)\n",                MAX_NUM_CLASSES);        exit(1);      }    }    *uch_id = unicharset_boxes.unichar_to_id(uch);    *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max));    return TRUE;             //read a box ok  }  return FALSE;                  //EOF}ROW *find_row_of_box(                         //                     BLOCK_LIST *block_list,  //real blocks                     TBOX box,                 //from boxfile                     inT16 &block_id,                     inT16 &row_id_to_process) {  BLOCK_IT block_it(block_list);  BLOCK *block;  ROW_IT row_it;  ROW *row;  ROW *row_to_process = NULL;  inT16 row_id;  WERD_IT word_it;  WERD *word;  BOOL8 polyg;  PBLOB_IT blob_it;  PBLOB *blob;  OUTLINE_IT outline_it;  OUTLINE *outline;  /*    Find row to process - error if box REALLY overlaps more than one row. (I.e    it overlaps blobs in the row - not just overlaps the bounding box of the    whole row.)  */  block_id = 0;  for (block_it.mark_cycle_pt ();  !block_it.cycled_list (); block_it.forward ()) {    block_id++;    row_id = 0;    block = block_it.data ();    if (block->bounding_box ().overlap (box)) {      row_it.set_to_list (block->row_list ());      for (row_it.mark_cycle_pt ();      !row_it.cycled_list (); row_it.forward ()) {        row_id++;        row = row_it.data ();        if (row->bounding_box ().overlap (box)) {          word_it.set_to_list (row->word_list ());          for (word_it.mark_cycle_pt ();          !word_it.cycled_list (); word_it.forward ()) {            word = word_it.data ();            polyg = word->flag (W_POLYGON);            if (word->bounding_box ().overlap (box)) {              blob_it.set_to_list (word->gblob_list ());              for (blob_it.mark_cycle_pt ();              !blob_it.cycled_list (); blob_it.forward ()) {                blob = blob_it.data ();

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?