📄 applybox.cpp
字号:
/********************************************************************** * File: applybox.cpp (Formerly applybox.c) * Description: Re segment rows according to box file data * Author: Phil Cheatle * Created: Wed Nov 24 09:11:23 GMT 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************//*define SECURE_NAMES for code versions which go to UNLV to stop tesseditincluding all the newdiff stuff (which contains lots of text indicatingwhat measures we are interested in.*//* #define SECURE_NAMES done in secnames.h when necessary*/#include "mfcpch.h"#include "applybox.h"#include <ctype.h>#include <string.h>#ifdef __UNIX__#include <assert.h>#include <errno.h>#endif#include "mainblk.h"#include "genblob.h"#include "fixxht.h"#include "control.h"#include "tessbox.h"#include "globals.h"#include "secname.h"#define SECURE_NAMES#ifndef SECURE_NAMES#include "wordstats.h"#endif#define EXTERNEXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");EXTERN INT_VAR (applybox_debug, 0, "Debug level");EXTERN STRING_VAR (applybox_test_exclusions, "|","Chars ignored for testing");EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");/************************************************************************* * The code re-assigns outlines to form words each with ONE labelled blob. * Noise is left in UNLABELLED words. The chars on the page are checked crudely * for sensible position relative to baseline and xht. Failed boxes are * compensated for by duplicating other believable instances of the character. * * The box file is assumed to contain box definitions, one per line, of the * following format: * <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused * * The approach taken is to search the WHOLE page for stuff overlapping each box. * - This is not too inefficient and is SAFE. * - We can detect overlapping blobs as we will be attempting to put a blob * from a LABELLED word into the current word. * - When all the boxes have been processed we can detect any stuff which is * being ignored - it is the unlabelled words left on the page. * * A box should only overlap one row. * * A warning is given if the box is on the same row as the previous box, but NOT * on the same row as the previous blob. * * Any OUTLINE which overlaps the box is put into the new word. * * ascender chars must ascend above xht significantly * xht chars must not rise above row xht significantly * bl chars must not descend below baseline significantly * descender chars must descend below baseline significantly * * ?? Certain chars are DROPPED - to limit the training data. * *************************************************************************/void apply_boxes(BLOCK_LIST *block_list //real blocks ) { INT16 boxfile_lineno = 0; INT16 boxfile_charno = 0; BOX box; //boxfile box char ch[2]; //correct ch from boxfile ROW *row; ROW *prev_row = NULL; INT16 prev_box_right = MAX_INT16; INT16 block_id; INT16 row_id; INT16 box_count = 0; INT16 box_failures = 0; INT16 labels_ok; INT16 rows_ok; INT16 bad_blobs; INT16 tgt_char_counts[128]; //No. of box samples // INT16 labelled_char_counts[128]; //No. of unique labelled samples INT16 i; INT16 rebalance_count = 0; char min_char; INT16 min_samples; INT16 final_labelled_blob_count; for (i = 0; i < 128; i++) tgt_char_counts[i] = 0; FILE* box_file; STRING filename = imagefile; filename += ".box"; if (!(box_file = fopen (filename.string(), "r"))) { CANTOPENFILE.error ("read_next_box", EXIT, "Cant open box file %s %d", filename.string(), errno); } ch[1] = '\0'; clear_any_old_text(block_list); while (read_next_box (box_file, &box, &ch[0])) { box_count++; tgt_char_counts[ch[0]]++; row = find_row_of_box (block_list, box, block_id, row_id); if (box.left () < prev_box_right) { boxfile_lineno++; boxfile_charno = 1; } else boxfile_charno++; if (row == NULL) { box_failures++; report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! box overlaps no blobs or blobs in multiple rows"); } else { if ((box.left () >= prev_box_right) && (row != prev_row)) report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "WARNING! false row break"); box_failures += resegment_box (row, box, ch, block_id, row_id, boxfile_lineno, boxfile_charno); prev_row = row; } prev_box_right = box.right (); } tidy_up(block_list, labels_ok, rows_ok, bad_blobs, tgt_char_counts, rebalance_count, min_char, min_samples, final_labelled_blob_count); tprintf ("APPLY_BOXES:\n"); tprintf (" Boxes read from boxfile: %6d\n", box_count); tprintf (" Initially labelled blobs: %6d in %d rows\n", labels_ok, rows_ok); tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); tprintf (" \"%c\" has fewest samples:%6d\n", min_char, min_samples); tprintf (" Total unlabelled words: %6d\n", bad_blobs); tprintf (" Final labelled words: %6d\n", final_labelled_blob_count);}void clear_any_old_text( //remove correct text BLOCK_LIST *block_list //real blocks ) { BLOCK_IT block_it(block_list); ROW_IT row_it; WERD_IT word_it; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { word_it.set_to_list (row_it.data ()->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word_it.data ()->set_text (""); } } }}BOOL8 read_next_box(FILE* box_file, // BOX *box, char *ch) { char buff[256]; //boxfile read buffer char *buffptr = buff; STRING box_filename; static INT16 line = 0; INT32 x_min; INT32 y_min; INT32 x_max; INT32 y_max; INT32 count = 0; while (!feof (box_file)) { fgets (buff, sizeof (buff) - 1, box_file); line++; /* Check for blank lines in box file */ for (buffptr = buff; isspace (*buffptr); buffptr++) ; if (*buffptr != '\0') { count = sscanf (buff, "%c " INT32FORMAT " " INT32FORMAT " " INT32FORMAT " " INT32FORMAT, ch, &x_min, &y_min, &x_max, &y_max); if (count != 5) { tprintf ("Box file format error on line %i ignored\n", line); } else { *box = BOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); return TRUE; //read a box ok } } } return FALSE; //EOF}ROW *find_row_of_box( // BLOCK_LIST *block_list, //real blocks BOX box, //from boxfile INT16 &block_id, INT16 &row_id_to_process) { BLOCK_IT block_it(block_list); BLOCK *block; ROW_IT row_it; ROW *row; ROW *row_to_process = NULL; INT16 row_id; WERD_IT word_it; WERD *word; BOOL8 polyg; PBLOB_IT blob_it; PBLOB *blob; OUTLINE_IT outline_it; OUTLINE *outline; /* Find row to process - error if box REALLY overlaps more than one row. (I.e it overlaps blobs in the row - not just overlaps the bounding box of the whole row.) */ block_id = 0; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_id++; row_id = 0; block = block_it.data (); if (block->bounding_box ().overlap (box)) { row_it.set_to_list (block->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_id++; row = row_it.data (); if (row->bounding_box ().overlap (box)) { word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg). overlap (box)) { outline_it. set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg).major_overlap (box)) { if ((row_to_process == NULL) || (row_to_process == row)) { row_to_process = row; row_id_to_process = row_id; } else /* RETURN ERROR Box overlaps blobs in more than one row */ return NULL; } } } } } } } } } } return row_to_process;}INT16 resegment_box( // ROW *row, BOX box, char *ch, INT16 block_id, INT16 row_id, INT16 boxfile_lineno, INT16 boxfile_charno) { WERD_IT word_it; WERD *word; WERD *new_word = NULL; BOOL8 polyg = false; PBLOB_IT blob_it; PBLOB_IT new_blob_it; PBLOB *blob; PBLOB *new_blob; OUTLINE_IT outline_it; OUTLINE_LIST dummy; // Just to initialize new_outline_it. OUTLINE_IT new_outline_it = &dummy; OUTLINE *outline; BOX new_word_box; float word_x_centre; float baseline; INT16 error_count = 0; //number of chars lost word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (gblob_bounding_box (blob, polyg).overlap (box)) { outline_it.set_to_list (gblob_out_list (blob, polyg)); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline = outline_it.data (); if (goutline_bounding_box (outline, polyg). major_overlap (box)) { if (strlen (word->text ()) > 0) { if (error_count == 0) { error_count = 1; if (applybox_debug > 4) report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! box overlaps blob in labelled word"); } if (applybox_debug > 4) tprintf ("APPLY_BOXES: ALSO ignoring corrupted char blk:%d row:%d \"%s\"\n", block_id, row_id, word_it.data ()->text ()); word_it.data ()->set_text (""); //UN label it error_count++; } if (error_count == 0) { if (new_word == NULL) { /* Make a new word with a single blob */ new_word = word->shallow_copy (); new_word->set_text (ch); if (polyg) new_blob = new PBLOB; else new_blob = (PBLOB *) new C_BLOB; new_blob_it.set_to_list (new_word-> gblob_list ()); new_blob_it.add_to_end (new_blob); new_outline_it. set_to_list (gblob_out_list (new_blob, polyg)); } new_outline_it.add_to_end (outline_it. extract ()); //move blob } } } //no outlines in blob if (outline_it.empty ()) //so delete blob delete blob_it.extract (); } } if (blob_it.empty ()) //no blobs in word //so delete word delete word_it.extract (); } } if (error_count > 0) return error_count; if (new_word != NULL) { gblob_sort_list (new_word->gblob_list (), polyg); word_it.add_to_end (new_word); new_word_box = new_word->bounding_box (); word_x_centre = (new_word_box.left () + new_word_box.right ()) / 2.0f; baseline = row->base_line (word_x_centre); if (STRING (chs_caps_ht).contains (ch[0]) && (new_word_box.top () < baseline + (1 + applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! caps-ht char didn't ascend"); new_word->set_text (""); return 1; } if (STRING (chs_odd_top).contains (ch[0]) && (new_word_box.top () < baseline + (1 - applybox_error_band) * row->x_height ())) { report_failed_box (boxfile_lineno, boxfile_charno, box, ch, "FAILURE! Odd top char below xht"); new_word->set_text ("");
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -