applybox.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 892 行 · 第 1/3 页
CPP
892 行
/********************************************************************** * File: applybox.cpp (Formerly applybox.c) * Description: Re segment rows according to box file data * Author: Phil Cheatle * Created: Wed Nov 24 09:11:23 GMT 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************//*define SECURE_NAMES for code versions which go to UNLV to stop tesseditincluding all the newdiff stuff (which contains lots of text indicatingwhat measures we are interested in.*//* #define SECURE_NAMES done in secnames.h when necessary*/#include "mfcpch.h"#include "applybox.h"#include <ctype.h>#include <string.h>#ifdef __UNIX__#include <assert.h>#include <errno.h>#endif#include "boxread.h"#include "mainblk.h"#include "genblob.h"#include "fixxht.h"#include "control.h"#include "tessbox.h"#include "globals.h"#include "secname.h"#include "unichar.h"#include "matchdefs.h"#define SECURE_NAMES#ifndef SECURE_NAMES#include "wordstats.h"#endif#define EXTERNEXTERN BOOL_VAR (applybox_rebalance, TRUE, "Drop dead");EXTERN INT_VAR (applybox_debug, 5, "Debug level");EXTERN INT_VAR (applybox_page, 0, "Page number to apply boxes from");EXTERN STRING_VAR (applybox_test_exclusions, "","Chars ignored for testing");EXTERN double_VAR (applybox_error_band, 0.15, "Err band as fract of xht");// The unicharset used during box trainingstatic UNICHARSET unicharset_boxes;static void PrintString(const char* str) { tprintf("%s:", str); int step = 0; for (int i = 0; str[i]; i += step) { step = UNICHAR::utf8_step(str + i); if (step == 0) step = 1; UNICHAR ch(str + i, step); tprintf("[%x]", ch.first_uni()); } tprintf("\n", str);}/************************************************************************* * The code re-assigns outlines to form words each with ONE labelled blob. * Noise is left in UNLABELLED words. The chars on the page are checked crudely * for sensible position relative to baseline and xht. Failed boxes are * compensated for by duplicating other believable instances of the character. * * The box file is assumed to contain box definitions, one per line, of the * following format: * <Char> <left> <bottom> <right> <top> ... arbitrary trailing fields unused * * The approach taken is to search the WHOLE page for stuff overlapping each box. * - This is not too inefficient and is SAFE. * - We can detect overlapping blobs as we will be attempting to put a blob * from a LABELLED word into the current word. * - When all the boxes have been processed we can detect any stuff which is * being ignored - it is the unlabelled words left on the page. * * A box should only overlap one row. * * A warning is given if the box is on the same row as the previous box, but NOT * on the same row as the previous blob. * * Any OUTLINE which overlaps the box is put into the new word. * * ascender chars must ascend above xht significantly * xht chars must not rise above row xht significantly * bl chars must not descend below baseline significantly * descender chars must descend below baseline significantly * * ?? Certain chars are DROPPED - to limit the training data. * *************************************************************************/void apply_boxes(BLOCK_LIST *block_list //real blocks ) { inT16 boxfile_lineno = 0; inT16 boxfile_charno = 0; TBOX box; //boxfile box UNICHAR_ID uch_id; //correct ch from boxfile ROW *row; ROW *prev_row = NULL; inT16 prev_box_right = MAX_INT16; inT16 block_id; inT16 row_id; inT16 box_count = 0; inT16 box_failures = 0; inT16 labels_ok; inT16 rows_ok; inT16 bad_blobs; inT16 tgt_char_counts[MAX_NUM_CLASSES]; //No. of box samples // inT16 labelled_char_counts[128]; //No. of unique labelled samples inT16 i; inT16 rebalance_count = 0; UNICHAR_ID min_uch_id; inT16 min_samples; inT16 final_labelled_blob_count; // Clean the unichar set unicharset_boxes.clear(); // Space character needed to represent NIL classification unicharset_boxes.unichar_insert(" "); for (i = 0; i < MAX_NUM_CLASSES; i++) tgt_char_counts[i] = 0; FILE* box_file; STRING filename = imagefile; filename += ".box"; if (!(box_file = fopen (filename.string(), "r"))) { CANTOPENFILE.error ("read_next_box", EXIT, "Cant open box file %s %d", filename.string(), errno); } clear_any_old_text(block_list); while (read_next_box(applybox_page, box_file, &box, &uch_id)) { box_count++; tgt_char_counts[uch_id]++; row = find_row_of_box (block_list, box, block_id, row_id); if (box.left () < prev_box_right) { boxfile_lineno++; boxfile_charno = 1; } else boxfile_charno++; if (row == NULL) { box_failures++; report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "FAILURE! box overlaps no blobs or blobs in multiple rows"); } else { if ((box.left () >= prev_box_right) && (row != prev_row)) report_failed_box (boxfile_lineno, boxfile_charno, box, unicharset_boxes.id_to_unichar(uch_id), "WARNING! false row break"); box_failures += resegment_box (row, box, uch_id, block_id, row_id, boxfile_lineno, boxfile_charno); prev_row = row; } prev_box_right = box.right (); } tidy_up(block_list, labels_ok, rows_ok, bad_blobs, tgt_char_counts, rebalance_count, &min_uch_id, min_samples, final_labelled_blob_count); tprintf ("APPLY_BOXES:\n"); tprintf (" Boxes read from boxfile: %6d\n", box_count); tprintf (" Initially labelled blobs: %6d in %d rows\n", labels_ok, rows_ok); tprintf (" Box failures detected: %6d\n", box_failures); tprintf (" Duped blobs for rebalance:%6d\n", rebalance_count); tprintf (" \"%s\" has fewest samples:%6d\n", unicharset_boxes.id_to_unichar(min_uch_id), min_samples); tprintf (" Total unlabelled words: %6d\n", bad_blobs); tprintf (" Final labelled words: %6d\n", final_labelled_blob_count);}void clear_any_old_text( //remove correct text BLOCK_LIST *block_list //real blocks ) { BLOCK_IT block_it(block_list); ROW_IT row_it; WERD_IT word_it; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { row_it.set_to_list (block_it.data ()->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { word_it.set_to_list (row_it.data ()->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word_it.data ()->set_text (""); } } }}BOOL8 read_next_box(int page, FILE* box_file, // TBOX *box, UNICHAR_ID *uch_id) { int x_min; int y_min; int x_max; int y_max; char uch[kBoxReadBufSize]; while (read_next_box(page, box_file, uch, &x_min, &y_min, &x_max, &y_max)) { if (!unicharset_boxes.contains_unichar(uch)) { unicharset_boxes.unichar_insert(uch); if (unicharset_boxes.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset of boxes is " "greater than MAX_NUM_CLASSES (%d)\n", MAX_NUM_CLASSES); exit(1); } } *uch_id = unicharset_boxes.unichar_to_id(uch); *box = TBOX (ICOORD (x_min, y_min), ICOORD (x_max, y_max)); return TRUE; //read a box ok } return FALSE; //EOF}ROW *find_row_of_box( // BLOCK_LIST *block_list, //real blocks TBOX box, //from boxfile inT16 &block_id, inT16 &row_id_to_process) { BLOCK_IT block_it(block_list); BLOCK *block; ROW_IT row_it; ROW *row; ROW *row_to_process = NULL; inT16 row_id; WERD_IT word_it; WERD *word; BOOL8 polyg; PBLOB_IT blob_it; PBLOB *blob; OUTLINE_IT outline_it; OUTLINE *outline; /* Find row to process - error if box REALLY overlaps more than one row. (I.e it overlaps blobs in the row - not just overlaps the bounding box of the whole row.) */ block_id = 0; for (block_it.mark_cycle_pt (); !block_it.cycled_list (); block_it.forward ()) { block_id++; row_id = 0; block = block_it.data (); if (block->bounding_box ().overlap (box)) { row_it.set_to_list (block->row_list ()); for (row_it.mark_cycle_pt (); !row_it.cycled_list (); row_it.forward ()) { row_id++; row = row_it.data (); if (row->bounding_box ().overlap (box)) { word_it.set_to_list (row->word_list ()); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); polyg = word->flag (W_POLYGON); if (word->bounding_box ().overlap (box)) { blob_it.set_to_list (word->gblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data ();
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?