📄 output.cpp
字号:
/****************************************************************** * File: output.cpp (Formerly output.c) * Description: Output pass * Author: Phil Cheatle * Created: Thu Aug 4 10:56:08 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#include "ocrshell.h"#include <string.h>#include <ctype.h>#ifdef __UNIX__#include <assert.h>#include <unistd.h>#include <errno.h>#endif#include "mainblk.h"#include "tfacep.h"#include "tessvars.h"#include "control.h"#include "secname.h"#include "reject.h"#include "docqual.h"#include "output.h"#include "bestfirst.h"#define EXTERN#define EPAPER_EXT ".ep"#define PAGE_YSIZE 3508#define CTRL_INSET '\024' //dc4=text inset#define CTRL_FONT '\016' //so=font change#define CTRL_DEFAULT '\017' //si=default font#define CTRL_SHIFT '\022' //dc2=x shift#define CTRL_TAB '\011' //tab#define CTRL_NEWLINE '\012' //newline#define CTRL_HARDLINE '\015' //crint NO_BLOCK = 0; //don't output block informationINT16 XOFFSET = 0; //the image can be a part of bigger picture and we want to have the original coordinatesINT16 YOFFSET = 0;EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,"Write block separators in output");EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,"Write raw stuff to name.raw");EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,"Return ratings in IPEOCRAPI data");EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,"Write .txt to .etx map file");EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,"Write repetition char code");EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");EXTERN STRING_EVAR (unrecognised_char, "|","Output char for unidentified blobs");EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");EXTERN INT_VAR (suspect_space_level, 100,"Min suspect level for rejecting spaces");EXTERN INT_VAR (suspect_short_words, 2,"Dont Suspect dict wds longer than this");EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,"UNLV keep 1Il chars rejected");EXTERN double_VAR (suspect_rating_per_ch, 999.9,"Dont touch bad rating limit");EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,"Only reject tess failures");EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,"Make output have exactly one word per WERD");EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,"Dont reject ANYTHING AT ALL");EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,"Force all rep chars the same");FILE *txt_mapfile = NULL; //reject mapFILE *unlv_file = NULL; //reject map/********************************************************************** * pixels_to_pts * * Convert an integer number of pixels to the nearest integer * number of points. **********************************************************************/INT32 pixels_to_pts( //convert coords INT32 pixels, INT32 pix_res //resolution ) { float pts; //converted value pts = pixels * 72.0 / pix_res; return (INT32) (pts + 0.5); //round it}void output_pass( //Tess output pass //send to api PAGE_RES_IT &page_res_it, BOOL8 write_to_shm) { BLOCK_RES *block_of_last_word; INT16 block_id; BOOL8 force_eol; //During output BLOCK *nextblock; //block of next word WERD *nextword; //next word if (tessedit_write_txt_map) txt_mapfile = open_outfile (".map"); if (tessedit_write_unlv) unlv_file = open_outfile (".unlv"); page_res_it.restart_page (); block_of_last_word = NULL; while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 120); if (tessedit_write_block_separators && block_of_last_word != page_res_it.block ()) { block_of_last_word = page_res_it.block (); if (block_of_last_word->block->text_region () == NULL) { if (block_of_last_word->block->poly_block () == NULL) block_id = 1; else block_id = ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())-> id_no(); } else block_id = block_of_last_word->block->text_region ()->id_no (); if (!NO_BLOCK) fprintf (textfile, "|^~tr%d\n", block_id); fprintf (txt_mapfile, "|^~tr%d\n", block_id); } force_eol = (tessedit_write_block_separators && (page_res_it.block () != page_res_it.next_block ())) || (page_res_it.next_word () == NULL); if (page_res_it.next_word () != NULL) nextword = page_res_it.next_word ()->word; else nextword = NULL; if (page_res_it.next_block () != NULL) nextblock = page_res_it.next_block ()->block; else nextblock = NULL; //regardless of tilde crunching write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol, write_to_shm); page_res_it.forward (); } if (write_to_shm) ocr_send_text(FALSE); if (tessedit_write_block_separators) { if (!NO_BLOCK) fprintf (textfile, "|^~tr\n"); fprintf (txt_mapfile, "|^~tr\n"); } if (tessedit_write_txt_map) { fprintf (txt_mapfile, "\n"); //because txt gets one #ifdef __UNIX__ fsync (fileno (txt_mapfile)); #endif fclose(txt_mapfile); }}/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: * .txt file - giving the final best choices with NO highlighting * .raw file - giving the tesseract top choice output for each word * .map file - showing how the .txt file has been rejected in the .ep file * epchoice list - a list of one element per word, containing the text for the * epaper. Reject strings are inserted. * inset list - a list of bounding boxes of reject insets - indexed by the * reject strings in the epchoice text. *************************************************************************/void write_results( //output a word PAGE_RES_IT &page_res_it, //full info char newline_type, //type of newline BOOL8 force_eol, //override tilde crunch? BOOL8 write_to_shm //send to api ) { //word to do WERD_RES *word = page_res_it.word (); WERD_CHOICE *ep_choice; //ep format STRING repetition_code; const STRING *wordstr; const char *text; int i; char unrecognised = STRING (unrecognised_char)[0]; char ep_chars[32]; //Only for unlv_tilde_crunch int ep_chars_index = 0; char txt_chs[32]; //Only for unlv_tilde_crunch char map_chs[32]; //Only for unlv_tilde_crunch int txt_index = 0; static BOOL8 tilde_crunch_written = FALSE; static BOOL8 last_char_was_newline = TRUE; static BOOL8 last_char_was_tilde = FALSE; static BOOL8 empty_block = TRUE; BOOL8 need_reject = FALSE; char *ptr; //string ptr PBLOB_IT blob_it; //blobs /* if (word->best_choice->string().length() == 0) { tprintf("No output: to output\n"); } else if (word->best_choice->string()[0]==' ') { tprintf("spaceword to output\n"); } else if (word->best_choice->string()[0]=='\0') { tprintf("null to output\n"); }*/ if (word->unlv_crunch_mode != CR_NONE && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) { if ((word->unlv_crunch_mode != CR_DELETE) && (!tilde_crunch_written || ((word->unlv_crunch_mode == CR_KEEP_SPACE) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)))) { if (!word->word->flag (W_BOL) && (word->word->space () > 0) && !word->word->flag (W_FUZZY_NON) && !word->word->flag (W_FUZZY_SP)) { /* Write a space to separate from preceeding good text */ txt_chs[txt_index] = ' '; map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = ' '; last_char_was_tilde = FALSE; } need_reject = TRUE; } if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) { /* Write a reject char - mark as rejected unless zero_rejection mode */ last_char_was_tilde = TRUE; txt_chs[txt_index] = unrecognised; if (tessedit_zero_rejection || (suspect_level == 0)) { map_chs[txt_index++] = '1'; ep_chars[ep_chars_index++] = unrecognised; } else { map_chs[txt_index++] = '0'; /* The ep_choice string is a faked reject to allow newdiff to sync the .etx with the .txt and .map files. */ ep_chars[ep_chars_index++] = CTRL_INSET; //escape code //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; //type ep_chars[ep_chars_index++] = 2; //dummy reject ep_chars[ep_chars_index++] = 1; //dummy reject ep_chars[ep_chars_index++] = 1; } tilde_crunch_written = TRUE; last_char_was_newline = FALSE; empty_block = FALSE; } if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) { /* Add a new line output */ txt_chs[txt_index] = '\n'; map_chs[txt_index++] = '\n'; //end line ep_chars[ep_chars_index++] = newline_type; //Cos of the real newline tilde_crunch_written = FALSE; last_char_was_newline = TRUE; last_char_was_tilde = FALSE; } txt_chs[txt_index] = '\0'; map_chs[txt_index] = '\0'; //xiaofan if (tessedit_write_output && !NO_BLOCK) fprintf (textfile, "%s", txt_chs); if (tessedit_write_unlv) fprintf (unlv_file, "%s", txt_chs); if (tessedit_write_txt_map) fprintf (txt_mapfile, "%s", map_chs); //terminate string ep_chars[ep_chars_index] = '\0'; word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM); if (force_eol) empty_block = TRUE; return; } /* NORMAL PROCESSING of non tilde crunched words */ tilde_crunch_written = FALSE; if (newline_type) last_char_was_newline = TRUE; else last_char_was_newline = FALSE; empty_block = force_eol; //About to write a real word if (unlv_tilde_crunching && last_char_was_tilde && (word->word->space () == 0) && !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) && (word->best_choice->string ()[0] == ' ')) { /* Prevent adjacent tilde across words - we know that adjacent tildes within words have been removed */ ptr = (char *) word->best_choice->string ().string (); strcpy (ptr, ptr + 1); //shuffle up word->reject_map.remove_pos (0); blob_it = word->outword->blob_list (); delete blob_it.extract (); //get rid of reject blob } if (newline_type || (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) last_char_was_tilde = FALSE; else { if (word->reject_map.length () > 0) { if (word->best_choice->string ()[word->reject_map.length () - 1] == ' ') last_char_was_tilde = TRUE; else last_char_was_tilde = FALSE; } else if (word->word->space () > 0) last_char_was_tilde = FALSE; /* else it is unchanged as there are no output chars */ } ptr = (char *) word->best_choice->string ().string (); ASSERT_HOST (strlen (ptr) == word->reject_map.length ()); if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps) ensure_rep_chars_are_consistent(word); set_unlv_suspects(word); check_debug_pt (word, 120); if (tessedit_rejection_debug) { tprintf ("Dict word: \"%s\": %d\n", word->best_choice->string ().string (), dict_word (word->best_choice->string ().string ())); } if (tessedit_write_unlv) { write_unlv_text(word); } if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { repetition_code = "|^~R"; repetition_code += get_rep_char (word); wordstr = &repetition_code; } else { wordstr = &(word->best_choice->string ()); if (tessedit_zero_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ text = wordstr->string (); for (i = 0; text[i] != '\0'; i++) { if (word->reject_map[i].rejected ()) word->reject_map[i].setrej_minimal_rej_accept (); } } if (tessedit_minimal_rejection) { /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */ text = wordstr->string (); for (i = 0; text[i] != '\0'; i++) { if ((text[i] != ' ') && word->reject_map[i].rejected ()) word->reject_map[i].setrej_minimal_rej_accept (); } } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -