⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 output.cpp

📁 一OCR的相关资料。.希望对研究OCR的朋友有所帮助.
💻 CPP
📖 第 1 页 / 共 3 页
字号:
/****************************************************************** * File:        output.cpp  (Formerly output.c) * Description: Output pass * Author:					Phil Cheatle * Created:					Thu Aug  4 10:56:08 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at ** http://www.apache.org/licenses/LICENSE-2.0 ** Unless required by applicable law or agreed to in writing, software ** distributed under the License is distributed on an "AS IS" BASIS, ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ** See the License for the specific language governing permissions and ** limitations under the License. * **********************************************************************/#include "mfcpch.h"#include          "ocrshell.h"#include          <string.h>#include          <ctype.h>#ifdef __UNIX__#include          <assert.h>#include          <unistd.h>#include                    <errno.h>#endif#include          "mainblk.h"#include          "tfacep.h"#include          "tessvars.h"#include          "control.h"#include          "secname.h"#include          "reject.h"#include          "docqual.h"#include          "output.h"#include "bestfirst.h"#define EXTERN#define EPAPER_EXT      ".ep"#define PAGE_YSIZE      3508#define CTRL_INSET      '\024'   //dc4=text inset#define CTRL_FONT       '\016'   //so=font change#define CTRL_DEFAULT      '\017' //si=default font#define CTRL_SHIFT      '\022'   //dc2=x shift#define CTRL_TAB        '\011'   //tab#define CTRL_NEWLINE      '\012' //newline#define CTRL_HARDLINE   '\015'   //crint NO_BLOCK = 0;                //don't output block informationINT16 XOFFSET = 0;               //the image can be a part of bigger picture and we want to have the original coordinatesINT16 YOFFSET = 0;EXTERN BOOL_EVAR (tessedit_write_block_separators, FALSE,"Write block separators in output");EXTERN BOOL_VAR (tessedit_write_raw_output, FALSE,"Write raw stuff to name.raw");EXTERN BOOL_EVAR (tessedit_write_output, TRUE, "Write text to name.txt");EXTERN BOOL_EVAR (tessedit_write_ratings, FALSE,"Return ratings in IPEOCRAPI data");EXTERN BOOL_EVAR (tessedit_write_txt_map, TRUE,"Write .txt to .etx map file");EXTERN BOOL_EVAR (tessedit_write_rep_codes, TRUE,"Write repetition char code");EXTERN BOOL_EVAR (tessedit_write_unlv, FALSE, "Write .unlv output file");EXTERN STRING_EVAR (unrecognised_char, "|","Output char for unidentified blobs");EXTERN INT_EVAR (suspect_level, 99, "Suspect marker level");EXTERN INT_VAR (suspect_space_level, 100,"Min suspect level for rejecting spaces");EXTERN INT_VAR (suspect_short_words, 2,"Dont Suspect dict wds longer than this");EXTERN BOOL_VAR (suspect_constrain_1Il, FALSE,"UNLV keep 1Il chars rejected");EXTERN double_VAR (suspect_rating_per_ch, 999.9,"Dont touch bad rating limit");EXTERN double_VAR (suspect_accept_rating, -999.9, "Accept good rating limit");EXTERN BOOL_EVAR (tessedit_minimal_rejection, FALSE,"Only reject tess failures");EXTERN BOOL_VAR (tessedit_zero_rejection, FALSE, "Dont reject ANYTHING");EXTERN BOOL_VAR (tessedit_word_for_word, FALSE,"Make output have exactly one word per WERD");EXTERN BOOL_VAR (tessedit_zero_kelvin_rejection, FALSE,"Dont reject ANYTHING AT ALL");EXTERN BOOL_VAR (tessedit_consistent_reps, TRUE,"Force all rep chars the same");FILE *txt_mapfile = NULL;        //reject mapFILE *unlv_file = NULL;          //reject map/********************************************************************** * pixels_to_pts * * Convert an integer number of pixels to the nearest integer * number of points. **********************************************************************/INT32 pixels_to_pts(               //convert coords                    INT32 pixels,                    INT32 pix_res  //resolution                   ) {  float pts;                     //converted value  pts = pixels * 72.0 / pix_res;  return (INT32) (pts + 0.5);    //round it}void output_pass(  //Tess output pass //send to api                 PAGE_RES_IT &page_res_it,                 BOOL8 write_to_shm) {  BLOCK_RES *block_of_last_word;  INT16 block_id;  BOOL8 force_eol;               //During output  BLOCK *nextblock;              //block of next word  WERD *nextword;                //next word  if (tessedit_write_txt_map)    txt_mapfile = open_outfile (".map");  if (tessedit_write_unlv)    unlv_file = open_outfile (".unlv");  page_res_it.restart_page ();  block_of_last_word = NULL;  while (page_res_it.word () != NULL) {    check_debug_pt (page_res_it.word (), 120);    if (tessedit_write_block_separators &&    block_of_last_word != page_res_it.block ()) {      block_of_last_word = page_res_it.block ();      if (block_of_last_word->block->text_region () == NULL) {        if (block_of_last_word->block->poly_block () == NULL)          block_id = 1;        else          block_id =            ((WEIRD_BLOCK *) block_of_last_word->block->poly_block ())->            id_no();       }      else        block_id = block_of_last_word->block->text_region ()->id_no ();      if (!NO_BLOCK)        fprintf (textfile, "|^~tr%d\n", block_id);      fprintf (txt_mapfile, "|^~tr%d\n", block_id);    }    force_eol = (tessedit_write_block_separators &&      (page_res_it.block () != page_res_it.next_block ())) ||      (page_res_it.next_word () == NULL);    if (page_res_it.next_word () != NULL)      nextword = page_res_it.next_word ()->word;    else      nextword = NULL;    if (page_res_it.next_block () != NULL)      nextblock = page_res_it.next_block ()->block;    else      nextblock = NULL;                                 //regardless of tilde crunching    write_results (page_res_it, determine_newline_type (page_res_it.word ()->word, page_res_it.block ()->block, nextword, nextblock), force_eol,      write_to_shm);    page_res_it.forward ();  }  if (write_to_shm)    ocr_send_text(FALSE);   if (tessedit_write_block_separators) {    if (!NO_BLOCK)      fprintf (textfile, "|^~tr\n");    fprintf (txt_mapfile, "|^~tr\n");  }  if (tessedit_write_txt_map) {    fprintf (txt_mapfile, "\n"); //because txt gets one    #ifdef __UNIX__    fsync (fileno (txt_mapfile));    #endif    fclose(txt_mapfile);   }}/************************************************************************* * write_results() * * All recognition and rejection has now been done. Generate the following: *   .txt file     - giving the final best choices with NO highlighting *   .raw file     - giving the tesseract top choice output for each word *   .map file     - showing how the .txt file has been rejected in the .ep file *   epchoice list - a list of one element per word, containing the text for the *                   epaper. Reject strings are inserted. *   inset list    - a list of bounding boxes of reject insets - indexed by the *                   reject strings in the epchoice text. *************************************************************************/void write_results(                           //output a word                   PAGE_RES_IT &page_res_it,  //full info                   char newline_type,         //type of newline                   BOOL8 force_eol,           //override tilde crunch?                   BOOL8 write_to_shm         //send to api                  ) {                                 //word to do  WERD_RES *word = page_res_it.word ();  WERD_CHOICE *ep_choice;        //ep format  STRING repetition_code;  const STRING *wordstr;  const char *text;  int i;  char unrecognised = STRING (unrecognised_char)[0];  char ep_chars[32];             //Only for unlv_tilde_crunch  int ep_chars_index = 0;  char txt_chs[32];              //Only for unlv_tilde_crunch  char map_chs[32];              //Only for unlv_tilde_crunch  int txt_index = 0;  static BOOL8 tilde_crunch_written = FALSE;  static BOOL8 last_char_was_newline = TRUE;  static BOOL8 last_char_was_tilde = FALSE;  static BOOL8 empty_block = TRUE;  BOOL8 need_reject = FALSE;  char *ptr;                     //string ptr  PBLOB_IT blob_it;              //blobs  /*	if (word->best_choice->string().length() == 0)    {      tprintf("No output: to output\n");    }    else if (word->best_choice->string()[0]==' ')    {      tprintf("spaceword to output\n");    }    else if (word->best_choice->string()[0]=='\0')    {      tprintf("null to output\n");    }*/  if (word->unlv_crunch_mode != CR_NONE  && !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {    if ((word->unlv_crunch_mode != CR_DELETE) &&      (!tilde_crunch_written ||      ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&      (word->word->space () > 0) &&      !word->word->flag (W_FUZZY_NON) &&    !word->word->flag (W_FUZZY_SP)))) {      if (!word->word->flag (W_BOL) &&        (word->word->space () > 0) &&        !word->word->flag (W_FUZZY_NON) &&      !word->word->flag (W_FUZZY_SP)) {        /* Write a space to separate from preceeding good text */        txt_chs[txt_index] = ' ';        map_chs[txt_index++] = '1';        ep_chars[ep_chars_index++] = ' ';        last_char_was_tilde = FALSE;      }      need_reject = TRUE;    }    if ((need_reject && !last_char_was_tilde) || (force_eol && empty_block)) {      /* Write a reject char - mark as rejected unless zero_rejection mode */      last_char_was_tilde = TRUE;      txt_chs[txt_index] = unrecognised;      if (tessedit_zero_rejection || (suspect_level == 0)) {        map_chs[txt_index++] = '1';        ep_chars[ep_chars_index++] = unrecognised;      }      else {        map_chs[txt_index++] = '0';        /*           The ep_choice string is a faked reject to allow newdiff to sync the .etx           with the .txt and .map files.         */        ep_chars[ep_chars_index++] = CTRL_INSET;        //escape code                                 //dummy reject        ep_chars[ep_chars_index++] = 1;                                 //dummy reject        ep_chars[ep_chars_index++] = 1;                                 //type        ep_chars[ep_chars_index++] = 2;                                 //dummy reject        ep_chars[ep_chars_index++] = 1;                                 //dummy reject        ep_chars[ep_chars_index++] = 1;      }      tilde_crunch_written = TRUE;      last_char_was_newline = FALSE;      empty_block = FALSE;    }    if ((word->word->flag (W_EOL) && !last_char_was_newline) || force_eol) {      /* Add a new line output */      txt_chs[txt_index] = '\n';      map_chs[txt_index++] = '\n';                                 //end line      ep_chars[ep_chars_index++] = newline_type;                                 //Cos of the real newline      tilde_crunch_written = FALSE;      last_char_was_newline = TRUE;      last_char_was_tilde = FALSE;    }    txt_chs[txt_index] = '\0';    map_chs[txt_index] = '\0';                                 //xiaofan    if (tessedit_write_output && !NO_BLOCK)      fprintf (textfile, "%s", txt_chs);    if (tessedit_write_unlv)      fprintf (unlv_file, "%s", txt_chs);    if (tessedit_write_txt_map)      fprintf (txt_mapfile, "%s", map_chs);                                 //terminate string    ep_chars[ep_chars_index] = '\0';    word->ep_choice = new WERD_CHOICE (ep_chars, 0, 0, NO_PERM);    if (force_eol)      empty_block = TRUE;    return;  }  /* NORMAL PROCESSING of non tilde crunched words */  tilde_crunch_written = FALSE;  if (newline_type)    last_char_was_newline = TRUE;  else    last_char_was_newline = FALSE;  empty_block = force_eol;       //About to write a real word  if (unlv_tilde_crunching &&    last_char_was_tilde &&    (word->word->space () == 0) &&    !(word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) &&  (word->best_choice->string ()[0] == ' ')) {    /* Prevent adjacent tilde across words - we know that adjacent tildes within       words have been removed */    ptr = (char *) word->best_choice->string ().string ();    strcpy (ptr, ptr + 1);       //shuffle up    word->reject_map.remove_pos (0);    blob_it = word->outword->blob_list ();    delete blob_it.extract ();   //get rid of reject blob  }  if (newline_type ||    (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))    last_char_was_tilde = FALSE;  else {    if (word->reject_map.length () > 0) {      if (word->best_choice->string ()[word->reject_map.length () - 1] ==        ' ')        last_char_was_tilde = TRUE;      else        last_char_was_tilde = FALSE;    }    else if (word->word->space () > 0)      last_char_was_tilde = FALSE;    /* else it is unchanged as there are no output chars */  }  ptr = (char *) word->best_choice->string ().string ();  ASSERT_HOST (strlen (ptr) == word->reject_map.length ());  if (word->word->flag (W_REP_CHAR) && tessedit_consistent_reps)    ensure_rep_chars_are_consistent(word);   set_unlv_suspects(word);   check_debug_pt (word, 120);  if (tessedit_rejection_debug) {    tprintf ("Dict word: \"%s\": %d\n",      word->best_choice->string ().string (),      dict_word (word->best_choice->string ().string ()));  }  if (tessedit_write_unlv) {    write_unlv_text(word);   }  if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) {    repetition_code = "|^~R";    repetition_code += get_rep_char (word);    wordstr = &repetition_code;  }  else {    wordstr = &(word->best_choice->string ());    if (tessedit_zero_rejection) {      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */      text = wordstr->string ();      for (i = 0; text[i] != '\0'; i++) {        if (word->reject_map[i].rejected ())          word->reject_map[i].setrej_minimal_rej_accept ();      }    }    if (tessedit_minimal_rejection) {      /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */      text = wordstr->string ();      for (i = 0; text[i] != '\0'; i++) {        if ((text[i] != ' ') && word->reject_map[i].rejected ())          word->reject_map[i].setrej_minimal_rej_accept ();      }    }  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -