output.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 1,274 行 · 第 1/3 页

CPP
1,274
字号
                         page_image.get_ysize () - 1 - blob_box.bottom (),                         font, (uinT8) rating,                         ptsize,                //point size                         blanks, enhancement,   //enhancement                         OCR_CDIR_LEFT_RIGHT,                         OCR_LDIR_DOWN_RIGHT,                         lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);        } else {          for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset)            ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]),                             blob_box.left (), blob_box.right (),                             page_image.get_ysize () - 1 - blob_box.top (),                             page_image.get_ysize () - 1 - blob_box.bottom (),                             font, (uinT8) rating,                             ptsize,                //point size                             blanks, enhancement,   //enhancement                             OCR_CDIR_LEFT_RIGHT,                             OCR_LDIR_DOWN_RIGHT,                             lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);        }        blanks = 0;      }    }  }  else if (tessedit_word_for_word) {    blanks = word->word->space ();    if (blanks == 0 && !word->word->flag (W_BOL))      blanks = 1;    blob_box = word->word->bounding_box ();    enhancement = 0;    if (word->italic > 0)      enhancement |= EUC_ITALIC;    if (word->bold > 0)      enhancement |= EUC_BOLD;    rating = 100;    if (word->font1_count > 2)      font = word->font1;    else if (row->font1_count > 8)      font = row->font1;    else                                 //font index      font = word->word->flag (W_DONT_CHOP) ? 0 : 1;    lineend = word->word->flag (W_EOL);                                 //font index    ocr_append_char (unrecognised,                     blob_box.left (), blob_box.right (),                     page_image.get_ysize () - 1 - blob_box.top (),                     page_image.get_ysize () - 1 - blob_box.bottom (),                     font,                     rating,                    //confidence                     ptsize,                    //point size                     blanks, enhancement,       //enhancement                     OCR_CDIR_LEFT_RIGHT,                     OCR_LDIR_DOWN_RIGHT,                     lineend ? OCR_NL_NEWLINE : OCR_NL_NONE);  }}/********************************************************************** * write_map * * Write a map file of 0's and 1'a which associates characters from the .txt * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char * is kept.  Note that there may be reject regions in the .etx file WITHOUT * .txt chars being rejected.  The map file should be the same length, and * the same number of lines as the .txt file * * The paramaterised input is because I thought I might be able to generate * multiple map files in a single run.  However, it didn't work because * newdiff needs etx files! **********************************************************************/#if 0void write_map(                //output a map file               FILE *mapfile,  //mapfile to write to               WERD_RES *word) {  inT16 index;  int status;  STRING mapstr = "";  if (word->best_choice->string ().length () > 0) {    for (index = 0; index < word->word->space (); index++) {      if (word->reject_spaces &&        (suspect_level >= suspect_space_level) &&        !tessedit_minimal_rejection && !tessedit_zero_rejection)        /* Write rejected spaces to .map file ONLY. Newdiff converts these back to        accepted spaces AFTER generating basic space stats but BEFORE using .etx */        status = fprintf (mapfile, "0");      else        status = fprintf (mapfile, "1");      if (status < 0)        WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno);    }    if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) {      for (index = 0; index < 5; index++)        mapstr += '1';    }    else {      ASSERT_HOST (word->reject_map.length () ==        word->best_choice->string ().length ());      for (index = 0; index < word->reject_map.length (); index++) {        if (word->reject_map[index].accepted ())          mapstr += '1';        else          mapstr += '0';      }    }    status = fprintf (mapfile, "%s", mapstr.string ());    if (status < 0)      WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno);  }  if (word->word->flag (W_EOL)) {    status = fprintf (mapfile, "\n");    if (status < 0)      WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno);  }  status = fflush (mapfile);  if (status != 0)    WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);}#endif/************************************************************************* * open_file() *************************************************************************/FILE *open_outfile(  //open .map & .unlv file                   const char *extension) {  STRING file_name;  FILE *outfile;  file_name = imagebasename + extension;  if (!(outfile = fopen (file_name.string (), "w"))) {    CANTOPENFILE.error ("open_outfile", EXIT, "%s %d",      file_name.string (), errno);  }  return outfile;}#if 0void write_unlv_text(WERD_RES *word) {  const char *wordstr;  char buff[512];                //string to output  int i = 0;  int j = 0;  char unrecognised = STRING (unrecognised_char)[0];  int status;  char space_str[3];  wordstr = word->best_choice->string ().string ();  /* DONT need to do anything special for repeated char words - at this stage  the repetition char has been identified and any other chars have been  rejected.  */  for (; wordstr[i] != '\0'; i++) {    if ((wordstr[i] == ' ') ||      (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|'))      buff[j++] = unrecognised;    else {      if (word->reject_map[i].rejected ())        buff[j++] = '^';         //Add suspect marker      buff[j++] = wordstr[i];    }  }  buff[j] = '\0';  if (strlen (wordstr) > 0) {    if (word->reject_spaces &&      (suspect_level >= suspect_space_level) &&      !tessedit_minimal_rejection && !tessedit_zero_rejection)      strcpy (space_str, "^ ");  //Suspect space    else      strcpy (space_str, " ");   //Certain space    for (i = 0; i < word->word->space (); i++) {      status = fprintf (unlv_file, "%s", space_str);      if (status < 0)        WRITEFAILED.error ("write_unlv_text", EXIT,          "Space Errno: %d", errno);    }    status = fprintf (unlv_file, "%s", buff);    if (status < 0)      WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno);  }  if (word->word->flag (W_EOL)) {    status = fprintf (unlv_file, "\n");    if (status < 0)      WRITEFAILED.error ("write_unlv_text", EXIT,        "Newline Errno: %d", errno);  }  status = fflush (unlv_file);  if (status != 0)    WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);}#endif/************************************************************************* * get_rep_char() * Return the first accepted character from the repetition string. This is the * character which is repeated - as determined earlier by fix_rep_char() *************************************************************************/UNICHAR_ID get_rep_char(WERD_RES *word) {  // what char is repeated?  int i;  int offset;  for (i = 0, offset = 0;    ((i < word->reject_map.length ()) &&    (word->reject_map[i].rejected ()));       offset += word->best_choice->lengths()[i++]);  if (i < word->reject_map.length ())    return unicharset.unichar_to_id(word->best_choice->string().string()                                    + offset,                                    word->best_choice->lengths()[i]);  else    return unicharset.unichar_to_id(unrecognised_char.string());}void ensure_rep_chars_are_consistent(WERD_RES *word) {#if 0  char rep_char = get_rep_char (word);  char *ptr;  ptr = (char *) word->best_choice->string ().string ();  for (; *ptr != '\0'; ptr++) {    if (*ptr != rep_char)      *ptr = rep_char;  }#endif#if 0  UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate  int i;  char *ptr;  STRING consistent_string;  STRING consistent_string_lengths;  ptr = (char *) word->best_choice->string ().string ();  for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) {    consistent_string += unicharset.id_to_unichar(rep_char);    consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char));  }  word->best_choice->string() = consistent_string;  word->best_choice->lengths() = consistent_string_lengths;#endif}/************************************************************************* * SUSPECT LEVELS * * 0 - dont reject ANYTHING * 1,2 - partial rejection * 3 - BEST * * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and * tessedit_minimal_rejection. *************************************************************************/void set_unlv_suspects(WERD_RES *word) {  int len = word->reject_map.length ();  int i;  int offset;  const char *ptr;  const char *lengths = word->best_choice->lengths ().string ();  float rating_per_ch;  ptr = word->best_choice->string ().string ();  if (suspect_level == 0) {    for (i = 0; i < len; i++) {      if (word->reject_map[i].rejected ())        word->reject_map[i].setrej_minimal_rej_accept ();    }    return;  }  if (suspect_level >= 3)    return;                      //Use defaults  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/  if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) >                               suspect_short_words)) {    /* Unreject alphas in dictionary words */    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {      if (word->reject_map[i].rejected () &&          unicharset.get_isalpha (ptr + offset, lengths[i]))        word->reject_map[i].setrej_minimal_rej_accept ();    }  }  rating_per_ch = word->best_choice->rating () / word->reject_map.length ();  if (rating_per_ch >= suspect_rating_per_ch)    return;                      //Dont touch bad ratings  if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {    /* Unreject any Tess Acceptable word - but NOT tess reject chs*/    for (i = 0, offset = 0; i < len; offset += lengths[i++]) {      if (word->reject_map[i].rejected () && (ptr[offset] != ' '))        word->reject_map[i].setrej_minimal_rej_accept ();    }  }  for (i = 0; i < len; i++) {    if (word->reject_map[i].rejected ()) {      if (word->reject_map[i].flag (R_DOC_REJ))        word->reject_map[i].setrej_minimal_rej_accept ();      if (word->reject_map[i].flag (R_BLOCK_REJ))        word->reject_map[i].setrej_minimal_rej_accept ();      if (word->reject_map[i].flag (R_ROW_REJ))        word->reject_map[i].setrej_minimal_rej_accept ();    }  }  if (suspect_level == 2)    return;  if (!suspect_constrain_1Il ||  (word->reject_map.length () <= suspect_short_words)) {    for (i = 0; i < len; i++) {      if (word->reject_map[i].rejected ()) {        if ((word->reject_map[i].flag (R_1IL_CONFLICT) ||          word->reject_map[i].flag (R_POSTNN_1IL)))          word->reject_map[i].setrej_minimal_rej_accept ();        if (!suspect_constrain_1Il &&          word->reject_map[i].flag (R_MM_REJECT))          word->reject_map[i].setrej_minimal_rej_accept ();      }    }  }  if ((acceptable_word_string (word->best_choice->string ().string (),                               word->best_choice->lengths ().string ())    != AC_UNACCEPTABLE) ||  acceptable_number_string (word->best_choice->string ().string (),                            word->best_choice->lengths ().string ())) {    if (word->reject_map.length () > suspect_short_words) {      for (i = 0; i < len; i++) {        if (word->reject_map[i].rejected () &&          (!word->reject_map[i].perm_rejected () ||          word->reject_map[i].flag (R_1IL_CONFLICT) ||          word->reject_map[i].flag (R_POSTNN_1IL) ||        word->reject_map[i].flag (R_MM_REJECT))) {          word->reject_map[i].setrej_minimal_rej_accept ();        }      }    }  }}inT16 count_alphas(  //how many alphas                   const char *s,                   const char *lengths) {  int count = 0;  for (; *s != '\0'; s += *(lengths++)) {    if (unicharset.get_isalpha(s, *lengths))      count++;  }  return count;}inT16 count_alphanums(  //how many alphanums                      const char *s,                      const char *lengths) {  int count = 0;  for (; *s != '\0'; s += *(lengths++)) {    if (unicharset.get_isalpha(s, *lengths) ||        unicharset.get_isdigit(s, *lengths))      count++;  }  return count;}BOOL8 acceptable_number_string(const char *s,                               const char *lengths) {  BOOL8 prev_digit = FALSE;  if (*lengths == 1 && *s == '(')    s++;  if (*lengths == 1 &&      ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))    s++;  for (; *s != '\0'; s += *(lengths++)) {    if (unicharset.get_isdigit (s, *lengths))      prev_digit = TRUE;    else if (prev_digit &&             (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))      prev_digit = FALSE;    else if (prev_digit && *lengths == 1 &&             (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))      return TRUE;    else if (prev_digit &&             *lengths == 1 && (*s == '%') &&             (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&             (*(s + *lengths + *(lengths + 1)) == '\0'))      return TRUE;    else      return FALSE;  }  return TRUE;}

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?