output.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 1,274 行 · 第 1/3 页
CPP
1,274 行
page_image.get_ysize () - 1 - blob_box.bottom (), font, (uinT8) rating, ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); } else { for (int suboffset = 0; suboffset < text_lengths[index]; ++suboffset) ocr_append_char (static_cast<unsigned char>(text[offset+suboffset]), blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (uinT8) rating, ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); } blanks = 0; } } } else if (tessedit_word_for_word) { blanks = word->word->space (); if (blanks == 0 && !word->word->flag (W_BOL)) blanks = 1; blob_box = word->word->bounding_box (); enhancement = 0; if (word->italic > 0) enhancement |= EUC_ITALIC; if (word->bold > 0) enhancement |= EUC_BOLD; rating = 100; if (word->font1_count > 2) font = word->font1; else if (row->font1_count > 8) font = row->font1; else //font index font = word->word->flag (W_DONT_CHOP) ? 0 : 1; lineend = word->word->flag (W_EOL); //font index ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, rating, //confidence ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); }}/********************************************************************** * write_map * * Write a map file of 0's and 1'a which associates characters from the .txt * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char * is kept. Note that there may be reject regions in the .etx file WITHOUT * .txt chars being rejected. The map file should be the same length, and * the same number of lines as the .txt file * * The paramaterised input is because I thought I might be able to generate * multiple map files in a single run. However, it didn't work because * newdiff needs etx files! **********************************************************************/#if 0void write_map( //output a map file FILE *mapfile, //mapfile to write to WERD_RES *word) { inT16 index; int status; STRING mapstr = ""; if (word->best_choice->string ().length () > 0) { for (index = 0; index < word->word->space (); index++) { if (word->reject_spaces && (suspect_level >= suspect_space_level) && !tessedit_minimal_rejection && !tessedit_zero_rejection) /* Write rejected spaces to .map file ONLY. Newdiff converts these back to accepted spaces AFTER generating basic space stats but BEFORE using .etx */ status = fprintf (mapfile, "0"); else status = fprintf (mapfile, "1"); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno); } if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) { for (index = 0; index < 5; index++) mapstr += '1'; } else { ASSERT_HOST (word->reject_map.length () == word->best_choice->string ().length ()); for (index = 0; index < word->reject_map.length (); index++) { if (word->reject_map[index].accepted ()) mapstr += '1'; else mapstr += '0'; } } status = fprintf (mapfile, "%s", mapstr.string ()); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno); } if (word->word->flag (W_EOL)) { status = fprintf (mapfile, "\n"); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno); } status = fflush (mapfile); if (status != 0) WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);}#endif/************************************************************************* * open_file() *************************************************************************/FILE *open_outfile( //open .map & .unlv file const char *extension) { STRING file_name; FILE *outfile; file_name = imagebasename + extension; if (!(outfile = fopen (file_name.string (), "w"))) { CANTOPENFILE.error ("open_outfile", EXIT, "%s %d", file_name.string (), errno); } return outfile;}#if 0void write_unlv_text(WERD_RES *word) { const char *wordstr; char buff[512]; //string to output int i = 0; int j = 0; char unrecognised = STRING (unrecognised_char)[0]; int status; char space_str[3]; wordstr = word->best_choice->string ().string (); /* DONT need to do anything special for repeated char words - at this stage the repetition char has been identified and any other chars have been rejected. */ for (; wordstr[i] != '\0'; i++) { if ((wordstr[i] == ' ') || (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|')) buff[j++] = unrecognised; else { if (word->reject_map[i].rejected ()) buff[j++] = '^'; //Add suspect marker buff[j++] = wordstr[i]; } } buff[j] = '\0'; if (strlen (wordstr) > 0) { if (word->reject_spaces && (suspect_level >= suspect_space_level) && !tessedit_minimal_rejection && !tessedit_zero_rejection) strcpy (space_str, "^ "); //Suspect space else strcpy (space_str, " "); //Certain space for (i = 0; i < word->word->space (); i++) { status = fprintf (unlv_file, "%s", space_str); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Space Errno: %d", errno); } status = fprintf (unlv_file, "%s", buff); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno); } if (word->word->flag (W_EOL)) { status = fprintf (unlv_file, "\n"); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Newline Errno: %d", errno); } status = fflush (unlv_file); if (status != 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);}#endif/************************************************************************* * get_rep_char() * Return the first accepted character from the repetition string. This is the * character which is repeated - as determined earlier by fix_rep_char() *************************************************************************/UNICHAR_ID get_rep_char(WERD_RES *word) { // what char is repeated? int i; int offset; for (i = 0, offset = 0; ((i < word->reject_map.length ()) && (word->reject_map[i].rejected ())); offset += word->best_choice->lengths()[i++]); if (i < word->reject_map.length ()) return unicharset.unichar_to_id(word->best_choice->string().string() + offset, word->best_choice->lengths()[i]); else return unicharset.unichar_to_id(unrecognised_char.string());}void ensure_rep_chars_are_consistent(WERD_RES *word) {#if 0 char rep_char = get_rep_char (word); char *ptr; ptr = (char *) word->best_choice->string ().string (); for (; *ptr != '\0'; ptr++) { if (*ptr != rep_char) *ptr = rep_char; }#endif#if 0 UNICHAR_ID rep_char = get_rep_char (word); //TODO(tkielbus) Reactivate int i; char *ptr; STRING consistent_string; STRING consistent_string_lengths; ptr = (char *) word->best_choice->string ().string (); for (i = 0; *ptr != '\0'; ptr += word->best_choice->lengths()[i++]) { consistent_string += unicharset.id_to_unichar(rep_char); consistent_string_lengths += strlen(unicharset.id_to_unichar(rep_char)); } word->best_choice->string() = consistent_string; word->best_choice->lengths() = consistent_string_lengths;#endif}/************************************************************************* * SUSPECT LEVELS * * 0 - dont reject ANYTHING * 1,2 - partial rejection * 3 - BEST * * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and * tessedit_minimal_rejection. *************************************************************************/void set_unlv_suspects(WERD_RES *word) { int len = word->reject_map.length (); int i; int offset; const char *ptr; const char *lengths = word->best_choice->lengths ().string (); float rating_per_ch; ptr = word->best_choice->string ().string (); if (suspect_level == 0) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected ()) word->reject_map[i].setrej_minimal_rej_accept (); } return; } if (suspect_level >= 3) return; //Use defaults /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ if (safe_dict_word (ptr) && (count_alphas (ptr, lengths) > suspect_short_words)) { /* Unreject alphas in dictionary words */ for (i = 0, offset = 0; i < len; offset += lengths[i++]) { if (word->reject_map[i].rejected () && unicharset.get_isalpha (ptr + offset, lengths[i])) word->reject_map[i].setrej_minimal_rej_accept (); } } rating_per_ch = word->best_choice->rating () / word->reject_map.length (); if (rating_per_ch >= suspect_rating_per_ch) return; //Dont touch bad ratings if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ for (i = 0, offset = 0; i < len; offset += lengths[i++]) { if (word->reject_map[i].rejected () && (ptr[offset] != ' ')) word->reject_map[i].setrej_minimal_rej_accept (); } } for (i = 0; i < len; i++) { if (word->reject_map[i].rejected ()) { if (word->reject_map[i].flag (R_DOC_REJ)) word->reject_map[i].setrej_minimal_rej_accept (); if (word->reject_map[i].flag (R_BLOCK_REJ)) word->reject_map[i].setrej_minimal_rej_accept (); if (word->reject_map[i].flag (R_ROW_REJ)) word->reject_map[i].setrej_minimal_rej_accept (); } } if (suspect_level == 2) return; if (!suspect_constrain_1Il || (word->reject_map.length () <= suspect_short_words)) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected ()) { if ((word->reject_map[i].flag (R_1IL_CONFLICT) || word->reject_map[i].flag (R_POSTNN_1IL))) word->reject_map[i].setrej_minimal_rej_accept (); if (!suspect_constrain_1Il && word->reject_map[i].flag (R_MM_REJECT)) word->reject_map[i].setrej_minimal_rej_accept (); } } } if ((acceptable_word_string (word->best_choice->string ().string (), word->best_choice->lengths ().string ()) != AC_UNACCEPTABLE) || acceptable_number_string (word->best_choice->string ().string (), word->best_choice->lengths ().string ())) { if (word->reject_map.length () > suspect_short_words) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected () && (!word->reject_map[i].perm_rejected () || word->reject_map[i].flag (R_1IL_CONFLICT) || word->reject_map[i].flag (R_POSTNN_1IL) || word->reject_map[i].flag (R_MM_REJECT))) { word->reject_map[i].setrej_minimal_rej_accept (); } } } }}inT16 count_alphas( //how many alphas const char *s, const char *lengths) { int count = 0; for (; *s != '\0'; s += *(lengths++)) { if (unicharset.get_isalpha(s, *lengths)) count++; } return count;}inT16 count_alphanums( //how many alphanums const char *s, const char *lengths) { int count = 0; for (; *s != '\0'; s += *(lengths++)) { if (unicharset.get_isalpha(s, *lengths) || unicharset.get_isdigit(s, *lengths)) count++; } return count;}BOOL8 acceptable_number_string(const char *s, const char *lengths) { BOOL8 prev_digit = FALSE; if (*lengths == 1 && *s == '(') s++; if (*lengths == 1 && ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-'))) s++; for (; *s != '\0'; s += *(lengths++)) { if (unicharset.get_isdigit (s, *lengths)) prev_digit = TRUE; else if (prev_digit && (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-')))) prev_digit = FALSE; else if (prev_digit && *lengths == 1 && (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')'))) return TRUE; else if (prev_digit && *lengths == 1 && (*s == '%') && (*(lengths + 1) == 1 && *(s + *lengths) == ')') && (*(s + *lengths + *(lengths + 1)) == '\0')) return TRUE; else return FALSE; } return TRUE;}
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?