📄 output.cpp
字号:
rating = text[index] == ' ' ? 100 : 0; else rating = word->reject_map[index].accepted ()? 0 : 100; if (rating > 255) rating = 255; if (word->font1_count > 2) font = word->font1; else if (row->font1_count > 8) font = row->font1; else //font index font = word->word->flag (W_DONT_CHOP) ? 0 : 1; lineend = word->word->flag (W_EOL) && index == length - 1; if (word->word->flag (W_EOL) && tessedit_zero_rejection && index < length - 1 && text[index + 1] == ' ') { for (index2 = index + 1; index2 < length && text[index2] == ' '; index2++); if (index2 == length) lineend = TRUE; } if (!tessedit_zero_rejection || text[index] != ' ' || tessedit_word_for_word) { //confidence ocr_append_char (text[index] == ' ' ? unrecognised : text[index], blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, (UINT8) rating, ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); blanks = 0; } } } else if (tessedit_word_for_word) { blanks = word->word->space (); if (blanks == 0 && !word->word->flag (W_BOL)) blanks = 1; blob_box = word->word->bounding_box (); enhancement = 0; if (word->italic > 0) enhancement |= EUC_ITALIC; if (word->bold > 0) enhancement |= EUC_BOLD; rating = 100; if (word->font1_count > 2) font = word->font1; else if (row->font1_count > 8) font = row->font1; else //font index font = word->word->flag (W_DONT_CHOP) ? 0 : 1; lineend = word->word->flag (W_EOL); //font index ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (), page_image.get_ysize () - 1 - blob_box.bottom (), font, rating, //confidence ptsize, //point size blanks, enhancement, //enhancement OCR_CDIR_LEFT_RIGHT, OCR_LDIR_DOWN_RIGHT, lineend ? OCR_NL_NEWLINE : OCR_NL_NONE); }}/********************************************************************** * write_map * * Write a map file of 0's and 1'a which associates characters from the .txt * file with those in the .etx file. 0 = .txt char was deleted. 1 = .txt char * is kept. Note that there may be reject regions in the .etx file WITHOUT * .txt chars being rejected. The map file should be the same length, and * the same number of lines as the .txt file * * The paramaterised input is because I thought I might be able to generate * multiple map files in a single run. However, it didn't work because * newdiff needs etx files! **********************************************************************/void write_map( //output a map file FILE *mapfile, //mapfile to write to WERD_RES *word) { INT16 index; int status; STRING mapstr = ""; if (word->best_choice->string ().length () > 0) { for (index = 0; index < word->word->space (); index++) { if (word->reject_spaces && (suspect_level >= suspect_space_level) && !tessedit_minimal_rejection && !tessedit_zero_rejection) /* Write rejected spaces to .map file ONLY. Newdiff converts these back to accepted spaces AFTER generating basic space stats but BEFORE using .etx */ status = fprintf (mapfile, "0"); else status = fprintf (mapfile, "1"); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Space Errno: %d", errno); } if ((word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes)) { for (index = 0; index < 5; index++) mapstr += '1'; } else { ASSERT_HOST (word->reject_map.length () == word->best_choice->string ().length ()); for (index = 0; index < word->reject_map.length (); index++) { if (word->reject_map[index].accepted ()) mapstr += '1'; else mapstr += '0'; } } status = fprintf (mapfile, "%s", mapstr.string ()); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Map str Errno: %d", errno); } if (word->word->flag (W_EOL)) { status = fprintf (mapfile, "\n"); if (status < 0) WRITEFAILED.error ("write_map", EXIT, "Newline Errno: %d", errno); } status = fflush (mapfile); if (status != 0) WRITEFAILED.error ("write_map", EXIT, "fflush Errno: %d", errno);}/************************************************************************* * open_file() *************************************************************************/FILE *open_outfile( //open .map & .unlv file const char *extension) { STRING file_name; FILE *outfile; file_name = imagebasename + extension; if (!(outfile = fopen (file_name.string (), "w"))) { CANTOPENFILE.error ("open_outfile", EXIT, "%s %d", file_name.string (), errno); } return outfile;}void write_unlv_text(WERD_RES *word) { const char *wordstr; char buff[512]; //string to output int i = 0; int j = 0; char unrecognised = STRING (unrecognised_char)[0]; int status; char space_str[3]; wordstr = word->best_choice->string ().string (); /* DONT need to do anything special for repeated char words - at this stage the repetition char has been identified and any other chars have been rejected. */ for (; wordstr[i] != '\0'; i++) { if ((wordstr[i] == ' ') || (wordstr[i] == '~') || (wordstr[i] == '^') || (wordstr[i] == '|')) buff[j++] = unrecognised; else { if (word->reject_map[i].rejected ()) buff[j++] = '^'; //Add suspect marker buff[j++] = wordstr[i]; } } buff[j] = '\0'; if (strlen (wordstr) > 0) { if (word->reject_spaces && (suspect_level >= suspect_space_level) && !tessedit_minimal_rejection && !tessedit_zero_rejection) strcpy (space_str, "^ "); //Suspect space else strcpy (space_str, " "); //Certain space for (i = 0; i < word->word->space (); i++) { status = fprintf (unlv_file, "%s", space_str); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Space Errno: %d", errno); } status = fprintf (unlv_file, "%s", buff); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Word Errno: %d", errno); } if (word->word->flag (W_EOL)) { status = fprintf (unlv_file, "\n"); if (status < 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Newline Errno: %d", errno); } status = fflush (unlv_file); if (status != 0) WRITEFAILED.error ("write_unlv_text", EXIT, "Fflush Errno: %d", errno);}/************************************************************************* * get_rep_char() * Return the first accepted character from the repetition string. This is the * character which is repeated - as determined earlier by fix_rep_char() *************************************************************************/char get_rep_char( // what char is repeated? WERD_RES *word) { int i; for (i = 0; ((i < word->reject_map.length ()) && (word->reject_map[i].rejected ())); i++); if (i < word->reject_map.length ()) return word->best_choice->string ()[i]; else return STRING (unrecognised_char)[0];}void ensure_rep_chars_are_consistent(WERD_RES *word) { char rep_char = get_rep_char (word); char *ptr; ptr = (char *) word->best_choice->string ().string (); for (; *ptr != '\0'; ptr++) { if (*ptr != rep_char) *ptr = rep_char; }}/************************************************************************* * SUSPECT LEVELS * * 0 - dont reject ANYTHING * 1,2 - partial rejection * 3 - BEST * * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and * tessedit_minimal_rejection. *************************************************************************/void set_unlv_suspects(WERD_RES *word) { int len = word->reject_map.length (); int i; const char *ptr; float rating_per_ch; ptr = word->best_choice->string ().string (); if (suspect_level == 0) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected ()) word->reject_map[i].setrej_minimal_rej_accept (); } return; } if (suspect_level >= 3) return; //Use defaults /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/ if (safe_dict_word (ptr) && (count_alphas (ptr) > suspect_short_words)) { /* Unreject alphas in dictionary words */ for (i = 0; i < len; i++) { if (word->reject_map[i].rejected () && isalpha (ptr[i])) word->reject_map[i].setrej_minimal_rej_accept (); } } rating_per_ch = word->best_choice->rating () / word->reject_map.length (); if (rating_per_ch >= suspect_rating_per_ch) return; //Dont touch bad ratings if ((word->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ for (i = 0; i < len; i++) { if (word->reject_map[i].rejected () && (ptr[i] != ' ')) word->reject_map[i].setrej_minimal_rej_accept (); } } for (i = 0; i < len; i++) { if (word->reject_map[i].rejected ()) { if (word->reject_map[i].flag (R_DOC_REJ)) word->reject_map[i].setrej_minimal_rej_accept (); if (word->reject_map[i].flag (R_BLOCK_REJ)) word->reject_map[i].setrej_minimal_rej_accept (); if (word->reject_map[i].flag (R_ROW_REJ)) word->reject_map[i].setrej_minimal_rej_accept (); } } if (suspect_level == 2) return; if (!suspect_constrain_1Il || (word->reject_map.length () <= suspect_short_words)) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected ()) { if ((word->reject_map[i].flag (R_1IL_CONFLICT) || word->reject_map[i].flag (R_POSTNN_1IL))) word->reject_map[i].setrej_minimal_rej_accept (); if (!suspect_constrain_1Il && word->reject_map[i].flag (R_MM_REJECT)) word->reject_map[i].setrej_minimal_rej_accept (); } } } if ((acceptable_word_string (word->best_choice->string ().string ()) != AC_UNACCEPTABLE) || acceptable_number_string (word->best_choice->string ().string ())) { if (word->reject_map.length () > suspect_short_words) { for (i = 0; i < len; i++) { if (word->reject_map[i].rejected () && (!word->reject_map[i].perm_rejected () || word->reject_map[i].flag (R_1IL_CONFLICT) || word->reject_map[i].flag (R_POSTNN_1IL) || word->reject_map[i].flag (R_MM_REJECT))) { word->reject_map[i].setrej_minimal_rej_accept (); } } } }}INT16 count_alphas( //how many alphas const char *s) { int count = 0; for (; *s != '\0'; s++) { if (isalpha (*s)) count++; } return count;}INT16 count_alphanums( //how many alphanums const char *s) { int count = 0; for (; *s != '\0'; s++) { if (isalnum (*s)) count++; } return count;}BOOL8 acceptable_number_string(const char *s) { BOOL8 prev_digit = FALSE; if (*s == '(') s++; if ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')) s++; for (; *s != '\0'; s++) { if (isdigit (*s)) prev_digit = TRUE; else if (prev_digit && ((*s == '.') || (*s == ',') || (*s == '-'))) prev_digit = FALSE; else if (prev_digit && (*(s + 1) == '\0') && ((*s == '%') || (*s == ')'))) return TRUE; else if (prev_digit && (*s == '%') && (*(s + 1) == ')') && (*(s + 2) == '\0')) return TRUE; else return FALSE; } return TRUE;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -