output.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 1,274 行 · 第 1/3 页
CPP
1,274 行
TRUE, FALSE, rawfile); if (tessedit_write_txt_map) write_map(txt_mapfile, word); ep_choice = make_epaper_choice (word, newline_type); word->ep_choice = ep_choice;#endif character_count += word->best_choice->lengths ().length (); word_count++;}/********************************************************************** * make_epaper_choice * * Construct the epaper text string for a word, using the reject map to * determine whether each blob should be rejected. **********************************************************************/#if 0WERD_CHOICE *make_epaper_choice( //convert one word WERD_RES *word, //word to do char newline_type //type of newline ) { inT16 index = 0; //to string inT16 blobindex; //to word inT16 prevright = 0; //right of previous blob inT16 nextleft; //left of next blob PBLOB *blob; TBOX inset_box; //bounding box PBLOB_IT blob_it; //blob iterator char word_string[MAX_PATH]; //converted string BOOL8 force_total_reject; char unrecognised = STRING (unrecognised_char)[0]; blob_it.set_to_list (word->outword->blob_list ()); ASSERT_HOST (word->reject_map.length () == word->best_choice->string ().length ()); /* tprintf( "\"%s\" -> length: %d; blobcount: %d (%d)\n", word->best_choice->string().string(), word->best_choice->string().length(), blob_it.length(), blob_count( word->outword ) ); */ if (word->best_choice->string ().length () == 0) force_total_reject = TRUE; else { force_total_reject = FALSE; ASSERT_HOST (blob_it.length () == word->best_choice->string ().length ()); } if (!blob_it.empty ()) { for (index = 0; index < word->word->space (); index++) word_string[index] = ' '; //leading blanks } /* Why does this generate leading blanks regardless of whether the word_choice string is empty, when write_cooked_text ony generates leading blanks when the string is NOT empty???. */ if (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes) { strcpy (word_string + index, "|^~R"); index += 4; strcpy(word_string + index, unicharset.id_to_unichar(get_rep_char (word))); index += strlen(unicharset.id_to_unichar(get_rep_char (word))); } else { if (!blob_it.empty ()) prevright = blob_it.data ()->bounding_box ().left (); //actually first left for (blobindex = 0, blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blobindex++, blob_it.forward ()) { blob = blob_it.data (); if (word->reject_map[blobindex].accepted ()) { if (word->best_choice->string ()[blobindex] == ' ') //but not rejected!! word_string[index++] = unrecognised; else word_string[index++] = word->best_choice->string ()[blobindex]; } else { // start reject inset_box = blob->bounding_box (); /* Extend reject box to include rejected neighbours */ while (!blob_it.at_last () && (force_total_reject || (word->reject_map[blobindex + 1].rejected ()))) { blobindex++; blob = blob_it.forward (); //get total box inset_box += blob->bounding_box (); } if (blob_it.at_last ()) nextleft = inset_box.right (); else nextleft = blob_it.data_relative (1)->bounding_box ().left (); // tprintf("Making reject from (%d,%d)->(%d,%d)\n", // inset_box.left(),inset_box.bottom(), // inset_box.right(),inset_box.top()); index += make_reject (&inset_box, prevright, nextleft, &word->denorm, &word_string[index]); } prevright = blob->bounding_box ().right (); } } if (newline_type) //end line word_string[index++] = newline_type; word_string[index] = '\0'; //terminate string if (strlen (word_string) != index) { tprintf ("ASSERT ABOUT TO FAIL: %s, index %d len %d\n", word_string, index, strlen (word_string)); } //don't pass any zeros ASSERT_HOST (strlen (word_string) == index); return new WERD_CHOICE (word_string, 0, 0, NO_PERM);}#endif/********************************************************************** * make_reject * * Add the escape code to the string for the reject. **********************************************************************/inT16make_reject ( //make reject codeTBOX * inset_box, //bounding boxinT16 prevright, //previous charinT16 nextleft, //next charDENORM * denorm, //de-normalizerchar word_string[] //output string) { inT16 index; //to string inT16 xpos; //start of inset inT16 ypos; inT16 width; //size of inset inT16 height; inT16 left_offset; //shift form prev char inT16 right_offset; //shift to next char inT16 baseline_offset; //shift from baseline inT16 inset_index = 0; //number of inset inT16 min_chars; //min width estimate inT16 max_chars; //max width estimate float x_centre; //centre of box index = 0; x_centre = (inset_box->left () + inset_box->right ()) / 2.0; left_offset = (inT16) (denorm->x (inset_box->left ()) - denorm->x (prevright)); right_offset = (inT16) (denorm->x (nextleft) - denorm->x (inset_box->right ())); xpos = (inT16) floor (denorm->x (inset_box->left ())); width = (inT16) ceil (denorm->x (inset_box->right ())) - xpos; ypos = (inT16) floor (denorm->y (inset_box->bottom (), x_centre)); height = (inT16) ceil (denorm->y (inset_box->top (), x_centre)) - ypos; baseline_offset = ypos - (inT16) denorm->y (bln_baseline_offset, x_centre); //escape code word_string[index++] = CTRL_INSET; min_chars = (inT16) ceil (0.27 * width / denorm->row ()->x_height ()); max_chars = (inT16) floor (1.8 * width / denorm->row ()->x_height ()); /* Ensure min_chars and max_chars are in the range 0..254. This ensures that we can add 1 to them to avoid putting \0 in a string, and still not exceed the max value in a byte. */ if (min_chars < 0) min_chars = 0; if (min_chars > 254) min_chars = 254; if (max_chars < min_chars) max_chars = min_chars; if (max_chars > 254) max_chars = 254; //min chars word_string[index++] = min_chars + 1; //max chars word_string[index++] = max_chars + 1; word_string[index++] = 2; //type? //store index word_string[index++] = inset_index / 255 + 1; word_string[index++] = inset_index % 255 + 1; return index; //size of string}/********************************************************************** * determine_newline_type * * Find whether we have a wrapping or hard newline. * Return FALSE if not at end of line. **********************************************************************/char determine_newline_type( //test line ends WERD *word, //word to do BLOCK *block, //current block WERD *next_word, //next word BLOCK *next_block //block of next word ) { inT16 end_gap; //to right edge inT16 width; //of next word TBOX word_box; //bounding TBOX next_box; //next word TBOX block_box; //block bounding if (!word->flag (W_EOL)) return FALSE; //not end of line if (next_word == NULL || next_block == NULL || block != next_block) return CTRL_NEWLINE; if (next_word->space () > 0) return CTRL_HARDLINE; //it is tabbed word_box = word->bounding_box (); next_box = next_word->bounding_box (); block_box = block->bounding_box (); //gap to eol end_gap = block_box.right () - word_box.right (); end_gap -= (inT32) block->space (); width = next_box.right () - next_box.left (); // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n", // block_box.right(),word_box.right(),end_gap, // next_box.right(),next_box.left(),width, // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE); return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;}/********************************************************************** * write_cooked_text * * Write the cooked text (with bold for pass2 and underline for reject) * to the given file. **********************************************************************/#if 0void write_cooked_text( //write output WERD *word, //word to do const STRING &text, //text to write BOOL8 acceptable, //good stuff BOOL8 pass2, //done on pass2 FILE *fp //file to write ) { inT16 index; //blank counter int status; static int newaline = 1; static int havespace = 0; char buff[512]; const char *wordstr = text.string (); int i = 0; char unrecognised = STRING (unrecognised_char)[0]; static int old_segs = 0; TBOX mybox; for (i = 0; wordstr[i] != '\0'; i++) { if (wordstr[i] == ' ') buff[i] = unrecognised; else buff[i] = wordstr[i]; } buff[i] = '\0'; if (fp == stdout) { tprintf ("Cooked=%s, %d segs, acceptable=%d", buff, num_popped - old_segs, acceptable); old_segs = num_popped; return; } if (text.length () > 0) { for (index = 0; index < word->space (); index++) { status = fprintf (fp, " "); havespace = 1; if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Space Errno: %d", errno); } if (pass2) { status = fprintf (fp, BOLD_ON); if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Bold Errno: %d", errno); } if (!acceptable) { status = fprintf (fp, UNDERLINE_ON); if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Underline Errno: %d", errno); } //xiaofan if (NO_BLOCK && word && strlen (buff)) { mybox = word->bounding_box (); if (newaline || !havespace) { fprintf (fp, " "); newaline = 0; } fprintf (fp, "(%d," INT32FORMAT ",%d," INT32FORMAT ")", XOFFSET + mybox.left (), YOFFSET + page_image.get_ysize () - mybox.top (), XOFFSET + mybox.right (), YOFFSET + page_image.get_ysize () - mybox.bottom ()); havespace = 0; } status = fprintf (fp, "%s", buff); if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Word Errno: %d", errno); if (pass2) { status = fprintf (fp, BOLD_OFF); if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Bold off Errno: %d", errno); } if (!acceptable) { status = fprintf (fp, UNDERLINE_OFF); if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Underline off Errno: %d", errno); } } if (word->flag (W_EOL)) { status = fprintf (fp, "\n"); newaline = 1; if (status < 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Newline Errno: %d", errno); } status = fflush (fp); if (status != 0) WRITEFAILED.error ("write_cooked_text", EXIT, "Fflush Errno: %d", errno);}#endif/********************************************************************** * write_shm_text * * Write the cooked text to the shared memory for the api. **********************************************************************/void write_shm_text( //write output WERD_RES *word, //word to do BLOCK *block, //block it is from ROW_RES *row, //row it is from const STRING &text, //text to write const STRING &text_lengths ) { inT32 index; //char counter inT32 index2; //char counter inT32 length; //chars in word inT32 ptsize; //font size inT8 blanks; //blanks in word uinT8 enhancement; //bold etc uinT8 font; //font index char unrecognised = STRING (unrecognised_char)[0]; PBLOB *blob; TBOX blob_box; //bounding box PBLOB_IT blob_it; //blob iterator WERD copy_outword; // copy to denorm uinT32 rating; //of char BOOL8 lineend; //end of line int offset; int offset2; //point size ptsize = pixels_to_pts ((inT32) (row->row->x_height () + row->row->ascenders () - row->row->descenders ()), 300); if (word->word->flag (W_BOL) && ocr_char_space () < 128 && ocr_send_text (TRUE) != OKAY) return; //release failed copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); blob_it.set_to_list (copy_outword.blob_list ()); length = text_lengths.length (); if (length > 0) { blanks = word->word->space (); if (blanks == 0 && tessedit_word_for_word && !word->word->flag (W_BOL)) blanks = 1; for (index = 0, offset = 0; index < length; offset += text_lengths[index++], blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); enhancement = 0; if (word->italic > 0 || (word->italic == 0 && row->italic > 0)) enhancement |= EUC_ITALIC; if (word->bold > 0 || (word->bold == 0 && row->bold > 0)) enhancement |= EUC_BOLD; if (tessedit_write_ratings) rating = (uinT32) (-word->best_choice->certainty () / 0.035); else if (tessedit_zero_rejection) rating = text[offset] == ' ' ? 100 : 0; else rating = word->reject_map[index].accepted ()? 0 : 100; if (rating > 255) rating = 255; if (word->font1_count > 2) font = word->font1; else if (row->font1_count > 8) font = row->font1; else //font index font = word->word->flag (W_DONT_CHOP) ? 0 : 1; lineend = word->word->flag (W_EOL) && index == length - 1; if (word->word->flag (W_EOL) && tessedit_zero_rejection && index < length - 1 && text[index + text_lengths[index]] == ' ') { for (index2 = index + 1, offset2 = offset + text_lengths[index]; index2 < length && text[offset2] == ' '; offset2 += text_lengths[index2++]); if (index2 == length) lineend = TRUE; } if (!tessedit_zero_rejection || text[offset] != ' ' || tessedit_word_for_word) { //confidence if (text[offset] == ' ') { ocr_append_char (unrecognised, blob_box.left (), blob_box.right (), page_image.get_ysize () - 1 - blob_box.top (),
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?