control.cpp
来自「一个google的OCR源码」· C++ 代码 · 共 1,843 行 · 第 1/5 页
CPP
1,843 行
} // tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n", // word_str, word_len, total, maxch ); free_mem(rep_ch); word->reject_map.initialise (word_len); for (i = 0, offset = 0; i < word_len; offset += word->best_choice->lengths()[i++]) { if (strncmp(word_str + offset, maxch, word->best_choice->lengths()[i]) != 0) //rej unrecognised blobs word->reject_map[i].setrej_bad_repetition (); } word->done = TRUE;}// TODO(tkielbus) Decide between keeping this behavior here or modifying the// training data.// Utility function for fix_quotes// Return true if the next character in the string (given the UTF8 length in// bytes) is a quote character.static int is_simple_quote(const char* signed_str, int length) { const unsigned char* str = reinterpret_cast<const unsigned char*>(signed_str); //standard 1 byte quotes return (length == 1 && (*str == '\'' || *str == '`')) || //utf8 3 bytes curved quotes (length == 3 && ((*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x98) || (*str == 0xe2 && *(str + 1) == 0x80 && *(str + 2) == 0x99)));}/********************************************************************** * fix_quotes * * Change pairs of quotes to double quotes. **********************************************************************/void fix_quotes( //make double quotes WERD_CHOICE *choice, //choice to fix WERD *word, //word to do //char choices BLOB_CHOICE_LIST_CLIST *blob_choices) { char *str = (char *) choice->string().string();//string ptr int i; int offset; //blobs PBLOB_IT blob_it = word->blob_list (); //choices BLOB_CHOICE_LIST_C_IT choice_it = blob_choices; BLOB_CHOICE_IT it1; //first choices BLOB_CHOICE_IT it2; //second choices for (i = 0, offset = 0; str[offset] != '\0'; offset += choice->lengths()[i++], blob_it.forward (), choice_it.forward ()) { if (str[offset + choice->lengths()[i]] != '\0' && is_simple_quote(str + offset, choice->lengths()[i]) && is_simple_quote(str + offset + choice->lengths()[i], choice->lengths()[i + 1]) && unicharset.contains_unichar("\"")) { str[offset] = '"'; //turn to double strcpy (str + offset + 1, str + offset + choice->lengths()[i] + choice->lengths()[i + 1]); //shuffle up choice->lengths()[i] = 1; strcpy ((char*) choice->lengths().string() + i + 1, choice->lengths().string() + i + 2); merge_blobs (blob_it.data (), blob_it.data_relative (1)); blob_it.forward (); delete blob_it.extract (); //get rid of spare it1.set_to_list (choice_it.data ()); it2.set_to_list (choice_it.data_relative (1)); if (it1.data ()->certainty () < it2.data ()->certainty ()) { choice_it.forward (); //get rid of spare delete choice_it.extract (); } else { //get rid of spare delete choice_it.extract (); choice_it.forward (); } } }}/********************************************************************** * fix_hyphens * * Change pairs of hyphens to a single hyphen if the bounding boxes touch * Typically a long dash which has been segmented. **********************************************************************/void fix_hyphens( //crunch double hyphens WERD_CHOICE *choice, //choice to fix WERD *word, //word to do //char choices BLOB_CHOICE_LIST_CLIST *blob_choices) { char *str = (char *) choice->string().string();//string ptr int i; int offset; //blobs PBLOB_IT blob_it = word->blob_list (); //choices BLOB_CHOICE_LIST_C_IT choice_it = blob_choices; BLOB_CHOICE_IT it1; //first choices BLOB_CHOICE_IT it2; //second choices for (i = 0, offset = 0; str[offset] != '\0'; offset += choice->lengths()[i++], blob_it.forward (), choice_it.forward ()) { if ((str[offset] == '-' || str[offset] == '~') && (str[offset + choice->lengths()[i]] == '-' || str[offset + choice->lengths()[i]] == '~') && (blob_it.data ()->bounding_box ().right () >= blob_it.data_relative (1)->bounding_box ().left ())) { str[offset] = '-'; //turn to single hyphen strcpy (str + offset + choice->lengths()[i], str + offset + choice->lengths()[i] + choice->lengths()[i + 1]); //shuffle up strcpy ((char*) choice->lengths().string() + i + 1, choice->lengths().string() + i + 2); merge_blobs (blob_it.data (), blob_it.data_relative (1)); blob_it.forward (); delete blob_it.extract (); //get rid of spare it1.set_to_list (choice_it.data ()); it2.set_to_list (choice_it.data_relative (1)); if (it1.data ()->certainty () < it2.data ()->certainty ()) { choice_it.forward (); //get rid of spare delete choice_it.extract (); } else { //get rid of spare delete choice_it.extract (); choice_it.forward (); } } }}/********************************************************************** * merge_blobs * * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted. **********************************************************************/void merge_blobs( //combine 2 blobs PBLOB *blob1, //dest blob PBLOB *blob2 //source blob ) { OUTLINE_IT outline_it = blob1->out_list (); //iterator outline_it.move_to_last (); //go to end //do it outline_it.add_list_after (blob2->out_list ());}/********************************************************************** * choice_dump_tester * * Matcher tester function which generates .chc file entries. * Called via test_segment_pass2 for every blob tested by tess in a word. * (But only for words for which a correct segmentation could be found.) **********************************************************************/void choice_dump_tester( //dump chars in word PBLOB *, //blob DENORM *, //de-normaliser BOOL8 correct, //ly segmented char *text, //correct text inT32 count, //chars in text BLOB_CHOICE_LIST *ratings //list of results ) { STRING choice_file_name; BLOB_CHOICE *blob_choice; BLOB_CHOICE_IT it; char source_chars[20]; char correct_char[3]; if (choice_file == NULL) { choice_file_name = imagebasename + ".chc"; if (!(choice_file = fopen (choice_file_name.string (), "w"))) { CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d", choice_file_name.string (), errno); } } if ((count == 0) || (text == NULL) || (text[0] == '\0')) { strcpy (source_chars, "$$"); strcpy (correct_char, "$$"); } else { strncpy(source_chars, text, count); source_chars[count] = '\0'; if (correct) { correct_char[0] = text[0]; correct_char[1] = '\0'; } else { strcpy (correct_char, "$$"); } } fprintf (choice_file, "%s\t%s", source_chars, correct_char); it.set_to_list (ratings); for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { blob_choice = it.data (); fprintf (choice_file, "\t%s\t%f\t%f", blob_choice->unichar (), blob_choice->rating (), blob_choice->certainty ()); } fprintf (choice_file, "\n");}/************************************************************************* * make_bln_copy() * * Generate a baseline normalised copy of the source word. The copy is done so * that whatever format the original word is in, a polygonal bln version is * generated as output. *************************************************************************/WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) { WERD *result; // if (wordit_linearc && !src_word->flag(W_POLYGON)) // { // larc_word = src_word->larc_copy( row->x_height() ); // result = larc_word->poly_copy( row->x_height() ); // delete larc_word; // } // else result = src_word->poly_copy (row->x_height ()); // if (tessedit_draw_words) // { // if ( la_win == NO_WINDOW ) // create_la_win(); // result->plot( la_win ); // } result->baseline_normalise_x (row, x_height, denorm); return result;}ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s, const char *lengths) { int i = 0; int offset = 0; int leading_punct_count; int upper_count = 0; int hyphen_pos = -1; ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; if (strlen (lengths) > 20) return word_type; /* Single Leading punctuation char*/ if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset]))) offset += lengths[i++]; leading_punct_count = i; /* Initial cap */ while ((s[offset] != '\0') && unicharset.get_isupper(s + offset, lengths[i])) { offset += lengths[i++]; upper_count++; } if (upper_count > 1) word_type = AC_UPPER_CASE; else { /* Lower case word, possibly with an initial cap */ while ((s[offset] != '\0') && unicharset.get_islower (s + offset, lengths[i])) { offset += lengths[i++]; } if (i - leading_punct_count < quality_min_initial_alphas_reqd) goto not_a_word; /* Allow a single hyphen in a lower case word - dont trust upper case - I've seen several cases of "H" -> "I-I" */ if (lengths[i] == 1 && s[offset] == '-') { hyphen_pos = i; offset += lengths[i++]; if (s[offset] != '\0') { while ((s[offset] != '\0') && unicharset.get_islower(s + offset, lengths[i])) { offset += lengths[i++]; } if (i < hyphen_pos + 3) goto not_a_word; } } else { /* Allow "'s" in NON hyphenated lower case words */ if (lengths[i] == 1 && (s[offset] == '\'') && lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) { offset += lengths[i++]; offset += lengths[i++]; } } if (upper_count > 0) word_type = AC_INITIAL_CAP; else word_type = AC_LOWER_CASE; } /* Up to two different, constrained trailing punctuation chars */ if (lengths[i] == 1 && (s[offset] != '\0') && (STRING (chs_trailing_punct1).contains (s[offset]))) offset += lengths[i++]; if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 && (s[offset - lengths[i - 1]] != s[offset]) && (STRING (chs_trailing_punct2).contains (s[offset]))) offset += lengths[i++]; if (s[offset] != '\0') word_type = AC_UNACCEPTABLE; not_a_word: if (word_type == AC_UNACCEPTABLE) { /* Look for abbreviation string */ i = 0; offset = 0; if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) { word_type = AC_UC_ABBREV; while ((s[offset] != '\0') && unicharset.get_isupper(s + offset, lengths[i]) && (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) { offset += lengths[i++]; offset += lengths[i++]; } } else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) { word_type = AC_LC_ABBREV; while ((s[offset] != '\0') && unicharset.get_islower(s + offset, lengths[i]) && (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) { offset += lengths[i++]; offset += lengths[i++]; } } if (s[offset] != '\0') word_type = AC_UNACCEPTABLE; } return word_type;}/* DEBUGGING ROUTINE */BOOL8 check_debug_pt(WERD_RES *word, int location) { BOOL8 show_map_detail = FALSE; inT16 i; #ifndef SECURE_NAMES if (!test_pt)
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?