📄 control.cpp
字号:
WERD *bln_word; //baseline norm copy //detailed results BLOB_CHOICE_LIST_CLIST blob_choices; set_global_subsubloc_code(SUBSUBLOC_OTHER); if (matcher_fp != NULL) { word_answer = (char *) word->word->text (); if (word_answer != NULL && word_answer[0] == '\0') word_answer = NULL; } matcher_pass = 0; bln_word = make_bln_copy (word->word, row, x_height, &word->denorm); set_global_subsubloc_code(SUBSUBLOC_TESS); if (tessedit_training_tess) word->best_choice = correct_segment_pass2 (bln_word, &word->denorm, tess_default_matcher, tess_training_tester, word->raw_choice, &blob_choices, word->outword); else if (tessedit_dump_choices) word->best_choice = test_segment_pass2 (bln_word, &word->denorm, tess_default_matcher, choice_dump_tester, word->raw_choice, &blob_choices, word->outword); // else if (tessedit_training_wiseowl) // best_choice=correct_segment_pass2( word, &denorm, // tess_default_matcher,wo_learn, // raw_choice,&blob_choices,outword); // else if (tessedit_matcher_is_wiseowl) // best_choice=tess_segment_pass2( word, &denorm, wo_classify, // raw_choice, &blob_choices, outword); else { word->best_choice = tess_segment_pass2 (bln_word, &word->denorm, tess_default_matcher, word->raw_choice, &blob_choices, word->outword); } set_global_subsubloc_code(SUBSUBLOC_OTHER); /* Test for TESS screw up on word. Recog_word has already ensured that the choice list, outword blob lists and best_choice string are the same length. A TESS screw up is indicated by a blank filled or 0 length string. */ if ((word->best_choice->string ().length () == 0) || (strspn (word->best_choice->string ().string (), " ") == word->best_choice->string ().length ())) { word->tess_failed = TRUE; word->reject_map.initialise (word->best_choice->string ().length ()); word->reject_map.rej_word_tess_failure (); // tprintf("Empty word produced\n"); } else { if ((word->best_choice->string ().length () != word->outword->blob_list ()->length ()) || (word->best_choice->string ().length () != blob_choices.length ())) { tprintf ("ASSERT FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word->best_choice->string ().string (), word->best_choice->string ().length (), word->outword->blob_list ()->length (), blob_choices.length ()); } ASSERT_HOST (word->best_choice->string ().length () == word->outword->blob_list ()->length ()); ASSERT_HOST (word->best_choice->string ().length () == blob_choices.length ()); word->tess_failed = FALSE; if (word->word->flag (W_REP_CHAR)) { fix_rep_char(word); } else { fix_quotes ((char *) word->best_choice->string ().string (), word->outword, &blob_choices); if (tessedit_fix_hyphens) fix_hyphens ((char *) word->best_choice->string ().string (), word->outword, &blob_choices); /* Dont trust fix_quotes! - though I think I've fixed the bug */ if ((word->best_choice->string ().length () != word->outword->blob_list ()->length ()) || (word->best_choice->string ().length () != blob_choices.length ())) { #ifndef SECURE_NAMES tprintf ("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d; #Blobs=%d; #Choices=%d\n", word->best_choice->string ().string (), word->best_choice->string ().length (), word->outword->blob_list ()->length (), blob_choices.length ()); #endif } ASSERT_HOST (word->best_choice->string ().length () == word->outword->blob_list ()->length ()); ASSERT_HOST (word->best_choice->string ().length () == blob_choices.length ()); word->tess_accepted = tess_acceptable_word (word->best_choice, word->raw_choice); make_reject_map (word, &blob_choices, row, 2); } } blob_choices.deep_clear (); delete bln_word; assert (word->raw_choice != NULL);}/************************************************************************* * fix_rep_char() * The word is a repeated char. Find the repeated char character. Make a reject * string which rejects any char other than the voted char. Set the word to done * to stop rematching it. * *************************************************************************/void fix_rep_char( //Repeated char word WERD_RES *word //word to do ) { struct REP_CH { char ch; int count; }; REP_CH *rep_ch; //array of char counts int word_len; int rep_ch_count = 0; //how many unique chs const char *word_str; //the repeated chs int i, j; int total = 0; int max = 0; char maxch = ' '; //Most common char word_str = word->best_choice->string ().string (); word_len = strlen (word_str); rep_ch = (REP_CH *) alloc_mem (word_len * sizeof (REP_CH)); for (i = 0; i < word_len; i++) { for (j = 0; j < rep_ch_count && rep_ch[j].ch != word_str[i]; j++); if (j < rep_ch_count) rep_ch[j].count++; else { rep_ch[rep_ch_count].ch = word_str[i]; rep_ch[rep_ch_count].count = 1; rep_ch_count++; } } for (j = 0; j < rep_ch_count; j++) { total += rep_ch[j].count; if ((rep_ch[j].count > max) && (rep_ch[j].ch != ' ')) { max = rep_ch[j].count; maxch = rep_ch[j].ch; } } // tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n", // word_str, word_len, total, maxch ); free_mem(rep_ch); word->reject_map.initialise (word_len); for (i = 0; i < word_len; i++) { if (word_str[i] != maxch) //rej unrecognised blobs word->reject_map[i].setrej_bad_repetition (); } word->done = TRUE;}/********************************************************************** * fix_quotes * * Change pairs of quotes to double quotes. **********************************************************************/void fix_quotes( //make double quotes char *string, //string to fix WERD *word, //word to do //char choices BLOB_CHOICE_LIST_CLIST *blob_choices) { char *ptr; //string ptr //blobs PBLOB_IT blob_it = word->blob_list (); //choices BLOB_CHOICE_LIST_C_IT choice_it = blob_choices; BLOB_CHOICE_IT it1; //first choices BLOB_CHOICE_IT it2; //second choices for (ptr = string; *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) { if ((*ptr == '\'' || *ptr == '`') && (*(ptr + 1) == '\'' || *(ptr + 1) == '`')) { *ptr = '"'; //turn to double strcpy (ptr + 1, ptr + 2); //shuffle up merge_blobs (blob_it.data (), blob_it.data_relative (1)); blob_it.forward (); delete blob_it.extract (); //get rid of spare it1.set_to_list (choice_it.data ()); it2.set_to_list (choice_it.data_relative (1)); if (it1.data ()->certainty () < it2.data ()->certainty ()) { choice_it.forward (); //get rid of spare delete choice_it.extract (); } else { //get rid of spare delete choice_it.extract (); choice_it.forward (); } } }}/********************************************************************** * fix_hyphens * * Change pairs of hyphens to a single hyphen if the bounding boxes touch * Typically a long dash which has been segmented. **********************************************************************/void fix_hyphens( //crunch double hyphens char *string, //string to fix WERD *word, //word to do //char choices BLOB_CHOICE_LIST_CLIST *blob_choices) { char *ptr; //string ptr //blobs PBLOB_IT blob_it = word->blob_list (); //choices BLOB_CHOICE_LIST_C_IT choice_it = blob_choices; BLOB_CHOICE_IT it1; //first choices BLOB_CHOICE_IT it2; //second choices for (ptr = string; *ptr != '\0'; ptr++, blob_it.forward (), choice_it.forward ()) { if ((*ptr == '-' || *ptr == '~') && (*(ptr + 1) == '-' || *(ptr + 1) == '~') && (blob_it.data ()->bounding_box ().right () >= blob_it.data_relative (1)->bounding_box ().left ())) { *ptr = '-'; //turn to single hyphen strcpy (ptr + 1, ptr + 2); //shuffle up merge_blobs (blob_it.data (), blob_it.data_relative (1)); blob_it.forward (); delete blob_it.extract (); //get rid of spare it1.set_to_list (choice_it.data ()); it2.set_to_list (choice_it.data_relative (1)); if (it1.data ()->certainty () < it2.data ()->certainty ()) { choice_it.forward (); //get rid of spare delete choice_it.extract (); } else { //get rid of spare delete choice_it.extract (); choice_it.forward (); } } }}/********************************************************************** * merge_blobs * * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted. **********************************************************************/void merge_blobs( //combine 2 blobs PBLOB *blob1, //dest blob PBLOB *blob2 //source blob ) { OUTLINE_IT outline_it = blob1->out_list (); //iterator outline_it.move_to_last (); //go to end //do it outline_it.add_list_after (blob2->out_list ());}/********************************************************************** * choice_dump_tester * * Matcher tester function which generates .chc file entries. * Called via test_segment_pass2 for every blob tested by tess in a word. * (But only for words for which a correct segmentation could be found.) **********************************************************************/void choice_dump_tester( //dump chars in word PBLOB *, //blob DENORM *, //de-normaliser BOOL8 correct, //ly segmented char *text, //correct text INT32 count, //chars in text BLOB_CHOICE_LIST *ratings //list of results ) { STRING choice_file_name; BLOB_CHOICE *blob_choice; BLOB_CHOICE_IT it; char source_chars[20]; char correct_char[3]; if (choice_file == NULL) { choice_file_name = imagebasename + ".chc"; if (!(choice_file = fopen (choice_file_name.string (), "w"))) { CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d", choice_file_name.string (), errno); } } if ((count == 0) || (text == NULL) || (text[0] == '\0')) { strcpy (source_chars, "$$"); strcpy (correct_char, "$$"); } else { strncpy(source_chars, text, count); source_chars[count] = '\0'; if (correct) { correct_char[0] = text[0]; correct_char[1] = '\0'; } else { strcpy (correct_char, "$$"); } } fprintf (choice_file, "%s\t%s", source_chars, correct_char); it.set_to_list (ratings); for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) { blob_choice = it.data (); if ((blob_choice->char_class () >= '!') && (blob_choice->char_class () <= '~')) fprintf (choice_file, "\t%c\t%f\t%f", blob_choice->char_class (), blob_choice->rating (), blob_choice->certainty ()); } fprintf (choice_file, "\n");}/************************************************************************* * make_bln_copy() * * Generate a baseline normalised copy of the source word. The copy is done so * that whatever format the original word is in, a polygonal bln version is * generated as output. *************************************************************************/WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) { WERD *result; // if (wordit_linearc && !src_word->flag(W_POLYGON)) // { // larc_word = src_word->larc_copy( row->x_height() ); // result = larc_word->poly_copy( row->x_height() ); // delete larc_word; // } // else result = src_word->poly_copy (row->x_height ()); // if (tessedit_draw_words) // { // if ( la_win == NO_WINDOW ) // create_la_win(); // result->plot( la_win ); // } result->baseline_normalise_x (row, x_height, denorm); return result;}ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s) { int i = 0; int leading_punct_count; int upper_count = 0; int hyphen_pos = -1; ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE; if (strlen (s) > 20) return word_type; /* Single Leading punctuation char*/ if ((s[i] != '\0') && (STRING (chs_leading_punct).contains (s[i]))) i++; leading_punct_count = i; /* Initial cap */ while (isupper (s[i])) { i++; upper_count++; } if (upper_count > 1) word_type = AC_UPPER_CASE; else { /* Lower case word, possibly with an initial cap */ while (islower (s[i])) { i++; } if (i - leading_punct_count < quality_min_initial_alphas_reqd) goto not_a_word; /* Allow a single hyphen in a lower case word - dont trust upper case - I've seen several cases of "H" -> "I-I" */ if (s[i] == '-') { hyphen_pos = i++; if (s[i] != '\0') { while (islower (s[i])) { i++; } if (i < hyphen_pos + 3) goto not_a_word; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -