📄 reject.cpp
字号:
check_debug_pt (word, 10); if (tessedit_rejection_debug) { tprintf ("Permuter Type = %d\n", word->best_choice->permuter ()); tprintf ("Certainty: %f Rating: %f\n", word->best_choice->certainty (), word->best_choice->rating ()); tprintf ("Dict word: %d\n", dict_word (word->best_choice->string ().string ())); } /* Un-reject any rejected characters if NN permits */ if (tessedit_use_nn && (pass == 2) && word->reject_map.recoverable_rejects ()) nn_recover_rejects(word, row); flip_hyphens(word); check_debug_pt (word, 20);}void reject_blanks(WERD_RES *word) { INT16 i; for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { if (word->best_choice->string ()[i] == ' ') //rej unrecognised blobs word->reject_map[i].setrej_tess_failure (); }}void reject_I_1_L(WERD_RES *word) { INT16 i; for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { if (STRING (conflict_set_I_l_1). contains (word->best_choice->string ()[i])) { //rej 1Il conflict word->reject_map[i].setrej_1Il_conflict (); } }}void reject_poor_matches( //detailed results WERD_RES *word, BLOB_CHOICE_LIST_CLIST *blob_choices) { float threshold; INT16 i = 0; //super iterator BLOB_CHOICE_LIST_C_IT list_it = blob_choices; BLOB_CHOICE_IT choice_it; //real iterator #ifndef SECURE_NAMES if (strlen (word->best_choice->string ().string ()) != list_it.length ()) { tprintf ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n", word->best_choice->string ().string (), strlen (word->best_choice->string ().string ()), list_it.length (), word->outword->blob_list ()->length ()); } #endif ASSERT_HOST (strlen (word->best_choice->string ().string ()) == list_it.length ()); ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ()); threshold = compute_reject_threshold (blob_choices); for (list_it.mark_cycle_pt (); !list_it.cycled_list (); list_it.forward (), i++) { /* NB - only compares the threshold against the TOP choice char in the choices list for a blob !! - the selected one may be below the threshold */ choice_it.set_to_list (list_it.data ()); if ((word->best_choice->string ()[i] == ' ') || (choice_it.length () == 0)) //rej unrecognised blobs word->reject_map[i].setrej_tess_failure (); else if (choice_it.data ()->certainty () < threshold) //rej poor score blob word->reject_map[i].setrej_poor_match (); }}/********************************************************************** * compute_reject_threshold * * Set a rejection threshold for this word. * Initially this is a trivial function which looks for the largest * gap in the certainty value. **********************************************************************/float compute_reject_threshold( //compute threshold //detailed results BLOB_CHOICE_LIST_CLIST *blob_choices) { INT16 index; //to ratings INT16 blob_count; //no of blobs in word INT16 ok_blob_count = 0; //non TESS rej blobs in word float *ratings; //array of confidences float threshold; //rejection threshold float bestgap; //biggest gap float gapstart; //bottom of gap //super iterator BLOB_CHOICE_LIST_C_IT list_it = blob_choices; BLOB_CHOICE_IT choice_it; //real iterator blob_count = blob_choices->length (); ratings = (float *) alloc_mem (blob_count * sizeof (float)); for (list_it.mark_cycle_pt (), index = 0; !list_it.cycled_list (); list_it.forward (), index++) { choice_it.set_to_list (list_it.data ()); if (choice_it.length () > 0) { ratings[ok_blob_count] = choice_it.data ()->certainty (); //get in an array // tprintf("Rating[%d]=%c %g %g\n", // index,choice_it.data()->char_class(), // choice_it.data()->rating(),choice_it.data()->certainty()); ok_blob_count++; } } ASSERT_HOST (index == blob_count); qsort (ratings, ok_blob_count, sizeof (float), sort_floats); //sort them bestgap = 0; gapstart = ratings[0] - 1; //all reject if none better if (ok_blob_count >= 3) { for (index = 0; index < ok_blob_count - 1; index++) { if (ratings[index + 1] - ratings[index] > bestgap) { bestgap = ratings[index + 1] - ratings[index]; //find biggest gapstart = ratings[index]; } } } threshold = gapstart + bestgap / 2; // tprintf("First=%g, last=%g, gap=%g, threshold=%g\n", // ratings[0],ratings[index],bestgap,threshold); free_mem(ratings); return threshold;}/********************************************************************** * sort_floats * * qsort function to sort 2 floats. **********************************************************************/int sort_floats( //qsort function const void *arg1, //ptrs to floats const void *arg2) { float diff; //difference diff = *((float *) arg1) - *((float *) arg2); if (diff > 0) return 1; else if (diff < 0) return -1; else return 0;}/************************************************************************* * reject_edge_blobs() * * If the word is perilously close to the edge of the image, reject those blobs * in the word which are too close to the edge as they could be clipped. *************************************************************************/void reject_edge_blobs(WERD_RES *word) { BOX word_box = word->word->bounding_box (); BOX blob_box; PBLOB_IT blob_it = word->outword->blob_list (); //blobs int blobindex = 0; float centre; if ((word_box.left () < tessedit_image_border) || (word_box.bottom () < tessedit_image_border) || (word_box.right () + tessedit_image_border > page_image.get_xsize () - 1) || (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) { ASSERT_HOST (word->reject_map.length () == blob_it.length ()); for (blobindex = 0, blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blobindex++, blob_it.forward ()) { blob_box = blob_it.data ()->bounding_box (); centre = (blob_box.left () + blob_box.right ()) / 2.0; if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) || (word->denorm.y (blob_box.bottom (), centre) < tessedit_image_border) || (word->denorm.x (blob_box.right ()) + tessedit_image_border > page_image.get_xsize () - 1) || (word->denorm.y (blob_box.top (), centre) + tessedit_image_border > page_image.get_ysize () - 1)) { word->reject_map[blobindex].setrej_edge_char (); //close to edge } } }}/********************************************************************** * one_ell_conflict() * * Identify words where there is a potential I/l/1 error. * - A bundle of contextual heuristics! **********************************************************************/BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) { const char *word; INT16 word_len; //its length INT16 first_alphanum_idx; INT16 i; BOOL8 non_conflict_set_char; //non conf set a/n? BOOL8 conflict = FALSE; BOOL8 allow_1s; ACCEPTABLE_WERD_TYPE word_type; BOOL8 dict_perm_type; BOOL8 dict_word_ok; int dict_word_type; word = word_res->best_choice->string ().string (); word_len = strlen (word); /* If there are no occurrences of the conflict set characters then the word is OK. */ if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL) return FALSE; /* There is a conflict if there are NO other (confirmed) alphanumerics apart from those in the conflict set. */ for (i = 0, non_conflict_set_char = FALSE; (i < word_len) && !non_conflict_set_char; i++) non_conflict_set_char = isalnum (word[i]) && !STRING (conflict_set_I_l_1).contains (word[i]); if (!non_conflict_set_char) { if (update_map) reject_I_1_L(word_res); return TRUE; } /* If the word is accepted by a dawg permuter, and the first alpha character is "I" or "l", check to see if the alternative is also a dawg word. If it is, then there is a potential error otherwise the word is ok. */ dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word_res->best_choice->permuter () == USER_DAWG_PERM) || (rej_trust_doc_dawg && (word_res->best_choice->permuter () == DOC_DAWG_PERM)) || (word_res->best_choice->permuter () == FREQ_DAWG_PERM); dict_word_type = dict_word (word); dict_word_ok = (dict_word_type > 0) && (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM)); if ((rej_1Il_use_dict_word && dict_word_ok) || (rej_1Il_trust_permuter_type && dict_perm_type) || (dict_perm_type && dict_word_ok)) { first_alphanum_idx = first_alphanum_pos (word); if (word[first_alphanum_idx] == 'I') { word_res->best_choice->string ()[first_alphanum_idx] = 'l'; if (safe_dict_word (word) > 0) { word_res->best_choice->string ()[first_alphanum_idx] = 'I'; if (update_map) word_res->reject_map[first_alphanum_idx]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->string ()[first_alphanum_idx] = 'I'; return FALSE; } } if (word[first_alphanum_idx] == 'l') { word_res->best_choice->string ()[first_alphanum_idx] = 'I'; if (safe_dict_word (word) > 0) { word_res->best_choice->string ()[first_alphanum_idx] = 'l'; if (update_map) word_res->reject_map[first_alphanum_idx]. setrej_1Il_conflict(); return TRUE; } else { word_res->best_choice->string ()[first_alphanum_idx] = 'l'; return FALSE; } } return FALSE; } /* NEW 1Il code. The old code relied on permuter types too much. In fact, tess will use TOP_CHOICE permute for good things like "palette". In this code the string is examined independently to see if it looks like a well formed word. */ /* REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a dictionary word. */ first_alphanum_idx = first_alphanum_pos (word); if (word[first_alphanum_idx] == 'l') { word_res->best_choice->string ()[first_alphanum_idx] = 'I'; if (safe_dict_word (word) > 0) return FALSE; else word_res->best_choice->string ()[first_alphanum_idx] = 'l'; } else if (word[first_alphanum_idx] == 'I') { word_res->best_choice->string ()[first_alphanum_idx] = 'l'; if (safe_dict_word (word) > 0) return FALSE; else word_res->best_choice->string ()[first_alphanum_idx] = 'I'; } /* For strings containing digits: If there are no alphas OR the numeric permuter liked the word, reject any non 1 conflict chs Else reject all conflict chs */ if (word_contains_non_1_digit (word)) { allow_1s = (alpha_count (word) == 0) || (word_res->best_choice->permuter () == NUMBER_PERM); conflict = FALSE; for (i = 0; i < word_len; i++) { if ((!allow_1s || (word[i] != '1')) && STRING (conflict_set_I_l_1).contains (word[i])) { if (update_map) word_res->reject_map[i].setrej_1Il_conflict (); conflict = TRUE; } } return conflict; } /* For anything else. See if it conforms to an acceptable word type. If so, treat accordingly. */ word_type = acceptable_word_string (word); if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) { first_alphanum_idx = first_alphanum_pos (word); if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_idx])) { if (update_map) word_res->reject_map[first_alphanum_idx].setrej_1Il_conflict (); return TRUE; } else return FALSE; } else if (word_type == AC_UPPER_CASE) { return FALSE; } else { if (update_map) reject_I_1_L(word_res); return TRUE; }}INT16 first_alphanum_pos(const char *word) { INT16 i; for (i = 0; word[i] != '\0'; i++) { if (isalnum (word[i])) return i; } return -1;}INT16 alpha_count(const char *word) { INT16 i; INT16 count = 0; for (i = 0; word[i] != '\0'; i++) { if (isalpha (word[i])) count++; } return count;}BOOL8 word_contains_non_1_digit(const char *word) { INT16 i; for (i = 0; word[i] != '\0'; i++) { if (isdigit (word[i]) && word[i] != '1') return TRUE; } return FALSE;}BOOL8 test_ambig_word( //test for ambiguity WERD_RES *word) { BOOL8 ambig = FALSE; if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word->best_choice->permuter () == FREQ_DAWG_PERM) || (word->best_choice->permuter () == USER_DAWG_PERM)) { ambig = !NoDangerousAmbig(word->best_choice->string().string(), NULL); } return ambig;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -