⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 reject.cpp

📁 一OCR的相关资料。.希望对研究OCR的朋友有所帮助.
💻 CPP
📖 第 1 页 / 共 4 页
字号:
  check_debug_pt (word, 10);  if (tessedit_rejection_debug) {    tprintf ("Permuter Type = %d\n", word->best_choice->permuter ());    tprintf ("Certainty: %f     Rating: %f\n",      word->best_choice->certainty (), word->best_choice->rating ());    tprintf ("Dict word: %d\n",      dict_word (word->best_choice->string ().string ()));  }  /* Un-reject any rejected characters if NN permits */  if (tessedit_use_nn && (pass == 2) &&    word->reject_map.recoverable_rejects ())    nn_recover_rejects(word, row);  flip_hyphens(word);  check_debug_pt (word, 20);}void reject_blanks(WERD_RES *word) {  INT16 i;  for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {    if (word->best_choice->string ()[i] == ' ')                                 //rej unrecognised blobs      word->reject_map[i].setrej_tess_failure ();  }}void reject_I_1_L(WERD_RES *word) {  INT16 i;  for (i = 0; word->best_choice->string ()[i] != '\0'; i++) {    if (STRING (conflict_set_I_l_1).    contains (word->best_choice->string ()[i])) {                                 //rej 1Il conflict      word->reject_map[i].setrej_1Il_conflict ();    }  }}void reject_poor_matches(  //detailed results                         WERD_RES *word,                         BLOB_CHOICE_LIST_CLIST *blob_choices) {  float threshold;  INT16 i = 0;                                 //super iterator  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;  BLOB_CHOICE_IT choice_it;      //real iterator  #ifndef SECURE_NAMES  if (strlen (word->best_choice->string ().string ()) != list_it.length ()) {    tprintf      ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",      word->best_choice->string ().string (),      strlen (word->best_choice->string ().string ()), list_it.length (),      word->outword->blob_list ()->length ());  }  #endif  ASSERT_HOST (strlen (word->best_choice->string ().string ()) ==    list_it.length ());  ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());  threshold = compute_reject_threshold (blob_choices);  for (list_it.mark_cycle_pt ();  !list_it.cycled_list (); list_it.forward (), i++) {    /* NB - only compares the threshold against the TOP choice char in the      choices list for a blob !! - the selected one may be below the threshold */    choice_it.set_to_list (list_it.data ());    if ((word->best_choice->string ()[i] == ' ') ||      (choice_it.length () == 0))                                 //rej unrecognised blobs      word->reject_map[i].setrej_tess_failure ();    else if (choice_it.data ()->certainty () < threshold)                                 //rej poor score blob      word->reject_map[i].setrej_poor_match ();  }}/********************************************************************** * compute_reject_threshold * * Set a rejection threshold for this word. * Initially this is a trivial function which looks for the largest * gap in the certainty value. **********************************************************************/float compute_reject_threshold(  //compute threshold //detailed results                               BLOB_CHOICE_LIST_CLIST *blob_choices) {  INT16 index;                   //to ratings  INT16 blob_count;              //no of blobs in word  INT16 ok_blob_count = 0;       //non TESS rej blobs in word  float *ratings;                //array of confidences  float threshold;               //rejection threshold  float bestgap;                 //biggest gap  float gapstart;                //bottom of gap                                 //super iterator  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;  BLOB_CHOICE_IT choice_it;      //real iterator  blob_count = blob_choices->length ();  ratings = (float *) alloc_mem (blob_count * sizeof (float));  for (list_it.mark_cycle_pt (), index = 0;  !list_it.cycled_list (); list_it.forward (), index++) {    choice_it.set_to_list (list_it.data ());    if (choice_it.length () > 0) {      ratings[ok_blob_count] = choice_it.data ()->certainty ();      //get in an array      //                 tprintf("Rating[%d]=%c %g %g\n",      //                         index,choice_it.data()->char_class(),      //                         choice_it.data()->rating(),choice_it.data()->certainty());      ok_blob_count++;    }  }  ASSERT_HOST (index == blob_count);  qsort (ratings, ok_blob_count, sizeof (float), sort_floats);  //sort them  bestgap = 0;  gapstart = ratings[0] - 1;     //all reject if none better  if (ok_blob_count >= 3) {    for (index = 0; index < ok_blob_count - 1; index++) {      if (ratings[index + 1] - ratings[index] > bestgap) {        bestgap = ratings[index + 1] - ratings[index];        //find biggest        gapstart = ratings[index];      }    }  }  threshold = gapstart + bestgap / 2;  //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",  //              ratings[0],ratings[index],bestgap,threshold);  free_mem(ratings);  return threshold;}/********************************************************************** * sort_floats * * qsort function to sort 2 floats. **********************************************************************/int sort_floats(                   //qsort function                const void *arg1,  //ptrs to floats                const void *arg2) {  float diff;                    //difference  diff = *((float *) arg1) - *((float *) arg2);  if (diff > 0)    return 1;  else if (diff < 0)    return -1;  else    return 0;}/************************************************************************* * reject_edge_blobs() * * If the word is perilously close to the edge of the image, reject those blobs * in the word which are too close to the edge as they could be clipped. *************************************************************************/void reject_edge_blobs(WERD_RES *word) {  BOX word_box = word->word->bounding_box ();  BOX blob_box;  PBLOB_IT blob_it = word->outword->blob_list ();  //blobs  int blobindex = 0;  float centre;  if ((word_box.left () < tessedit_image_border) ||    (word_box.bottom () < tessedit_image_border) ||    (word_box.right () + tessedit_image_border >    page_image.get_xsize () - 1) ||  (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {    ASSERT_HOST (word->reject_map.length () == blob_it.length ());    for (blobindex = 0, blob_it.mark_cycle_pt ();    !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {      blob_box = blob_it.data ()->bounding_box ();      centre = (blob_box.left () + blob_box.right ()) / 2.0;      if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||        (word->denorm.y (blob_box.bottom (), centre) <        tessedit_image_border) ||        (word->denorm.x (blob_box.right ()) + tessedit_image_border >        page_image.get_xsize () - 1) ||        (word->denorm.y (blob_box.top (), centre)      + tessedit_image_border > page_image.get_ysize () - 1)) {        word->reject_map[blobindex].setrej_edge_char ();        //close to edge      }    }  }}/********************************************************************** * one_ell_conflict() * * Identify words where there is a potential I/l/1 error. * - A bundle of contextual heuristics! **********************************************************************/BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {  const char *word;  INT16 word_len;                //its length  INT16 first_alphanum_idx;  INT16 i;  BOOL8 non_conflict_set_char;   //non conf set a/n?  BOOL8 conflict = FALSE;  BOOL8 allow_1s;  ACCEPTABLE_WERD_TYPE word_type;  BOOL8 dict_perm_type;  BOOL8 dict_word_ok;  int dict_word_type;  word = word_res->best_choice->string ().string ();  word_len = strlen (word);  /*    If there are no occurrences of the conflict set characters then the word    is OK.  */  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)    return FALSE;  /*    There is a conflict if there are NO other (confirmed) alphanumerics apart    from those in the conflict set.  */  for (i = 0, non_conflict_set_char = FALSE;    (i < word_len) && !non_conflict_set_char; i++)  non_conflict_set_char = isalnum (word[i]) &&      !STRING (conflict_set_I_l_1).contains (word[i]);  if (!non_conflict_set_char) {    if (update_map)      reject_I_1_L(word_res);    return TRUE;  }  /*    If the word is accepted by a dawg permuter, and the first alpha character    is "I" or "l", check to see if the alternative is also a dawg word. If it    is, then there is a potential error otherwise the word is ok.  */  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||    (word_res->best_choice->permuter () == USER_DAWG_PERM) ||    (rej_trust_doc_dawg &&    (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||    (word_res->best_choice->permuter () == FREQ_DAWG_PERM);  dict_word_type = dict_word (word);  dict_word_ok = (dict_word_type > 0) &&    (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));  if ((rej_1Il_use_dict_word && dict_word_ok) ||    (rej_1Il_trust_permuter_type && dict_perm_type) ||  (dict_perm_type && dict_word_ok)) {    first_alphanum_idx = first_alphanum_pos (word);    if (word[first_alphanum_idx] == 'I') {      word_res->best_choice->string ()[first_alphanum_idx] = 'l';      if (safe_dict_word (word) > 0) {        word_res->best_choice->string ()[first_alphanum_idx] = 'I';        if (update_map)          word_res->reject_map[first_alphanum_idx].            setrej_1Il_conflict();        return TRUE;      }      else {        word_res->best_choice->string ()[first_alphanum_idx] = 'I';        return FALSE;      }    }    if (word[first_alphanum_idx] == 'l') {      word_res->best_choice->string ()[first_alphanum_idx] = 'I';      if (safe_dict_word (word) > 0) {        word_res->best_choice->string ()[first_alphanum_idx] = 'l';        if (update_map)          word_res->reject_map[first_alphanum_idx].            setrej_1Il_conflict();        return TRUE;      }      else {        word_res->best_choice->string ()[first_alphanum_idx] = 'l';        return FALSE;      }    }    return FALSE;  }  /*    NEW 1Il code. The old code relied on permuter types too much. In fact,    tess will use TOP_CHOICE permute for good things like "palette".    In this code the string is examined independently to see if it looks like    a well formed word.  */  /*    REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a    dictionary word.  */  first_alphanum_idx = first_alphanum_pos (word);  if (word[first_alphanum_idx] == 'l') {    word_res->best_choice->string ()[first_alphanum_idx] = 'I';    if (safe_dict_word (word) > 0)      return FALSE;    else      word_res->best_choice->string ()[first_alphanum_idx] = 'l';  }  else if (word[first_alphanum_idx] == 'I') {    word_res->best_choice->string ()[first_alphanum_idx] = 'l';    if (safe_dict_word (word) > 0)      return FALSE;    else      word_res->best_choice->string ()[first_alphanum_idx] = 'I';  }  /*    For strings containing digits:      If there are no alphas OR the numeric permuter liked the word,        reject any non 1 conflict chs      Else reject all conflict chs  */  if (word_contains_non_1_digit (word)) {    allow_1s = (alpha_count (word) == 0) ||      (word_res->best_choice->permuter () == NUMBER_PERM);    conflict = FALSE;    for (i = 0; i < word_len; i++) {      if ((!allow_1s || (word[i] != '1')) &&      STRING (conflict_set_I_l_1).contains (word[i])) {        if (update_map)          word_res->reject_map[i].setrej_1Il_conflict ();        conflict = TRUE;      }    }    return conflict;  }  /*    For anything else. See if it conforms to an acceptable word type. If so,    treat accordingly.  */  word_type = acceptable_word_string (word);  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {    first_alphanum_idx = first_alphanum_pos (word);    if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_idx])) {      if (update_map)        word_res->reject_map[first_alphanum_idx].setrej_1Il_conflict ();      return TRUE;    }    else      return FALSE;  }  else if (word_type == AC_UPPER_CASE) {    return FALSE;  }  else {    if (update_map)      reject_I_1_L(word_res);    return TRUE;  }}INT16 first_alphanum_pos(const char *word) {  INT16 i;  for (i = 0; word[i] != '\0'; i++) {    if (isalnum (word[i]))      return i;  }  return -1;}INT16 alpha_count(const char *word) {  INT16 i;  INT16 count = 0;  for (i = 0; word[i] != '\0'; i++) {    if (isalpha (word[i]))      count++;  }  return count;}BOOL8 word_contains_non_1_digit(const char *word) {  INT16 i;  for (i = 0; word[i] != '\0'; i++) {    if (isdigit (word[i]) && word[i] != '1')      return TRUE;  }  return FALSE;}BOOL8 test_ambig_word(  //test for ambiguity                      WERD_RES *word) {  BOOL8 ambig = FALSE;  if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||    (word->best_choice->permuter () == FREQ_DAWG_PERM) ||  (word->best_choice->permuter () == USER_DAWG_PERM)) {    ambig = !NoDangerousAmbig(word->best_choice->string().string(), NULL);  }  return ambig;}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -