reject.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 1,776 行 · 第 1/4 页

CPP
1,776
字号
       offset += word->best_choice->lengths ()[i], i += 1) {    if (word->best_choice->string ()[offset] == ' ')                                 //rej unrecognised blobs      word->reject_map[i].setrej_tess_failure ();  }}void reject_I_1_L(WERD_RES *word) {  inT16 i;  inT16 offset;  for (i = 0, offset = 0; word->best_choice->string ()[offset] != '\0';       offset += word->best_choice->lengths ()[i], i += 1) {    if (STRING (conflict_set_I_l_1).    contains (word->best_choice->string ()[offset])) {                                 //rej 1Il conflict      word->reject_map[i].setrej_1Il_conflict ();    }  }}void reject_poor_matches(  //detailed results                         WERD_RES *word,                         BLOB_CHOICE_LIST_CLIST *blob_choices) {  float threshold;  inT16 i = 0;  inT16 offset = 0;                                 //super iterator  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;  BLOB_CHOICE_IT choice_it;      //real iterator  #ifndef SECURE_NAMES  if (strlen (word->best_choice->lengths ().string ()) != list_it.length ()) {    tprintf      ("ASSERT FAIL string:\"%s\"; strlen=%d; choices len=%d; blob len=%d\n",      word->best_choice->string ().string (),      strlen (word->best_choice->lengths ().string ()), list_it.length (),      word->outword->blob_list ()->length ());  }  #endif  ASSERT_HOST (strlen (word->best_choice->lengths ().string ()) ==    list_it.length ());  ASSERT_HOST (word->outword->blob_list ()->length () == list_it.length ());  threshold = compute_reject_threshold (blob_choices);  for (list_it.mark_cycle_pt ();  !list_it.cycled_list (); list_it.forward (), i++,           offset += word->best_choice->lengths ()[i]) {    /* NB - only compares the threshold against the TOP choice char in the      choices list for a blob !! - the selected one may be below the threshold */    choice_it.set_to_list (list_it.data ());    if ((word->best_choice->string ()[offset] == ' ') ||      (choice_it.length () == 0))                                 //rej unrecognised blobs      word->reject_map[i].setrej_tess_failure ();    else if (choice_it.data ()->certainty () < threshold)                                 //rej poor score blob      word->reject_map[i].setrej_poor_match ();  }}/********************************************************************** * compute_reject_threshold * * Set a rejection threshold for this word. * Initially this is a trivial function which looks for the largest * gap in the certainty value. **********************************************************************/float compute_reject_threshold(  //compute threshold //detailed results                               BLOB_CHOICE_LIST_CLIST *blob_choices) {  inT16 index;                   //to ratings  inT16 blob_count;              //no of blobs in word  inT16 ok_blob_count = 0;       //non TESS rej blobs in word  float *ratings;                //array of confidences  float threshold;               //rejection threshold  float bestgap;                 //biggest gap  float gapstart;                //bottom of gap                                 //super iterator  BLOB_CHOICE_LIST_C_IT list_it = blob_choices;  BLOB_CHOICE_IT choice_it;      //real iterator  blob_count = blob_choices->length ();  ratings = (float *) alloc_mem (blob_count * sizeof (float));  for (list_it.mark_cycle_pt (), index = 0;  !list_it.cycled_list (); list_it.forward (), index++) {    choice_it.set_to_list (list_it.data ());    if (choice_it.length () > 0) {      ratings[ok_blob_count] = choice_it.data ()->certainty ();      //get in an array      //                 tprintf("Rating[%d]=%c %g %g\n",      //                         index,choice_it.data()->char_class(),      //                         choice_it.data()->rating(),choice_it.data()->certainty());      ok_blob_count++;    }  }  ASSERT_HOST (index == blob_count);  qsort (ratings, ok_blob_count, sizeof (float), sort_floats);  //sort them  bestgap = 0;  gapstart = ratings[0] - 1;     //all reject if none better  if (ok_blob_count >= 3) {    for (index = 0; index < ok_blob_count - 1; index++) {      if (ratings[index + 1] - ratings[index] > bestgap) {        bestgap = ratings[index + 1] - ratings[index];        //find biggest        gapstart = ratings[index];      }    }  }  threshold = gapstart + bestgap / 2;  //      tprintf("First=%g, last=%g, gap=%g, threshold=%g\n",  //              ratings[0],ratings[index],bestgap,threshold);  free_mem(ratings);  return threshold;}/********************************************************************** * sort_floats * * qsort function to sort 2 floats. **********************************************************************/int sort_floats(                   //qsort function                const void *arg1,  //ptrs to floats                const void *arg2) {  float diff;                    //difference  diff = *((float *) arg1) - *((float *) arg2);  if (diff > 0)    return 1;  else if (diff < 0)    return -1;  else    return 0;}/************************************************************************* * reject_edge_blobs() * * If the word is perilously close to the edge of the image, reject those blobs * in the word which are too close to the edge as they could be clipped. *************************************************************************/void reject_edge_blobs(WERD_RES *word) {  TBOX word_box = word->word->bounding_box ();  TBOX blob_box;  PBLOB_IT blob_it = word->outword->blob_list ();  //blobs  int blobindex = 0;  float centre;  if ((word_box.left () < tessedit_image_border) ||    (word_box.bottom () < tessedit_image_border) ||    (word_box.right () + tessedit_image_border >    page_image.get_xsize () - 1) ||  (word_box.top () + tessedit_image_border > page_image.get_ysize () - 1)) {    ASSERT_HOST (word->reject_map.length () == blob_it.length ());    for (blobindex = 0, blob_it.mark_cycle_pt ();    !blob_it.cycled_list (); blobindex++, blob_it.forward ()) {      blob_box = blob_it.data ()->bounding_box ();      centre = (blob_box.left () + blob_box.right ()) / 2.0;      if ((word->denorm.x (blob_box.left ()) < tessedit_image_border) ||        (word->denorm.y (blob_box.bottom (), centre) <        tessedit_image_border) ||        (word->denorm.x (blob_box.right ()) + tessedit_image_border >        page_image.get_xsize () - 1) ||        (word->denorm.y (blob_box.top (), centre)      + tessedit_image_border > page_image.get_ysize () - 1)) {        word->reject_map[blobindex].setrej_edge_char ();        //close to edge      }    }  }}/********************************************************************** * one_ell_conflict() * * Identify words where there is a potential I/l/1 error. * - A bundle of contextual heuristics! **********************************************************************/BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map) {  const char *word;  const char *lengths;  inT16 word_len;                //its length  inT16 first_alphanum_index_;  inT16 first_alphanum_offset_;  inT16 i;  inT16 offset;  BOOL8 non_conflict_set_char;   //non conf set a/n?  BOOL8 conflict = FALSE;  BOOL8 allow_1s;  ACCEPTABLE_WERD_TYPE word_type;  BOOL8 dict_perm_type;  BOOL8 dict_word_ok;  int dict_word_type;  word = word_res->best_choice->string ().string ();  lengths = word_res->best_choice->lengths().string();  word_len = strlen (lengths);  /*    If there are no occurrences of the conflict set characters then the word    is OK.  */  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)    return FALSE;  /*    There is a conflict if there are NO other (confirmed) alphanumerics apart    from those in the conflict set.  */  for (i = 0, offset = 0, non_conflict_set_char = FALSE;       (i < word_len) && !non_conflict_set_char; offset += lengths[i++])    non_conflict_set_char =        (unicharset.get_isalpha(word + offset, lengths[i]) ||         unicharset.get_isdigit(word + offset, lengths[i])) &&        !STRING (conflict_set_I_l_1).contains (word[offset]);  if (!non_conflict_set_char) {    if (update_map)      reject_I_1_L(word_res);    return TRUE;  }  /*    If the word is accepted by a dawg permuter, and the first alpha character    is "I" or "l", check to see if the alternative is also a dawg word. If it    is, then there is a potential error otherwise the word is ok.  */  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||    (word_res->best_choice->permuter () == USER_DAWG_PERM) ||    (rej_trust_doc_dawg &&    (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||    (word_res->best_choice->permuter () == FREQ_DAWG_PERM);  dict_word_type = dict_word (word);  dict_word_ok = (dict_word_type > 0) &&    (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));  if ((rej_1Il_use_dict_word && dict_word_ok) ||    (rej_1Il_trust_permuter_type && dict_perm_type) ||  (dict_perm_type && dict_word_ok)) {    first_alphanum_index_ = first_alphanum_index (word, lengths);    first_alphanum_offset_ = first_alphanum_offset (word, lengths);    if (lengths[first_alphanum_index_] == 1 &&        word[first_alphanum_offset_] == 'I') {      word_res->best_choice->string ()[first_alphanum_offset_] = 'l';      if (safe_dict_word (word) > 0) {        word_res->best_choice->string ()[first_alphanum_offset_] = 'I';        if (update_map)          word_res->reject_map[first_alphanum_index_].            setrej_1Il_conflict();        return TRUE;      }      else {        word_res->best_choice->string ()[first_alphanum_offset_] = 'I';        return FALSE;      }    }    if (lengths[first_alphanum_index_] == 1 &&        word[first_alphanum_offset_] == 'l') {      word_res->best_choice->string ()[first_alphanum_offset_] = 'I';      if (safe_dict_word (word) > 0) {        word_res->best_choice->string ()[first_alphanum_offset_] = 'l';        if (update_map)          word_res->reject_map[first_alphanum_index_].            setrej_1Il_conflict();        return TRUE;      }      else {        word_res->best_choice->string ()[first_alphanum_offset_] = 'l';        return FALSE;      }    }    return FALSE;  }  /*    NEW 1Il code. The old code relied on permuter types too much. In fact,    tess will use TOP_CHOICE permute for good things like "palette".    In this code the string is examined independently to see if it looks like    a well formed word.  */  /*    REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a    dictionary word.  */  first_alphanum_index_ = first_alphanum_index (word, lengths);  first_alphanum_offset_ = first_alphanum_offset (word, lengths);  if (lengths[first_alphanum_index_] == 1 &&      word[first_alphanum_offset_] == 'l') {    word_res->best_choice->string ()[first_alphanum_offset_] = 'I';    if (safe_dict_word (word) > 0)      return FALSE;    else      word_res->best_choice->string ()[first_alphanum_offset_] = 'l';  }  else if (lengths[first_alphanum_index_] == 1 &&           word[first_alphanum_offset_] == 'I') {    word_res->best_choice->string ()[first_alphanum_offset_] = 'l';    if (safe_dict_word (word) > 0)      return FALSE;    else      word_res->best_choice->string ()[first_alphanum_offset_] = 'I';  }  /*    For strings containing digits:      If there are no alphas OR the numeric permuter liked the word,        reject any non 1 conflict chs      Else reject all conflict chs  */  if (word_contains_non_1_digit (word, lengths)) {    allow_1s = (alpha_count (word, lengths) == 0) ||      (word_res->best_choice->permuter () == NUMBER_PERM);    inT16 offset;    conflict = FALSE;    for (i = 0, offset = 0; word[offset] != '\0';         offset += word_res->best_choice->lengths ()[i++]) {      if ((!allow_1s || (word[offset] != '1')) &&      STRING (conflict_set_I_l_1).contains (word[offset])) {        if (update_map)          word_res->reject_map[i].setrej_1Il_conflict ();        conflict = TRUE;      }    }    return conflict;  }  /*    For anything else. See if it conforms to an acceptable word type. If so,    treat accordingly.  */  word_type = acceptable_word_string (word, lengths);  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {    first_alphanum_index_ = first_alphanum_index (word, lengths);    first_alphanum_offset_ = first_alphanum_offset (word, lengths);    if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {      if (update_map)        word_res->reject_map[first_alphanum_index_].            setrej_1Il_conflict ();      return TRUE;    }    else      return FALSE;  }  else if (word_type == AC_UPPER_CASE) {    return FALSE;  }  else {    if (update_map)      reject_I_1_L(word_res);    return TRUE;  }}inT16 first_alphanum_index(const char *word,                           const char *word_lengths) {  inT16 i;  inT16 offset;  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||        unicharset.get_isdigit(word + offset, word_lengths[i]))      return i;  }  return -1;}inT16 first_alphanum_offset(const char *word,                            const char *word_lengths) {  inT16 i;  inT16 offset;  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {    if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||        unicharset.get_isdigit(word + offset, word_lengths[i]))      return offset;  }  return -1;}inT16 alpha_count(const char *word,                  const char *word_lengths) {  inT16 i;  inT16 offset;  inT16 count = 0;  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {    if (unicharset.get_isalpha (word + offset, word_lengths[i]))      count++;  }  return count;}BOOL8 word_contains_non_1_digit(const char *word,                                const char *word_lengths) {  inT16 i;  inT16 offset;  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {    if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&        (word_lengths[i] != 1 || word[offset] != '1'))      return TRUE;  }  return FALSE;}BOOL8 test_ambig_word(  //test for ambiguity                      WERD_RES *word) {  BOOL8 ambig = FALSE;  if ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||    (word->best_choice->permuter () == FREQ_DAWG_PERM) ||  (word->best_choice->permuter () == USER_DAWG_PERM)) {    ambig = !NoDangerousAmbig(word->best_choice->string().string(),                              word->best_choice->lengths().string(),                              NULL);  }  return ambig;}/************************************************************************* * ambig_word() * * This is a recursive routine which tests the dictionary for all combinations * of conflict set alternatives for characters in a given word. *************************************************************************/BOOL8 ambig_word(                     //original word

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?