📄 reject.cpp

📁 一ＯＣＲ的相关资料。．希望对研究ＯＣＲ的朋友有所帮助．
💻 CPP
📖 第 1 页 / 共 4 页
字号:
/************************************************************************* * ambig_word() * * This is a recursive routine which tests the dictionary for all combinations * of conflict set alternatives for characters in a given word. *************************************************************************/BOOL8 ambig_word(                     //original word                 const char *start_word,                 char *temp_word,     //alterable copy                 INT16 test_char_pos  //idx to char to alter                ) {  const char *ambigs;            //Ambiguities for char  if (*(temp_word + test_char_pos) == '\0') {    if (safe_dict_word (temp_word)) {      if (strcmp (start_word, temp_word) == 0)        return FALSE;      else        return TRUE;    }    else      return FALSE;  }  else {    ambigs = char_ambiguities (*(temp_word + test_char_pos));    if (ambigs == NULL)      return ambig_word (start_word, temp_word, test_char_pos + 1);    else {      while (*ambigs != '\0') {        *(temp_word + test_char_pos) = *ambigs++;        //test next ambiguity        if (ambig_word (start_word, temp_word, test_char_pos + 1))          return TRUE;      }      return FALSE;    }  }}/************************************************************************* * char_ambiguities() * * Return a pointer to a string containing the full conflict set of characters * which includes the specified character, if there is one. If the specified * character is not a member of a conflict set, return NULL. * (NOTE that a character is assumed to be a member of only ONE conflict set.) *************************************************************************/const char *char_ambiguities(char c) {  static STRING_CLIST conflict_sets;  static BOOL8 read_conflict_sets = FALSE;  STRING_C_IT cs_it(&conflict_sets);  const char *cs;  STRING cs_file_name;  FILE *cs_file;  char buff[1024];  if (!read_conflict_sets) {    cs_file_name = datadir + "confsets";    if (!(cs_file = fopen (cs_file_name.string (), "r"))) {      CANTOPENFILE.error ("char_ambiguities", EXIT, "%s %d",        cs_file_name.string (), errno);    }    while (fscanf (cs_file, "%s", buff) == 1) {      cs_it.add_after_then_move (new STRING (buff));    }    read_conflict_sets = TRUE;    cs_it.move_to_first ();    if (tessedit_rejection_debug) {      for (cs_it.mark_cycle_pt ();      !cs_it.cycled_list (); cs_it.forward ()) {        tprintf ("\"%s\"\n", cs_it.data ()->string ());      }    }  }  cs_it.move_to_first ();  for (cs_it.mark_cycle_pt (); !cs_it.cycled_list (); cs_it.forward ()) {    cs = cs_it.data ()->string ();    if (strchr (cs, c) != NULL)      return cs;  }  return NULL;}#ifndef EMBEDDEDvoid test_ambigs(const char *word) {  char orig_word[80];  char temp_word[80];  if (strlen (word) > 80)    tprintf ("Ridiculously long word \"%s\"\n", word);  else {    strcpy(orig_word, word);    while (strlen (orig_word) > 0) {      strcpy(temp_word, orig_word);      #ifndef SECURE_NAMES      if (ambig_word (orig_word, temp_word, 0))        tprintf ("Ambiguity \"%s\" -> \"%s\"\n", orig_word, temp_word);      else        tprintf ("NO Ambiguities for  \"%s\"\n", orig_word);      tprintf ("Next Word > ");      #endif      scanf ("%s", orig_word);    }  }}#endif/************************************************************************* * nn_recover_rejects() * Generate the nn_reject_map - a copy of the current reject map, but dont * reject previously rejected chars if the NN matcher agrees with the best * choice. *************************************************************************/void nn_recover_rejects(WERD_RES *word, ROW *row) {                                 //copy for debug  REJMAP old_map = word->reject_map;  /*    NOTE THAT THIS IS RELATIVELY INEFFICIENT AS THE WHOLE OF THE WERD IS    MATCHED BY THE NN MATCHER. IF COULD EASILY BE RESTRICTED TO JUST THE    REJECT CHARACTERS  (Though initial use is when words are total rejects    anyway).  */  set_global_subsubloc_code(SUBSUBLOC_NN);  nn_match_word(word, row);  if (no_unrej_1Il)    dont_allow_1Il(word);  if (no_unrej_dubious_chars)    dont_allow_dubious_chars(word);  if (rej_mostly_reject_mode == 1)    reject_mostly_rejects(word);  /*    IF there are no unrejected alphanumerics AND      The word is not an acceptable single non alphanum char word  AND      The word is not an acceptable repeated non alphanum char word    THEN Reject whole word  */  if (no_unrej_no_alphanum_wds &&    (count_alphanums (word) < 1) &&    !((word->best_choice->string ().length () == 1) &&    STRING (ok_single_ch_non_alphanum_wds).contains (word->best_choice->    string ()[0]))    && !repeated_nonalphanum_wd (word, row))    word->reject_map.rej_word_no_alphanums ();  #ifndef SECURE_NAMES  if (nn_debug) {    tprintf ("\nTess: \"%s\" MAP ", word->best_choice->string ().string ());    old_map.print (stdout);    tprintf ("->");    word->reject_map.print (stdout);    tprintf ("\n");  }  #endif  set_global_subsubloc_code(SUBSUBLOC_OTHER);}void nn_match_word(  //Match a word                   WERD_RES *word,                   ROW *row) {  PIXROW_LIST *pixrow_list;  PIXROW_IT pixrow_it;  IMAGELINE *imlines;            //lines of the image  BOX pix_box;                   //box of imlines extent#ifndef GRAPHICS_DISABLED  WINDOW win = NULL;#endif  IMAGE clip_image;  IMAGE scaled_image;  float baseline_pos;  INT16 net_image_size;  INT16 clip_image_size;  WERD copy_outword;             // copy to denorm  INT16 i;  const char *word_string;  BOOL8 word_in_dict;            //Tess wd in dict  BOOL8 checked_dict_word;       //Tess wd definitely in dict  BOOL8 sensible_word;           //OK char string  BOOL8 centre;                  //Not at word end       chs  BOOL8 good_quality_word;  INT16 char_quality;  INT16 accepted_char_quality;  INT16 conf_level;              //0:REJECT  //1:DODGY ACCEPT  //2:DICT ACCEPT  //3:CLEAR ACCEPT  INT16 first_alphanum_idx;  word_string = word->best_choice->string ().string ();  first_alphanum_idx = first_alphanum_pos (word_string);  word_in_dict = ((word->best_choice->permuter () == SYSTEM_DAWG_PERM) ||    (word->best_choice->permuter () == FREQ_DAWG_PERM) ||    (word->best_choice->permuter () == USER_DAWG_PERM));  checked_dict_word = word_in_dict && (safe_dict_word (word_string) > 0);  sensible_word = acceptable_word_string (word_string) != AC_UNACCEPTABLE;  word_char_quality(word, row, &char_quality, &accepted_char_quality);  good_quality_word = word->best_choice->string ().length () == char_quality;  #ifndef SECURE_NAMES  if (nn_reject_debug) {    tprintf ("Dict: %c   Checked Dict: %c   Sensible: %c   Quality: %c\n",      word_in_dict ? 'T' : 'F',      checked_dict_word ? 'T' : 'F',      sensible_word ? 'T' : 'F', good_quality_word ? 'T' : 'F');  }  #endif  if (word->best_choice->string ().length () !=  word->outword->blob_list ()->length ()) {    #ifndef SECURE_NAMES    tprintf ("nn_match_word ASSERT FAIL String:\"%s\";  #Blobs=%d\n",      word->best_choice->string ().string (),      word->outword->blob_list ()->length ());    #endif    err_exit();  }  copy_outword = *(word->outword);  copy_outword.baseline_denormalise (&word->denorm);  /*    For each character, generate and match a new image, containing JUST the    character we have clipped, centered in the image, on a white background.    Note that we MUST have a square image so that we can scale it uniformly in    x and y.  We base the size on x_height as this can be found fairly reliably.  */  net_image_size = (net_image_width > net_image_height) ?    net_image_width : net_image_height;  clip_image_size = (INT16) floor (0.5 +    net_image_size * word->x_height /    net_image_x_height);  if ((clip_image_size <= 1) || (net_image_size <= 1)) {    return;  }  /*    Get the image of the word and the pix positions of each char  */  char_clip_word(&copy_outword, page_image, pixrow_list, imlines, pix_box);#ifndef GRAPHICS_DISABLED  if (show_char_clipping) {    win = display_clip_image (&copy_outword, page_image,      pixrow_list, pix_box);  }#endif  pixrow_it.set_to_list (pixrow_list);  pixrow_it.move_to_first ();  for (pixrow_it.mark_cycle_pt (), i = 0;  !pixrow_it.cycled_list (); pixrow_it.forward (), i++) {    if (pixrow_it.data ()->      bad_box (page_image.get_xsize (), page_image.get_ysize ()))      continue;    clip_image.create (clip_image_size, clip_image_size, 1);    //make bin imge    if (!copy_outword.flag (W_INVERSE))      invert_image(&clip_image);  //white background for black on white    pixrow_it.data ()->char_clip_image (imlines, pix_box, row,      clip_image, baseline_pos);    if (copy_outword.flag (W_INVERSE))      invert_image(&clip_image);  //invert white on black for scaling &NN    scaled_image.create (net_image_size, net_image_size, 1);    scale_image(clip_image, scaled_image);    baseline_pos *= net_image_size / clip_image_size;    //scale with im    centre = !pixrow_it.at_first () && !pixrow_it.at_last ();    conf_level = nn_match_char (scaled_image, baseline_pos,      word_in_dict, checked_dict_word,      sensible_word, centre,      good_quality_word, word_string[i]);    if (word->reject_map[i].recoverable ()) {      if ((i == first_alphanum_idx) &&      ((word_string[i] == 'I') || (word_string[i] == 'i'))) {        if (conf_level >= nn_conf_initial_i_level)          word->reject_map[i].setrej_nn_accept ();        //un-reject char      }      else if (conf_level > 0)                                 //un-reject char        word->reject_map[i].setrej_nn_accept ();    }#ifndef GRAPHICS_DISABLED    if (show_char_clipping)      display_images(clip_image, scaled_image);#endif    clip_image.destroy ();    scaled_image.destroy ();  }  delete[]imlines;               // Free array of imlines  delete pixrow_list;#ifndef GRAPHICS_DISABLED  if (show_char_clipping) {    destroy_window(win);  }#endif}/************************************************************************* * nn_match_char() * Call Neural Net matcher to match a single character, given a scaled, * square image *************************************************************************/INT16 nn_match_char(                          //of character                    IMAGE &scaled_image,                    float baseline_pos,       //rel to scaled_image                    BOOL8 dict_word,          //part of dict wd?                    BOOL8 checked_dict_word,  //part of dict wd?                    BOOL8 sensible_word,      //part acceptable str?                    BOOL8 centre,             //not at word ends?                    BOOL8 good_quality_word,  //initial segmentation                    char tess_ch              //confirm this?                   ) {  INT16 conf_level;              //0..2  INT32 row;  INT32 col;  INT32 y_size = scaled_image.get_ysize ();  INT32 start_y = y_size - (y_size - net_image_height) / 2 - 1;  INT32 end_y = start_y - net_image_height + 1;  IMAGELINE imline;  float *input_vector;  float *input_vec_ptr;  char top;  float top_score;  char next;  float next_score;  INT16 input_nodes = (net_image_height * net_image_width) + net_bl_nodes;  INT16 j;  input_vector = (float *) alloc_mem (input_nodes * sizeof (float));  input_vec_ptr = input_vector;  invert_image(&scaled_image);  //cos nns work better  for (row = start_y; row >= end_y; row--) {    scaled_image.fast_get_line (0, row, net_image_width, &imline);    for (col = 0; col < net_image_width; col++)      *input_vec_ptr++ = imline.pixels[col];  }  /*    The bit map presented to the net may be shorter than the image, so shift    the coord to be relative to the bitmap portion.  */  baseline_pos -= (y_size - net_image_height) / 2.0;  /*    Baseline pos is 0 if below bitmap, 1 if above and in proportion otherwise.    This is represented to the net as a set of bl_nodes, an initial proportion    of which are set to 1.0, indicating the level of the baseline. The    remainder are 0.0  */  if (baseline_pos < 0)    baseline_pos = 0;  else if (baseline_pos >= net_image_height)    baseline_pos = net_image_height + 1;  else    baseline_pos = baseline_pos + 1;  baseline_pos = baseline_pos / (net_image_height + 1);  if (net_bl_nodes > 0) {    baseline_pos *= 1.7;         //Use a wider range    if (net_bl_nodes > 1) {      /* Multi-node baseline representation */      for (j = 0; j < net_bl_nodes; j++) {        if (baseline_pos > ((float) j / net_bl_nodes))          *input_vec_ptr++ = 1.0;        else          *input_vec_ptr++ = 0.0;      }    }    else {      /* Single node baseline */      *input_vec_ptr++ = baseline_pos;    }  }  callnet(input_vector, &top, &top_score, &next, &next_score);  conf_level = evaluate_net_match (top, top_score, next, next_score,    tess_ch, dict_word, checked_dict_word,    sensible_word, centre, good_quality_word);  #ifndef SECURE_NAMES  if (nn_reject_debug) {    tprintf ("top:\"%c\" %4.2f   next:\"%c\" %4.2f  TESS:\"%c\" Conf: %d\n",      top, top_score, next, next_score, tess_ch, conf_level);  }  #endif  free_mem(input_vector);  return conf_level;}INT16 evaluate_net_match(char top,                         float top_score,                         char next,                         float next_score,                         char tess_ch,                         BOOL8 dict_word,
💿 文件大小 2763 K
👤 上传用户 danlong
📂 所属分类其他书籍
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -