control.cpp

来自「一个google的OCR源码」· C++ 代码 · 共 1,843 行 · 第 1/5 页
CPP
1,843 行
  }  //      tprintf( "REPEATED CHAR %s len=%d total=%d choice=%c\n",  //                        word_str, word_len, total, maxch );  free_mem(rep_ch);  word->reject_map.initialise (word_len);  for (i = 0, offset = 0; i < word_len;       offset += word->best_choice->lengths()[i++]) {    if (strncmp(word_str + offset, maxch,                word->best_choice->lengths()[i]) != 0)                                 //rej unrecognised blobs      word->reject_map[i].setrej_bad_repetition ();  }  word->done = TRUE;}// TODO(tkielbus) Decide between keeping this behavior here or modifying the// training data.// Utility function for fix_quotes// Return true if the next character in the string (given the UTF8 length in// bytes) is a quote character.static int is_simple_quote(const char* signed_str, int length) {  const unsigned char* str = reinterpret_cast<const unsigned char*>(signed_str);   //standard 1 byte quotes  return (length == 1 && (*str == '\'' || *str == '`')) ||      //utf8 3 bytes curved quotes      (length == 3 && ((*str == 0xe2 &&                        *(str + 1) == 0x80 &&                        *(str + 2) == 0x98) ||                       (*str == 0xe2 &&                        *(str + 1) == 0x80 &&                        *(str + 2) == 0x99)));}/********************************************************************** * fix_quotes * * Change pairs of quotes to double quotes. **********************************************************************/void fix_quotes(               //make double quotes                WERD_CHOICE *choice,  //choice to fix                WERD *word,    //word to do //char choices                BLOB_CHOICE_LIST_CLIST *blob_choices) {  char *str = (char *) choice->string().string();//string ptr  int i;  int offset;                                 //blobs  PBLOB_IT blob_it = word->blob_list ();                                 //choices  BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;  BLOB_CHOICE_IT it1;            //first choices  BLOB_CHOICE_IT it2;            //second choices  for (i = 0, offset = 0; str[offset] != '\0';       offset += choice->lengths()[i++],           blob_it.forward (), choice_it.forward ()) {    if (str[offset + choice->lengths()[i]] != '\0' &&        is_simple_quote(str + offset, choice->lengths()[i]) &&        is_simple_quote(str + offset + choice->lengths()[i],                        choice->lengths()[i + 1]) &&        unicharset.contains_unichar("\"")) {      str[offset] = '"';                //turn to double      strcpy (str + offset + 1,              str + offset + choice->lengths()[i] +              choice->lengths()[i + 1]); //shuffle up      choice->lengths()[i] = 1;      strcpy ((char*) choice->lengths().string() + i + 1,              choice->lengths().string() + i + 2);      merge_blobs (blob_it.data (), blob_it.data_relative (1));      blob_it.forward ();      delete blob_it.extract (); //get rid of spare      it1.set_to_list (choice_it.data ());      it2.set_to_list (choice_it.data_relative (1));      if (it1.data ()->certainty () < it2.data ()->certainty ()) {        choice_it.forward ();                                 //get rid of spare        delete choice_it.extract ();      }      else {                                 //get rid of spare        delete choice_it.extract ();        choice_it.forward ();      }    }  }}/********************************************************************** * fix_hyphens * * Change pairs of hyphens to a single hyphen if the bounding boxes touch * Typically a long dash which has been segmented. **********************************************************************/void fix_hyphens(               //crunch double hyphens                 WERD_CHOICE *choice,  //choice to fix                 WERD *word,    //word to do //char choices                 BLOB_CHOICE_LIST_CLIST *blob_choices) {  char *str = (char *) choice->string().string();//string ptr  int i;  int offset;                                 //blobs  PBLOB_IT blob_it = word->blob_list ();                                 //choices  BLOB_CHOICE_LIST_C_IT choice_it = blob_choices;  BLOB_CHOICE_IT it1;            //first choices  BLOB_CHOICE_IT it2;            //second choices  for (i = 0, offset = 0; str[offset] != '\0';  offset += choice->lengths()[i++],           blob_it.forward (), choice_it.forward ()) {    if ((str[offset] == '-' || str[offset] == '~') &&      (str[offset + choice->lengths()[i]] == '-' ||       str[offset + choice->lengths()[i]] == '~') &&      (blob_it.data ()->bounding_box ().right () >=    blob_it.data_relative (1)->bounding_box ().left ())) {      str[offset] = '-';                //turn to single hyphen      strcpy (str + offset + choice->lengths()[i],              str + offset + choice->lengths()[i] +              choice->lengths()[i + 1]); //shuffle up      strcpy ((char*) choice->lengths().string() + i + 1,              choice->lengths().string() + i + 2);      merge_blobs (blob_it.data (), blob_it.data_relative (1));      blob_it.forward ();      delete blob_it.extract (); //get rid of spare      it1.set_to_list (choice_it.data ());      it2.set_to_list (choice_it.data_relative (1));      if (it1.data ()->certainty () < it2.data ()->certainty ()) {        choice_it.forward ();                                 //get rid of spare        delete choice_it.extract ();      }      else {                                 //get rid of spare        delete choice_it.extract ();        choice_it.forward ();      }    }  }}/********************************************************************** * merge_blobs * * Add the outlines from blob2 to blob1. Blob2 is emptied but not deleted. **********************************************************************/void merge_blobs(               //combine 2 blobs                 PBLOB *blob1,  //dest blob                 PBLOB *blob2   //source blob                ) {  OUTLINE_IT outline_it = blob1->out_list ();  //iterator  outline_it.move_to_last ();    //go to end                                 //do it  outline_it.add_list_after (blob2->out_list ());}/********************************************************************** * choice_dump_tester * * Matcher tester function which generates .chc file entries. * Called via test_segment_pass2 for every blob tested by tess in a word. * (But only for words for which a correct segmentation could be found.) **********************************************************************/void choice_dump_tester(                           //dump chars in word                        PBLOB *,                   //blob                        DENORM *,                  //de-normaliser                        BOOL8 correct,             //ly segmented                        char *text,                //correct text                        inT32 count,               //chars in text                        BLOB_CHOICE_LIST *ratings  //list of results                       ) {  STRING choice_file_name;  BLOB_CHOICE *blob_choice;  BLOB_CHOICE_IT it;  char source_chars[20];  char correct_char[3];  if (choice_file == NULL) {    choice_file_name = imagebasename + ".chc";    if (!(choice_file = fopen (choice_file_name.string (), "w"))) {      CANTOPENFILE.error ("choice_dump_tester", EXIT, "%s %d",        choice_file_name.string (), errno);    }  }  if ((count == 0) || (text == NULL) || (text[0] == '\0')) {    strcpy (source_chars, "$$");    strcpy (correct_char, "$$");  }  else {    strncpy(source_chars, text, count);    source_chars[count] = '\0';    if (correct) {      correct_char[0] = text[0];      correct_char[1] = '\0';    }    else {      strcpy (correct_char, "$$");    }  }  fprintf (choice_file, "%s\t%s", source_chars, correct_char);  it.set_to_list (ratings);  for (it.mark_cycle_pt (); !it.cycled_list (); it.forward ()) {    blob_choice = it.data ();    fprintf (choice_file, "\t%s\t%f\t%f",             blob_choice->unichar (),             blob_choice->rating (), blob_choice->certainty ());  }  fprintf (choice_file, "\n");}/************************************************************************* * make_bln_copy() * * Generate a baseline normalised copy of the source word. The copy is done so * that whatever format the original word is in, a polygonal bln version is * generated as output. *************************************************************************/WERD *make_bln_copy(WERD *src_word, ROW *row, float x_height, DENORM *denorm) {  WERD *result;  //      if (wordit_linearc && !src_word->flag(W_POLYGON))  //      {  //              larc_word = src_word->larc_copy( row->x_height() );  //              result = larc_word->poly_copy( row->x_height() );  //              delete larc_word;  //      }  // else  result = src_word->poly_copy (row->x_height ());  //      if (tessedit_draw_words)  //      {  //              if ( la_win == NO_WINDOW )  //                      create_la_win();  //              result->plot( la_win );  //      }  result->baseline_normalise_x (row, x_height, denorm);  return result;}ACCEPTABLE_WERD_TYPE acceptable_word_string(const char *s,                                            const char *lengths) {  int i = 0;  int offset = 0;  int leading_punct_count;  int upper_count = 0;  int hyphen_pos = -1;  ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;  if (strlen (lengths) > 20)    return word_type;  /* Single Leading punctuation char*/  if ((s[offset] != '\0') && (STRING (chs_leading_punct).contains (s[offset])))    offset += lengths[i++];  leading_punct_count = i;  /* Initial cap */  while ((s[offset] != '\0') &&         unicharset.get_isupper(s + offset, lengths[i])) {    offset += lengths[i++];    upper_count++;  }  if (upper_count > 1)    word_type = AC_UPPER_CASE;  else {    /* Lower case word, possibly with an initial cap */    while ((s[offset] != '\0') &&           unicharset.get_islower (s + offset, lengths[i])) {      offset += lengths[i++];    }    if (i - leading_punct_count < quality_min_initial_alphas_reqd)      goto not_a_word;    /*    Allow a single hyphen in a lower case word    - dont trust upper case - I've seen several cases of "H" -> "I-I"    */    if (lengths[i] == 1 && s[offset] == '-') {      hyphen_pos = i;      offset += lengths[i++];      if (s[offset] != '\0') {        while ((s[offset] != '\0') &&               unicharset.get_islower(s + offset, lengths[i])) {          offset += lengths[i++];        }        if (i < hyphen_pos + 3)          goto not_a_word;      }    }    else {      /* Allow "'s" in NON hyphenated lower case words */      if (lengths[i] == 1 && (s[offset] == '\'') &&          lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {        offset += lengths[i++];        offset += lengths[i++];      }    }    if (upper_count > 0)      word_type = AC_INITIAL_CAP;    else      word_type = AC_LOWER_CASE;  }  /* Up to two different, constrained trailing punctuation chars */  if (lengths[i] == 1 && (s[offset] != '\0') &&      (STRING (chs_trailing_punct1).contains (s[offset])))    offset += lengths[i++];  if (lengths[i] == 1 && (s[offset] != '\0') && i > 0 &&    (s[offset - lengths[i - 1]] != s[offset]) &&      (STRING (chs_trailing_punct2).contains (s[offset])))    offset += lengths[i++];  if (s[offset] != '\0')    word_type = AC_UNACCEPTABLE;  not_a_word:  if (word_type == AC_UNACCEPTABLE) {    /* Look for abbreviation string */    i = 0;    offset = 0;    if (s[0] != '\0' && unicharset.get_isupper (s, lengths[0])) {      word_type = AC_UC_ABBREV;      while ((s[offset] != '\0') &&             unicharset.get_isupper(s + offset, lengths[i]) &&             (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {        offset += lengths[i++];        offset += lengths[i++];      }    }    else if (s[0] != '\0' && unicharset.get_islower (s, lengths[0])) {      word_type = AC_LC_ABBREV;      while ((s[offset] != '\0') &&             unicharset.get_islower(s + offset, lengths[i]) &&             (lengths[i + 1] == 1 && s[offset + lengths[i]] == '.')) {        offset += lengths[i++];        offset += lengths[i++];      }    }    if (s[offset] != '\0')      word_type = AC_UNACCEPTABLE;  }  return word_type;}/* DEBUGGING ROUTINE */BOOL8 check_debug_pt(WERD_RES *word, int location) {  BOOL8 show_map_detail = FALSE;  inT16 i;  #ifndef SECURE_NAMES  if (!test_pt)
control.cpp - 源码说明

本页面展示了「一个google的OCR源码」中的 control.cpp 源码文件，采用 C++ 编程语言编写，共 1,843 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与google相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?