📄 adaptions.cpp
字号:
#ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample added to an existing cluster\n"); #endif } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample dropped, good match to an existing cluster\n"); #endif } } else if (best_score > tessedit_cluster_t2) { c_it.add_to_end (new CHAR_SAMPLES (sample)); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("New cluster created for this sample\n"); #endif } else { cw_it.add_to_end (sample); if (tessedit_cluster_debug) tprintf ("Sample added to the wait list\n"); } }}void check_wait_list(CHAR_SAMPLE_LIST *chars_waiting, CHAR_SAMPLE *sample, CHAR_SAMPLES *best_cluster) { CHAR_SAMPLE *wait_sample; CHAR_SAMPLE *test_sample = sample; CHAR_SAMPLE_IT cw_it = chars_waiting; CHAR_SAMPLE_LIST add_list; //Samples added to best cluster CHAR_SAMPLE_IT add_it = &add_list; float score; add_list.clear (); if (!cw_it.empty ()) { do { if (!add_list.empty ()) { add_it.forward (); test_sample = add_it.extract (); best_cluster->add_sample (test_sample); } for (cw_it.mark_cycle_pt (); !cw_it.cycled_list (); cw_it.forward ()) { wait_sample = cw_it.data (); if (tessedit_mm_use_prototypes) score = best_cluster->match_score (wait_sample); else score = sample->match_sample (wait_sample, FALSE); if (score < tessedit_cluster_t1) { if (score > tessedit_cluster_t3 || tessedit_mm_use_prototypes) { add_it.add_after_stay_put (cw_it.extract ()); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Wait sample added to an existing cluster\n"); #endif } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Wait sample dropped, good match to an existing cluster\n"); #endif } } } } while (!add_list.empty ()); }}void complete_clustering(CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { CHAR_SAMPLES *best_cluster; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; CHAR_SAMPLE *sample; INT32 total_sample_count = 0; while (!cw_it.empty ()) { cw_it.move_to_first (); sample = cw_it.extract (); best_cluster = new CHAR_SAMPLES (sample); c_it.add_to_end (best_cluster); check_wait_list(chars_waiting, sample, best_cluster); } for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { c_it.data ()->assign_to_char (); if (tessedit_use_best_sample) c_it.data ()->find_best_sample (); else if (tessedit_mm_adapt_using_prototypes) c_it.data ()->build_prototype (); if (tessedit_cluster_debug) total_sample_count += c_it.data ()->n_samples (); } #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Clustering completed, %d samples in all\n", total_sample_count); #endif#ifndef GRAPHICS_DISABLED if (tessedit_demo_adaption) display_cluster_prototypes(char_clusters);#endif}void adapt_to_good_ems(WERD_RES *word, CHAR_SAMPLES_LIST *char_clusters, CHAR_SAMPLE_LIST *chars_waiting) { PBLOB_LIST *blobs = word->outword->blob_list (); PBLOB_IT blob_it(blobs); INT16 i; CHAR_SAMPLE *sample; CHAR_SAMPLES_IT c_it = char_clusters; CHAR_SAMPLE_IT cw_it = chars_waiting; float score; float best_score; char best_char; CHAR_SAMPLES *best_cluster; PIXROW_LIST *pixrow_list; PIXROW_IT pixrow_it; IMAGELINE *imlines; // lines of the image BOX pix_box; // box of imlines // extent WERD copy_outword; // copy to denorm BOX b_box; PBLOB_IT copy_blob_it; OUTLINE_IT copy_outline_it; PIXROW *pixrow = NULL; static INT32 word_number = 0;#ifndef GRAPHICS_DISABLED WINDOW demo_win = NULL;#endif INT32 resolution = page_image.get_res (); if (word->word->bounding_box ().height () > resolution / 3) return; word_number++; if (strchr (word->best_choice->string ().string (), 'm') == NULL && (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") == NULL)) return; if (tessedit_reject_ems) reject_all_ems(word); else if (tessedit_reject_suspect_ems) reject_suspect_ems(word); else { if (char_clusters->length () == 0) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("No clusters to use for em adaption\n"); #endif return; } if (!cw_it.empty ()) { complete_clustering(char_clusters, chars_waiting); print_em_stats(char_clusters, chars_waiting); } if ((!word_adaptable (word, tessedit_em_adaption_mode) || word->reject_map.reject_count () != 0) && (strchr (word->best_choice->string ().string (), 'm') != NULL || (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") != NULL))) { if (tessedit_process_rns && strstr (word->best_choice->string ().string (), "rn") != NULL) { copy_outword = *(word->outword); copy_blob_it.set_to_list (copy_outword.blob_list ()); i = 0; while (word->best_choice->string ()[i] != '\0') { if (word->best_choice->string ()[i] == 'r' && word->best_choice->string ()[i + 1] == 'n') { copy_outline_it.set_to_list (copy_blob_it.data ()-> out_list ()); copy_outline_it.add_list_after (copy_blob_it. data_relative (1)-> out_list ()); copy_blob_it.forward (); delete (copy_blob_it.extract ()); i++; } copy_blob_it.forward (); i++; } } else copy_outword = *(word->outword); copy_outword.baseline_denormalise (&word->denorm); copy_blob_it.set_to_list (copy_outword.blob_list ()); char_clip_word(©_outword, page_image, pixrow_list, imlines, pix_box); pixrow_it.set_to_list (pixrow_list); pixrow_it.move_to_first (); // For debugging only b_box = copy_outword.bounding_box (); pixrow = pixrow_it.data (); blob_it.move_to_first (); copy_blob_it.move_to_first (); for (i = 0; word->best_choice->string ()[i] != '\0'; i++, pixrow_it.forward (), blob_it.forward (), copy_blob_it.forward ()) { if ((word->best_choice->string ()[i] == 'm' || (word->best_choice->string ()[i] == 'r' && word->best_choice->string ()[i + 1] == 'n')) && !word->reject_map[i].perm_rejected ()) { if (tessedit_cluster_debug) tprintf ("Sample %c to check found in %s, index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), i); if (tessedit_demo_adaption) tprintf ("Sample %c to check found in %s (%d), index %d\n", word->best_choice->string ()[i], word->best_choice->string ().string (), word_number, i); if (tessedit_matrix_match) { BOX copy_box = copy_blob_it.data ()->bounding_box (); sample = clip_sample (pixrow_it.data (), imlines, pix_box, copy_outword.flag (W_INVERSE), word->best_choice->string ()[i]); //Clip failed if (sample == NULL) { tprintf ("Unable to clip sample from %s, index %d\n", word->best_choice->string ().string (), i); #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (no sample)\n"); #endif word->reject_map[i].setrej_mm_reject (); if (word->best_choice->string ()[i] == 'r') { word->reject_map[i + 1].setrej_mm_reject (); i++; } continue; } } else sample = new CHAR_SAMPLE (blob_it.data (), &word->denorm, word->best_choice-> string ()[i]); best_score = MAX_INT32; best_char = '\0'; best_cluster = NULL; for (c_it.mark_cycle_pt (); !c_it.cycled_list (); c_it.forward ()) { if (c_it.data ()->character () != '\0') { score = c_it.data ()->match_score (sample); if (score < best_score) { best_cluster = c_it.data (); best_score = score; best_char = c_it.data ()->character (); } } } if (best_score > tessedit_cluster_t1) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (score %f)\n", best_score); #endif word->reject_map[i].setrej_mm_reject (); if (word->best_choice->string ()[i] == 'r') word->reject_map[i + 1].setrej_mm_reject (); } else { if (word->best_choice->string ()[i] == best_char) { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample accepted (score %f)\n", best_score); if (tessedit_demo_adaption) tprintf ("Sample accepted (score %f)\n", best_score); #endif word->reject_map[i].setrej_mm_accept (); if (word->best_choice->string ()[i] == 'r') word->reject_map[i + 1].setrej_mm_accept (); } else { #ifndef SECURE_NAMES if (tessedit_cluster_debug) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); if (tessedit_demo_adaption) tprintf ("Sample rejected (char %c, score %f)\n", best_char, best_score); #endif word->reject_map[i].setrej_mm_reject (); if (word->best_choice->string ()[i] == 'r') word->reject_map[i + 1].setrej_mm_reject (); } } if (tessedit_demo_adaption) { if (strcmp (imagebasename.string (), tessedit_demo_file.string ()) != 0 || word_number == tessedit_demo_word1 || word_number == tessedit_demo_word2) {#ifndef GRAPHICS_DISABLED demo_win = display_clip_image(©_outword, page_image, pixrow_list, pix_box);#endif demo_word = word_number; best_cluster->match_score (sample); demo_word = 0; } } if (word->best_choice->string ()[i] == 'r') i++; // Skip next character } } delete[]imlines; // Free array of imlines delete pixrow_list; } }}void adapt_to_good_samples(WERD_RES *word,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -