📄 fixspace.cpp
字号:
tprintf ("FP fixspace working on \"%s\"\n", word_res->best_choice->string ().string ()); } #endif gblob_sort_list ((PBLOB_LIST *) word_res->word->rej_cblob_list (), FALSE); sub_word_list_it.add_after_stay_put (word_res_it.extract ()); fix_noisy_space_list(sub_word_list, row); new_length = sub_word_list.length (); word_res_it.add_list_before (&sub_word_list); for (; (!word_res_it.at_last () && (new_length > 1)); new_length--) { word_res_it.forward (); }}void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row) { INT16 best_score; WERD_RES_IT best_perm_it(&best_perm); WERD_RES_LIST current_perm; WERD_RES_IT current_perm_it(¤t_perm); WERD_RES *old_word_res; WERD_RES *new_word_res; INT16 current_score; BOOL8 improved = FALSE; //default score best_score = fp_eval_word_spacing (best_perm); dump_words (best_perm, best_score, 1, improved); new_word_res = new WERD_RES; old_word_res = best_perm_it.data (); //Kludge to force deep copy old_word_res->combination = TRUE; *new_word_res = *old_word_res; //deep copy //Undo kludge old_word_res->combination = FALSE; //Undo kludge new_word_res->combination = FALSE; current_perm_it.add_to_end (new_word_res); break_noisiest_blob_word(current_perm); while ((best_score != PERFECT_WERDS) && !current_perm.empty ()) { match_current_words(current_perm, row); current_score = fp_eval_word_spacing (current_perm); dump_words (current_perm, current_score, 2, improved); if (current_score > best_score) { best_perm.clear (); best_perm.deep_copy (¤t_perm); best_score = current_score; improved = TRUE; } if (current_score < PERFECT_WERDS) break_noisiest_blob_word(current_perm); } dump_words (best_perm, best_score, 3, improved);}/************************************************************************* * break_noisiest_blob_word() * Find the word with the blob which looks like the worst noise. * Break the word into two, deleting the noise blob. *************************************************************************/void break_noisiest_blob_word(WERD_RES_LIST &words) { WERD_RES_IT word_it(&words); WERD_RES_IT worst_word_it; float worst_noise_score = 9999; int worst_blob_index = -1; //noisiest blb of noisiest wd int blob_index; //of wds noisiest blb float noise_score; //of wds noisiest blb WERD_RES *word_res; C_BLOB_IT blob_it; C_BLOB_IT rej_cblob_it; C_BLOB_LIST new_blob_list; C_BLOB_IT new_blob_it; C_BLOB_IT new_rej_cblob_it; WERD *new_word; INT16 start_of_noise_blob; INT16 i; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { blob_index = worst_noise_blob (word_it.data (), &noise_score); if ((blob_index > -1) && (worst_noise_score > noise_score)) { worst_noise_score = noise_score; worst_blob_index = blob_index; worst_word_it = word_it; } } if (worst_blob_index < 0) { words.clear (); //signal termination return; } /* Now split the worst_word_it */ word_res = worst_word_it.data (); /* Move blobs before noise blob to a new bloblist */ new_blob_it.set_to_list (&new_blob_list); blob_it.set_to_list (word_res->word->cblob_list ()); for (i = 0; i < worst_blob_index; i++, blob_it.forward ()) { new_blob_it.add_after_then_move (blob_it.extract ()); } start_of_noise_blob = blob_it.data ()->bounding_box ().left (); delete blob_it.extract (); //throw out noise blb new_word = new WERD (&new_blob_list, word_res->word); new_word->set_flag (W_EOL, FALSE); word_res->word->set_flag (W_BOL, FALSE); word_res->word->set_blanks (1);//After break new_rej_cblob_it.set_to_list (new_word->rej_cblob_list ()); rej_cblob_it.set_to_list (word_res->word->rej_cblob_list ()); for (; (!rej_cblob_it.empty () && (rej_cblob_it.data ()->bounding_box ().left () < start_of_noise_blob)); rej_cblob_it.forward ()) { new_rej_cblob_it.add_after_then_move (rej_cblob_it.extract ()); } worst_word_it.add_before_then_move (new WERD_RES (new_word)); word_res->done = FALSE; if (word_res->outword != NULL) { delete word_res->outword; delete word_res->best_choice; delete word_res->raw_choice; word_res->outword = NULL; word_res->best_choice = NULL; word_res->raw_choice = NULL; }}INT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score) { PBLOB_IT blob_it; INT16 blob_count; float noise_score[512]; int i; int min_noise_blob; //1st contender int max_noise_blob; //last contender int non_noise_count; int worst_noise_blob; //Worst blob float small_limit = bln_x_height * fixsp_small_outlines_size; float non_noise_limit = bln_x_height * 0.8; blob_it.set_to_list (word_res->outword->blob_list ()); //normalised blob_count = blob_it.length (); ASSERT_HOST (blob_count <= 512); if (blob_count < 5) return -1; //too short to split /* Get the noise scores for all blobs */ #ifndef SECURE_NAMES if (debug_fix_space_level > 5) tprintf ("FP fixspace Noise metrics for \"%s\": ", word_res->best_choice->string ().string ()); #endif for (i = 0; i < blob_count; i++, blob_it.forward ()) { if (word_res->reject_map[i].accepted ()) noise_score[i] = non_noise_limit; else noise_score[i] = blob_noise_score (blob_it.data ()); if (debug_fix_space_level > 5) tprintf ("%1.1f ", noise_score[i]); } if (debug_fix_space_level > 5) tprintf ("\n"); /* Now find the worst one which is far enough away from the end of the word */ non_noise_count = 0; for (i = 0; (i < blob_count) && (non_noise_count < fixsp_non_noise_limit); i++) { if (noise_score[i] >= non_noise_limit) non_noise_count++; } if (non_noise_count < fixsp_non_noise_limit) return -1; min_noise_blob = i; non_noise_count = 0; for (i = blob_count - 1; (i >= 0) && (non_noise_count < fixsp_non_noise_limit); i--) { if (noise_score[i] >= non_noise_limit) non_noise_count++; } if (non_noise_count < fixsp_non_noise_limit) return -1; max_noise_blob = i; if (min_noise_blob > max_noise_blob) return -1; *worst_noise_score = small_limit; worst_noise_blob = -1; for (i = min_noise_blob; i <= max_noise_blob; i++) { if (noise_score[i] < *worst_noise_score) { worst_noise_blob = i; *worst_noise_score = noise_score[i]; } } return worst_noise_blob;}float blob_noise_score(PBLOB *blob) { OUTLINE_IT outline_it; BOX box; //BB of outline INT16 outline_count = 0; INT16 max_dimension; INT16 largest_outline_dimension = 0; outline_it.set_to_list (blob->out_list ()); for (outline_it.mark_cycle_pt (); !outline_it.cycled_list (); outline_it.forward ()) { outline_count++; box = outline_it.data ()->bounding_box (); if (box.height () > box.width ()) max_dimension = box.height (); else max_dimension = box.width (); if (largest_outline_dimension < max_dimension) largest_outline_dimension = max_dimension; } if (fixsp_noise_score_fixing) { if (outline_count > 5) //penalise LOTS of blobs largest_outline_dimension *= 2; box = blob->bounding_box (); if ((box.bottom () > bln_baseline_offset * 4) || (box.top () < bln_baseline_offset / 2)) //Lax blob is if high or low largest_outline_dimension /= 2; } return largest_outline_dimension;}void fixspace_dbg(WERD_RES *word) { BOX box = word->word->bounding_box (); BOOL8 show_map_detail = FALSE; INT16 i; box.print (); #ifndef SECURE_NAMES tprintf (" \"%s\" ", word->best_choice->string ().string ()); tprintf ("Blob count: %d (word); %d/%d (outword)\n", word->word->gblob_list ()->length (), word->outword->gblob_list ()->length (), word->outword->rej_blob_list ()->length ()); word->reject_map.print (debug_fp); tprintf ("\n"); if (show_map_detail) { tprintf ("\"%s\"\n", word->best_choice->string ().string ()); for (i = 0; word->best_choice->string ()[i] != '\0'; i++) { tprintf ("**** \"%c\" ****\n", word->best_choice->string ()[i]); word->reject_map[i].full_print (debug_fp); } } tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE"); tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE"); #endif}/************************************************************************* * fp_eval_word_spacing() * Evaluation function for fixed pitch word lists. * * Basically, count the number of "nice" characters - those which are in tess * acceptable words or in dict words and are not rejected. * Penalise any potential noise chars *************************************************************************/INT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list) { WERD_RES_IT word_it(&word_res_list); WERD_RES *word; PBLOB_IT blob_it; INT16 word_length; INT16 score = 0; INT16 i; const char *chs; float small_limit = bln_x_height * fixsp_small_outlines_size; if (!fixsp_fp_eval) return (eval_word_spacing (word_res_list)); for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); word_length = word->reject_map.length (); chs = word->best_choice->string ().string (); if ((word->done || word->tess_accepted) || (word->best_choice->permuter () == SYSTEM_DAWG_PERM) || (word->best_choice->permuter () == FREQ_DAWG_PERM) || (word->best_choice->permuter () == USER_DAWG_PERM) || (safe_dict_word (chs) > 0)) { blob_it.set_to_list (word->outword->blob_list ()); for (i = 0; i < word_length; i++, blob_it.forward ()) { if ((chs[i] == ' ') || (blob_noise_score (blob_it.data ()) < small_limit)) score -= 1; //penalise possibly erroneous non-space else if (word->reject_map[i].accepted ()) score++; } } } if (score < 0) score = 0; return score;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -