📄 tordmain.cpp
字号:
if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; #ifndef SECURE_NAMES if (testing_on) { tprintf ("Blob at (%d,%d) -> (%d,%d), ols=%d, tc=%d, bldiff=%g\n", blob_box.left (), blob_box.bottom (), blob_box.right (), blob_box.top (), blob->out_list ()->length (), trans_count, blob_box.bottom () - row->base_line (blob_box.left ())); } #endif } } #ifndef SECURE_NAMES if (textord_noise_debug) { tprintf ("Row ending at (%d,%g):", blob_box.right (), row->base_line (blob_box.right ())); tprintf (" R=%g, dc=%d, nc=%d, %s\n", norm_count > 0 ? (float) dot_count / norm_count : 9999, dot_count, norm_count, dot_count > norm_count * textord_noise_normratio && dot_count > 2 ? "REJECTED" : "ACCEPTED"); } #endif return super_norm_count < textord_noise_sncount && dot_count > norm_count * textord_noise_rowratio && dot_count > 2;}/********************************************************************** * clean_noise_from_words * * Move blobs of words from rows of garbage into the reject blobs list. **********************************************************************/void clean_noise_from_words( //remove empties ROW *row //row to clean ) { BOX blob_box; //bounding box INT8 *word_dud; //was it chucked C_BLOB *blob; //current blob C_OUTLINE *outline; //current outline WERD *word; //current word INT32 blob_size; //biggest size INT32 trans_count; //no of transitions INT32 trans_threshold; //noise tolerance INT32 dot_count; //small objects INT32 norm_count; //normal objects INT32 dud_words; //number discarded INT32 ok_words; //number remaining INT32 word_index; //current word //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator C_OUTLINE_IT out_it; //outline iterator ok_words = word_it.length (); if (ok_words == 0) return; word_dud = (INT8 *) alloc_mem (ok_words * sizeof (INT8)); dud_words = 0; ok_words = 0; word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word dot_count = 0; norm_count = 0; //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); if (!word->flag (W_DONT_CHOP)) { //get outlines out_it.set_to_list (blob->out_list ()); for (out_it.mark_cycle_pt (); !out_it.cycled_list (); out_it.forward ()) { outline = out_it.data (); blob_box = outline->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box. height(); if (blob_size < textord_noise_sizelimit * row->x_height ()) dot_count++; //count smal outlines if (!outline->child ()->empty () && blob_box.height () < (1 + textord_noise_syfract) * row->x_height () && blob_box.height () > (1 - textord_noise_syfract) * row->x_height () && blob_box.width () < (1 + textord_noise_sxfract) * row->x_height () && blob_box.width () > (1 - textord_noise_sxfract) * row->x_height ()) norm_count++; //count smal outlines } } else norm_count++; blob_box = blob->bounding_box (); blob_size = blob_box.width () > blob_box.height ()? blob_box.width () : blob_box.height (); if (blob_size >= textord_noise_sizelimit * row->x_height () && blob_size < row->x_height () * 2) { trans_threshold = blob_size / textord_noise_sizefraction; trans_count = blob->count_transitions (trans_threshold); if (trans_count < textord_noise_translimit) norm_count++; } else if (blob_box.height () > row->x_height () * 2 && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } if (dot_count > 2) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; } else word_dud[word_index] = 0; if (word_dud[word_index] == 2) dud_words++; else ok_words++; word_index++; } word_index = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || word_dud[word_index] == 1 && dud_words > ok_words) { word = word_it.data (); //current word //rejected blobs blob_it.set_to_list (word->rej_cblob_list ()); //move from blobs blob_it.add_list_after (word->cblob_list ()); } word_index++; } free_mem(word_dud);}/********************************************************************** * tweak_row_baseline * * Shift baseline to fit the blobs more accurately where they are * close enough. **********************************************************************/void tweak_row_baseline( //remove empties ROW *row //row to clean ) { BOX blob_box; //bounding box C_BLOB *blob; //current blob WERD *word; //current word INT32 blob_count; //no of blobs INT32 src_index; //source segment INT32 dest_index; //destination segment INT32 *xstarts; //spline segments double *coeffs; //spline coeffs float ydiff; //baseline error float x_centre; //centre of blob //words of row WERD_IT word_it = row->word_list (); C_BLOB_IT blob_it; //blob iterator blob_count = 0; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //get total blobs blob_count += word->cblob_list ()->length (); } if (blob_count == 0) return; xstarts = (INT32 *) alloc_mem ((blob_count + row->baseline.segments + 1) * sizeof (INT32)); coeffs = (double *) alloc_mem ((blob_count + row->baseline.segments) * 3 * sizeof (double)); src_index = 0; dest_index = 0; xstarts[0] = row->baseline.xcoords[0]; for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { word = word_it.data (); //current word //blobs in word blob_it.set_to_list (word->cblob_list ()); for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) { blob = blob_it.data (); blob_box = blob->bounding_box (); x_centre = (blob_box.left () + blob_box.right ()) / 2.0; ydiff = blob_box.bottom () - row->base_line (x_centre); if (ydiff < 0) ydiff = -ydiff / row->x_height (); else ydiff = ydiff / row->x_height (); if (ydiff < textord_blshift_maxshift && blob_box.height () / row->x_height () > textord_blshift_xfraction) { if (xstarts[dest_index] >= x_centre) xstarts[dest_index] = blob_box.left (); coeffs[dest_index * 3] = 0; coeffs[dest_index * 3 + 1] = 0; coeffs[dest_index * 3 + 2] = blob_box.bottom (); //shift it dest_index++; xstarts[dest_index] = blob_box.right () + 1; } else { if (xstarts[dest_index] <= x_centre) { while (row->baseline.xcoords[src_index + 1] <= x_centre && src_index < row->baseline.segments - 1) { if (row->baseline.xcoords[src_index + 1] > xstarts[dest_index]) { coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; } src_index++; } coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; xstarts[dest_index] = row->baseline.xcoords[src_index + 1]; } } } } while (src_index < row->baseline.segments && row->baseline.xcoords[src_index + 1] <= xstarts[dest_index]) src_index++; while (src_index < row->baseline.segments) { coeffs[dest_index * 3] = row->baseline.quadratics[src_index].a; coeffs[dest_index * 3 + 1] = row->baseline.quadratics[src_index].b; coeffs[dest_index * 3 + 2] = row->baseline.quadratics[src_index].c; dest_index++; src_index++; xstarts[dest_index] = row->baseline.xcoords[src_index]; } //turn to spline row->baseline = QSPLINE (dest_index, xstarts, coeffs); free_mem(xstarts); free_mem(coeffs);}/********************************************************************** * blob_y_order * * Sort function to sort blobs in y from page top. **********************************************************************/INT32 blob_y_order( //sort function void *item1, //items to compare void *item2) { //converted ptr BLOBNBOX *blob1 = *(BLOBNBOX **) item1; //converted ptr BLOBNBOX *blob2 = *(BLOBNBOX **) item2; if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) return -1; else if (blob1->bounding_box ().bottom () < blob2->bounding_box ().bottom ()) return 1; else { if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) return -1; else if (blob1->bounding_box ().left () > blob2->bounding_box ().left ()) return 1; else return 0; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -