📄 split.c
字号:
bow_test_set_files_use_basename)); if (index != -1) bow_verbosify (bow_quiet, "WARNING: Repeated file basename `%s'\n", bow_int2str (map, si)); bow_str2int (map2, bow_basename (bow_int2str (map, si), bow_test_set_files_use_basename)); } bow_int4str_free (map); map = map2; } for (di = 0; di < docs->length; di++) { cdoc = bow_array_entry_at_index (docs, di); if (bow_test_set_files_use_basename) filename = bow_basename (cdoc->filename, bow_test_set_files_use_basename); else filename = cdoc->filename; if (bow_str2int_no_add (map, filename) != -1) { /* This filename is in the map; tag this cdoc. */ if (cdoc->type != bow_doc_untagged) bow_verbosify (bow_quiet, "Duplicate tags requested for %s. " "Using first tag.\n", filename); else { cdoc->type = type; files_count++; } } } switch (type) { case bow_doc_train: type_name = "train"; break; case bow_doc_test: type_name = "test"; break; case bow_doc_unlabeled: type_name = "unlabeled"; break; case bow_doc_ignore: type_name = "ignore"; break; case bow_doc_validation: type_name = "validation"; break; default: bow_error ("No implementation for this type."); } bow_verbosify (bow_progress, "Using %s, placed %d documents in the %s set\n", test_files_filename, files_count, type_name); bow_int4str_free (map); return;}/* Postprocess the tags on documents by setting untagged documents to train or test, depending on context. */voidbow_set_doc_types_of_remaining (bow_array *docs, int type){ int di; bow_cdoc *cdoc; char *type_name = NULL; int num_found = 0; for (di = 0; di < docs->length; di++) { cdoc = bow_array_entry_at_index (docs, di); if (cdoc->type == bow_doc_untagged) { cdoc->type = type; num_found++; } } switch (type) { case bow_doc_train: type_name = "train"; break; case bow_doc_test: type_name = "test"; break; case bow_doc_unlabeled: type_name = "unlabeled"; break; case bow_doc_ignore: type_name = "ignore"; break; case bow_doc_validation: type_name = "validation"; break; default: bow_error ("No implementation for this type."); } bow_verbosify (bow_progress, "Placed remaining %d documents in the %s set:\n", num_found, type_name);}/* Use the command line arguments to create the appropriate train/test split */voidbow_set_doc_types (bow_array *docs, int num_classes, bow_int4str *classnames){ /* int num_docs; */ int ti; int num_types; int num_remaining = 0; /* note it is important that ignore comes first, so we can ignore them when doing even prior random splits later */ struct { bow_files_source_type source; float fraction; int number; char *filename; bow_split_fancy_count *fancy_counts; bow_files_source_type doc_type; } types[] = {{bow_ignore_files_source, bow_ignore_fraction, bow_ignore_number, bow_ignore_filename, bow_ignore_fancy_counts, bow_doc_ignore}, {bow_test_files_source, bow_test_fraction, bow_test_number, bow_test_filename, bow_test_fancy_counts, bow_doc_test}, {bow_train_files_source, bow_train_fraction, bow_train_number, bow_train_filename, bow_train_fancy_counts, bow_doc_train}, {bow_unlabeled_files_source, bow_unlabeled_fraction, bow_unlabeled_number, bow_unlabeled_filename, bow_unlabeled_fancy_counts, bow_doc_unlabeled}, {bow_validation_files_source, bow_validation_fraction, bow_validation_number, bow_validation_filename, bow_validation_fancy_counts, bow_doc_validation}, {0,0,0,0,0,0}}; /* First set all files to be untagged. */ bow_set_all_docs_untagged (docs); /* count the number of document types */ for (num_types = 0; types[num_types].source; num_types++); /* First, set document types based on input files */ for (ti = 0; ti < num_types; ti++) { if (types[ti].source == bow_files_source_file) bow_set_docs_to_type (docs, types[ti].filename, types[ti].doc_type); } /* Second, set document types based on specific numbers per class */ for (ti = 0; ti < num_types; ti++) { if (types[ti].source == bow_files_source_num_per_class) { int ci; int *class_nums; class_nums = bow_malloc (sizeof (int) * num_classes); for (ci = 0; ci < num_classes; ci ++) class_nums[ci] = types[ti].number; bow_set_doc_types_randomly_by_count_per_class (docs, num_classes, classnames, class_nums, types[ti].doc_type, bow_doc_untagged); bow_free(class_nums); } if (types[ti].source == bow_files_source_num_per_class_remaining) { int ci; int *class_nums; bow_error ("prc suffix not yet implemented."); class_nums = bow_malloc (sizeof (int) * num_classes); for (ci = 0; ci < num_classes; ci ++) class_nums[ci] = types[ti].number; bow_set_doc_types_randomly_by_count_per_class (docs, num_classes, classnames, class_nums, types[ti].doc_type, bow_doc_untagged); bow_free(class_nums); } else if (types[ti].source == bow_files_source_fancy_counts) { int *counts = bow_malloc (sizeof (int) * num_classes); int ci; bow_split_fancy_count *class_count; assert (classnames); for (ci = 0; ci < num_classes; ci++) counts[ci] = -1; for (class_count = types[ti].fancy_counts; class_count->class_name; class_count++) { int class = -1; for (ci = 0; ci < num_classes; ci++) { const char *name = bow_int2str (classnames, ci); if (!strcmp(class_count->class_name, name)) { class = ci; break; } } if (class == -1) bow_error ("Unknown class %s.\n", class_count->class_name); counts[class] = class_count->num_docs; } for (ci = 0; ci < num_classes; ci++) if (counts[ci] == -1) bow_error("Under-specified class counts"); bow_set_doc_types_randomly_by_count_per_class (docs, num_classes, classnames, counts, types[ti].doc_type, bow_doc_untagged); bow_free(counts); } } /* Third, do any random class-proportioned splits to set document types */ for (ti = 0; ti < num_types; ti++) { if (types[ti].source == bow_files_source_fraction) { bow_set_doc_types_randomly_by_fraction (docs, num_classes, classnames, types[ti].fraction, types[ti].doc_type, bow_doc_untagged); } else if (types[ti].source == bow_files_source_fraction_remaining) { bow_set_doc_types_randomly_by_fraction_remaining (docs, num_classes, classnames, types[ti].fraction, types[ti].doc_type); } else if (types[ti].source == bow_files_source_number) { bow_set_doc_types_randomly_by_count (docs, num_classes, classnames, types[ti].number, types[ti].doc_type, 0, bow_doc_untagged); } else if (types[ti].source == bow_files_source_number_remaining) { bow_set_doc_types_randomly_by_count (docs, num_classes, classnames, types[ti].number, types[ti].doc_type, 1, bow_doc_untagged); } } /* Set remaining untagged docs if appropriate */ for (ti = 0; ti < num_types; ti++) { if (types[ti].source == bow_files_source_remaining) { assert (num_remaining == 0); num_remaining = 1; bow_set_doc_types_of_remaining (docs, types[ti].doc_type); } } /* Now that the training documents are fixed, check if any document types feed from them */ for (ti = 0; ti < num_types; ti++) { if (types[ti].source == bow_files_source_fraction_train) { bow_set_doc_types_randomly_by_fraction (docs, num_classes, classnames, types[ti].fraction, types[ti].doc_type, bow_doc_train); } else if (types[ti].source == bow_files_source_number_train) { bow_set_doc_types_randomly_by_count (docs, num_classes, classnames, types[ti].number, types[ti].doc_type, 0, bow_doc_train); } }}/* Use the command line arguments to create the appropriate train/test split */voidbow_set_doc_types_for_barrel (bow_barrel *barrel){ bow_set_doc_types (barrel->cdocs, bow_barrel_num_classes (barrel), barrel->classnames);}#define BOW_DOC_IS_X(X) \int bow_doc_is_ ## X (bow_doc *doc) { return (doc->type == bow_doc_ ## X); }BOW_DOC_IS_X(train)BOW_DOC_IS_X(test)BOW_DOC_IS_X(unlabeled)BOW_DOC_IS_X(untagged)BOW_DOC_IS_X(validation)BOW_DOC_IS_X(ignore)BOW_DOC_IS_X(pool)BOW_DOC_IS_X(waiting)/* return 1 for all training and unlabeled docs */intbow_cdoc_is_train_or_unlabeled (bow_cdoc *cdoc){ return ((cdoc->type == bow_doc_unlabeled) || (cdoc->type == bow_doc_train));}void _register_split_args () __attribute__ ((constructor));void _register_split_args (){ static int done = 0; if (done) return; bow_argp_add_child (&bow_split_argp_child); done = 1;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -