📄 split.c
字号:
static struct argp_child bow_split_argp_child ={ &bow_split_argp, /* This child's argp structure */ 0, /* flags for child */ 0, /* optional header in help message */ 0 /* arbitrary group number for ordering */};/* Mark all documents in the array DOCS to be of type TAG. */voidbow_tag_docs (bow_array *docs, int tag){ int i; bow_cdoc *doc; for (i = 0; i < docs->length ; i++) { doc = bow_array_entry_at_index (docs, i); doc->type = tag; }}/* Change documents in the array DOCS of type TAG1 to be of type TAG2. Returns the number of tags changed. */intbow_tag_change_tags (bow_array *docs, int tag1, int tag2){ int i; bow_cdoc *doc; int changed = 0; for (i = 0; i < docs->length ; i++) { doc = bow_array_entry_at_index (docs, i); if (doc->type == tag1) { doc->type = tag2; changed++; } } return changed;}/* Mark all documents in the array DOCS to be of type BOW_DOC_UNTAGGED. */voidbow_set_all_docs_untagged (bow_array *docs){ int i; bow_cdoc *doc; for (i = 0; i < docs->length ; i++) { doc = bow_array_entry_at_index (docs, i); doc->type = bow_doc_untagged; }}/* Randomly select some SOURCE_TAG documents and label them with tag indicated by TAG. The number of documents from each class are determined by the array NUM_PER_CLASS. */voidbow_set_doc_types_randomly_by_count_per_class (bow_array *docs, int num_classes, bow_int4str *classnames, int *num_per_class, int tag, int source_tag){ int ci, di; bow_cdoc *cdoc = NULL; /* All the below include only the test/model docs, not the ignore docs.*/ int *num_untagged_per_class; int *local_num_per_class; int total_num_to_tag; int num_tagged; int num_loops; char *type_name = NULL; /* Seed the random number generator if it hasn't been already */ bow_random_set_seed (); /* Count the number of untagged documents in each class, and if this function is trying to tag more than are available, simply have this function tag less */ num_untagged_per_class = alloca (num_classes * sizeof (int)); for (ci = 0; ci < num_classes; ci++) num_untagged_per_class[ci] = 0; for (di = 0; di < docs->length; di++) { cdoc = bow_array_entry_at_index (docs, di); if (cdoc->type == bow_doc_untagged) num_untagged_per_class[cdoc->class]++; } for (ci = 0; ci < num_classes; ci++) if (num_per_class[ci] > num_untagged_per_class[ci]) { bow_verbosify (bow_quiet, "Asked for %d documents of class %s; " "only %d available\n", num_per_class[ci], bow_int2str (classnames, ci), num_untagged_per_class[ci]); num_per_class[ci] = num_untagged_per_class[ci]; } /* Create a local array of the number of taggings to perform in each class, which we will change by decrementing it as we tag. */ local_num_per_class = alloca (num_classes * sizeof (int)); total_num_to_tag = 0; for (ci = 0; ci < num_classes; ci++) { local_num_per_class[ci] = num_per_class[ci]; total_num_to_tag += num_per_class[ci]; } if (total_num_to_tag == 0) return;#if 0 /* Print the number of documents in each class. */ fprintf (stderr, "Number of docs per class: "); for (i = 0; i < max_ci; i++) fprintf (stderr, "%d:%d ", num_docs_per_class[i], num_test_docs_allowed_per_class[i]); fprintf (stderr, "\n");#endif /* Now loop until we have tagged a set of size num_test */ for (num_tagged = 0, num_loops = 0; num_tagged < total_num_to_tag; num_loops++) { di = random() % docs->length; cdoc = bow_array_entry_at_index (docs, di); assert (cdoc); if (cdoc->type == source_tag && local_num_per_class[cdoc->class] > 0) { cdoc->type = tag; num_tagged++; local_num_per_class[cdoc->class]--; assert (local_num_per_class[cdoc->class] >= 0); } if (num_loops > docs->length * 1000) bow_error ("Random number generator could not find enough " "model document indices with balanced classes"); } switch (tag) { case bow_doc_train: type_name = "train"; break; case bow_doc_test: type_name = "test"; break; case bow_doc_unlabeled: type_name = "unlabeled"; break; case bow_doc_ignore: type_name = "ignore"; break; case bow_doc_validation: type_name = "validation"; break; default: bow_error ("No implementation for this type."); } bow_verbosify (bow_progress, "Randomly selected %d documents for the %s set:\n", total_num_to_tag, type_name); for (ci = 0; ci < num_classes; ci++) bow_verbosify (bow_progress, " %5d : %s\n", num_per_class[ci], bow_int2str (classnames, ci));}/* Randomly select NUM untagged documents and label them with tag indicated by TAG. If TAKE_PROPORTION_FROM_REMAINING is non-zero, then the number of documents from each class are determined by attempting to match the proportion of classes among the so-far untagged documents; otherwise, it attempts to match the proportion of the non-ignore documents. */voidbow_set_doc_types_randomly_by_count (bow_array *docs, int num_classes, bow_int4str *classnames, int num, int tag, int take_proportion_from_remaining, int source_tag){ int ci, di; bow_cdoc *cdoc; int *num_per_class = alloca (num_classes * sizeof (int)); int *num_docs_per_class = alloca (num_classes * sizeof (int)); int total_num_docs = 0; int total; if (num == 0) return; /* Find out the number of documents in each class. */ for (ci = 0; ci < num_classes; ci++) num_docs_per_class[ci] = 0; for (di = 0; di < docs->length ; di++) { cdoc = bow_array_entry_at_index (docs, di); if (cdoc->type == bow_doc_ignore || (take_proportion_from_remaining && cdoc->type != bow_doc_untagged)) continue; assert (cdoc->class < num_classes); num_docs_per_class[cdoc->class]++; total_num_docs++; } /* Initialize the array NUM_PER_CLASS, indicating how many documents per class should be tagged. */ total = 0; for (ci = 0; ci < num_classes; ci++) { num_per_class[ci] = ((float) num / (float) total_num_docs) * (float) num_docs_per_class[ci]; /* Note that NUM_PER_CLASS[CI] may be zero here. Don't just arbitrarily set it zero to 1, because NUM_DOCS_PER_CLASS[CI] may also be zero. */ total += num_per_class[ci]; } assert (total <= num); /* Add more to take care of round-off error. */ for (ci = 0; total < num; ci = (ci+1) % docs->length) { if (num_per_class[ci] < num_docs_per_class[ci]) { num_per_class[ci]++; total++; } } /* Do it. */ bow_set_doc_types_randomly_by_count_per_class (docs, num_classes, classnames, num_per_class, tag, source_tag);}/* Randomly select a FRACTION of the untagged documents and label them with tag indicated by TYPE. The number of documents from each class are determined by attempting to match the proportion of classes among the untagged documents. */voidbow_set_doc_types_randomly_by_fraction_remaining (bow_array *docs, int num_classes, bow_int4str *classnames, double fraction, int type){ int di, untagged_doc_count = 0; bow_cdoc *cdoc; assert (fraction <= 1.0); for (di = 0; di < docs->length ; di++) { cdoc = bow_array_entry_at_index (docs, di); if (cdoc->type == bow_doc_untagged) untagged_doc_count++; } bow_set_doc_types_randomly_by_count (docs, num_classes, classnames, fraction * untagged_doc_count, type, 1, bow_doc_untagged);}/* Randomly select a FRACTION of the non-ignore documents and label them with tag indicated by TYPE. The number of documents from each class are determined by attempting to match the proportion of classes among the untagged documents. */voidbow_set_doc_types_randomly_by_fraction (bow_array *docs, int num_classes, bow_int4str *classnames, double fraction, int type, int source_tag){ int di, non_ignore_doc_count = 0; bow_cdoc *cdoc; for (di = 0; di < docs->length ; di++) { cdoc = bow_array_entry_at_index (docs, di); if (cdoc->type != bow_doc_ignore) non_ignore_doc_count++; } bow_set_doc_types_randomly_by_count (docs, num_classes, classnames, fraction * non_ignore_doc_count, type, 0, source_tag);}/* Setting file tags with lists of filenames. *//* If opts.c:bow_test_set_files_use_basename is non-zero, ignore the directory names in the filenames read from TEST_FILES_FILENAMES in bow_test_set_file(). */static inline const char *bow_basename (const char *str, int num_components){ int i; if (num_components == 0) return str; i = strlen (str) - 1; assert (str[i] != '/'); while (i > 0) { if (str[i] == '/') { num_components--; if (num_components == 0) break; } i--; } if (str[i] == '/') i++; return &(str[i]);}/* Set all the cdoc's named in TEST_FILES_FILENAME to type indicated by TYPE. Raises error if any of the files already have a non-"untagged" type. BARREL should be a doc barrel, not a class barrel. */voidbow_set_docs_to_type (bow_array *docs, const char *test_files_filename, int type){ bow_int4str *map; bow_cdoc *cdoc; int di; int files_count = 0; const char *filename; char *type_name = NULL; map = bow_int4str_new_from_string_file (test_files_filename); if (bow_test_set_files_use_basename) { /* Convert the filename strings in map to only the basenames of the files. */ int si, index; bow_int4str *map2 = bow_int4str_new (0); for (si = 0; si < map->str_array_length; si++) { index = bow_str2int_no_add (map2, bow_basename(bow_int2str (map, si),
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -