📄 split.c
字号:
/* Splitting the documents into training and test sets. *//* Copyright (C) 1997, 1998, 1999 Andrew McCallum Written by: Sean Slattery <jslttery@cs.cmu.edu> This file is part of the Bag-Of-Words Library, `libbow'. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation, version 2. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA */#include <argp.h>#include <bow/libbow.h>#include <math.h>#include <time.h>#include <sys/time.h>#include <unistd.h>/* Different ways of specifying how to do the split */typedef enum { bow_files_source_file = 10, /* get docs for a type from a file */ bow_files_source_fraction, /* do a random fraction split of docs */ bow_files_source_fraction_remaining, /* random fraction of untagged docs */ bow_files_source_fraction_train, /* a random fraction of the training docs */ bow_files_source_number, /* do a random number split of docs */ bow_files_source_number_remaining, /* ditto, but proportions from untagged */ bow_files_source_number_train, /* take a random number of training docs */ bow_files_source_remaining, /* use remaining docs for a file type */ bow_files_source_num_per_class, /* pick a random number from each class */ bow_files_source_num_per_class_remaining, /* ditto, but prop from untagged */ bow_files_source_fancy_counts /* pick the random number specified for each class */} bow_files_source_type;/* A structure for maintining information about fancy counts */typedef struct _bow_split_fancy_count { char *class_name; int num_docs;} bow_split_fancy_count;/* How documents of each type are being selected */static bow_files_source_type bow_test_files_source = bow_files_source_number;static bow_files_source_type bow_train_files_source = bow_files_source_remaining;static bow_files_source_type bow_unlabeled_files_source = bow_files_source_number;static bow_files_source_type bow_ignore_files_source = bow_files_source_number;static bow_files_source_type bow_validation_files_source = bow_files_source_number;/* The fraction used for selecting each type */static float bow_test_fraction;static float bow_train_fraction;static float bow_unlabeled_fraction;static float bow_ignore_fraction;static float bow_validation_fraction;/* The number for selecting each type for both num_per_class and number */static int bow_test_number = 0;static int bow_train_number;static int bow_unlabeled_number = 0;static int bow_ignore_number = 0;static int bow_validation_number = 0;/* The filename containing lists of documents for each type */static char *bow_test_filename;static char *bow_train_filename;static char *bow_unlabeled_filename;static char *bow_ignore_filename;static char *bow_validation_filename;/* The numbers to select from each class for fancy counts for each type*/static bow_split_fancy_count *bow_test_fancy_counts;static bow_split_fancy_count *bow_train_fancy_counts;static bow_split_fancy_count *bow_unlabeled_fancy_counts;static bow_split_fancy_count *bow_ignore_fancy_counts;static bow_split_fancy_count *bow_validation_fancy_counts;/* When using files to set the test/train split, compare filenames by using this many directory components as basename only, not their complete filenames. */int bow_test_set_files_use_basename = 0;enum { TEST_SOURCE = 5000, TRAIN_SOURCE, UNLABELED_SOURCE, IGNORE_SOURCE, VALIDATION_SOURCE, SET_TEST_FILES_KEY, SET_TEST_FILES_USE_BASENAME_KEY};static struct argp_option bow_split_options[] ={ {0, 0, 0, 0, "Splitting options:", 10}, {"test-set", TEST_SOURCE, "SOURCE", 0, "How to select the testing documents. " "A number between 0 and 1 inclusive " "with a decimal point indicates a random fraction of all documents. " "The number of documents selected from each class is determined " "by attempting to match the proportions of the non-ignore documents. " "A number with no decimal point indicates the number of documents " "to select randomly. " "Alternatively, a suffix of `pc' indicates the number of documents " "per-class to tag. " "The suffix 't' for a number or proportion indicates to tag documents from the " "pool of training documents, not the untagged documents. " "`remaining' selects all documents that remain untagged at the end. " "Anything else is interpreted as a filename listing documents to select. " "Default is `0.0'."}, /* The following text was removed from above: "by attempting to match the proportions of the non-ignore documents; " "however, if the number is followed by an `r' (for `remaining'), " "then it attempts to match the proportions of the thus-far " "untagged documents instead. " "A number with no decimal point indicates the number of documents " "to select randomly. " "A suffix of `r' can be used similarly to above. " "(The above selection methods are actually run last, which is " "important since this effects the meaning of `remaining'.) " We should document the "fancy counts" method here in this comment: */ {"train-set", TRAIN_SOURCE, "SOURCE", 0, "How to select the training documents. Same format as --test-set. Default is " "`remaining'."}, {"unlabeled-set", UNLABELED_SOURCE, "SOURCE", 0, "How to select the unlabeled documents. Same format as --test-set. " "Default is `0'."}, {"ignore-set", IGNORE_SOURCE, "SOURCE", 0, "How to select the ignored documents. Same format as --test-set. Default is " "`0'."}, {"validation-set", VALIDATION_SOURCE, "SOURCE", 0, "How to select the validation documents. Same format as --test-set. Default is " "`0'."}, {"test-percentage", 'p', "P", OPTION_HIDDEN, "Use P percent of the indexed documents as test data. Default is 30."}, {"set-test-files", SET_TEST_FILES_KEY, "FILENAME", OPTION_HIDDEN, "Instead of splitting the data among test/train randomly (using the " "-p option), use the indexed files named in the contents of FILENAME " "for testing, and all the others in the model for training. FILENAME " "should contain a list of file paths (with path identical to the path " "used in indexing), each path separated by a newline."}, {"testing-files", SET_TEST_FILES_KEY, "FILENAME", OPTION_ALIAS | OPTION_HIDDEN}, {"set-files-use-basename", SET_TEST_FILES_USE_BASENAME_KEY, "N", OPTION_ARG_OPTIONAL, "When using files to specify doc types, compare only the last N " "components the doc's pathname. That is use the filename and " "the last N-1 directory names. If N is not specified, it defaults to 1."}, {"testing-files-use-basename", SET_TEST_FILES_USE_BASENAME_KEY, "N", OPTION_ALIAS | OPTION_HIDDEN | OPTION_ARG_OPTIONAL}, {0,0}};static error_tbow_split_parse_opt (int key, char *arg, struct argp_state *state){ bow_files_source_type *files_source; float *fraction; int *number; char **filename; bow_split_fancy_count **fancy_counts; int length; switch (key) { case TEST_SOURCE: files_source = &bow_test_files_source; fraction = &bow_test_fraction; number = &bow_test_number; filename = &bow_test_filename; fancy_counts = &bow_test_fancy_counts; break; case TRAIN_SOURCE: files_source = &bow_train_files_source; fraction = &bow_train_fraction; number = &bow_train_number; filename = &bow_train_filename; fancy_counts = &bow_train_fancy_counts; break; case UNLABELED_SOURCE: files_source = &bow_unlabeled_files_source; fraction = &bow_unlabeled_fraction; number = &bow_unlabeled_number; filename = &bow_unlabeled_filename; fancy_counts = &bow_unlabeled_fancy_counts; break; case IGNORE_SOURCE: files_source = &bow_ignore_files_source; fraction = &bow_ignore_fraction; number = &bow_ignore_number; filename = &bow_ignore_filename; fancy_counts = &bow_ignore_fancy_counts; break; case VALIDATION_SOURCE: files_source = &bow_validation_files_source; fraction = &bow_validation_fraction; number = &bow_validation_number; filename = &bow_validation_filename; fancy_counts = &bow_validation_fancy_counts; break; case 'p': bow_test_files_source = bow_files_source_fraction; bow_test_fraction = atof (arg) / 100.0; return 0; break; case SET_TEST_FILES_KEY: bow_test_files_source = bow_files_source_file; bow_test_filename = arg; return 0; break; case SET_TEST_FILES_USE_BASENAME_KEY: if (arg) bow_test_set_files_use_basename = atoi (arg); else bow_test_set_files_use_basename = 1; return 0; break; default: return ARGP_ERR_UNKNOWN; } assert (key == TEST_SOURCE || key == TRAIN_SOURCE || key == UNLABELED_SOURCE || key == IGNORE_SOURCE || key == VALIDATION_SOURCE); length = strlen(arg); /* Now parse the split option */ if (!strcmp(arg, "remaining")) *files_source = bow_files_source_remaining; else if (length == strspn(arg, "0123456789")) { *files_source = bow_files_source_number; *number = atoi(arg); } else if (length == strspn(arg, ".0123456789") && (strchr(arg, '.') == strrchr(arg, '.'))) { *files_source = bow_files_source_fraction; *fraction = atof(arg); assert (*fraction >= 0 && *fraction <= 1); } else if (length == strspn(arg, "0123456789r") && length > 1 && strchr(arg, 'r') == arg + length - 1) { *files_source = bow_files_source_number_remaining; *number = atoi(arg); } else if (length > 2 && strchr(arg, 'r') == arg + length - 1 && strspn(arg, ".0123456789r")) { char buf[length]; *files_source = bow_files_source_fraction_remaining; memcpy (buf, arg, length-1); buf[length-1] = '\0'; *fraction = atof(buf); } else if (length == strspn(arg, "0123456789t") && length > 1 && strchr(arg, 't') == arg + length - 1) { *files_source = bow_files_source_number_train; *number = atoi(arg); } else if (length > 2 && strchr(arg, 't') == arg + length - 1 && strspn(arg, ".0123456789t")) { char buf[length]; *files_source = bow_files_source_fraction_train; memcpy (buf, arg, length-1); buf[length-1] = '\0'; *fraction = atof(buf); } else if (length > 2 && strchr(arg, 'p') == arg + length - 2 && strchr(arg, 'c') == arg + length - 1 && strspn(arg, "0123456789pc")) { *files_source = bow_files_source_num_per_class; *number = atoi(arg); } else if (length > 2 && strchr(arg, 'p') == arg + length - 3 && strchr(arg, 'c') == arg + length - 2 && strchr(arg, 'r') == arg + length - 1 && strspn(arg, "0123456789pcr")) { bow_error ("`pcr' not yet supported"); *files_source = bow_files_source_num_per_class_remaining; *number = atoi(arg); } else if (length > 2 && arg[0] == '[' && arg[length-1] == ']') { char *charp; int num_entries; int x; *files_source = bow_files_source_fancy_counts; /* see how many classes we'll need to malloc for */ for (charp=arg, num_entries = 0; *charp != '\0'; charp++) { if (*charp == ',') num_entries++; } num_entries = (num_entries + 1) / 2; /* malloc and initialize the space to store the counts */ assert (num_entries > 0); *fancy_counts = malloc (sizeof (bow_split_fancy_count) * (num_entries + 1)); (*fancy_counts)[num_entries].class_name = NULL; /* extract the class num arg pairs */ arg++; for (charp = strtok(arg, ","), x=0; x < num_entries; x++, charp = strtok(NULL, ",")) { (*fancy_counts)[x].class_name = charp; (*fancy_counts)[x].num_docs = atoi(strtok(NULL, ",")); } assert(NULL == strtok(NULL, ",")); } else { *files_source = bow_files_source_file; *filename = arg; } return 0;}static const struct argp bow_split_argp ={ bow_split_options, bow_split_parse_opt};
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -