📄 svm_base.c
字号:
/* Copyright (C) 1999 Greg Schohn - gcs@jprc.com *//* "main" file for all of the svm related code - any svm stuff should * pass through some function here */#include <bow/svm.h>#if !HAVE_SQRTF#define sqrtf sqrt#endif#define BARREL_GET_MAX_NSV(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,0))->normalizer)))#define BARREL_GET_NCLASSES(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,0))->prior)))#define BARREL_GET_NMETA_DOCS(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,1))->normalizer)))#define KERNEL_TYPE 14001#define WEIGHT_TYPE 14002#define COST_TYPE 14003#define EA_TYPE 14004#define BSIZE_TYPE 14005#define VOTE_TYPE 14006#define CACHE_SIZE_ARG 14007#define QUICK_SCORE 14008#define DF_COUNTS_ARG 14009#define REMOVE_MISCLASS_TYPE 14010#define TF_TRANSFORM_TYPE 14011#define USE_SMO_ARG 14012#define CNAME_ARG 14013#define LNAME_ARG 14014#define DO_ACTIVE_LEARNING 14015#define ACTIVE_LEARNING_CHUNK_SIZE_ARG 14016#define AL_TEST_IN_TRAIN_ARG 14017#define AL_BASELINE 14018#define START_AT_ARG 14019#define RANDOM_SEED_ARG 14020#define SUPPRESS_SCORE_MAT_ARG 14021#define INITIAL_AL_TSET_ARG 14022#define TRANSDUCE_CLASS_ARG 14023#define TRANS_CSTAR_ARG 14024#define TRANS_NPOS_ARG 14025#define SVM_BASENAME_ARG 14026#define AL_WITH_TRANS_ARG 14027#define TRANS_IGNORE_BIAS_ARG 14028#define TRANS_HYP_REFRESH_ARG 14029#define TRANS_SMART_VALS_ARG 14030#define AGAINST_ALL 0#define PAIRWISE 1#define REMOVE_BOUND 1#define REMOVE_WRONG 2static int weight_type=RAW; /* 0=raw_freq, 1=tfidf, 2=infogain */static int tf_transform_type=RAW; /* 0=raw, 1=log, 2?... */static int vote_type=0;static int cache_size=4000037;static int quick_scoring=1;static int do_active_learning=0;static int test_in_train=0;static int suppress_score_mat=0;static int al_pick_random=0;static int model_starting_no=0;/* here's a C hack - it uses the actual of the enum to do the shift * make sure when passing arguments, you know what the actuals are */static int transduce_class=(1 << bow_doc_unlabeled);static int transduce_class_overriding=0; /* gets set to 1 when args are * passed to override */static char *svml_basename=NULL;FILE *svml_test_file=NULL;#ifdef HAVE_LOQOint svm_use_smo=0;#elseint svm_use_smo=1;#endifdouble svm_epsilon_a=1E-12; /* for alpha's & there bounds */double svm_epsilon_crit=INIT_KKT; /* for critical KT points */double svm_C=1000.0; /* maximum cost */int svm_bsize=4; /* sizeof working set */int svm_kernel_type=0; /* 0=linear */int svm_remove_misclassified=0;int svm_weight_style;int svm_nkc_calls;int svm_trans_npos;int svm_trans_nobias=0;int svm_trans_hyp_refresh=40;int svm_trans_smart_vals=1;double svm_trans_cstar=200;int svm_init_al_tset=8;int svm_al_qsize;int svm_al_do_trans=0;int svm_random_seed=0; /* for al - gets filled in with time */int svm_verbosity=0;/* for tfidf scoring - they could (should?) be made into options... */static int df_transform=LOG;static int df_counts=bow_tfidf_occurrences;/* these are dangerous optimizations for svm_score... - but they save a lot of time... *//* dangerous because they waste a lot of memory (about the size of the original barrel) * & if the vpc barrel gets played with, then its all wrong & there's no totally * error proof way to do that without checking all of the barrel, which i don't do. */struct model_bucket { bow_wv **docs; float **oweights; /* original weights (after norm & tf scaling) note - this only matters when tf_transform is set & some weight_per_model scheme is used */ /* note - these are regular vectors instead of wv's to save time * (O(# qwv features) instead of O((# qwv features) + (# of features)) */ union { float **sub_model; /* weights for submodels */ float *barrel; /* weights for the whole barrel */ } word_weights; double *bvect; int **indices; int *sizes; /* length of each array */ double **weights; double **W; int **yvect; bow_barrel *barrel; int ndocs; int nmodels;};static struct model_bucket model_cache = {NULL, NULL, {NULL}, NULL, NULL, NULL, NULL, NULL, NULL, 0, 0};double dprod(bow_wv *wv1, bow_wv *wv2);double kernel_poly(bow_wv *wv1, bow_wv *wv2);double kernel_rbf(bow_wv *wv1, bow_wv *wv2);double kernel_sig(bow_wv *wv1, bow_wv *wv2);/* by default use the dot product as the kernel */static double (*kernel)(bow_wv *, bow_wv *) = dprod;/* Command-line options specific to SVMs */static struct argp_option svm_options[] = { {0,0,0,0, "Support Vector Machine options, --method=svm:", 50}, {"svm-active-learning-baseline", AL_BASELINE, "", 0, "Incrementally add documents to the training set at random."}, {"svm-test-in-train", AL_TEST_IN_TRAIN_ARG, 0, 0, "do active learning testing inside of the training... a hack " "around making code 10 times more complicated."}, {"svm-al-transduce", AL_WITH_TRANS_ARG, 0, 0, "do transduction over the unlabeled data during active learning."}, {"svm-bsize", BSIZE_TYPE, "", 0, "maximum size to construct the subproblems."}, {"svm-cache-size", CACHE_SIZE_ARG, "", 0, "Number of kernel evaluations to cache."}, {"svm-cost", COST_TYPE, "", 0, "cost to bound the lagrange multipliers by (default 1000)."}, {"svm-df-counts", DF_COUNTS_ARG, "", 0, "Set df_counts (0=occurrences, 1=words)."}, {"svm-active-learning", DO_ACTIVE_LEARNING, "", 0, "Use active learning to query the labels & incrementally (by arg_size) build the barrels."}, {"svm-epsilon_a", EA_TYPE, "", 0, "tolerance for the bounds of the lagrange multipliers (default 0.0001)."}, {"svm-kernel", KERNEL_TYPE, "", 0, "type of kernel to use (0=linear, 1=polynomial, 2=gassian, 3=sigmoid, 4=fisher kernel)."}, {"svm-al_init_tsetsize", INITIAL_AL_TSET_ARG, "", 0, "Number of random documents to start with in active learning."}, {"svm-quick-scoring", QUICK_SCORE, 0, 0, "Turn quick scoring on."}, {"svm-rseed", RANDOM_SEED_ARG, "", 0, "what random seed should be used in the test-in-train splits"}, {"svm-remove-misclassified", REMOVE_MISCLASS_TYPE, "", 0, "Remove all of the misclassified examples and retrain (default none (0), 1=bound, 2=wrong."}, {"svm-start-at", START_AT_ARG, "", 0, "which model should be the first generated."}, {"svm-suppress-score-matrix", SUPPRESS_SCORE_MAT_ARG, 0, 0, "Do not print the scores of each test document at each AL iteration."}, {"svml-basename", SVM_BASENAME_ARG, "", OPTION_HIDDEN, ""}, {"svm-tf-transform", TF_TRANSFORM_TYPE, "", 0, "0=raw, 1=log..."}, {"svm-transduce-class", TRANSDUCE_CLASS_ARG, "", 0, "override default class(es) (int) to do transduction with " "(default bow_doc_unlabeled)."}, {"svm-trans-cost", TRANS_CSTAR_ARG, "", 0, "value to assign to C* (default 200)."}, {"svm-trans-hyp-refresh", TRANS_HYP_REFRESH_ARG, "", 0, "how often the hyperplane should be recomputed during transduction. " "Only applies to SMO. (default 40)"}, {"svm-trans-nobias", TRANS_IGNORE_BIAS_ARG, 0, 0, "Do not use a bias when marking unlabeled documents. Use a " "threshold of 0 to determine labels instead of some threshold to" "mark a certain number of documents for each class."}, {"svm-trans-npos", TRANS_NPOS_ARG, "", 0, "number of unlabeled documents to label as positive " "(default: proportional to number of labeled positive docs)."}, {"svm-trans-smart-vals", TRANS_SMART_VALS_ARG, "", 0, "use previous problem's as a starting point for the next. (default true)"}, {"svm-use-smo", USE_SMO_ARG, "", 0,#ifdef HAVE_LOQO "default 0 (don't use SMO)"#else "default 1 (use SMO) - PR_LOQO not compiled"#endif }, {"svm-vote", VOTE_TYPE, "", 0, "Type of voting to use (0=singular, 1=pairwise; default 0)."}, {"svm-weight", WEIGHT_TYPE, "", 0, "type of function to use to set the weights of the documents' words " "(0=raw_frequency, 1=tfidf, 2=infogain."}, {0, 0}};union kern_param { struct { double const_co; double lin_co; double degree; } poly ; struct { double gamma; } rbf; struct { double const_co; double lin_co; } sig;};union kern_param kparm;error_t svm_parse_opt (int key, char *arg, struct argp_state *state) { switch (key) { case START_AT_ARG: model_starting_no = atoi(arg); break; case KERNEL_TYPE: svm_kernel_type = atoi (arg); if (svm_kernel_type > 4) { fprintf(stderr, "Invalid value for -k, value must be between 0, 1, 2, 3, or 4.\n"); return ARGP_ERR_UNKNOWN; } switch (svm_kernel_type) { case 0: kernel = dprod; break; case 1: kparm.poly.const_co = 1.0; kparm.poly.lin_co = 1.0; kparm.poly.degree = 4.0; kernel = kernel_poly; break; case 2: kparm.rbf.gamma = 1.0; kernel = kernel_rbf; break; case 3: kparm.sig.lin_co = 1.0; kparm.sig.const_co = 0.0; kernel = kernel_sig; break; case 4: kernel = svm_kernel_fisher; break; default: } break; case AL_TEST_IN_TRAIN_ARG: test_in_train = 1; break; case AL_WITH_TRANS_ARG: svm_al_do_trans = 1; break; case BSIZE_TYPE: svm_bsize = atoi(arg); if (svm_bsize < 2) { fprintf(stderr, "Invalid value for -b, value must be at least 2.\n"); return ARGP_ERR_UNKNOWN; } svm_bsize = ((svm_bsize+3)/4)*4; break; case CACHE_SIZE_ARG: cache_size = atoi(arg); if (cache_size < 2) { fprintf(stderr, "Invalid value for --cache_size, value must be greater than 1\n"); return ARGP_ERR_UNKNOWN; } break; case COST_TYPE: svm_C = atof(arg); break; case DF_COUNTS_ARG: key = atoi(arg); if (key == 0) { df_counts = bow_tfidf_occurrences; } else if (key == 1) { df_counts = bow_tfidf_words; } else { return ARGP_ERR_UNKNOWN; } break; case EA_TYPE: svm_epsilon_a = atof(arg); break; case AL_BASELINE: test_in_train = 1; al_pick_random = 1; case DO_ACTIVE_LEARNING: do_active_learning = 1; svm_al_qsize = atoi(arg); if (svm_al_qsize < 0) { fprintf(stderr, "Bogus AL-query size\n"); return ARGP_ERR_UNKNOWN; } break; case INITIAL_AL_TSET_ARG: svm_init_al_tset = atoi(arg); break; case REMOVE_MISCLASS_TYPE: svm_remove_misclassified = atoi(arg); break; case RANDOM_SEED_ARG: svm_random_seed = atoi(arg); break; case QUICK_SCORE: quick_scoring = 1; break; case SUPPRESS_SCORE_MAT_ARG: suppress_score_mat = 1; break; case SVM_BASENAME_ARG: svml_basename = arg; break; case TF_TRANSFORM_TYPE: tf_transform_type = atoi(arg); if ((tf_transform_type < 0) || (tf_transform_type > 1)) { fprintf(stderr, "Invalid value for tf_transform_type, value must be 0 or 1\n"); return ARGP_ERR_UNKNOWN; } break; case TRANSDUCE_CLASS_ARG: { int a; a = atoi(arg); if (a == bow_doc_train) { fprintf(stderr,"Cannot do transduction on training set, ignoring \"%s\" option\n",arg); } else { if (!transduce_class) { transduce_class_overriding = 1; transduce_class = 0; } /* < 0 turns transduction off */ if (a > 0) { transduce_class |= (1 << a); } } } break; case TRANS_HYP_REFRESH_ARG: svm_trans_hyp_refresh = atoi(arg); if (svm_trans_hyp_refresh < 1) { fprintf(stderr, "svm_trans_hyp_refresh (hyperplane refresh rate)" " must be greater than 0\n"); } break; case TRANS_IGNORE_BIAS_ARG: svm_trans_nobias = 1; break; case TRANS_NPOS_ARG: svm_trans_npos = atoi(arg); if (svm_trans_npos < 1) { fprintf(stderr, "svm_trans_npos should be greater than 0.\n"); return ARGP_ERR_UNKNOWN; } break; case TRANS_CSTAR_ARG: svm_trans_cstar = atof(arg); break; case TRANS_SMART_VALS_ARG: svm_trans_smart_vals = atoi(arg); break; case USE_SMO_ARG: svm_use_smo = atoi(arg); /* the epsilon is used is 2x as big as it would be in the loqo method */ if (svm_use_smo == 1) { svm_epsilon_crit /= 2; }#ifndef HAVE_LOQO if (svm_use_smo != 1) { fprintf(stderr,"Cannot switch from SMO, no other solvers were built,\n" "rebuild libbow with pr_loqo to use another algorithm.\n"); }#endif break; case VOTE_TYPE: vote_type = atoi(arg); if ((vote_type < 0) || (vote_type > 1)) { fprintf(stderr, "Invalid value for --vote, value must be 0 for linear or 1 for pairwise.\n"); return ARGP_ERR_UNKNOWN; } break; case WEIGHT_TYPE: weight_type = atoi(arg); if ((weight_type < 0) || (weight_type > 3)) { fprintf(stderr, "Invalid value for -w, value must be 0, 1, 2, or 3.\n"); return ARGP_ERR_UNKNOWN; } break; default: return ARGP_ERR_UNKNOWN; } return 0;}static const struct argp svm_argp = { svm_options, svm_parse_opt };static struct argp_child svm_argp_child = { &svm_argp, /* This child's argp structure */ 0, /* flags for child */ 0, /* optional header in help message */ 0 /* arbitrary group number for ordering */};void svm_permute_data(int *permute_table, bow_wv **docs, int *yvect, int ndocs) { int i, j; for (i=0; i<ndocs; i++) { permute_table[i] = i; } for (i=0; i<ndocs; i++) { bow_wv *d; int y; j = random() % ndocs; d = docs[j]; docs[j] = docs[i]; docs[i] = d; y = yvect[j]; yvect[j] = yvect[i]; yvect[i] = y; y = permute_table[j]; permute_table[j] = permute_table[i]; permute_table[i] = y; }}void svm_unpermute_data(int *permute_table, bow_wv **docs, int *yvect, int ndocs) { int i, j; for (i=0; i<ndocs; ) { bow_wv *d; int y; j = permute_table[i];
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -