📄 svm_base.c

📁 机器学习作者tom mitchell的书上代码
💻 C
📖 第 1 页 / 共 5 页
字号:
12 3 4 5 下一页
/* Copyright (C) 1999 Greg Schohn - gcs@jprc.com *//* "main" file for all of the svm related code - any svm stuff should * pass through some function here */#include <bow/svm.h>#if !HAVE_SQRTF#define sqrtf sqrt#endif#define BARREL_GET_MAX_NSV(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,0))->normalizer)))#define BARREL_GET_NCLASSES(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,0))->prior)))#define BARREL_GET_NMETA_DOCS(barrel) (*((int *) &((GET_CDOC_ARRAY_EL(barrel,1))->normalizer)))#define KERNEL_TYPE                    14001#define WEIGHT_TYPE                    14002#define COST_TYPE                      14003#define EA_TYPE                        14004#define BSIZE_TYPE                     14005#define VOTE_TYPE                      14006#define CACHE_SIZE_ARG                 14007#define QUICK_SCORE                    14008#define DF_COUNTS_ARG                  14009#define REMOVE_MISCLASS_TYPE           14010#define TF_TRANSFORM_TYPE              14011#define USE_SMO_ARG                    14012#define CNAME_ARG                      14013#define LNAME_ARG                      14014#define DO_ACTIVE_LEARNING             14015#define ACTIVE_LEARNING_CHUNK_SIZE_ARG 14016#define AL_TEST_IN_TRAIN_ARG           14017#define AL_BASELINE                    14018#define START_AT_ARG                   14019#define RANDOM_SEED_ARG                14020#define SUPPRESS_SCORE_MAT_ARG         14021#define INITIAL_AL_TSET_ARG            14022#define TRANSDUCE_CLASS_ARG            14023#define TRANS_CSTAR_ARG                14024#define TRANS_NPOS_ARG                 14025#define SVM_BASENAME_ARG               14026#define AL_WITH_TRANS_ARG              14027#define TRANS_IGNORE_BIAS_ARG          14028#define TRANS_HYP_REFRESH_ARG          14029#define TRANS_SMART_VALS_ARG           14030#define AGAINST_ALL 0#define PAIRWISE    1#define REMOVE_BOUND 1#define REMOVE_WRONG 2static int weight_type=RAW;   /* 0=raw_freq, 1=tfidf, 2=infogain */static int tf_transform_type=RAW;  /* 0=raw, 1=log, 2?... */static int vote_type=0;static int cache_size=4000037;static int quick_scoring=1;static int do_active_learning=0;static int test_in_train=0;static int suppress_score_mat=0;static int al_pick_random=0;static int model_starting_no=0;/* here's a C hack - it uses the actual of the enum to do the shift * make sure when passing arguments, you know what the actuals are */static int transduce_class=(1 << bow_doc_unlabeled);static int transduce_class_overriding=0; /* gets set to 1 when args are 					  * passed to override */static char *svml_basename=NULL;FILE *svml_test_file=NULL;#ifdef HAVE_LOQOint svm_use_smo=0;#elseint svm_use_smo=1;#endifdouble svm_epsilon_a=1E-12;       /* for alpha's & there bounds */double svm_epsilon_crit=INIT_KKT; /* for critical KT points */double svm_C=1000.0;              /* maximum cost */int svm_bsize=4;               /* sizeof working set */int svm_kernel_type=0;          /* 0=linear */int svm_remove_misclassified=0;int svm_weight_style;int svm_nkc_calls;int svm_trans_npos;int svm_trans_nobias=0;int svm_trans_hyp_refresh=40;int svm_trans_smart_vals=1;double svm_trans_cstar=200;int svm_init_al_tset=8;int svm_al_qsize;int svm_al_do_trans=0;int svm_random_seed=0; /* for al - gets filled in with time */int svm_verbosity=0;/* for tfidf scoring - they could (should?) be made into options... */static int df_transform=LOG;static int df_counts=bow_tfidf_occurrences;/* these are dangerous optimizations for svm_score... - but they save a lot of time... *//* dangerous because they waste a lot of memory (about the size of the original barrel) * & if the vpc barrel gets played with, then its all wrong & there's no totally * error proof way to do that without checking all of the barrel, which i don't do. */struct model_bucket {  bow_wv    **docs;  float     **oweights;  /* original weights (after norm & tf scaling) 			    note - this only matters when tf_transform is set &			    some weight_per_model scheme is used */  /* note - these are regular vectors instead of wv's to save time    * (O(# qwv features) instead of O((# qwv features) + (# of features)) */  union {    float **sub_model; /* weights for submodels */    float  *barrel;     /* weights for the whole barrel */  } word_weights;  double     *bvect;  int       **indices;  int        *sizes;    /* length of each array */  double    **weights;  double    **W;  int       **yvect;  bow_barrel *barrel;  int         ndocs;  int         nmodels;};static struct model_bucket model_cache = {NULL, NULL, {NULL}, NULL, NULL, NULL, 					  NULL, NULL, NULL, 0, 0};double dprod(bow_wv *wv1, bow_wv *wv2);double kernel_poly(bow_wv *wv1, bow_wv *wv2);double kernel_rbf(bow_wv *wv1, bow_wv *wv2);double kernel_sig(bow_wv *wv1, bow_wv *wv2);/* by default use the dot product as the kernel */static double (*kernel)(bow_wv *, bow_wv *) = dprod;/* Command-line options specific to SVMs */static struct argp_option svm_options[] = {  {0,0,0,0,   "Support Vector Machine options, --method=svm:", 50},  {"svm-active-learning-baseline", AL_BASELINE, "", 0,   "Incrementally add documents to the training set at random."},  {"svm-test-in-train", AL_TEST_IN_TRAIN_ARG, 0, 0,   "do active learning testing inside of the training...  a hack "   "around making code 10 times more complicated."},  {"svm-al-transduce", AL_WITH_TRANS_ARG, 0, 0,   "do transduction over the unlabeled data during active learning."},  {"svm-bsize", BSIZE_TYPE, "", 0,   "maximum size to construct the subproblems."},  {"svm-cache-size", CACHE_SIZE_ARG, "", 0,   "Number of kernel evaluations to cache."},  {"svm-cost", COST_TYPE, "", 0,   "cost to bound the lagrange multipliers by (default 1000)."},  {"svm-df-counts", DF_COUNTS_ARG, "", 0,   "Set df_counts (0=occurrences, 1=words)."},  {"svm-active-learning", DO_ACTIVE_LEARNING, "", 0,   "Use active learning to query the labels & incrementally (by arg_size) build the barrels."},  {"svm-epsilon_a", EA_TYPE, "", 0,   "tolerance for the bounds of the lagrange multipliers (default 0.0001)."},  {"svm-kernel", KERNEL_TYPE, "", 0,   "type of kernel to use (0=linear, 1=polynomial, 2=gassian, 3=sigmoid, 4=fisher kernel)."},  {"svm-al_init_tsetsize", INITIAL_AL_TSET_ARG, "", 0,   "Number of random documents to start with in active learning."},  {"svm-quick-scoring", QUICK_SCORE, 0, 0,   "Turn quick scoring on."},  {"svm-rseed", RANDOM_SEED_ARG, "", 0,   "what random seed should be used in the test-in-train splits"},  {"svm-remove-misclassified", REMOVE_MISCLASS_TYPE, "", 0,   "Remove all of the misclassified examples and retrain (default none (0), 1=bound, 2=wrong."},  {"svm-start-at", START_AT_ARG, "", 0,   "which model should be the first generated."},  {"svm-suppress-score-matrix", SUPPRESS_SCORE_MAT_ARG, 0, 0,   "Do not print the scores of each test document at each AL iteration."},  {"svml-basename", SVM_BASENAME_ARG, "", OPTION_HIDDEN, ""},  {"svm-tf-transform", TF_TRANSFORM_TYPE, "", 0,   "0=raw, 1=log..."},  {"svm-transduce-class", TRANSDUCE_CLASS_ARG, "", 0,   "override default class(es) (int) to do transduction with "   "(default bow_doc_unlabeled)."},  {"svm-trans-cost", TRANS_CSTAR_ARG, "", 0,   "value to assign to C* (default 200)."},  {"svm-trans-hyp-refresh", TRANS_HYP_REFRESH_ARG, "", 0,   "how often the hyperplane should be recomputed during transduction.  "   "Only applies to SMO.  (default 40)"},  {"svm-trans-nobias", TRANS_IGNORE_BIAS_ARG, 0, 0,   "Do not use a bias when marking unlabeled documents.  Use a "   "threshold of 0 to determine labels instead of some threshold to"   "mark a certain number of documents for each class."},  {"svm-trans-npos", TRANS_NPOS_ARG, "", 0,   "number of unlabeled documents to label as positive "   "(default: proportional to number of labeled positive docs)."},  {"svm-trans-smart-vals", TRANS_SMART_VALS_ARG, "", 0,   "use previous problem's as a starting point for the next. (default true)"},  {"svm-use-smo", USE_SMO_ARG, "", 0,#ifdef HAVE_LOQO   "default 0 (don't use SMO)"#else    "default 1 (use SMO) - PR_LOQO not compiled"#endif  },  {"svm-vote", VOTE_TYPE, "", 0,   "Type of voting to use (0=singular, 1=pairwise; default 0)."},  {"svm-weight", WEIGHT_TYPE, "", 0,   "type of function to use to set the weights of the documents' words "   "(0=raw_frequency, 1=tfidf, 2=infogain."},  {0, 0}};union kern_param {  struct {    double const_co;    double lin_co;    double degree;  } poly ;  struct {    double gamma;  } rbf;  struct {    double const_co;    double lin_co;  } sig;};union kern_param kparm;error_t svm_parse_opt (int key, char *arg, struct argp_state *state) {  switch (key) {  case START_AT_ARG:    model_starting_no = atoi(arg);    break;  case KERNEL_TYPE:    svm_kernel_type = atoi (arg);    if (svm_kernel_type > 4) {      fprintf(stderr, "Invalid value for -k, value must be between 0, 1, 2, 3, or 4.\n");      return ARGP_ERR_UNKNOWN;    }    switch (svm_kernel_type) {    case 0:      kernel = dprod;      break;    case 1:      kparm.poly.const_co = 1.0;      kparm.poly.lin_co = 1.0;      kparm.poly.degree = 4.0;      kernel = kernel_poly;      break;    case 2:      kparm.rbf.gamma = 1.0;      kernel = kernel_rbf;      break;    case 3:      kparm.sig.lin_co = 1.0;      kparm.sig.const_co = 0.0;      kernel = kernel_sig;      break;    case 4:      kernel = svm_kernel_fisher;      break;    default:    }    break;  case AL_TEST_IN_TRAIN_ARG:    test_in_train = 1;    break;  case AL_WITH_TRANS_ARG:    svm_al_do_trans = 1;    break;  case BSIZE_TYPE:    svm_bsize = atoi(arg);    if (svm_bsize < 2) {      fprintf(stderr, "Invalid value for -b, value must be at least 2.\n");      return ARGP_ERR_UNKNOWN;    }    svm_bsize = ((svm_bsize+3)/4)*4;    break;  case CACHE_SIZE_ARG:    cache_size = atoi(arg);    if (cache_size < 2) {      fprintf(stderr, "Invalid value for --cache_size, value must be greater than 1\n");      return ARGP_ERR_UNKNOWN;    }    break;  case COST_TYPE:    svm_C = atof(arg);    break;  case DF_COUNTS_ARG:    key = atoi(arg);    if (key == 0) {      df_counts = bow_tfidf_occurrences;    } else if (key == 1) {      df_counts = bow_tfidf_words;    } else {      return ARGP_ERR_UNKNOWN;    }    break;  case EA_TYPE:    svm_epsilon_a = atof(arg);    break;  case AL_BASELINE:    test_in_train = 1;    al_pick_random = 1;  case DO_ACTIVE_LEARNING:    do_active_learning = 1;    svm_al_qsize = atoi(arg);    if (svm_al_qsize < 0) {      fprintf(stderr, "Bogus AL-query size\n");      return ARGP_ERR_UNKNOWN;    }    break;  case INITIAL_AL_TSET_ARG:    svm_init_al_tset = atoi(arg);    break;  case REMOVE_MISCLASS_TYPE:    svm_remove_misclassified = atoi(arg);    break;  case RANDOM_SEED_ARG:    svm_random_seed = atoi(arg);    break;  case QUICK_SCORE:    quick_scoring = 1;    break;  case SUPPRESS_SCORE_MAT_ARG:    suppress_score_mat = 1;    break;  case SVM_BASENAME_ARG:    svml_basename = arg;    break;  case TF_TRANSFORM_TYPE:    tf_transform_type = atoi(arg);    if ((tf_transform_type < 0) || (tf_transform_type > 1)) {      fprintf(stderr, "Invalid value for tf_transform_type, value must be 0 or 1\n");      return ARGP_ERR_UNKNOWN;    }    break;  case TRANSDUCE_CLASS_ARG:    {       int a;      a = atoi(arg);      if (a == bow_doc_train) {	fprintf(stderr,"Cannot do transduction on training set, ignoring \"%s\" option\n",arg);      } else {	if (!transduce_class) {	  transduce_class_overriding = 1;	  transduce_class = 0;	}	/* < 0 turns transduction off */	if (a > 0) {	  transduce_class |= (1 << a);	}      }    }    break;  case TRANS_HYP_REFRESH_ARG:    svm_trans_hyp_refresh = atoi(arg);    if (svm_trans_hyp_refresh < 1) {      fprintf(stderr, "svm_trans_hyp_refresh (hyperplane refresh rate)"	      " must be greater than 0\n");    }    break;  case TRANS_IGNORE_BIAS_ARG:    svm_trans_nobias = 1;    break;  case TRANS_NPOS_ARG:    svm_trans_npos = atoi(arg);    if (svm_trans_npos < 1) {      fprintf(stderr, "svm_trans_npos should be greater than 0.\n");      return ARGP_ERR_UNKNOWN;    }    break;  case TRANS_CSTAR_ARG:    svm_trans_cstar = atof(arg);    break;  case TRANS_SMART_VALS_ARG:    svm_trans_smart_vals = atoi(arg);    break;  case USE_SMO_ARG:    svm_use_smo = atoi(arg);    /* the epsilon is used is 2x as big as it would be in the loqo method */    if (svm_use_smo == 1) {      svm_epsilon_crit /= 2;    }#ifndef HAVE_LOQO    if (svm_use_smo != 1) {      fprintf(stderr,"Cannot switch from SMO, no other solvers were built,\n"	      "rebuild libbow with pr_loqo to use another algorithm.\n");    }#endif    break;  case VOTE_TYPE:    vote_type = atoi(arg);    if ((vote_type < 0) || (vote_type > 1)) {      fprintf(stderr, "Invalid value for --vote, value must be 0 for linear or 1 for pairwise.\n");      return ARGP_ERR_UNKNOWN;    }    break;  case WEIGHT_TYPE:    weight_type = atoi(arg);    if ((weight_type < 0) || (weight_type > 3)) {      fprintf(stderr, "Invalid value for -w, value must be 0, 1, 2, or 3.\n");      return ARGP_ERR_UNKNOWN;    }    break;  default:    return ARGP_ERR_UNKNOWN;  }  return 0;}static const struct argp svm_argp = { svm_options, svm_parse_opt };static struct argp_child svm_argp_child = {  &svm_argp,		/* This child's argp structure */  0,		       	/* flags for child */  0,		       	/* optional header in help message */  0		       	/* arbitrary group number for ordering */};void svm_permute_data(int *permute_table, bow_wv **docs, int *yvect, int ndocs) {  int i, j;  for (i=0; i<ndocs; i++) {    permute_table[i] = i;  }  for (i=0; i<ndocs; i++) {    bow_wv *d;    int y;    j = random() % ndocs;    d = docs[j];    docs[j] = docs[i];    docs[i] = d;    y = yvect[j];    yvect[j] = yvect[i];    yvect[i] = y;    y = permute_table[j];    permute_table[j] = permute_table[i];    permute_table[i] = y;  }}void svm_unpermute_data(int *permute_table, bow_wv **docs, int *yvect, int ndocs) {  int i, j;  for (i=0; i<ndocs; ) {    bow_wv *d;    int     y;    j = permute_table[i];
12 3 4 5 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -