📄 mysvmclassifier.cpp
字号:
while((!space_or_null((int)line[pos])) && line[pos]) pos++;
if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
/* it is the query id */
(*queryid)=(long)wnum;
}
else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
/* it is the slack id */
if(wnum > 0)
(*slackid)=(long)wnum;
else {
perror ("Slack-id must be greater or equal to 1!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
}
else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
/* it is the example-dependent cost factor */
(*costfactor)=(double)weight;
}
else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
/* it is a regular feature */
if(wnum<=0) {
perror ("Feature numbers must be larger or equal to 1!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
perror ("Features must be in increasing order!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
(words[wpos]).wnum=wnum;
(words[wpos]).weight=(FVAL)weight;
wpos++;
}
else {
perror ("Cannot parse feature/value pair!!!\n");
printf("'%s' in LINE: %s\n",featurepair,line);
exit (1);
}
}
(words[wpos]).wnum=0;
(*numwords)=wpos+1;
return(1);
}
double* CMySVMClassifier::read_alphas(char *alphafile,long totdoc)
/* reads the alpha vector from a file as written by the
write_alphas function */
{
FILE *fl;
double *alpha;
long dnum;
if ((fl = fopen (alphafile, "r")) == NULL)
{ perror (alphafile); exit (1); }
alpha = (double *)my_malloc(sizeof(double)*totdoc);
if(verbosity>=1) {
printf("Reading alphas..."); fflush(stdout);
}
dnum=0;
while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
dnum++;
}
if(dnum != totdoc)
{ perror ("\nNot enough values in alpha file!"); exit (1); }
fclose(fl);
if(verbosity>=1) {
printf("done\n"); fflush(stdout);
}
return(alpha);
}
void CMySVMClassifier::nol_ll(char *file, long int *nol, long int *wol, long int *ll)
/* Grep through file and count number of lines, maximum number of
spaces per line, and longest line. */
{
FILE *fl;
int ic;
char c;
long current_length,current_wol;
if ((fl = fopen (file, "r")) == NULL)
{ perror (file); exit (1); }
current_length=0;
current_wol=0;
(*ll)=0;
(*nol)=1;
(*wol)=0;
while((ic=getc(fl)) != EOF) {
c=(char)ic;
current_length++;
if(space_or_null((int)c)) {
current_wol++;
}
if(c == '\n') {
(*nol)++;
if(current_length>(*ll)) {
(*ll)=current_length;
}
if(current_wol>(*wol)) {
(*wol)=current_wol;
}
current_length=0;
current_wol=0;
}
}
fclose(fl);
}
long CMySVMClassifier::minl(long int a, long int b)
{
if(a<b)
return(a);
else
return(b);
}
long CMySVMClassifier::maxl(long int a, long int b)
{
if(a>b)
return(a);
else
return(b);
}
long CMySVMClassifier::get_runtime(void)
{
clock_t start;
start = clock();
return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
}
//# ifdef MICROSOFT
int CMySVMClassifier::isnan(double a)
{
return(_isnan(a));
}
//# endif
int CMySVMClassifier::space_or_null(int c) {
if (c==0)
return 1;
return isspace(c);
}
void* CMySVMClassifier::my_malloc(size_t size)
{
void *ptr;
ptr=(void *)malloc(size);
if(!ptr) {
perror ("Out of memory!\n");
exit (1);
}
return(ptr);
}
void CMySVMClassifier::copyright_notice(void)
{
printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
printf("This software is available for non-commercial use only. It must not\n");
printf("be modified and distributed without prior permission of the author.\n");
printf("The author is not responsible for implications from the use of this\n");
printf("software.\n\n");
}
//SVM LEARNING FUNCTIONS
void CMySVMClassifier::svm_learn_classification(DOC **docs, double *classes, long int
totdoc, long int totwords,
LEARN_PARM *learn_parm,
KERNEL_PARM *kernel_parm,
KERNEL_CACHE *kernel_cache,
MODEL *model,
double *alpha)
/* docs: Training vectors (x-part) */
/* class: Training labels (y-part, zero if test example for
transduction) */
/* totdoc: Number of examples in docs/label */
/* totwords: Number of features (i.e. highest feature index) */
/* learn_parm: Learning paramenters */
/* kernel_parm: Kernel paramenters */
/* kernel_cache:Initialized Cache of size totdoc, if using a kernel.
NULL if linear.*/
/* model: Returns learning result (assumed empty before called) */
/* alpha: Start values for the alpha variables or NULL
pointer. The new alpha values are returned after
optimization if not NULL. Array must be of size totdoc. */
{
long *inconsistent,i,*label;
long inconsistentnum;
long misclassified,upsupvecnum;
double loss,model_length,example_length;
double maxdiff,*lin,*a,*c;
long runtime_start,runtime_end;
long iterations;
long *unlabeled,transduction;
long heldout;
long loo_count=0,loo_count_pos=0,loo_count_neg=0,trainpos=0,trainneg=0;
long loocomputed=0,runtime_start_loo=0,runtime_start_xa=0;
double heldout_c=0,r_delta_sq=0,r_delta,r_delta_avg;
long *index,*index2dnum;
double *weights;
CFLOAT *aicache; /* buffer to keep one row of hessian */
double *xi_fullset; /* buffer for storing xi on full sample in loo */
double *a_fullset; /* buffer for storing alpha on full sample in loo */
TIMING timing_profile;
SHRINK_STATE shrink_state;
runtime_start=get_runtime();
timing_profile.time_kernel=0;
timing_profile.time_opti=0;
timing_profile.time_shrink=0;
timing_profile.time_update=0;
timing_profile.time_model=0;
timing_profile.time_check=0;
timing_profile.time_select=0;
kernel_cache_statistic=0;
learn_parm->totwords=totwords;
/* make sure -n value is reasonable */
if((learn_parm->svm_newvarsinqp < 2)
|| (learn_parm->svm_newvarsinqp > learn_parm->svm_maxqpsize)) {
learn_parm->svm_newvarsinqp=learn_parm->svm_maxqpsize;
}
init_shrink_state(&shrink_state,totdoc,(long)MAXSHRINK);
label = (long *)my_malloc(sizeof(long)*totdoc);
inconsistent = (long *)my_malloc(sizeof(long)*totdoc);
unlabeled = (long *)my_malloc(sizeof(long)*totdoc);
c = (double *)my_malloc(sizeof(double)*totdoc);
a = (double *)my_malloc(sizeof(double)*totdoc);
a_fullset = (double *)my_malloc(sizeof(double)*totdoc);
xi_fullset = (double *)my_malloc(sizeof(double)*totdoc);
lin = (double *)my_malloc(sizeof(double)*totdoc);
learn_parm->svm_cost = (double *)my_malloc(sizeof(double)*totdoc);
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*(totdoc+2));
model->alpha = (double *)my_malloc(sizeof(double)*(totdoc+2));
model->index = (long *)my_malloc(sizeof(long)*(totdoc+2));
model->at_upper_bound=0;
model->b=0;
model->supvec[0]=0; /* element 0 reserved and empty for now */
model->alpha[0]=0;
model->lin_weights=NULL;
model->totwords=totwords;
model->totdoc=totdoc;
model->kernel_parm=(*kernel_parm);
model->sv_num=1;
model->loo_error=-1;
model->loo_recall=-1;
model->loo_precision=-1;
model->xa_error=-1;
model->xa_recall=-1;
model->xa_precision=-1;
inconsistentnum=0;
transduction=0;
r_delta=estimate_r_delta(docs,totdoc,kernel_parm);
r_delta_sq=r_delta*r_delta;
r_delta_avg=estimate_r_delta_average(docs,totdoc,kernel_parm);
if(learn_parm->svm_c == 0.0) { /* default value for C */
learn_parm->svm_c=1.0/(r_delta_avg*r_delta_avg);
if(verbosity>=1)
printf("Setting default regularization parameter C=%.4f\n",
learn_parm->svm_c);
}
learn_parm->eps=-1.0; /* equivalent regression epsilon for
classification */
for(i=0;i<totdoc;i++) { /* various inits */
docs[i]->docnum=i;
inconsistent[i]=0;
a[i]=0;
lin[i]=0;
c[i]=0.0;
unlabeled[i]=0;
if(classes[i] == 0) {
unlabeled[i]=1;
label[i]=0;
transduction=1;
}
if(classes[i] > 0) {
learn_parm->svm_cost[i]=learn_parm->svm_c*learn_parm->svm_costratio*
docs[i]->costfactor;
label[i]=1;
trainpos++;
}
else if(classes[i] < 0) {
learn_parm->svm_cost[i]=learn_parm->svm_c*docs[i]->costfactor;
label[i]=-1;
trainneg++;
}
else {
learn_parm->svm_cost[i]=0;
}
}
if(verbosity>=2) {
printf("%ld positive, %ld negative, and %ld unlabeled examples.\n",trainpos,trainneg,totdoc-trainpos-trainneg); fflush(stdout);
}
/* caching makes no sense for linear kernel */
if(kernel_parm->kernel_type == LINEAR) {
kernel_cache = NULL;
}
/* compute starting state for initial alpha values */
if(alpha) {
if(verbosity>=1) {
printf("Computing starting state..."); fflush(stdout);
}
index = (long *)my_malloc(sizeof(long)*totdoc);
index2dnum = (long *)my_malloc(sizeof(long)*(totdoc+11));
weights=(double *)my_malloc(sizeof(double)*(totwords+1));
aicache = (CFLOAT *)my_malloc(sizeof(CFLOAT)*totdoc);
for(i=0;i<totdoc;i++) { /* create full index and clip alphas */
index[i]=1;
alpha[i]=fabs(alpha[i]);
if(alpha[i]<0) alpha[i]=0;
if(alpha[i]>learn_parm->svm_cost[i]) alpha[i]=learn_parm->svm_cost[i];
}
if(kernel_parm->kernel_type != LINEAR) {
for(i=0;i<totdoc;i++) /* fill kernel cache with unbounded SV */
if((alpha[i]>0) && (alpha[i]<learn_parm->svm_cost[i])
&& (kernel_cache_space_available(kernel_cache)))
cache_kernel_row(kernel_cache,docs,i,kernel_parm);
for(i=0;i<totdoc;i++) /* fill rest of kernel cache with bounded SV */
if((alpha[i]==learn_parm->svm_cost[i])
&& (kernel_cache_space_available(kernel_cache)))
cache_kernel_row(kernel_cache,docs,i,kernel_parm);
}
(void)compute_index(index,totdoc,index2dnum);
update_linear_component(docs,label,index2dnum,alpha,a,index2dnum,totdoc,
totwords,kernel_parm,kernel_cache,lin,aicache,
weights);
(void)calculate_svm_model(docs,label,unlabeled,lin,alpha,a,c,
learn_parm,index2dnum,index2dnum,model);
for(i=0;i<totdoc;i++) { /* copy initial alphas */
a[i]=alpha[i];
}
free(index);
free(index2dnum);
free(weights);
free(aicache);
if(verbosity>=1) {
printf("done.\n"); fflush(stdout);
}
}
if(transduction) {
learn_parm->svm_iter_to_shrink=99999999;
if(verbosity >= 1)
printf("\nDeactivating Shrinking due to an incompatibility with the transductive \nlearner in the current version.\n\n");
}
if(transduction && learn_parm->compute_loo) {
learn_parm->compute_loo=0;
if(verbosity >= 1)
printf("\nCannot compute leave-one-out estimates for transductive learner.\n\n");
}
if(learn_parm->remove_inconsistent && learn_parm->compute_loo) {
learn_parm->compute_loo=0;
printf("\nCannot compute leave-one-out estimates when removing inconsistent examples.\n\n");
}
if(learn_parm->compute_loo && ((trainpos == 1) || (trainneg == 1))) {
learn_parm->compute_loo=0;
printf("\nCannot compute leave-one-out with only one example in one class.\n\n");
}
if(verbosity==1) {
printf("Optimizing"); fflush(stdout);
}
/* train the svm */
iterations=optimize_to_convergence(docs,label,totdoc,totwords,learn_parm,
kernel_parm,kernel_cache,&shrink_state,model,
inconsistent,unlabeled,a,lin,
c,&timing_profile,
&maxdiff,(long)-1,
(long)1);
if(verbosity>=1) {
if(verbosity==1) printf("done. (%ld iterations)\n",iterations);
misclassified=0;
for(i=0;(i<totdoc);i++) { /* get final statistic */
if((lin[i]-model->b)*(double)label[i] <= 0.0)
misclassified++;
}
printf("Optimization finished (%ld misclassified, maxdiff=%.5f).\n",
misclassified,maxdiff);
runtime_end=get_runtime();
if(verbosity>=2) {
printf("Runtime in cpu-seconds: %.2f (%.2f%% for kernel/%.2f%% for optimizer/%.2f%% for final/%.2f%% for update/%.2f%% for model/%.2f%% for check/%.2f%% for select)\n",
((float)runtime_end-(float)runtime_start)/100.0,
(100.0*timing_profile.time_kernel)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_opti)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_shrink)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_update)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_model)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_check)/(float)(runtime_end-runtime_start),
(100.0*timing_profile.time_select)/(float)(runtime_end-runtime_start));
}
else {
printf("Runtime in cpu-seconds: %.2f\n",
(runtime_end-runtime_start)/100.0);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -