📄 svm_learn.c
字号:
for(jj=0;(j=active2dnum[jj])>=0;jj++) { lin[j]+=sprod_ns(weights,docs[j].words); } } else { /* general case */ for(jj=0;(i=working2dnum[jj])>=0;jj++) { if(a[i] != a_old[i]) { get_kernel_row(kernel_cache,docs,i,totdoc,active2dnum,aicache, kernel_parm); for(ii=0;(j=active2dnum[ii])>=0;ii++) { tec=aicache[j]; lin[j]+=(((a[i]*tec)-(a_old[i]*tec))*(double)label[i]); } } } }}long incorporate_unlabeled_examples(MODEL *model, long int *label, long int *inconsistent, long int *unlabeled, double *a, double *lin, long int totdoc, double *selcrit, long int *select, long int *key, long int transductcycle, KERNEL_PARM *kernel_parm, LEARN_PARM *learn_parm){ long i,j,k,j1,j2,j3,j4,unsupaddnum1=0,unsupaddnum2=0; long pos,neg,upos,uneg,orgpos,orgneg,nolabel,newpos,newneg,allunlab; double dist,model_length,posratio,negratio; long check_every=2; double loss; static double switchsens=0.0,switchsensorg=0.0; double umin,umax,sumalpha; long imin=0,imax=0; static long switchnum=0; switchsens/=1.2; /* assumes that lin[] is up to date -> no inactive vars */ orgpos=0; orgneg=0; newpos=0; newneg=0; nolabel=0; allunlab=0; for(i=0;i<totdoc;i++) { if(!unlabeled[i]) { if(label[i] > 0) { orgpos++; } else { orgneg++; } } else { allunlab++; if(unlabeled[i]) { if(label[i] > 0) { newpos++; } else if(label[i] < 0) { newneg++; } } } if(label[i]==0) { nolabel++; } } if(learn_parm->transduction_posratio >= 0) { posratio=learn_parm->transduction_posratio; } else { posratio=(double)orgpos/(double)(orgpos+orgneg); /* use ratio of pos/neg */ } /* in training data */ negratio=1.0-posratio; learn_parm->svm_costratio=1.0; /* global */ if(posratio>0) { learn_parm->svm_costratio_unlab=negratio/posratio; } else { learn_parm->svm_costratio_unlab=1.0; } pos=0; neg=0; upos=0; uneg=0; for(i=0;i<totdoc;i++) { dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ if(dist>0) { pos++; } else { neg++; } if(unlabeled[i]) { if(dist>0) { upos++; } else { uneg++; } } if((!unlabeled[i]) && (a[i]>(learn_parm->svm_cost[i]-learn_parm->epsilon_a))) { /* printf("Ubounded %ld (class %ld, unlabeled %ld)\n",i,label[i],unlabeled[i]); */ } } if(verbosity>=2) { printf("POS=%ld, ORGPOS=%ld, ORGNEG=%ld\n",pos,orgpos,orgneg); printf("POS=%ld, NEWPOS=%ld, NEWNEG=%ld\n",pos,newpos,newneg); printf("pos ratio = %f (%f).\n",(double)(upos)/(double)(allunlab),posratio); fflush(stdout); } if(transductcycle == 0) { j1=0; j2=0; j4=0; for(i=0;i<totdoc;i++) { dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ if((label[i]==0) && (unlabeled[i])) { selcrit[j4]=dist; key[j4]=i; j4++; } } unsupaddnum1=0; unsupaddnum2=0; select_top_n(selcrit,j4,select,(long)(allunlab*posratio+0.5)); for(k=0;(k<(long)(allunlab*posratio+0.5));k++) { i=key[select[k]]; label[i]=1; unsupaddnum1++; j1++; } for(i=0;i<totdoc;i++) { if((label[i]==0) && (unlabeled[i])) { label[i]=-1; j2++; unsupaddnum2++; } } for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ if(unlabeled[i]) { if(label[i] == 1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; } else if(label[i] == -1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_unlabbound; } } } if(verbosity>=1) { /* printf("costratio %lf, costratio_unlab %lf, unlabbound %lf\n", learn_parm->svm_costratio,learn_parm->svm_costratio_unlab, learn_parm->svm_unlabbound); */ printf("Classifying unlabeled data as %ld POS / %ld NEG.\n", unsupaddnum1,unsupaddnum2); fflush(stdout); } if(verbosity >= 1) printf("Retraining."); if(verbosity >= 2) printf("\n"); return((long)3); } if((transductcycle % check_every) == 0) { if(verbosity >= 1) printf("Retraining."); if(verbosity >= 2) printf("\n"); j1=0; j2=0; unsupaddnum1=0; unsupaddnum2=0; for(i=0;i<totdoc;i++) { if((unlabeled[i] == 2)) { unlabeled[i]=1; label[i]=1; j1++; unsupaddnum1++; } else if((unlabeled[i] == 3)) { unlabeled[i]=1; label[i]=-1; j2++; unsupaddnum2++; } } for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ if(unlabeled[i]) { if(label[i] == 1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; } else if(label[i] == -1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_unlabbound; } } } if(verbosity>=2) { /* printf("costratio %lf, costratio_unlab %lf, unlabbound %lf\n", learn_parm->svm_costratio,learn_parm->svm_costratio_unlab, learn_parm->svm_unlabbound); */ printf("%ld positive -> Added %ld POS / %ld NEG unlabeled examples.\n", upos,unsupaddnum1,unsupaddnum2); fflush(stdout); } if(learn_parm->svm_unlabbound == 1) { learn_parm->epsilon_crit=0.001; /* do the last run right */ } else { learn_parm->epsilon_crit=0.01; /* otherwise, no need to be so picky */ } return((long)3); } else if(((transductcycle % check_every) < check_every)) { model_length=0; sumalpha=0; loss=0; for(i=0;i<totdoc;i++) { model_length+=a[i]*label[i]*lin[i]; sumalpha+=a[i]; dist=(lin[i]-model->b); /* 'distance' from hyperplane*/ if((label[i]*dist)<(1.0-learn_parm->epsilon_crit)) { loss+=(1.0-(label[i]*dist))*learn_parm->svm_cost[i]; } } model_length=sqrt(model_length); if(verbosity>=2) { printf("Model-length = %f (%f), loss = %f, objective = %f\n", model_length,sumalpha,loss,loss+0.5*model_length*model_length); fflush(stdout); } j1=0; j2=0; j3=0; j4=0; unsupaddnum1=0; unsupaddnum2=0; umin=99999; umax=-99999; j4=1; while(j4) { umin=99999; umax=-99999; for(i=0;(i<totdoc);i++) { dist=(lin[i]-model->b); if((label[i]>0) && (unlabeled[i]) && (!inconsistent[i]) && (dist<umin)) { umin=dist; imin=i; } if((label[i]<0) && (unlabeled[i]) && (!inconsistent[i]) && (dist>umax)) { umax=dist; imax=i; } } if((umin < (umax+switchsens-1E-4))) { j1++; j2++; unsupaddnum1++; unlabeled[imin]=3; inconsistent[imin]=1; unsupaddnum2++; unlabeled[imax]=2; inconsistent[imax]=1; } else j4=0; j4=0; } for(j=0;(j<totdoc);j++) { if(unlabeled[j] && (!inconsistent[j])) { if(label[j]>0) { unlabeled[j]=2; } else if(label[j]<0) { unlabeled[j]=3; } /* inconsistent[j]=1; */ j3++; } } switchnum+=unsupaddnum1+unsupaddnum2; /* stop and print out current margin printf("switchnum %ld %ld\n",switchnum,kernel_parm->poly_degree); if(switchnum == 2*kernel_parm->poly_degree) { learn_parm->svm_unlabbound=1; } */ if((!unsupaddnum1) && (!unsupaddnum2)) { if((learn_parm->svm_unlabbound>=1) && ((newpos+newneg) == allunlab)) { for(j=0;(j<totdoc);j++) { inconsistent[j]=0; if(unlabeled[j]) unlabeled[j]=1; } write_prediction(learn_parm->predfile,model,lin,a,unlabeled,label, totdoc,learn_parm); if(verbosity>=1) printf("Number of switches: %ld\n",switchnum); return((long)0); } switchsens=switchsensorg; learn_parm->svm_unlabbound*=1.5; if(learn_parm->svm_unlabbound>1) { learn_parm->svm_unlabbound=1; } model->at_upper_bound=0; /* since upper bound increased */ if(verbosity>=1) printf("Increasing influence of unlabeled examples to %f%% .", learn_parm->svm_unlabbound*100.0); } else if(verbosity>=1) { printf("%ld positive -> Switching labels of %ld POS / %ld NEG unlabeled examples.", upos,unsupaddnum1,unsupaddnum2); fflush(stdout); } if(verbosity >= 2) printf("\n"); learn_parm->epsilon_crit=0.5; /* don't need to be so picky */ for(i=0;i<totdoc;i++) { /* set upper bounds on vars */ if(unlabeled[i]) { if(label[i] == 1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_costratio_unlab*learn_parm->svm_unlabbound; } else if(label[i] == -1) { learn_parm->svm_cost[i]=learn_parm->svm_c* learn_parm->svm_unlabbound; } } } return((long)2); } return((long)0); }/*************************** Working set selection ***************************/long select_next_qp_subproblem_grad(long int *label, long int *unlabeled, double *a, double *lin, double *c, long int totdoc, long int qp_size, LEARN_PARM *learn_parm, long int *inconsistent, long int *active2dnum, long int *working2dnum, double *selcrit, long int *select, KERNEL_CACHE *kernel_cache, long int *key, long int *chosen) /* Use the feasible direction approach to select the */ /* next qp-subproblem (see section 'Selecting a good */ /* working set') */{ long choosenum,i,j,k,activedoc,inum; double s; for(inum=0;working2dnum[inum]>=0;inum++); /* find end of index */ choosenum=0; activedoc=0; for(i=0;(j=active2dnum[i])>=0;i++) { s=-label[j]; if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) && (!((a[j]>=(learn_parm->svm_cost[j]-learn_parm->epsilon_a)) && (s>0))) && (!inconsistent[j]) && (label[j]) && (!chosen[j])) { selcrit[activedoc]=(double)label[j]*(learn_parm->eps-(double)label[j]*c[j]+(double)label[j]*lin[j]); /* selcrit[activedoc]=(double)label[j]*(-1.0+(double)label[j]*lin[j]); */ /* selcrit[activedoc]=lin[j]-(double)label[j]; */ key[activedoc]=j; activedoc++; } } select_top_n(selcrit,activedoc,select,(long)(qp_size/2)); for(k=0;(choosenum<(qp_size/2)) && (k<(qp_size/2)) && (k<activedoc);k++) { i=key[select[k]]; chosen[i]=1; working2dnum[inum+choosenum]=i; choosenum+=1; kernel_cache_touch(kernel_cache,i); /* make sure it does not get kicked */ /* out of cache */ } activedoc=0; for(i=0;(j=active2dnum[i])>=0;i++) { s=label[j]; if((!((a[j]<=(0+learn_parm->epsilon_a)) && (s<0))) && (!((a[j]>=(learn_parm->svm_cost[j]-learn
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -