📄 svm_common.c
字号:
DOC *create_example(long docnum, long queryid, long slackid,
double costfactor, SVECTOR *fvec)
{
DOC *example;
example = (DOC *)my_malloc(sizeof(DOC));
example->docnum=docnum;
example->queryid=queryid;
example->slackid=slackid;
example->costfactor=costfactor;
example->fvec=fvec;
return(example);
}
void free_example(DOC *example, long deep)
{
if(example) {
if(deep) {
if(example->fvec)
free_svector(example->fvec);
}
free(example);
}
}
void write_model(char *modelfile, MODEL *model)
{
FILE *modelfl;
long j,i,sv_num;
SVECTOR *v;
if(verbosity>=1) {
printf("Writing model file..."); fflush(stdout);
}
if ((modelfl = fopen (modelfile, "w")) == NULL)
{ perror (modelfile); exit (1); }
fprintf(modelfl,"SVM-light Version %s\n",VERSION);
fprintf(modelfl,"%ld # kernel type\n",
model->kernel_parm.kernel_type);
fprintf(modelfl,"%ld # kernel parameter -d \n",
model->kernel_parm.poly_degree);
fprintf(modelfl,"%.8g # kernel parameter -g \n",
model->kernel_parm.rbf_gamma);
fprintf(modelfl,"%.8g # kernel parameter -s \n",
model->kernel_parm.coef_lin);
fprintf(modelfl,"%.8g # kernel parameter -r \n",
model->kernel_parm.coef_const);
fprintf(modelfl,"%s# kernel parameter -u \n",model->kernel_parm.custom);
fprintf(modelfl,"%ld # highest feature index \n",model->totwords);
fprintf(modelfl,"%ld # number of training documents \n",model->totdoc);
sv_num=1;
for(i=1;i<model->sv_num;i++) {
for(v=model->supvec[i]->fvec;v;v=v->next)
sv_num++;
}
fprintf(modelfl,"%ld # number of support vectors plus 1 \n",sv_num);
fprintf(modelfl,"%.8g # threshold b, each following line is a SV (starting with alpha*y)\n",model->b);
for(i=1;i<model->sv_num;i++) {
for(v=model->supvec[i]->fvec;v;v=v->next) {
fprintf(modelfl,"%.32g ",model->alpha[i]*v->factor);
for (j=0; (v->words[j]).wnum; j++) {
fprintf(modelfl,"%ld:%.8g ",
(long)(v->words[j]).wnum,
(double)(v->words[j]).weight);
}
fprintf(modelfl,"#%s\n",v->userdefined);
/* NOTE: this could be made more efficient by summing the
alpha's of identical vectors before writing them to the
file. */
}
}
fclose(modelfl);
if(verbosity>=1) {
printf("done\n");
}
}
MODEL *read_model(char *modelfile)
{
FILE *modelfl;
long i,queryid,slackid;
double costfactor;
long max_sv,max_words,ll,wpos;
char *line,*comment;
WORD *words;
char version_buffer[100];
MODEL *model;
if(verbosity>=1) {
printf("Reading model..."); fflush(stdout);
}
nol_ll(modelfile,&max_sv,&max_words,&ll); /* scan size of model file */
max_words+=2;
ll+=2;
words = (WORD *)my_malloc(sizeof(WORD)*(max_words+10));
line = (char *)my_malloc(sizeof(char)*ll);
model = (MODEL *)my_malloc(sizeof(MODEL));
if ((modelfl = fopen (modelfile, "r")) == NULL)
{ perror (modelfile); exit (1); }
fscanf(modelfl,"SVM-light Version %s\n",version_buffer);
if(strcmp(version_buffer,VERSION)) {
perror ("Version of model-file does not match version of svm_classify!");
exit (1);
}
fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.kernel_type);
fscanf(modelfl,"%ld%*[^\n]\n", &model->kernel_parm.poly_degree);
fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.rbf_gamma);
fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_lin);
fscanf(modelfl,"%lf%*[^\n]\n", &model->kernel_parm.coef_const);
fscanf(modelfl,"%[^#]%*[^\n]\n", model->kernel_parm.custom);
fscanf(modelfl,"%ld%*[^\n]\n", &model->totwords);
fscanf(modelfl,"%ld%*[^\n]\n", &model->totdoc);
fscanf(modelfl,"%ld%*[^\n]\n", &model->sv_num);
fscanf(modelfl,"%lf%*[^\n]\n", &model->b);
model->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
model->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
model->index=NULL;
model->lin_weights=NULL;
for(i=1;i<model->sv_num;i++) {
fgets(line,(int)ll,modelfl);
if(!parse_document(line,words,&(model->alpha[i]),&queryid,&slackid,
&costfactor,&wpos,max_words,&comment)) {
printf("\nParsing error while reading model file in SV %ld!\n%s",
i,line);
exit(1);
}
model->supvec[i] = create_example(-1,
0,0,
0.0,
create_svector(words,comment,1.0));
}
fclose(modelfl);
free(line);
free(words);
if(verbosity>=1) {
fprintf(stdout, "OK. (%d support vectors read)\n",(int)(model->sv_num-1));
}
return(model);
}
MODEL *copy_model(MODEL *model)
{
MODEL *newmodel;
long i;
newmodel=(MODEL *)my_malloc(sizeof(MODEL));
(*newmodel)=(*model);
newmodel->supvec = (DOC **)my_malloc(sizeof(DOC *)*model->sv_num);
newmodel->alpha = (double *)my_malloc(sizeof(double)*model->sv_num);
newmodel->index = NULL; /* index is not copied */
newmodel->supvec[0] = NULL;
newmodel->alpha[0] = 0;
for(i=1;i<model->sv_num;i++) {
newmodel->alpha[i]=model->alpha[i];
newmodel->supvec[i]=create_example(model->supvec[i]->docnum,
model->supvec[i]->queryid,0,
model->supvec[i]->costfactor,
copy_svector(model->supvec[i]->fvec));
}
if(model->lin_weights) {
newmodel->lin_weights = (double *)my_malloc(sizeof(double)*(model->totwords+1));
for(i=0;i<model->totwords+1;i++)
newmodel->lin_weights[i]=model->lin_weights[i];
}
return(newmodel);
}
void free_model(MODEL *model, int deep)
{
long i;
if(model->supvec) {
if(deep) {
for(i=1;i<model->sv_num;i++) {
free_example(model->supvec[i],1);
}
}
free(model->supvec);
}
if(model->alpha) free(model->alpha);
if(model->index) free(model->index);
if(model->lin_weights) free(model->lin_weights);
free(model);
}
void read_documents(char *docfile, DOC ***docs, double **label,
long int *totwords, long int *totdoc)
{
char *line,*comment;
WORD *words;
long dnum=0,wpos,dpos=0,dneg=0,dunlab=0,queryid,slackid,max_docs;
long max_words_doc, ll;
double doc_label,costfactor;
FILE *docfl;
if(verbosity>=1) {
printf("Scanning examples..."); fflush(stdout);
}
nol_ll(docfile,&max_docs,&max_words_doc,&ll); /* scan size of input file */
max_words_doc+=2;
ll+=2;
max_docs+=2;
if(verbosity>=1) {
printf("done\n"); fflush(stdout);
}
(*docs) = (DOC **)my_malloc(sizeof(DOC *)*max_docs); /* feature vectors */
(*label) = (double *)my_malloc(sizeof(double)*max_docs); /* target values */
line = (char *)my_malloc(sizeof(char)*ll);
if ((docfl = fopen (docfile, "r")) == NULL)
{ perror (docfile); exit (1); }
words = (WORD *)my_malloc(sizeof(WORD)*(max_words_doc+10));
if(verbosity>=1) {
printf("Reading examples into memory..."); fflush(stdout);
}
dnum=0;
(*totwords)=0;
while((!feof(docfl)) && fgets(line,(int)ll,docfl)) {
if(line[0] == '#') continue; /* line contains comments */
if(!parse_document(line,words,&doc_label,&queryid,&slackid,&costfactor,
&wpos,max_words_doc,&comment)) {
printf("\nParsing error in line %ld!\n%s",dnum,line);
exit(1);
}
(*label)[dnum]=doc_label;
/* printf("docnum=%ld: Class=%f ",dnum,doc_label); */
if(doc_label > 0) dpos++;
if (doc_label < 0) dneg++;
if (doc_label == 0) dunlab++;
if((wpos>1) && ((words[wpos-2]).wnum>(*totwords)))
(*totwords)=(words[wpos-2]).wnum;
if((*totwords) > MAXFEATNUM) {
printf("\nMaximum feature number exceeds limit defined in MAXFEATNUM!\n");
printf("LINE: %s\n",line);
exit(1);
}
(*docs)[dnum] = create_example(dnum,queryid,slackid,costfactor,
create_svector(words,comment,1.0));
/* printf("\nNorm=%f\n",((*docs)[dnum]->fvec)->twonorm_sq); */
dnum++;
if(verbosity>=1) {
if((dnum % 100) == 0) {
printf("%ld..",dnum); fflush(stdout);
}
}
}
fclose(docfl);
free(line);
free(words);
if(verbosity>=1) {
fprintf(stdout, "OK. (%ld examples read)\n", dnum);
}
(*totdoc)=dnum;
}
int parse_document(char *line, WORD *words, double *label,
long *queryid, long *slackid, double *costfactor,
long int *numwords, long int max_words_doc,
char **comment)
{
register long wpos,pos;
long wnum;
double weight;
int numread;
char featurepair[1000],junk[1000];
(*queryid)=0;
(*slackid)=0;
(*costfactor)=1;
pos=0;
(*comment)=NULL;
while(line[pos] ) { /* cut off comments */
if((line[pos] == '#') && (!(*comment))) {
line[pos]=0;
(*comment)=&(line[pos+1]);
}
if(line[pos] == '\n') { /* strip the CR */
line[pos]=0;
}
pos++;
}
if(!(*comment)) (*comment)=&(line[pos]);
/* printf("Comment: '%s'\n",(*comment)); */
wpos=0;
/* check, that line starts with target value or zero, but not with
feature pair */
if(sscanf(line,"%s",featurepair) == EOF) return(0);
pos=0;
while((featurepair[pos] != ':') && featurepair[pos]) pos++;
if(featurepair[pos] == ':') {
perror ("Line must start with label or 0!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
/* read the target value */
if(sscanf(line,"%lf",label) == EOF) return(0);
pos=0;
while(space_or_null((int)line[pos])) pos++;
while((!space_or_null((int)line[pos])) && line[pos]) pos++;
while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) &&
(numread > 0) &&
(wpos<max_words_doc)) {
/* printf("%s\n",featurepair); */
while(space_or_null((int)line[pos])) pos++;
while((!space_or_null((int)line[pos])) && line[pos]) pos++;
if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1) {
/* it is the query id */
(*queryid)=(long)wnum;
}
else if(sscanf(featurepair,"sid:%ld%s",&wnum,junk)==1) {
/* it is the slack id */
if(wnum > 0)
(*slackid)=(long)wnum;
else {
perror ("Slack-id must be greater or equal to 1!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
}
else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1) {
/* it is the example-dependent cost factor */
(*costfactor)=(double)weight;
}
else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2) {
/* it is a regular feature */
if(wnum<=0) {
perror ("Feature numbers must be larger or equal to 1!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
if((wpos>0) && ((words[wpos-1]).wnum >= wnum)) {
perror ("Features must be in increasing order!!!\n");
printf("LINE: %s\n",line);
exit (1);
}
(words[wpos]).wnum=wnum;
(words[wpos]).weight=(FVAL)weight;
wpos++;
}
else {
perror ("Cannot parse feature/value pair!!!\n");
printf("'%s' in LINE: %s\n",featurepair,line);
exit (1);
}
}
(words[wpos]).wnum=0;
(*numwords)=wpos+1;
return(1);
}
double *read_alphas(char *alphafile,long totdoc)
/* reads the alpha vector from a file as written by the
write_alphas function */
{
FILE *fl;
double *alpha;
long dnum;
if ((fl = fopen (alphafile, "r")) == NULL)
{ perror (alphafile); exit (1); }
alpha = (double *)my_malloc(sizeof(double)*totdoc);
if(verbosity>=1) {
printf("Reading alphas..."); fflush(stdout);
}
dnum=0;
while((!feof(fl)) && fscanf(fl,"%lf\n",&alpha[dnum]) && (dnum<totdoc)) {
dnum++;
}
if(dnum != totdoc)
{ perror ("\nNot enough values in alpha file!"); exit (1); }
fclose(fl);
if(verbosity>=1) {
printf("done\n"); fflush(stdout);
}
return(alpha);
}
void nol_ll(char *file, long int *nol, long int *wol, long int *ll)
/* Grep through file and count number of lines, maximum number of
spaces per line, and longest line. */
{
FILE *fl;
int ic;
char c;
long current_length,current_wol;
if ((fl = fopen (file, "r")) == NULL)
{ perror (file); exit (1); }
current_length=0;
current_wol=0;
(*ll)=0;
(*nol)=1;
(*wol)=0;
while((ic=getc(fl)) != EOF) {
c=(char)ic;
current_length++;
if(space_or_null((int)c)) {
current_wol++;
}
if(c == '\n') {
(*nol)++;
if(current_length>(*ll)) {
(*ll)=current_length;
}
if(current_wol>(*wol)) {
(*wol)=current_wol;
}
current_length=0;
current_wol=0;
}
}
fclose(fl);
}
long minl(long int a, long int b)
{
if(a<b)
return(a);
else
return(b);
}
long maxl(long int a, long int b)
{
if(a>b)
return(a);
else
return(b);
}
long get_runtime(void)
{
clock_t start;
start = clock();
return((long)((double)start*100.0/(double)CLOCKS_PER_SEC));
}
# ifdef _MSC_VER
int isnan(double a)
{
return(_isnan(a));
}
# endif
int space_or_null(int c) {
if (c==0)
return 1;
return isspace(c);
}
void *my_malloc(size_t size)
{
void *ptr;
ptr=(void *)malloc(size);
if(!ptr) {
perror ("Out of memory!\n");
exit (1);
}
return(ptr);
}
void copyright_notice(void)
{
printf("\nCopyright: Thorsten Joachims, thorsten@joachims.org\n\n");
printf("This software is available for non-commercial use only. It must not\n");
printf("be modified and distributed without prior permission of the author.\n");
printf("The author is not responsible for implications from the use of this\n");
printf("software.\n\n");
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -