📄 doclist.cpp
字号:
#include "stdafx.h"
#pragma warning( disable : 4786 )
#include "assert.h"
#include "doclist.h"
void *my_malloc(size_t size)
{
void *ptr;
ptr=(void *)malloc(size);
if(!ptr) {
perror ("Out of memory!\n");
exit (1);
}
return(ptr);
}
//compute the inner product of two sparse vectors
//if one vector is empty, that ai a[0].wnum=0,the function return 0
float sprod_ss(WORDITEM *a, WORDITEM *b)
{
register float sum=0;
register WORDITEM *ai,*bj;
ai=a;
bj=b;
while (ai->wnum && bj->wnum)
{
if(ai->wnum > bj->wnum)
{
bj++;
}
else if (ai->wnum < bj->wnum)
{
ai++;
}
else
{
sum+=ai->weight * bj->weight;
ai++;
bj++;
}
}
return(sum);
}
// Grep through file and count number of lines, maximum number of spaces per line, and longest line
void nol_ll(char *file, long& nol, long& wol, long& ll)
{
FILE *fl;
int ic;
char c;
long current_length,current_wol;
if ((fl = fopen (file, "r")) == NULL)
{
perror (file);
exit (1);
}
current_length = 0;
current_wol = 0;
ll = 0;
nol = 0;
wol = 0;
while((ic=getc(fl)) != EOF)
{
c=(char)ic;
current_length++;
if(isspace((int)c))
{
current_wol++;
}
if(c == '\n')
{
nol++;
if(current_length > ll)
{
ll = current_length;
}
if(current_wol > wol)
{
wol = current_wol;
}
current_length = 0;
current_wol = 0;
}
}
cout << endl;
fclose(fl);
}
int parse_document(int iParseFlag,char *line, DOC& doc, char *label, long& numwords,long max_words)
{
int ret = 1;
register long wpos;
unsigned long pos;
long wnum;
char chDocId[256];
static long lLineIndex = 1;
double weight;
int numread;
char featurepair[MAX_DOC_FEATURE],junk[MAX_DOC_FEATURE];
doc.queryid=0;
doc.costfactor=1;
pos=0;
while(line[pos])
{
// cut off comments
if(line[pos] == '#')
{
line[pos]=0;
}
else
{
pos++;
}
}
wpos=0;
pos = 0;
if (iParseFlag & PARSE_ID_FLAG)
{
if(sscanf(line,"%s",chDocId) == EOF)
return 0;
while( pos<strlen(line) && !isspace(line[pos]) ) pos++;
while( pos<strlen(line) && isspace(line[pos])) pos++;
}
if (iParseFlag & PARSE_LABEL_FLAG)
{
if(sscanf(line+pos,"%s",label) == EOF)
return 0;
while( pos<strlen(line) && !isspace(line[pos])) pos++;
while( pos<strlen(line) && isspace(line[pos])) pos++;
}
while(((numread=sscanf(line+pos,"%s",featurepair)) != EOF) && (wpos<max_words))
{
while(isspace((int)line[pos])) pos++;
while((!isspace((int)line[pos])) && line[pos]) pos++;
if(sscanf(featurepair,"qid:%ld%s",&wnum,junk)==1)
{
//it is the query id
doc.queryid=(long)wnum;
}
else if(sscanf(featurepair,"cost:%lf%s",&weight,junk)==1)
{
//it is the example-dependent cost factor
doc.costfactor=(double)weight;
}
else if(sscanf(featurepair,"%ld:%lf%s",&wnum,&weight,junk)==2)
{
//it is a regular feature
if(wnum<=0)
{
printf("Line: %s\n",line);
perror ("Feature numbers must be larger or equal to 0!\n");
exit (1);
}
if((wpos>0) && ((doc.content[wpos-1]).wnum >= wnum))
{
printf("Line: %s\n",line);
perror ("Features must be in increasing order!\n");
exit (1);
}
doc.content[wpos].wnum=wnum;
doc.content[wpos].weight=(float)weight;
wpos++;
}
else
{
printf("'%s' in LINE: %s\n",featurepair,line);
perror ("Cannot parse feature/value pair!\n");
exit (1);
}
}
if (!wpos)
ret = 0;
(doc.content[wpos]).wnum=0;
numwords = wpos+1;
doc.dim_content = wpos;
if (iParseFlag & PARSE_ID_FLAG)
{
doc.DocId = atol(chDocId);
}
else
{
doc.DocId = lLineIndex++;
}
doc.twonorm_sq=sprod_ss(doc.content,doc.content);
return ret;
}
void CDocList::ReadVector( string strFile )
{
long lLinelen;
long max_docs,max_words;
cout << "Traversing the vector file to get parameter info..." ;
nol_ll( (char *)strFile.c_str(), max_docs, max_words, lLinelen); // scan size of input file
cout << "done." << endl;
docs = (DOC *)my_malloc(sizeof(DOC)*max_docs); // feature vectors
lLinelen += 2;
char *line;
DOC doc;
long lIndex=0,wpos;
char doc_label[MAX_LABEL_LEN];
long empty_line = 0;
cout << "Scanning examples...\n";
line = (char *)my_malloc( sizeof(char)*lLinelen );
ifstream ifile(strFile.c_str());
doc.content = (WORDITEM *)my_malloc(sizeof(WORDITEM)*(max_words+10));
SDoc sDoc;
lIndex=0;
MaxWordsCount = 0;
ifile.getline(line,lLinelen);
while( strlen(line) )
{
if(line[0] == '#') continue; // line contains comments
int iParseResult = parse_document(PARSE_ID_FLAG, line, doc, doc_label, wpos,max_words);
//the document is empty, it is removed
if (!iParseResult)
{
empty_line++;
ifile.getline(line,lLinelen);
continue;
}
//if wpos=1,the document is empty
assert(wpos>1);
//if the document is empty,wpos is 1
if((wpos>1) && ((doc.content[wpos-2]).wnum > MaxWordsCount))
MaxWordsCount = (doc.content[wpos-2]).wnum;
//dimcontent: number of valid terms, not including the item with wnum=0
docs[lIndex].dim_content = wpos-1;
docs[lIndex].queryid = doc.queryid;
docs[lIndex].costfactor = doc.costfactor;
docs[lIndex].content = (WORDITEM *)my_malloc(sizeof(WORDITEM)*(wpos));
docs[lIndex].DocId = doc.DocId;
docs[lIndex].twonorm_sq=doc.twonorm_sq;
mapDocId_Pos[doc.DocId] = lIndex;
for(int i=0;i<wpos;i++)
{
docs[lIndex].content[i]=doc.content[i];
if (i!=wpos-1)
setWordId.insert((docs[lIndex].content[i]).wnum);
}
sDoc.lDocId = doc.DocId;
vSDoc.push_back(sDoc);
lIndex++;
printf("%ld\r",lIndex);
ifile.getline(line,lLinelen);
}
cout << lIndex+empty_line << " examples read, " << empty_line << " are empty and removed, " << lIndex << " documents are left." << endl;
ifile.close();
free(line);
free(doc.content);
}
int ReadDoc( string& sLine, DOC& test_doc )
{
test_doc.content = (WORDITEM *)my_malloc(sizeof(WORDITEM)*MAX_DOC_FEATURE);
char test_doc_label[MAX_LABEL_LEN];
long lFeatureTotal;
int iParseResult = parse_document(PARSE_ID_FLAG, (char*)sLine.c_str(), test_doc, test_doc_label, lFeatureTotal,MAX_DOC_FEATURE);
return iParseResult;
}
void CDocList::ReadDocList( string strFile)
{
ifstream ifile((char*)strFile.c_str());
cout << "Reading category information...";
int iDocId;
set<int> setDocCat;
string sLine,sDocId,sDocCat;
while (getline( ifile, sLine))
{
setDocCat.clear();
size_t pos = sLine.find("\t");
string sDocId = sLine.substr( 0, pos );
string sCatId = sLine.substr( pos+1, sLine.size()-pos-1 );
iDocId = atoi( (char*)sDocId.c_str() );
int iCat = atoi( (char*)sCatId.c_str() );
setDocCat.insert( iCat );
if ( setCat.find( iCat )==setCat.end() ) {
setCat.insert( iCat );
//vecCat.push_back( iCat );
}
long lPos = mapDocId_Pos[ iDocId ];
//mapDoc2SetLabel[ iDocId ] = setDocCat;
//vTarget.push_back( setDocCat );
vSDoc[ lPos ].setDocCat = setDocCat;
setDocCat.clear();
}
ifile.close();
cout << endl;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -