📄 similarity.cpp
字号:
#include "similarity.h"
#include "math.h"
vector<string> Biaodian;
vector<string> Stopword;
vector<string> WordArray;
vector<SWORD> TextArray1;
vector<SWORD> TextArray2;
double sprod_ns(double *vecn,double *vec_s,int n)
{
register double sum=0;
int m=0;
for(m=0;m<n;m++){
if(vec_s[m]>0)
sum+=((vecn[m])*(vec_s[m]));
}
return(sum);
}
int loadbiaodian(string Filename)
{
ifstream ifs(Filename.c_str());
if (!ifs)
{
cout << "Can not open " << Filename << endl;
return 1;
}
string strLine;
while (getline(ifs, strLine))
{
Biaodian.push_back(strLine);
}
return 0;
}
int loadStopword(string Filename)
{
ifstream ifs(Filename.c_str());
if (!ifs)
{
cout << "Can not open " << Filename << endl;
return 1;
}
string strLine;
while (getline(ifs, strLine))
{
Stopword.push_back(strLine);
}
return 0;
}
int caltf(string text,vector<SWORD> &TextArray)
{
char *seps=("\t\r\n 0 1 2 3 4 5 6 7 8 9 . , ? : & ");
int nTextLength=0;
char *token;
int wordnum = 0;
char *buf_out;
SWORD tmp;
nTextLength = text.size();
buf_out=new char[3*nTextLength];
SSPS((char*)text.c_str(), buf_out);
token = strtok( buf_out, seps);
while( token != NULL )
{
if(find(Biaodian,token)<0)
{
if(strlen(token)>2)
{
if(find(Stopword,token)<0)
{
cout<<token<<endl;
int pos = find(WordArray,token); //查找在总词库中的位置
if(pos >= 0)
{
int position = find(TextArray,pos);
if(position >=0)
TextArray[position].weight++;
else
{
tmp.wnum = pos;
tmp.weight = 1;
TextArray.push_back(tmp);
}
}
else
{
WordArray.push_back(token);
tmp.wnum = WordArray.size()-1;
tmp.weight = 1;
TextArray.push_back(tmp);
}
}
}
wordnum++;
}
token = strtok( NULL, seps );
}
int size = TextArray.size();
for(int i=0;i<size;i++)
{
TextArray[i].weight=TextArray[i].weight/wordnum;
}
delete []buf_out;
return 0;
}
double Compute_sim(string text1,string text2)
{
double similarity;
caltf(text1,TextArray1);
caltf(text2,TextArray2);
int nsize = WordArray.size();
double *textfreq1,*textfreq2;
double textlen1,textlen2;
textlen1 = 0;
textlen2 = 0;
textfreq1 = new double[nsize+1];
textfreq2 = new double[nsize+1];
for(int i= 0;i<nsize;i++)
{
textfreq1[i]=0;
textfreq2[i]=0;
}
int size = TextArray1.size();
for(i = 0;i<size;i++)
{
textfreq1[TextArray1[i].wnum] = TextArray1[i].weight;
// cout << WordArray[TextArray1[i].wnum] <<" "<<TextArray1[i].weight<<endl;
}
size = TextArray2.size();
for(i = 0;i<size;i++)
{
textfreq2[TextArray2[i].wnum] = TextArray2[i].weight;
// cout << WordArray[TextArray2[i].wnum] <<" "<<TextArray2[i].weight<<endl;
}
for(i= 0;i<nsize;i++)
{
textlen1 += textfreq1[i]*textfreq1[i];
textlen2 += textfreq2[i]*textfreq2[i];
}
textlen1 = sqrt(textlen1);
textlen2 = sqrt(textlen2);
similarity = sprod_ns(textfreq1,textfreq2,nsize);
if(textlen1*textlen2 != 0)
similarity = similarity/(textlen1*textlen2);
return similarity;
}
int find(vector<string> Array,string word)
{
int size;
size = Array.size();
for(int i = 0;i<size;i++)
{
if(Array[i] == word)
{
return i;
}
}
return -1;
}
int find(vector<SWORD> textarray,int pos)
{
int size;
size = textarray.size();
for(int i = 0;i<size;i++)
{
if(textarray[i].wnum == pos)
{
return i;
}
}
return -1;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -