📄 method.cpp
字号:
// Method.cpp: implementation of the CMethod class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include <io.h>
#include <stdio.h>
//#include "DMBGT.h"
#include "Method.h"
#include <algorithm>
#define LAM 0.000001 //平滑参数
#define LAMDA 0.1 //平滑参数
#define NN 53335 //词汇表中词汇的总数
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CMethod::CMethod()
{
strTmp=" ";
// ReadSenseNo();
}
CMethod::~CMethod()
{
AfxMessageBox("train is over!");
}
void CMethod::ReadSenseNo()
{
fstream finSen("Cup-Dic简表.txt",ios::in);
string str_Name;
string str_gender;
string SenseID;
string p;
int Count = 0;
while(finSen>>str_Name>>str_gender>>SenseID>>p)
{
WordList.word.wordName = str_Name;
WordList.word.senseID = SenseID;
if(str_Name != strTmp)
{
Count++;
WordList.wordID = Count;
}
else
{
WordList.wordID = Count;
}
strTmp = str_Name;
WordList.wordCount = 0;
WordList.Context_T = 0;
WordList.Psk = 0;
WordList.P_LAM = 0;
vectWordList.push_back(WordList);//把词写入词表
}
sort(vectWordList.begin(),vectWordList.end());
}
void CMethod::ReadSentence()
{
fstream finSentence("hebing-修改.txt",ios::in);
string sentence;
vector<CWord>::iterator pPointer;
CWord ToSearch;
ReadSenseNo();
vector<int> vectTmp;
PPP p;
PPP pBefore;
PPP pAfter;
while(getline(finSentence, sentence))
{
//保存一个句子的所有词ID
SplitLine(sentence);
int i, j = vectword.size();
for(i = 0; i < j; i ++) //查出词汇的ID号码,并统计词频记录在表中
{
if(vectword[i].senseID == "-1")
{
vectWorID.push_back(-1);
pPointer = 0;
vectMid.push_back(pPointer);
}
else
{
ToSearch.word = vectword[i];
pPointer = lower_bound(vectWordList.begin(),vectWordList.end(),ToSearch);
if(pPointer != vectWordList.end()&&pPointer->word == vectword[i])
{
pPointer->wordCount++;
vectWorID.push_back(pPointer->wordID); //记录每个词的ID
vectMid.push_back(pPointer); //记录每个词在向量中的位置
}
else
{
pPointer = 0;
vectWorID.push_back(-1);
vectMid.push_back(pPointer);
} //找不到的词汇不处理
}
}
int a, b = vectword.size(); //
for(a = 0; a < b; a++)
{
p = vectMid[a];
if(p > 0)
{
pBefore = p - 1;
pAfter = p + 1;
if(!(pBefore->wordID != p->wordID && pAfter->wordID != p->wordID)) //如果无歧义,不考虑上下文
{
vectTmp = vectWorID; //保存一个句子的所有词ID
vectTmp.erase(vectTmp.begin()+a); //保存上下文词的ID
int h, c = vectTmp.size();
for(h = 0; h < c; h++)
{
CreateTree(p->Context_T,vectTmp[h]);
}
}
}
}
vectword.clear(); //处理完一句,清除该向量
vectMid.clear();
vectWorID.clear();
}
CountPara();
CountParaII();
vectWordList.clear();
}
void CMethod::SplitLine(string sentence)
{
Word word;
string strSenNo;
int Pointer = 0;
int WordBegin = 0;
int SenseEnd;
while((Pointer=sentence.find("/", Pointer + 1)) != -1)
{
word.wordName = sentence.substr(WordBegin, Pointer - WordBegin); //词
SenseEnd = sentence.find(" ", Pointer);
strSenNo = sentence.substr(Pointer + 1, SenseEnd - Pointer -1);
// word.senseID = atoi(strSenNo.c_str()); //词义标号
word.senseID = strSenNo;
WordBegin = SenseEnd + 1;
vectword.push_back(word); //句子中的所有词
}
}
void CMethod::CreateTree(BTREE &T, int ID)
{
if(T == NULL)
{
T = new CContext;
T->ContextWordID = ID;
T->ContextWordCount = 1;
T->ChildL = T->ChildR = NULL;
}
else if(T->ContextWordID == ID)
T->ContextWordCount ++;
else if(T->ContextWordID > ID)
CreateTree(T->ChildL, ID);
else
CreateTree(T->ChildR, ID);
}
void CMethod::CountPara()
{
int i, j = vectWordList.size();
int ID_Tmp = vectWordList[0].wordID;
int Sum = vectWordList[0].wordCount;
vectWordList[0].P_LAM = (float)LAMDA / (float)(vectWordList[0].wordCount + NN * LAMDA);
int n_count = 1;
float Psk;
for (i = 1; i < j; i++)
{
vectWordList[i].P_LAM = (float)LAMDA / (float)(vectWordList[i].wordCount + NN * LAMDA);
if(vectWordList[i].wordID == ID_Tmp)
{
Sum += vectWordList[i].wordCount;
n_count ++;
if(i == j-1) // 表的尾部,最后一个元素
{
int g = i - n_count + 1;
for(g; g <= i; g++)
{
if(Sum != 0)
{
Psk=(float)(vectWordList[g].wordCount + LAM)/(float)(Sum + n_count * LAM);
}
else
{
Psk = (float)LAM; //Psk=0; 令LAM = 0.00000001
}
vectWordList[g].Psk=Psk;
}
return; // 从尾部返回
}
}
else
{
if(n_count != 1)
{
int k = i - n_count;
for(k; k < i; k++)
{
if(Sum != 0)
{
Psk = (float)(vectWordList[k].wordCount + LAM)/(float)(Sum + n_count * LAM);
}
else
{
Psk = (float)LAM;
}
vectWordList[k].Psk = Psk;
}
}
else
{
if(Sum != 0)
{
Psk = 1; // vectWordList[i-1].P_sen=1;
}
else
{
Psk = (float)LAM; // vectWordList[i-1].P_sen=0;
}
vectWordList[i-1].Context_T = 0; // 无歧义的的词汇其上下文可以不考虑
vectWordList[i-1].Psk = Psk;
}
if(i == j-1)
{
if(vectWordList[i].wordCount > 0)
{
vectWordList[i].Psk = 1;
}
else
{
vectWordList[i].Psk = (float)LAM;
}
return; // 从尾部返回
}
else // 循环初始化条件
{
ID_Tmp=vectWordList[i].wordID;
Sum=vectWordList[i].wordCount;
n_count=1;
}
}
}
}
void CMethod::CountParaII()
{
int i, j = vectWordList.size();
fstream foutPsk("Psk-hebing.txt", ios::out);
fstream foutPvs("Pvs-hebing", ios::out|ios::binary);
long begin,end;
for(i = 0; i < j; i++)
{
if(vectWordList[i].Context_T != NULL)
{
begin = foutPvs.tellp(); //记录输出的头位置
IntravTree(foutPvs, vectWordList[i].Context_T,i); //遍历数出树中的Pvs
end = foutPvs.tellp(); //记录输出的尾位置
foutPsk<<vectWordList[i].wordID<<" "<<vectWordList[i].word.senseID
<<" "<<vectWordList[i].Psk<<" "<<vectWordList[i].P_LAM<<" "<<begin<<" "<<end<<endl;//输出Psk
}
else
{
foutPsk<<vectWordList[i].wordID<<" "<<vectWordList[i].word.senseID
<<" "<<vectWordList[i].Psk<<" "<<vectWordList[i].P_LAM<<" "<<"0"<<" "<<"0"<<endl;
}
}
}
void CMethod::IntravTree(fstream & fout, BTREE & T, int k)
{
wordPVS wordPvsTmp;
if(T!=NULL)
{
IntravTree(fout, T->ChildL, k);
float Pvs = ((float)(T->ContextWordCount + LAMDA)) / ((float)(vectWordList[k].wordCount + LAMDA * NN)); //vectWC[i-1].W_Sum);
if(Pvs == 0) //几乎不可能有Pvs == 0的情形
{
Pvs = vectWordList[k].P_LAM; //(float)LAM;
}
wordPvsTmp.IDw = T->ContextWordID;
wordPvsTmp.Pvs = Pvs;
fout.write((char *) & wordPvsTmp, sizeof(wordPVS)); //二进制输出
IntravTree(fout, T->ChildR, k);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -