📄 annotator.cpp
字号:
// Annotator.cpp: implementation of the CAnnotator class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
//#include "NETagger.h"
//#include "OrgTag.h"
#include "Annotator.h"
#include "math.h"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
extern CString gWorkingPath;
//////////////////////////////////////////////////////////////////////
//
// CTree
//
//////////////////////////////////////////////////////////////////////
CTree::CTree()
{
m_left = NULL;
m_right = NULL;
m_dict.m_count[ORG] = 0;
m_dict.m_count[NONNE] = 0;
}
CTree::~CTree()
{
if (m_left != NULL) delete m_left;
if (m_right != NULL) delete m_right;
}
void CTree::Insert(CWord word)
{
int result = WordCompare(this, word);
if (result == 0)
m_dict.m_count[word.m_ne]++;
else if (result < 0)
{
if (m_left != NULL)
m_left->Insert(word);
else
{
CTree *child = new CTree();
child->m_dict.m_word = word.m_word;
child->m_dict.m_pos = word.m_pos;
child->m_dict.m_count[word.m_ne] = 1;
this->m_left = child;
}
}
else
{
if (m_right != NULL)
m_right->Insert(word);
else
{
CTree *child = new CTree();
child->m_dict.m_word = word.m_word;
child->m_dict.m_pos = word.m_pos;
child->m_dict.m_count[word.m_ne] = 1;
this->m_right = child;
}
}
}
void CTree::ToFile(CString filename)
{
CFileFind find;
CStdioFile file;
int found = find.FindFile(gWorkingPath + "\\" + filename);
if (found)
file.Open(gWorkingPath + "\\temp.txt", CFile::modeCreate|CFile::modeWrite);
else
file.Open(gWorkingPath + "\\" + filename, CFile::modeCreate|CFile::modeWrite);
TraverseToFile(&file);
file.Close();
if (found)
MergeFile("temp.txt", filename);
}
int WordCompare(CTree* node, CWord word)
{
int result;
if ((result = node->m_dict.m_word.Compare(word.m_word)) != 0)
return result;
if (node->m_dict.m_pos < word.m_pos)
return -1;
if (node->m_dict.m_pos = word.m_pos)
return 0;
return 1;
}
void CTree::TraverseToFile(CStdioFile* file)
{
if (m_left != NULL)
m_left->TraverseToFile(file);
if (m_dict.m_word.Compare("") != 0)
{
CString str;
str.Format("%s %d %d %d\n", this->m_dict.m_word, this->m_dict.m_pos, this->m_dict.m_count[ORG], this->m_dict.m_count[NONNE]);
file->WriteString(str);
}
if (m_right != NULL)
m_right->TraverseToFile(file);
}
void CTree::MergeFile(CString file1, CString file2)
{
CStdioFile f1(gWorkingPath + "\\" + file1, CFile::modeRead);
CStdioFile f2(gWorkingPath + "\\" + file2, CFile::modeRead);
CStdioFile f3(gWorkingPath + "\\temp2.txt", CFile::modeCreate|CFile::modeWrite);
CString str1, str2;
CDict d1, d2;
int neof1 = f1.ReadString(str1);
int neof2 = f2.ReadString(str2);
if (neof1) d1 = Extract(str1);
if (neof2) d2 = Extract(str2);
while (neof1 && neof2)
{
int result = WordCompare(d1, d2);
if (result > 0)
{
f3.WriteString(str1 + "\n");
neof1 = f1.ReadString(str1);
if (neof1) d1 = Extract(str1);
}
else if (result < 0)
{
f3.WriteString(str2 + "\n");
neof2 = f2.ReadString(str2);
if (neof2) d2 = Extract(str2);
}
else
{
CString str;
str.Format("%s %d %d %d\n", d1.m_word, d1.m_pos, d1.m_count[ORG] + d2.m_count[ORG], d1.m_count[NONNE] + d2.m_count[NONNE]);
f3.WriteString(str);
neof1 = f1.ReadString(str1);
neof2 = f2.ReadString(str2);
if (neof1) d1 = Extract(str1);
if (neof2) d2 = Extract(str2);
}
}
while (neof1)
{
f3.WriteString(str1 + "\r\n");
neof1 = f1.ReadString(str1);
}
while (neof2)
{
f3.WriteString(str2 + "\r\n");
neof2 = f2.ReadString(str2);
}
f1.Close();
f2.Close();
f3.Close();
DeleteFile(gWorkingPath + "\\" + file1);
DeleteFile(gWorkingPath + "\\" + file2);
CopyFile(gWorkingPath + "\\temp2.txt", gWorkingPath + "\\" + file2, false);
DeleteFile(gWorkingPath + "\\temp2.txt");
}
////////////////////////////////////////////////////////////
//
// CDict
//
//////////////////////////////////////////////////////////////
CDict::CDict(const CDict& d)
{
this->m_word = d.m_word;
this->m_pos = d.m_pos;
this->m_count[ORG] = d.m_count[ORG];
this->m_count[NONNE] = d.m_count[NONNE];
}
CDict& CDict::operator =(const CDict& d)
{
this->m_word = d.m_word;
this->m_pos = d.m_pos;
this->m_count[ORG] = d.m_count[ORG];
this->m_count[NONNE] = d.m_count[NONNE];
return *this;
}
////////////////////////////////////////////////////////////////
//
// CSentence
//
/////////////////////////////////////////////////////////////////
CSentence::CSentence(const CSentence& s)
{
POSITION pos = s.GetHeadPosition();
for (int i = 0; i < s.GetCount(); i++)
{
this->AddTail(s.GetNext(pos));
}
}
CSentence& CSentence::operator = (const CSentence& s)
{
while(this->GetCount() > 0)
this->RemoveHead();
POSITION pos = s.GetHeadPosition();
for (int i = 0; i < s.GetCount(); i++)
{
this->AddTail(s.GetNext(pos));
}
return *this;
}
//////////////////////////////////////////////////////////////////
//
// CSentenceList
//
//////////////////////////////////////////////////////////////////
CSentenceList::CSentenceList(const CSentenceList& s)
{
POSITION pos = s.GetHeadPosition();
for (int i = 0; i < s.GetCount(); i++)
{
this->AddTail(s.GetNext(pos));
}
}
CSentenceList& CSentenceList::operator = (const CSentenceList& sl)
{
while(this->GetCount() > 0)
this->RemoveHead();
POSITION pos = sl.GetHeadPosition();
for (int i = 0; i < sl.GetCount(); i++)
{
this->AddTail(sl.GetNext(pos));
}
return *this;
};
void CSentenceList::print(CString filename)
{
CStdioFile file(gWorkingPath + "\\" + filename, CFile::modeCreate | CFile::modeWrite);
POSITION p1 = this->GetHeadPosition();
for (int i = 0; i < this->GetCount(); i++)
{
CSentence s = this->GetNext(p1);
POSITION p2 = s.GetHeadPosition();
CString str1;
str1.Format("\nSentence %d:\n", i);
file.WriteString(str1);
for (int j = 0; j < s.GetCount(); j++)
{
CWord w = s.GetNext(p2);
CString str;
str.Format("%s %d %d\n", w.m_word, w.m_pos, w.m_ne);
file.WriteString(str);
}
}
file.Close();
}
/////////////////////////////////////////////////////////////////
//
// Global
//
//////////////////////////////////////////////////////////////////
CDict Extract(CString str)
{
CDict dict;
int pos1 = 0, pos2 = -1;
pos2 = str.Find(" ");
dict.m_word = str.Mid(pos1, pos2 - pos1);
pos1 = pos2 + 1;
pos2 = str.Find(" ", pos1);
dict.m_pos = (CODE)atoi(str.Mid(pos1, pos2 - pos1).GetBuffer(0));
pos1 = pos2 + 1;
pos2 = str.Find(" ", pos1);
dict.m_count[ORG] = atoi(str.Mid(pos1, pos2 - pos1).GetBuffer(0));
pos1 = pos2 + 1;
dict.m_count[NONNE] = atoi(str.Mid(pos1).GetBuffer(0));
return dict;
}
int WordCompare(CDict& d1, CDict& d2)
{
int result = d1.m_word.Compare(d2.m_word);
if (result != 0)
return result;
if (d1.m_pos < d2.m_pos)
return -1;
if (d1.m_pos == d2.m_pos)
return 0;
return 1;
}
/////////////////////////////////////////////////////////////////////
//
// CStateWord
//
////////////////////////////////////////////////////////////////////
CStateWord::CStateWord()
{
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k ++)
{
m_prob[i][j][k] = MINPROB;
m_prev[i][j][k] = S_BEGIN;
}
}
CStateWord::CStateWord(CWord w)
{
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k ++)
{
m_prob[i][j][k] = MINPROB;
m_prev[i][j][k] = S_BEGIN;
}
m_word = w.m_word;
m_pos = w.m_pos;
}
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CAnnotator::CAnnotator()
{
m_dataIn = false;
m_tree = NULL;
m_progress = NULL;
}
CAnnotator::CAnnotator(CProgressCtrl* progress)
{
m_dataIn = false;
m_tree = NULL;
m_progress = progress;
}
CAnnotator::~CAnnotator()
{
}
void CAnnotator::Train(CSentenceList corpus)
{
Init();
if (m_tree != NULL) delete m_tree;
m_tree = new CTree();
m_progress->SetPos(0);
POSITION p1 = corpus.GetHeadPosition();
for (int i = 0; i < corpus.GetCount(); i++)
{
CSentence sentence;
sentence = (CSentence)(corpus.GetNext(p1));
int cur = S_BEGIN;
int prev = S_BEGIN;
int pprev = S_BEGIN;
POSITION p2 = sentence.GetHeadPosition();
for (int j = 0; j < sentence.GetCount(); j++)
{
CWord word = (CWord)(sentence.GetNext(p2));
if (word.m_pos != w && word.m_pos != tt)
{
m_tree->Insert(word);
m_count[word.m_ne]++;
m_poscount[word.m_pos][word.m_ne]++;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -