📄 annotator.cpp

📁 分词词典软件
💻 CPP
📖 第 1 页 / 共 2 页
字号:
12 下一页
// Annotator.cpp: implementation of the CAnnotator class.
//
//////////////////////////////////////////////////////////////////////

#include "stdafx.h"
//#include "NETagger.h"
//#include "OrgTag.h"
#include "Annotator.h"
#include "math.h"

#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif

extern CString gWorkingPath;

//////////////////////////////////////////////////////////////////////
//
// CTree
//
//////////////////////////////////////////////////////////////////////

CTree::CTree()
{
	m_left = NULL;
	m_right = NULL;
	m_dict.m_count[ORG] = 0;
	m_dict.m_count[NONNE] = 0;
}

CTree::~CTree()
{
	if (m_left != NULL)	delete m_left;
	if (m_right != NULL) delete m_right;
}

void CTree::Insert(CWord word)
{
	int result = WordCompare(this, word);
	if (result == 0)
		m_dict.m_count[word.m_ne]++;
	else if (result < 0)
	{
		if (m_left != NULL)
			m_left->Insert(word);
		else
		{
			CTree *child = new CTree();
			child->m_dict.m_word = word.m_word;
			child->m_dict.m_pos = word.m_pos;
			child->m_dict.m_count[word.m_ne] = 1;
			this->m_left = child;
		}
	}
	else
	{
		if (m_right != NULL)
			m_right->Insert(word);
		else
		{
			CTree *child = new CTree();
			child->m_dict.m_word = word.m_word;
			child->m_dict.m_pos = word.m_pos;
			child->m_dict.m_count[word.m_ne] = 1;
			this->m_right = child;
		}
	}
}

void CTree::ToFile(CString filename)
{
	CFileFind find;
	CStdioFile file;
	int found = find.FindFile(gWorkingPath + "\\" + filename);
	if (found)
		file.Open(gWorkingPath + "\\temp.txt", CFile::modeCreate|CFile::modeWrite);
	else
		file.Open(gWorkingPath + "\\" + filename, CFile::modeCreate|CFile::modeWrite);

	TraverseToFile(&file);
	file.Close();

	if (found)
		MergeFile("temp.txt", filename);
}

int WordCompare(CTree* node, CWord word)
{
	int result;
	if ((result = node->m_dict.m_word.Compare(word.m_word)) != 0)
		return result;
	if (node->m_dict.m_pos < word.m_pos)
		return -1;
	if (node->m_dict.m_pos = word.m_pos)
		return 0;
	return 1;
}

void CTree::TraverseToFile(CStdioFile* file)
{
	if (m_left != NULL)
		m_left->TraverseToFile(file);
	
	if (m_dict.m_word.Compare("") != 0)
	{
		CString str;
		str.Format("%s %d %d %d\n", this->m_dict.m_word, this->m_dict.m_pos, this->m_dict.m_count[ORG], this->m_dict.m_count[NONNE]);
		file->WriteString(str);
	}

	if (m_right != NULL)
		m_right->TraverseToFile(file);

}

void CTree::MergeFile(CString file1, CString file2)
{
	CStdioFile f1(gWorkingPath + "\\" + file1, CFile::modeRead);
	CStdioFile f2(gWorkingPath + "\\" + file2, CFile::modeRead);
	CStdioFile f3(gWorkingPath + "\\temp2.txt", CFile::modeCreate|CFile::modeWrite);
	CString str1, str2;
	CDict d1, d2;
	int neof1 = f1.ReadString(str1);
	int neof2 = f2.ReadString(str2);
	if (neof1) d1 = Extract(str1);
	if (neof2) d2 = Extract(str2);

	while (neof1 && neof2)
	{
		int result = WordCompare(d1, d2);
		if (result > 0)
		{
			f3.WriteString(str1 + "\n");
			neof1 = f1.ReadString(str1);
			if (neof1) d1 = Extract(str1);
		}
		else if (result < 0)
		{
			f3.WriteString(str2 + "\n");
			neof2 = f2.ReadString(str2);
			if (neof2) d2 = Extract(str2);
		}
		else
		{
			CString str;
			str.Format("%s %d %d %d\n", d1.m_word, d1.m_pos, d1.m_count[ORG] + d2.m_count[ORG], d1.m_count[NONNE] + d2.m_count[NONNE]);
			f3.WriteString(str);
			neof1 = f1.ReadString(str1);
			neof2 = f2.ReadString(str2);
			if (neof1) d1 = Extract(str1);
			if (neof2) d2 = Extract(str2);
		}
	}

	while (neof1)
	{
		f3.WriteString(str1 + "\r\n");
		neof1 = f1.ReadString(str1);
	}
	while (neof2)
	{
		f3.WriteString(str2 + "\r\n");
		neof2 = f2.ReadString(str2);
	}

	f1.Close();
	f2.Close();
	f3.Close();
	DeleteFile(gWorkingPath + "\\" + file1);
	DeleteFile(gWorkingPath + "\\" + file2);
	CopyFile(gWorkingPath + "\\temp2.txt", gWorkingPath + "\\" + file2, false);
	DeleteFile(gWorkingPath + "\\temp2.txt");
}


////////////////////////////////////////////////////////////
//
// CDict
//
//////////////////////////////////////////////////////////////
CDict::CDict(const CDict& d)
{
	this->m_word = d.m_word;
	this->m_pos = d.m_pos;
	this->m_count[ORG] = d.m_count[ORG];
	this->m_count[NONNE] = d.m_count[NONNE];
}

CDict& CDict::operator =(const CDict& d)
{
	this->m_word = d.m_word;
	this->m_pos = d.m_pos;
	this->m_count[ORG] = d.m_count[ORG];
	this->m_count[NONNE] = d.m_count[NONNE];
	return *this;
}

////////////////////////////////////////////////////////////////
//
// CSentence
//
/////////////////////////////////////////////////////////////////


CSentence::CSentence(const CSentence& s)
{
	POSITION pos = s.GetHeadPosition();
	for (int i = 0; i < s.GetCount(); i++)	
	{
		this->AddTail(s.GetNext(pos));
	}
}

CSentence& CSentence::operator = (const CSentence& s)
{
	while(this->GetCount() > 0)
		this->RemoveHead();

	POSITION pos = s.GetHeadPosition();
	for (int i = 0; i < s.GetCount(); i++)	
	{
		this->AddTail(s.GetNext(pos));
	}
	return *this;
}


//////////////////////////////////////////////////////////////////
//
// CSentenceList
//
//////////////////////////////////////////////////////////////////

CSentenceList::CSentenceList(const CSentenceList& s)
{
	POSITION pos = s.GetHeadPosition();
	for (int i = 0; i < s.GetCount(); i++)	
	{
		this->AddTail(s.GetNext(pos));
	}
}

CSentenceList& CSentenceList::operator = (const CSentenceList& sl)
{
	while(this->GetCount() > 0)
		this->RemoveHead();
	POSITION pos = sl.GetHeadPosition();
	for (int i = 0; i < sl.GetCount(); i++)	
	{
		this->AddTail(sl.GetNext(pos));
	}
	return *this;
};

void CSentenceList::print(CString filename)
{
	CStdioFile file(gWorkingPath + "\\" + filename, CFile::modeCreate | CFile::modeWrite);
	POSITION p1 = this->GetHeadPosition();
	for (int i = 0; i < this->GetCount(); i++)
	{
		CSentence s = this->GetNext(p1);
		POSITION p2 = s.GetHeadPosition();
		CString str1;
		str1.Format("\nSentence %d:\n", i);
		file.WriteString(str1);
		for (int j = 0; j < s.GetCount(); j++)
		{
			CWord w = s.GetNext(p2);
			CString str;
			str.Format("%s %d %d\n", w.m_word, w.m_pos, w.m_ne);
			file.WriteString(str);
		}
	}
	file.Close();
}

/////////////////////////////////////////////////////////////////
// 
// Global
//
//////////////////////////////////////////////////////////////////

CDict Extract(CString str)
{
	CDict dict;
	int pos1 = 0, pos2 = -1;
	pos2 = str.Find(" ");
	dict.m_word = str.Mid(pos1, pos2 - pos1);
	pos1 = pos2 + 1;
	pos2 = str.Find(" ", pos1);
	dict.m_pos = (CODE)atoi(str.Mid(pos1, pos2 - pos1).GetBuffer(0));
	pos1 = pos2 + 1;
	pos2 = str.Find(" ", pos1);
	dict.m_count[ORG] = atoi(str.Mid(pos1, pos2 - pos1).GetBuffer(0));
	pos1 = pos2 + 1;
	dict.m_count[NONNE] = atoi(str.Mid(pos1).GetBuffer(0));

	return dict;
}

int WordCompare(CDict& d1, CDict& d2)
{
	int result = d1.m_word.Compare(d2.m_word);
	if (result != 0)
		return result;
	if (d1.m_pos < d2.m_pos)
		return -1;
	if (d1.m_pos == d2.m_pos)
		return 0;
	return 1;
}


/////////////////////////////////////////////////////////////////////
//
// CStateWord
//
////////////////////////////////////////////////////////////////////

CStateWord::CStateWord()
{
	for (int i = 0; i < 3; i++)
		for (int j = 0; j < 3; j++)
			for (int k = 0; k < 3; k ++)
			{
				m_prob[i][j][k] = MINPROB;
				m_prev[i][j][k] = S_BEGIN;
			}
}

CStateWord::CStateWord(CWord w)
{
	for (int i = 0; i < 3; i++)
		for (int j = 0; j < 3; j++)
			for (int k = 0; k < 3; k ++)
			{
				m_prob[i][j][k] = MINPROB;
				m_prev[i][j][k] = S_BEGIN;
			}
	m_word = w.m_word;
	m_pos = w.m_pos;
}

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CAnnotator::CAnnotator()
{
	m_dataIn = false;
	m_tree = NULL;
	m_progress = NULL;
}

CAnnotator::CAnnotator(CProgressCtrl* progress)
{
	m_dataIn = false;
	m_tree = NULL;
	m_progress = progress;
}

CAnnotator::~CAnnotator()
{

}

void CAnnotator::Train(CSentenceList corpus)
{
	Init();
	if (m_tree != NULL) delete m_tree;
	m_tree = new CTree();
	m_progress->SetPos(0);

	POSITION p1 = corpus.GetHeadPosition();
	for (int i = 0; i < corpus.GetCount(); i++)
	{
		CSentence sentence;
		sentence = (CSentence)(corpus.GetNext(p1));
		int cur = S_BEGIN;
		int prev = S_BEGIN;
		int pprev = S_BEGIN;
		POSITION p2 = sentence.GetHeadPosition();
		for (int j = 0; j < sentence.GetCount(); j++)
		{
			CWord word = (CWord)(sentence.GetNext(p2));
			if (word.m_pos != w && word.m_pos != tt)
			{
				m_tree->Insert(word);
				m_count[word.m_ne]++;
				m_poscount[word.m_pos][word.m_ne]++;
12 下一页
💿 文件大小 2293 K
👤 上传用户 xujinliner
📂 所属分类多国语言处理
🏷️ 相关标签

#分 #词典 #软件
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -