annotator.cpp

来自「分词词典软件」· C++ 代码 · 共 783 行 · 第 1/2 页
CPP
783 行
				m_statep[pprev][prev][cur][word.m_ne]++;
				pprev = prev;
				prev = cur;
				cur = word.m_ne;
				//m_statep[pprev][prev][cur]++;
			}
		}
		m_progress->SetPos((int)((i + 1) * 100 / corpus.GetCount()));
	}
	m_tree->ToFile("worddata.txt");
	CStdioFile file(gWorkingPath + "\\" + "statedata.txt", CFile::modeCreate|CFile::modeWrite);
	CString str;
	str.Format("%ld\r\n%ld\r\n", m_count[0], m_count[1]);
	file.WriteString(str);
	for (i = 0; i < 3; i++)
		for (int j = 0; j < 3; j++)
			for (int k = 0; k < 3; k++)
			{
				str.Format("%ld\r\n%ld\r\n", m_statep[i][j][k][0], m_statep[i][j][k][1]);
				file.WriteString(str);
			}
	file.Close();
	file.Open(gWorkingPath + "\\" + "posdata.txt", CFile::modeCreate | CFile::modeWrite);
	for (i = 0; i < 45; i++)
	{
		str.Format("%ld\r\n%ld\r\n", m_poscount[i][0], m_poscount[i][1]);
		file.WriteString(str);
	}
	file.Close();

	delete m_tree;
	m_dataIn = false;
};

void CAnnotator::Init()
{
	CStdioFile file;
	if (file.Open(gWorkingPath + "\\statedata.txt", CFile::modeRead))
	{
		CString str;
		for (int i = 0; i < 2; i++)
		{
			file.ReadString(str);
			m_count[i] = atoi(str.GetBuffer(0));
		}
		for (i = 0; i < 3; i++)
			for (int j = 0; j < 3; j++)
				for (int k = 0; k < 3; k++)
				{
					file.ReadString(str);
					m_statep[i][j][k][0] = atoi(str.GetBuffer(0));
					file.ReadString(str);
					m_statep[i][j][k][1] = atoi(str.GetBuffer(0));
				}
		file.Close();
		file.Open(gWorkingPath + "\\" + "posdata.txt", CFile::modeRead);
		for (i = 0; i < 45; i++)
		{
			file.ReadString(str);
			m_poscount[i][0] = atoi(str.GetBuffer(0));
			file.ReadString(str);
			m_poscount[i][1] = atoi(str.GetBuffer(0));
		}
		file.Close();
		
	}
	else
	{
		for (int i = 0; i < 2; i++)
		{
			m_count[i] = 0;
			for (int j = 0; j < 45; j++)
				m_poscount[j][i] = 0;
		}
		for (i = 0; i < 3; i++)
			for (int j = 0; j < 3; j++)
				for (int k = 0; k < 3; k++)
					m_statep[i][j][k][0] = m_statep[i][j][k][1] = 0;		
	}
}

CSentenceList CAnnotator::Annotate(CSentenceList corpus)
{
	CSentenceList sl;
	if (!m_dataIn)
	{
		if (!ReadData())
		{
			throw("No data file error");
			return sl;
		}
		m_dataIn = true;
	}

	m_progress->SetPos(0);
	POSITION p1 = corpus.GetHeadPosition();
	for (int i = 0; i < corpus.GetCount(); i++)
	{
		CSentence sentence;
		sentence = (CSentence)(corpus.GetNext(p1));
		CObArray arr;
		POSITION p2 = sentence.GetHeadPosition();
		CStateWord *sw = new CStateWord();
		sw->m_prob[S_BEGIN][S_BEGIN][S_BEGIN] = 0;
		arr.Add(sw);
		for (int j = 0; j < sentence.GetCount(); j++)
		{
			sw = new CStateWord((CWord)(sentence.GetNext(p2)));
			arr.Add(sw);
		}
		
		for (j = 1; j < arr.GetSize(); j++)
		{
			CStateWord* prev = (CStateWord*)arr.GetAt(j - 1);
			CStateWord* cur = (CStateWord*)arr.GetAt(j);
			for (int x = 0; x < 3; x++)
				for (int y = 0; y < 3; y++)
					for (int z = 0; z < 3; z++)
					{
						if (prev->m_prob[x][y][z] > MINPROB)
						{
							double prob = prev->m_prob[x][y][z] + GetStateProb(x, y, z, ORG) + GetWordProb(cur->m_word, cur->m_pos, ORG, arr.GetSize());
							if (prob > cur->m_prob[y][z][ORG])
							{
								cur->m_prob[y][z][ORG] = prob;
								cur->m_prev[y][z][ORG] = x;
							}
							prob = prev->m_prob[x][y][z] + GetStateProb(x, y, z, NONNE) + GetWordProb(cur->m_word, cur->m_pos, NONNE, arr.GetSize());
							if (prob > cur->m_prob[y][z][NONNE])
							{
								cur->m_prob[y][z][NONNE] = prob;
								cur->m_prev[y][z][NONNE] = x;
							}
						}
					}
			arr.SetAt(j, cur);
		}

		CStateWord* last = (CStateWord*)arr.GetAt(arr.GetSize() - 1);
		double maxp = MINPROB;
		int curx, cury, curz;
		for (int x = 0; x < 3; x++)
			for (int y = 0; y < 3; y++)
				for (int z = 0; z < 3; z++)
				{
					if (last->m_prob[x][y][z] > maxp)
					{
						maxp = last->m_prob[x][y][z];
						curx = x;
						cury = y;
						curz = z;
					}
				}

		CSentence s;
		for (j = arr.GetSize() - 2; j >= 0; j--)
		{
			CWord w;
			w.m_word = last->m_word;
			w.m_pos = last->m_pos;
			w.m_ne = curz;
			s.AddHead(w);
			int prev = last->m_prev[curx][cury][curz];
			curz = cury;
			cury = curx;
			curx = prev;
			last = (CStateWord*)arr.GetAt(j);
		}

		Eliminate(s);
		sl.AddTail(s);
		sl.print("slprint.txt");
		m_progress->SetPos((int)((i + 1) * 100 / corpus.GetCount()));


	}

	return sl;
		

}

bool CAnnotator::ReadData()
{
	m_data.RemoveAll();
	CStdioFile file;
	if (!file.Open(gWorkingPath + "\\worddata.txt", CFile::modeRead))
		return false;

	CString str;
	while(file.ReadString(str))
	{
		CDict *d = new CDict();
		*d = Extract(str);
		m_data.Add(d);
	}
	file.Close();

	if (!file.Open(gWorkingPath + "\\" + "statedata.txt", CFile::modeRead))
		return false;
	for (int i = 0; i < 2; i++)
	{
		file.ReadString(str);
		m_count[i] = atoi(str.GetBuffer(0));
	}
	for (i = 0; i < 3; i++)
		for (int j = 0; j < 3; j++)
			for (int k = 0; k < 3; k++)
			{
				file.ReadString(str);
				m_statep[i][j][k][0] = atoi(str.GetBuffer(0));
				file.ReadString(str);
				m_statep[i][j][k][1] = atoi(str.GetBuffer(0));
			}
	file.Close();

	if (!file.Open(gWorkingPath + "\\" + "posdata.txt", CFile::modeRead))
		return false;
	for (i = 0; i < 45; i++)
	{
		file.ReadString(str);
		m_poscount[i][0] = atoi(str.GetBuffer(0));
		file.ReadString(str);
		m_poscount[i][1] = atoi(str.GetBuffer(0));
	}
	file.Close();
	return true;
}

double CAnnotator::GetStateProb(int x, int y, int z, int next)
{
	double prob = (double)m_statep[x][y][z][next] / (m_statep[x][y][z][ORG] + m_statep[x][y][z][NONNE]);

	if (prob < 0.000001) return MINPROB;
	return log(prob);
};

double CAnnotator::GetWordProb(CString word, CODE pos, int ne, int totalWordCount)
{
	double prob;
	if (pos == w || pos == tt)
		return (ne == ORG)?MINPROB: 0;
	if (pos == nt)
		return (ne == ORG)?0 :MINPROB;

	int wordCount = GetWordCount(word, pos, ne, 0, m_data.GetSize() - 1);
	if (wordCount >= 0)
	{
		prob = wordCount / ((double)m_count[ne]);
		//double posprob = m_poscount[pos][ne] / ((double)m_count[ne]) * 0.8;
		//prob *= posprob;
	}else
	{
		prob = m_poscount[pos][ne] / ((double)m_count[ne]);
	}


	if (word.GetLength() <= 2)
		prob *= (ne == ORG)? 0.5: 1.5;

	if (prob < 0.0000001) return MINPROB;
	return log(prob);
};

int CAnnotator::GetWordCount(CString word, CODE pos, int ne, int start, int end)
{
	if (start > end)
		return -1;

	CDict d;
	d.m_word = word;
	d.m_pos = pos;
	int mid = (start + end) / 2;
	CDict* getd = (CDict*)m_data.GetAt(mid);
	int result = WordCompare(d, *getd);
	if (result == 0)
		return getd->m_count[ne];
	else if (result > 0)
		return GetWordCount(word, pos, ne, start, mid - 1);
	else 
		return GetWordCount(word, pos, ne, mid + 1, end);

}

void CAnnotator::Eliminate(CSentence &s)
{
	CString orgstring[] = {"公司", "企业", "集团", "厂", "部门", "部", "中心", "社", "局", "处", "所"};
	POSITION pos = s.GetTailPosition();
	CSentence ns;
	CSentence orgs;
	int status = S_BEGIN;
	int nOrgString = 0;
	bool hasVerb = false;
	CWord hw;
	hw.m_ne = NONNE;
	s.AddHead(hw);
	for (int i = s.GetCount() - 1; i >= 0; i--)
	{
		CWord word = (CWord)s.GetPrev(pos);
		if (status == ORG && word.m_ne == NONNE)
		{
			if (orgs.GetCount() == 1)
			{
				CWord w = (CWord)orgs.GetHead();
				int count = GetWordCount(w.m_word, w.m_pos, NONNE, 0, m_data.GetSize() - 1);
				if (count > 2 || (w.m_pos != n && w.m_pos != Ag && w.m_pos != an && w.m_pos != ad
					&& w.m_pos != a && w.m_pos != d && w.m_pos != Dg && w.m_pos != nt && w.m_pos != nz))
					w.m_ne = NONNE;
				ns.AddHead(w);
			}
			else if (nOrgString == 1 && WordIn(orgs.GetHead().m_word, orgstring ,11))
			{
				POSITION p1 = orgs.GetTailPosition();
				for (int j = orgs.GetCount() - 1; j >= 0; j--)
				{
					CWord w = (CWord)orgs.GetPrev(p1);
					w.m_ne = NONNE;
					ns.AddHead(w);
				}
			}
			else
			{
				if (nOrgString > 0)
				{
					POSITION p1 = orgs.GetTailPosition();
					POSITION pp = p1;
					for (int j = orgs.GetCount() - 1; j >= 0; j--)
					{
						CWord w = (CWord)orgs.GetPrev(p1);
						if (WordIn(w.m_word, orgstring ,11))
							break;
						w.m_ne = NONNE;
						orgs.SetAt(pp, w);
						pp = p1;
					}
				}
				else
				{
					if (hasVerb)
					{
						POSITION p1 = orgs.GetTailPosition();
						POSITION pp = p1;
						for (int j = orgs.GetCount() - 1; j >= 0; j--)
						{
							CWord w = (CWord)orgs.GetPrev(p1);
							w.m_ne = NONNE;
							orgs.SetAt(pp, w);
							pp = p1;
						}

					}
				}
				POSITION p1 = orgs.GetTailPosition();
				for (int j = orgs.GetCount() - 1; j >= 0; j--)
				{
					CWord w = (CWord)orgs.GetPrev(p1);
					ns.AddHead(w);
				}


			}
			while (orgs.GetCount() > 0) orgs.RemoveHead();
			nOrgString = 0;
			hasVerb = false;
		}

		if (word.m_ne == ORG)
		{
			orgs.AddHead(word);
			if (WordIn(word.m_word, orgstring ,11)) nOrgString++;
			if (word.m_pos == v) hasVerb = true;
		}
		else
			ns.AddHead(word);
		
		status = word.m_ne;
	}
	ns.RemoveHead();
	s = ns;
}

bool CAnnotator::WordIn(CString word, CString wordlist[], int number)
{
	for (int i = 0; i < number; i++)
	{
		if (word.Find(wordlist[i]) == word.GetLength() - wordlist[i].GetLength())
			return true;
	}
	return false;
}
annotator.cpp - 源码说明

本页面展示了「分词词典软件」中的 annotator.cpp 源码文件，采用 C++ 编程语言编写，共 783 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与分相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?