📄 annotator.cpp
字号:
m_statep[pprev][prev][cur][word.m_ne]++;
pprev = prev;
prev = cur;
cur = word.m_ne;
//m_statep[pprev][prev][cur]++;
}
}
m_progress->SetPos((int)((i + 1) * 100 / corpus.GetCount()));
}
m_tree->ToFile("worddata.txt");
CStdioFile file(gWorkingPath + "\\" + "statedata.txt", CFile::modeCreate|CFile::modeWrite);
CString str;
str.Format("%ld\r\n%ld\r\n", m_count[0], m_count[1]);
file.WriteString(str);
for (i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++)
{
str.Format("%ld\r\n%ld\r\n", m_statep[i][j][k][0], m_statep[i][j][k][1]);
file.WriteString(str);
}
file.Close();
file.Open(gWorkingPath + "\\" + "posdata.txt", CFile::modeCreate | CFile::modeWrite);
for (i = 0; i < 45; i++)
{
str.Format("%ld\r\n%ld\r\n", m_poscount[i][0], m_poscount[i][1]);
file.WriteString(str);
}
file.Close();
delete m_tree;
m_dataIn = false;
};
void CAnnotator::Init()
{
CStdioFile file;
if (file.Open(gWorkingPath + "\\statedata.txt", CFile::modeRead))
{
CString str;
for (int i = 0; i < 2; i++)
{
file.ReadString(str);
m_count[i] = atoi(str.GetBuffer(0));
}
for (i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++)
{
file.ReadString(str);
m_statep[i][j][k][0] = atoi(str.GetBuffer(0));
file.ReadString(str);
m_statep[i][j][k][1] = atoi(str.GetBuffer(0));
}
file.Close();
file.Open(gWorkingPath + "\\" + "posdata.txt", CFile::modeRead);
for (i = 0; i < 45; i++)
{
file.ReadString(str);
m_poscount[i][0] = atoi(str.GetBuffer(0));
file.ReadString(str);
m_poscount[i][1] = atoi(str.GetBuffer(0));
}
file.Close();
}
else
{
for (int i = 0; i < 2; i++)
{
m_count[i] = 0;
for (int j = 0; j < 45; j++)
m_poscount[j][i] = 0;
}
for (i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++)
m_statep[i][j][k][0] = m_statep[i][j][k][1] = 0;
}
}
CSentenceList CAnnotator::Annotate(CSentenceList corpus)
{
CSentenceList sl;
if (!m_dataIn)
{
if (!ReadData())
{
throw("No data file error");
return sl;
}
m_dataIn = true;
}
m_progress->SetPos(0);
POSITION p1 = corpus.GetHeadPosition();
for (int i = 0; i < corpus.GetCount(); i++)
{
CSentence sentence;
sentence = (CSentence)(corpus.GetNext(p1));
CObArray arr;
POSITION p2 = sentence.GetHeadPosition();
CStateWord *sw = new CStateWord();
sw->m_prob[S_BEGIN][S_BEGIN][S_BEGIN] = 0;
arr.Add(sw);
for (int j = 0; j < sentence.GetCount(); j++)
{
sw = new CStateWord((CWord)(sentence.GetNext(p2)));
arr.Add(sw);
}
for (j = 1; j < arr.GetSize(); j++)
{
CStateWord* prev = (CStateWord*)arr.GetAt(j - 1);
CStateWord* cur = (CStateWord*)arr.GetAt(j);
for (int x = 0; x < 3; x++)
for (int y = 0; y < 3; y++)
for (int z = 0; z < 3; z++)
{
if (prev->m_prob[x][y][z] > MINPROB)
{
double prob = prev->m_prob[x][y][z] + GetStateProb(x, y, z, ORG) + GetWordProb(cur->m_word, cur->m_pos, ORG, arr.GetSize());
if (prob > cur->m_prob[y][z][ORG])
{
cur->m_prob[y][z][ORG] = prob;
cur->m_prev[y][z][ORG] = x;
}
prob = prev->m_prob[x][y][z] + GetStateProb(x, y, z, NONNE) + GetWordProb(cur->m_word, cur->m_pos, NONNE, arr.GetSize());
if (prob > cur->m_prob[y][z][NONNE])
{
cur->m_prob[y][z][NONNE] = prob;
cur->m_prev[y][z][NONNE] = x;
}
}
}
arr.SetAt(j, cur);
}
CStateWord* last = (CStateWord*)arr.GetAt(arr.GetSize() - 1);
double maxp = MINPROB;
int curx, cury, curz;
for (int x = 0; x < 3; x++)
for (int y = 0; y < 3; y++)
for (int z = 0; z < 3; z++)
{
if (last->m_prob[x][y][z] > maxp)
{
maxp = last->m_prob[x][y][z];
curx = x;
cury = y;
curz = z;
}
}
CSentence s;
for (j = arr.GetSize() - 2; j >= 0; j--)
{
CWord w;
w.m_word = last->m_word;
w.m_pos = last->m_pos;
w.m_ne = curz;
s.AddHead(w);
int prev = last->m_prev[curx][cury][curz];
curz = cury;
cury = curx;
curx = prev;
last = (CStateWord*)arr.GetAt(j);
}
Eliminate(s);
sl.AddTail(s);
sl.print("slprint.txt");
m_progress->SetPos((int)((i + 1) * 100 / corpus.GetCount()));
}
return sl;
}
bool CAnnotator::ReadData()
{
m_data.RemoveAll();
CStdioFile file;
if (!file.Open(gWorkingPath + "\\worddata.txt", CFile::modeRead))
return false;
CString str;
while(file.ReadString(str))
{
CDict *d = new CDict();
*d = Extract(str);
m_data.Add(d);
}
file.Close();
if (!file.Open(gWorkingPath + "\\" + "statedata.txt", CFile::modeRead))
return false;
for (int i = 0; i < 2; i++)
{
file.ReadString(str);
m_count[i] = atoi(str.GetBuffer(0));
}
for (i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
for (int k = 0; k < 3; k++)
{
file.ReadString(str);
m_statep[i][j][k][0] = atoi(str.GetBuffer(0));
file.ReadString(str);
m_statep[i][j][k][1] = atoi(str.GetBuffer(0));
}
file.Close();
if (!file.Open(gWorkingPath + "\\" + "posdata.txt", CFile::modeRead))
return false;
for (i = 0; i < 45; i++)
{
file.ReadString(str);
m_poscount[i][0] = atoi(str.GetBuffer(0));
file.ReadString(str);
m_poscount[i][1] = atoi(str.GetBuffer(0));
}
file.Close();
return true;
}
double CAnnotator::GetStateProb(int x, int y, int z, int next)
{
double prob = (double)m_statep[x][y][z][next] / (m_statep[x][y][z][ORG] + m_statep[x][y][z][NONNE]);
if (prob < 0.000001) return MINPROB;
return log(prob);
};
double CAnnotator::GetWordProb(CString word, CODE pos, int ne, int totalWordCount)
{
double prob;
if (pos == w || pos == tt)
return (ne == ORG)?MINPROB: 0;
if (pos == nt)
return (ne == ORG)?0 :MINPROB;
int wordCount = GetWordCount(word, pos, ne, 0, m_data.GetSize() - 1);
if (wordCount >= 0)
{
prob = wordCount / ((double)m_count[ne]);
//double posprob = m_poscount[pos][ne] / ((double)m_count[ne]) * 0.8;
//prob *= posprob;
}else
{
prob = m_poscount[pos][ne] / ((double)m_count[ne]);
}
if (word.GetLength() <= 2)
prob *= (ne == ORG)? 0.5: 1.5;
if (prob < 0.0000001) return MINPROB;
return log(prob);
};
int CAnnotator::GetWordCount(CString word, CODE pos, int ne, int start, int end)
{
if (start > end)
return -1;
CDict d;
d.m_word = word;
d.m_pos = pos;
int mid = (start + end) / 2;
CDict* getd = (CDict*)m_data.GetAt(mid);
int result = WordCompare(d, *getd);
if (result == 0)
return getd->m_count[ne];
else if (result > 0)
return GetWordCount(word, pos, ne, start, mid - 1);
else
return GetWordCount(word, pos, ne, mid + 1, end);
}
void CAnnotator::Eliminate(CSentence &s)
{
CString orgstring[] = {"公司", "企业", "集团", "厂", "部门", "部", "中心", "社", "局", "处", "所"};
POSITION pos = s.GetTailPosition();
CSentence ns;
CSentence orgs;
int status = S_BEGIN;
int nOrgString = 0;
bool hasVerb = false;
CWord hw;
hw.m_ne = NONNE;
s.AddHead(hw);
for (int i = s.GetCount() - 1; i >= 0; i--)
{
CWord word = (CWord)s.GetPrev(pos);
if (status == ORG && word.m_ne == NONNE)
{
if (orgs.GetCount() == 1)
{
CWord w = (CWord)orgs.GetHead();
int count = GetWordCount(w.m_word, w.m_pos, NONNE, 0, m_data.GetSize() - 1);
if (count > 2 || (w.m_pos != n && w.m_pos != Ag && w.m_pos != an && w.m_pos != ad
&& w.m_pos != a && w.m_pos != d && w.m_pos != Dg && w.m_pos != nt && w.m_pos != nz))
w.m_ne = NONNE;
ns.AddHead(w);
}
else if (nOrgString == 1 && WordIn(orgs.GetHead().m_word, orgstring ,11))
{
POSITION p1 = orgs.GetTailPosition();
for (int j = orgs.GetCount() - 1; j >= 0; j--)
{
CWord w = (CWord)orgs.GetPrev(p1);
w.m_ne = NONNE;
ns.AddHead(w);
}
}
else
{
if (nOrgString > 0)
{
POSITION p1 = orgs.GetTailPosition();
POSITION pp = p1;
for (int j = orgs.GetCount() - 1; j >= 0; j--)
{
CWord w = (CWord)orgs.GetPrev(p1);
if (WordIn(w.m_word, orgstring ,11))
break;
w.m_ne = NONNE;
orgs.SetAt(pp, w);
pp = p1;
}
}
else
{
if (hasVerb)
{
POSITION p1 = orgs.GetTailPosition();
POSITION pp = p1;
for (int j = orgs.GetCount() - 1; j >= 0; j--)
{
CWord w = (CWord)orgs.GetPrev(p1);
w.m_ne = NONNE;
orgs.SetAt(pp, w);
pp = p1;
}
}
}
POSITION p1 = orgs.GetTailPosition();
for (int j = orgs.GetCount() - 1; j >= 0; j--)
{
CWord w = (CWord)orgs.GetPrev(p1);
ns.AddHead(w);
}
}
while (orgs.GetCount() > 0) orgs.RemoveHead();
nOrgString = 0;
hasVerb = false;
}
if (word.m_ne == ORG)
{
orgs.AddHead(word);
if (WordIn(word.m_word, orgstring ,11)) nOrgString++;
if (word.m_pos == v) hasVerb = true;
}
else
ns.AddHead(word);
status = word.m_ne;
}
ns.RemoveHead();
s = ns;
}
bool CAnnotator::WordIn(CString word, CString wordlist[], int number)
{
for (int i = 0; i < number; i++)
{
if (word.Find(wordlist[i]) == word.GetLength() - wordlist[i].GetLength())
return true;
}
return false;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -