⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 unit1.cpp

📁 基于java的一个分词程序 速度比较快 精确度比较高
💻 CPP
字号:
//#include <iostream>
#include <fstream>
#include <string>
#include <vector>

using namespace std;
//---------------------------------------------------------------------------
ofstream cout("output.txt");

const int START1 = 0XB0, START2 = 0XA1, END1 = 0XF8, END2 = 0XFF;
const int MAXWORDLEN = 48;

struct ThirdPlusRec
{
    string Key;
    bool IsWord;
    ThirdPlusRec *L, *R;
    ThirdPlusRec(string cc = "", bool isword = false, ThirdPlusRec* b = 0, ThirdPlusRec* s = 0):
        Key(cc), IsWord(isword), L(b), R(s) {}
};

struct SecondRec
{
    string Key;
    bool IsWord;
    ThirdPlusRec *Child;
    SecondRec(string cc = "", bool isword = false, ThirdPlusRec* s = 0):
        Key(cc), IsWord(isword), Child(s) {}
};

struct HeadRec
{
    string Key;
    vector<SecondRec> Sec;
};
//---------------------------------------------------------------------------
class Dictionary
{
    int HASH[END1 - START1][END2 - START2];
    vector<HeadRec> Dict;
    ifstream fin;

    string GetLine();
    int GetNumber();
    void LoadDictionary();
    void ProcessThird(string s, ThirdPlusRec* &Child);
    void PrintThird(string s, ThirdPlusRec* Child);
    void SkipNonChinese(string src, vector<string> &dest, unsigned &start, unsigned &end);
    int BinarySearch(unsigned x, string Sec);
    ThirdPlusRec* SeqSearch(ThirdPlusRec* p, string cc);
public:
    Dictionary(string FileName);
    void PrintDict();
    void Segment(string src, vector<string> &dest);
    void SegFileToFile(string FileNameSrc, string FileNameDest);
};
//---------------------------------------------------------------------------
void Dictionary::SegFileToFile(string FileNameSrc, string FileNameDest)
{
    string s;
    const int MAXLEN = 1000;
    char t[MAXLEN];
    ifstream SrcFile(FileNameSrc.c_str());
    while (SrcFile.getline(t, MAXLEN))
        s += string(t);
    vector<string> dest;
    Segment(s, dest);
    ofstream DestFile(FileNameDest.c_str());
    for (unsigned i = 0; i < dest.size(); i++)
        DestFile << dest[i] << endl;
}

unsigned CharToInt(char c)
{
    return unsigned((unsigned char)c);
}

bool IsCC(char c)
{
    unsigned val = CharToInt(c);
    return val >= START1 && val < END1;
}

bool IsEC(char c)
{
    unsigned val = CharToInt(c);
    return val < 0x80;
}

void Dictionary::SkipNonChinese(string src, vector<string> &dest,
                                unsigned &StartPos, unsigned &EndPos)
{
    unsigned StrLen = src.length();
    while (EndPos < StrLen && !IsCC(src[EndPos]))
    {
        if (!IsEC(src[EndPos]))
            EndPos++;
        EndPos++;
    }
    if (EndPos > StartPos)
    {
        dest.push_back(src.substr(StartPos, EndPos - StartPos));
        StartPos = EndPos;
    }
}

int Dictionary::BinarySearch(unsigned x, string Sec)
{
    int L = 0, R = Dict[x].Sec.size() - 1;
    while (L <= R)
    {
        int M = (L + R) / 2;
        if (Dict[x].Sec[M].Key == Sec)
            return M;
        else if (Dict[x].Sec[M].Key < Sec)
            L = M + 1;
        else
            R = M - 1;
    }
    return -1;
}

ThirdPlusRec* Dictionary::SeqSearch(ThirdPlusRec* p, string cc)
{
    while (p != 0)
    {
        if (p->Key == cc)
            return p;
        else
            p = p->L;
    }
    return 0;
}

void Dictionary::Segment(string src, vector<string> &dest)
{
    unsigned StrLen = src.length();
    unsigned StartPos = 0, EndPos;
    while (StartPos < StrLen)
    {
        EndPos = StartPos;
        SkipNonChinese(src, dest, StartPos, EndPos);
        if (StartPos >= StrLen) return;

        unsigned SegLen = 2;
        string HeadCC = src.substr(StartPos, 2);
        int HeadIndex = HASH[CharToInt(HeadCC[0]) - START1][CharToInt(HeadCC[1]) - START2];
        if (HeadIndex >= 0)
        {
            string SecCC = src.substr(StartPos + 2, 2);
            if (SecCC.length() > 0 && IsCC(SecCC[0]))
            {
                int B2 = BinarySearch(HeadIndex, SecCC);
                if (B2 >= 0)    // 双字部分存在
                {
                    if (Dict[HeadIndex].Sec[B2].IsWord) //是双字词
                        SegLen += 2;
                    EndPos = StartPos + 4;
                    ThirdPlusRec *p = Dict[HeadIndex].Sec[B2].Child;
                    while (EndPos < StrLen && (p = SeqSearch(p, src.substr(EndPos, 2))) != 0)
                    {
                        EndPos += 2;
                        if (p->IsWord)
                            SegLen = EndPos - StartPos;
                        p = p->R;
                    }
                }
            }
        }
        dest.push_back(src.substr(StartPos, SegLen));
        StartPos += SegLen;
    }
}

void Dictionary::PrintThird(string s, ThirdPlusRec* Child)
{
    for (ThirdPlusRec* p = Child; p != 0; p = p->L)
    {
        string t = s + p->Key;
        if (p->IsWord)
            cout << t << endl;
        if (p->R != 0)
            PrintThird(t, p->R);
    }
}

void Dictionary::PrintDict()
{
    for (unsigned i = 0; i < Dict.size(); i++)
    {
        cout << Dict[i].Key << endl;
        int n = Dict[i].Sec.size();
        cout << n << endl;
        for (int j = 0; j < n; j++)
        {
            string s = Dict[i].Key + Dict[i].Sec[j].Key;
            if (Dict[i].Sec[j].IsWord)
                cout << s << endl;
            if (Dict[i].Sec[j].Child != 0)
                PrintThird(s, Dict[i].Sec[j].Child);
        }
    }
}

Dictionary::Dictionary(string FileName)
{
    fin.open(FileName.c_str());
    for (int i = 0; i < END1 - START1; i++)
        for (int j = 0; j < END2 - START2; j++)
            HASH[i][j] = -1;
    LoadDictionary();
}

int StrToInt(string s)
{
    int t = 0;
    for (unsigned i = 0; i < s.length(); i++)
        t = t * 10 + int(s[i] - '0');
    return t;
}

string Dictionary::GetLine()
{
    char cstr[MAXWORDLEN];
    fin.getline(cstr, MAXWORDLEN);
    return string(cstr);
}

int Dictionary::GetNumber()
{
    char cstr[MAXWORDLEN];
    fin.getline(cstr, MAXWORDLEN);
    string s(cstr);
    return StrToInt(s);
}

void Dictionary::LoadDictionary()
{
    char cstr[MAXWORDLEN];
    string s;
    int n, k = 0;
    while (fin.getline(cstr, MAXWORDLEN))
    {
        s = cstr;
        HeadRec H;
        H.Key = s.substr(0, 2);
        int m1 = (unsigned char)s[0] - START1,
        m2 = (unsigned char)s[1] - START2;
        HASH[m1][m2] = k++;
        n = GetNumber();
        for (int i = 0; i < n; i++)
        {
            s = GetLine();
            string t = s.substr(2, 2);

            int SIZE = H.Sec.size();
            if (SIZE == 0 || SIZE > 0 && H.Sec[SIZE - 1].Key != t)
            {
                SecondRec sec(t, (s.length() == 4), 0);
                H.Sec.push_back(sec);
            }
            SIZE = H.Sec.size();
            if (s.length() > 4)
                ProcessThird(s.substr(4, MAXWORDLEN), H.Sec[SIZE - 1].Child);
        }
        Dict.push_back(H);
    }
}

void Dictionary::ProcessThird(string s, ThirdPlusRec* &Child)
{
    int Len = s.length();
    string t = s.substr(0, 2);
    ThirdPlusRec* LAST = Child;
    if (Child == 0)
        LAST = Child = new ThirdPlusRec(t, (Len == 2), 0, 0);
    else
    {
        while (LAST->L != 0)
            LAST = LAST->L;
        if (LAST->Key != t)
        {
            LAST->L = new ThirdPlusRec(t, (Len == 2), 0, 0);
            LAST = LAST->L;
        }
    }
    if (Len > 2)
        ProcessThird(s.substr(2, MAXWORDLEN), LAST->R);
}
//---------------------------------------------------------------------------
int main(int argc, char* argv[])
{
    Dictionary S("segdict.txt");
//    S.SegFileToFile("src.txt", "dest.txt");
//    S.PrintDict();
    vector<string> dest;
    string s = "中华人民共和国成立于1949年。";
    s += "中华民国成立于1911年。";
    s += "周恩来领导了1927年的南昌起义。";
    s += "他从马上下来。";
    cout << s << endl;
    S.Segment(s, dest);
    for (unsigned i = 0; i < dest.size(); i++)
        cout << dest[i] << endl;

//    cin.get();

//    S.PrintDict();
    return 0;
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -