⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 form1.cs

📁 利用字典进行文章分词
💻 CS
字号:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Text;
using System.Windows.Forms;
using System.IO;
using Office;
using Word;
using System.Text.RegularExpressions;
namespace ParseWord
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void buttonBrowse_Click(object sender, EventArgs e)
        {
            OpenFileDialog ofd = new OpenFileDialog();
            ofd.Filter = "Word Documents(*.doc)|*.doc";
            ofd.ShowDialog();
            textBoxWordPathName.Text = ofd.FileName;
        }

        private void buttonParse_Click(object sender, EventArgs e)
        {
            Word.ApplicationClass wordApp = new ApplicationClass();
            object file = textBoxWordPathName.Text;
            object nullobj = System.Reflection.Missing.Value;
            Word.Document doc = wordApp.Documents.Open(
                ref file, ref nullobj, ref nullobj,
                ref nullobj, ref nullobj, ref nullobj,
                ref nullobj, ref nullobj, ref nullobj,
                ref nullobj, ref nullobj, ref nullobj);
            doc.ActiveWindow.Selection.WholeStory();
            doc.ActiveWindow.Selection.Copy();
            IDataObject data = Clipboard.GetDataObject();
            string filecontent = data.GetData(DataFormats.Text).ToString();
            doc.Close(ref nullobj, ref nullobj, ref nullobj);
            wordApp.Quit(ref nullobj, ref nullobj, ref nullobj);
           

            textBoxFile.Text = FileParse(filecontent);
      //      textBoxFile.Text = filecontent;            
        }
        private string FileParse(string fc)
        {
            WordDeal worddeal = new WordDeal(fc);
            worddeal.ReadWordList(@"word_freq_list.txt");
            worddeal.WordsParse(4);
            return worddeal.FILECONTENT;
        }
    }
    public class WordDeal
    {
        private string FileContent;
        private List<string> WordList;
        public string FILECONTENT
        {
            get
            {
                return FileContent;
            }
        }
        public WordDeal(string filecontent)
        {
            FileContent = filecontent;
            WordList = new List<string>();
        }
        public void ReadWordList(string filename)//读取字库
        {
            FileStream afile = new FileStream(filename, FileMode.Open);
            Encoding ecode = Encoding.GetEncoding("GB18030");
            StreamReader sr = new StreamReader(afile, ecode);
            string strline = sr.ReadLine();
            char[] seperator = new char[] { ' ' };
            string[] splitline;
            while (strline != null)
            {
                splitline = strline.Split(seperator, StringSplitOptions.RemoveEmptyEntries);
                WordList.Add(splitline[1]);
                strline = sr.ReadLine();
            }
            sr.Close();
        }
        public string ExtractWords(int index, int length)//index从0开始
        {
            return FileContent.Substring(index, length);
        }
        public void InsertBackSpace(int pos)//在第pos后插入空格,从1开始
        {
            //string bs="  ";
            FileContent = FileContent.Insert(pos, " ");
        }
        public bool isExist(string WordSegment)//判断是字符串否在字库中
        {
            foreach (string str in WordList)
            {
                if (WordSegment == str)
                    return true;
            }
            return false;
        }
        public bool isWord(string WordSegment)//判断是否为字母或数字
        {
            if ((new Regex(@"^[A-Za-z0-9\s]+$")).IsMatch(WordSegment)) 
                return true;
            else 
                return false;   

        }
        public void WordsParse(int ParseLen)//数据解析,ParseLe为最大解析字符串长度,4
        {
            int i = 0;
            int j = 0;
            string substring;
            int len;
            while (i < FileContent.Length)
            {
                if ((FileContent.Length - i) >= ParseLen)
                {
                    len = ParseLen;
                }
                else
                {
                    len = FileContent.Length-i;
                }
                for (j = len; j > 0; j--)
                {
                    substring = ExtractWords(i, j);
                    if (isExist(substring))
                    {
                        InsertBackSpace(i + j);
                        break;
                    }
                    else if(isWord(substring))
                    {
                        break;
                    }
                    else if (j == 1)
                    {
                        InsertBackSpace(i + j);
                        break;
                    }
                }
                i = i + j + 1;
            }
        }
    }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -