⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 textsegment.java

📁 中文分词程序
💻 JAVA
字号:

import java.io.*;
import java.util.*;
import java.sql.*;
import java.awt.*;
import java.awt.event.*;
import javax.swing.*;
import javax.swing.filechooser.*;


public class textsegment implements Runnable 
{
        static private final String newline = "\n"; 
        int fileTag = 0;
        public static int[] wordNum = new int[6000];
        public static String[] wordString = new String[6000];
        public static int endTag=0, endTagTemp=0;
        public static String[] wordStringTemp = new String[50];
        public static File file;
        String fl;
        String filepath;

        public textsegment(String fl,String filepath)
        {
          this.fl = fl; 
          this.filepath = filepath;        
        }

		public String storeToDatabase()
		{
   		 try
   		 {
          ResultSet rs = null;
          String urlConn = "jdbc:odbc:WORKLOAD";
          Connection con;
          Statement stmt,stmtTemp;
          String wordWeightDescribe = "";
          String wordWeightField = "";
          Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
          con = DriverManager.getConnection(urlConn);
          con.setAutoCommit(true);
          stmtTemp = con.createStatement(rs.TYPE_SCROLL_INSENSITIVE,rs.CONCUR_READ_ONLY);
          String sStop;
          sStop=this.divsubstring("stop_list.txt");
          rs = stmtTemp.executeQuery("SELECT * FROM wordWeightDict");
          while (rs.next())
           {
             wordWeightDescribe = rs.getString("wordWeightDescribe");
             wordWeightField = rs.getString("wordWeightField");
             if ((fl.equals(wordWeightDescribe.trim()))) break;
           }
             rs.close();
             rs = stmtTemp.executeQuery("SELECT *  FROM wordWeight;");
             outer:
             for (int i =0; i < endTag; i++)
             {
             	if (sStop.indexOf(wordString[i].trim())<0)
             	{             
                  while (rs.next())
                   {
                    String wordItem = rs.getString("wordItem");
                    int count = rs.getInt(wordWeightField.trim());
                  	stmt = con.createStatement();
                    if (wordString[i].trim().equals(wordItem.trim()))
                  	{       
                      System.out.println("count = "+count);
                      count = count+wordNum[i];
                      stmt.executeUpdate("update wordWeight set "+wordWeightField.trim()+" = '"+count+"' where wordItem = '"+wordString[i]+"'");
                      rs.first();
                      continue outer;
                  }
                }
                
                  stmt = con.createStatement();
                  System.out.println("Insert="+wordString[i].trim());
                  stmt.executeUpdate("INSERT INTO wordWeight(wordItem,"+wordWeightField.trim()+") VALUES ('"+wordString[i].trim()+"','"+wordNum[i]+"');");
                  rs.first();

              }
            }   
            rs.close();
            stmt = con.createStatement();
            stmt.executeUpdate("update wordWeightDict set wordWeightCreate = true where wordWeightDescribe = '"+wordWeightDescribe.trim()+"'");

        }
        catch ( Exception e )
        {
                System.err.println(e.getMessage());
        }
        return "Successed!!!";
}


	public  void train(String trainFile)
	{
        String text=trainFile;
        String str="dictionary.txt";
        String sourceString=divsubstring(text);
        String search=new String("");
        String dictionary=new String("");
        String string=new String("");
        int Stringlength=0;
        int i=0;
        String encoding = System.getProperty("file.encoding");
        dictionary=readInput(str);

        StringTokenizer st=new StringTokenizer(sourceString);
        //System.in.read();
        while(st.hasMoreTokens())
        {
           endTagTemp=0;
           string=st.nextToken().trim();
           while((Stringlength= string.length())>=7)
           {
             i=7;
             search=string.substring(0,i).trim();
             while(i>0)
             {
                search=' '+search+' ';
                if((dictionary.indexOf(search))>= 0)
                {
                  wordStringTemp[endTagTemp] = search.substring(0,search.length()-1).trim();
                  endTagTemp++;
                  string=string.substring(i);
                  break;
                }
                else
                {
                  i=i-1;
                  if(i==1)
                  {
                    search=string.substring(0,i).trim();
                    wordStringTemp[endTagTemp] = search;
                    endTagTemp++;
                    string=string.substring(i);
                    break;
                  }
                  else
                    search=string.substring(0,i).trim();
                 }
                }
            }

               i=string.length();
               while(string.length()>1)
               {
                 search=' '+string.substring(0,i).trim()+' ';
                  if((dictionary.indexOf(search))>= 0)
                   {
                     wordStringTemp[endTagTemp] = search.substring(0,search.length()-1).trim();
                     endTagTemp++;
                     string=string.substring(i);
                     i=string.length();
                    }
                  else
                    {
                      i--;
                      if(i==1)
                       {
                         search=string.substring(0,i).trim();
                         wordStringTemp[endTagTemp] = search;
                         endTagTemp++;
                         string=string.substring(i);
                         i=string.length();
                       }
                     }
                }
                if(string.trim().length()!=0)
                {
                   wordStringTemp[endTagTemp] = string;
                   endTagTemp++;
                }
                addWordString();
        }
  }

	public static void addWordString()
	{
        outer:
        for (int j=0; j < endTagTemp; j++)
        {

          if (wordStringTemp[j] == null) continue;
            for (int i =0; i < endTag; i++)
            {
//			System.out.println("wordString===:  "+wordString[i]);
//			System.out.println("wordStringTemp===:  "+wordStringTemp[j]);
               if (wordString[i].trim().equals(wordStringTemp[j].trim()))
                 {
                     wordNum[i]++;
                     continue outer;
                 }
             }
               wordString[endTag] = wordStringTemp[j].trim();
               wordNum[endTag] = 1;
               endTag++;
        }
 	}

	public static String readInput(String strInFile)
	{
        StringBuffer buffer = new StringBuffer();
        try
        {
                FileInputStream fis = new FileInputStream(strInFile);
                InputStreamReader isr = new InputStreamReader(fis, "GBK");
                Reader in = new BufferedReader(isr);
                int ch;
                while ((ch = in.read()) > -1)
                {
                        buffer.append((char)ch);
                }
                in.close();
        }catch (IOException e) {  e.printStackTrace(); }
        return buffer.toString();
	}

	public static String divsubstring(String sourcefile)
	{
        StringBuffer temp=new StringBuffer();
        String result=new String("");
        try
        {
                FileInputStream fis = new FileInputStream(sourcefile);
                InputStreamReader isr = new InputStreamReader(fis, "GBK");

                //	  String str=new String("");
                Reader in = new BufferedReader(isr);
                int c=in.read();

                while(c!=-1)
                {
                        {
//				if(((c>64)&&(c<91))||((c>96)&&(c<123))||((c>47)&&(c<58))||((c>19800)&&(c<41000))||(c==183))
                if(((c>19800)&&(c<41000)))
                                temp.append((char)c);
                                else
                                temp.append((char)32);
                        }
                c=in.read();
                }
                        in.close();
                        result=temp.toString();

        }catch (IOException e) { System.out.println(e);}
        //System.out.println("result : "+result);
        return result;
	}

	public void run()
   	{
   		File f = new File(filepath);
   	    String [] m_dir= f.list();
        for(int i=0;i<m_dir.length;i++)
   	    {
            System.out.println(f+"\\"+m_dir[i]);
            train(f+"\\"+m_dir[i]);
    	}
     System.out.println("finished!!!");
     this.storeToDatabase();
   	}
   	
   	
	public static void main(String args[])
	{
       textsegment test = new textsegment("经济325","D:\\MailTool\\分词程序\\分类文件库\\经济325");    
       new Thread(test).start();     
	}

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -