📄 textsegment.java
字号:
import java.io.*;
import java.util.*;
import java.sql.*;
import java.awt.*;
import java.awt.event.*;
import javax.swing.*;
import javax.swing.filechooser.*;
public class textsegment implements Runnable
{
static private final String newline = "\n";
int fileTag = 0;
public static int[] wordNum = new int[6000];
public static String[] wordString = new String[6000];
public static int endTag=0, endTagTemp=0;
public static String[] wordStringTemp = new String[50];
public static File file;
String fl;
String filepath;
public textsegment(String fl,String filepath)
{
this.fl = fl;
this.filepath = filepath;
}
public String storeToDatabase()
{
try
{
ResultSet rs = null;
String urlConn = "jdbc:odbc:WORKLOAD";
Connection con;
Statement stmt,stmtTemp;
String wordWeightDescribe = "";
String wordWeightField = "";
Class.forName("sun.jdbc.odbc.JdbcOdbcDriver");
con = DriverManager.getConnection(urlConn);
con.setAutoCommit(true);
stmtTemp = con.createStatement(rs.TYPE_SCROLL_INSENSITIVE,rs.CONCUR_READ_ONLY);
String sStop;
sStop=this.divsubstring("stop_list.txt");
rs = stmtTemp.executeQuery("SELECT * FROM wordWeightDict");
while (rs.next())
{
wordWeightDescribe = rs.getString("wordWeightDescribe");
wordWeightField = rs.getString("wordWeightField");
if ((fl.equals(wordWeightDescribe.trim()))) break;
}
rs.close();
rs = stmtTemp.executeQuery("SELECT * FROM wordWeight;");
outer:
for (int i =0; i < endTag; i++)
{
if (sStop.indexOf(wordString[i].trim())<0)
{
while (rs.next())
{
String wordItem = rs.getString("wordItem");
int count = rs.getInt(wordWeightField.trim());
stmt = con.createStatement();
if (wordString[i].trim().equals(wordItem.trim()))
{
System.out.println("count = "+count);
count = count+wordNum[i];
stmt.executeUpdate("update wordWeight set "+wordWeightField.trim()+" = '"+count+"' where wordItem = '"+wordString[i]+"'");
rs.first();
continue outer;
}
}
stmt = con.createStatement();
System.out.println("Insert="+wordString[i].trim());
stmt.executeUpdate("INSERT INTO wordWeight(wordItem,"+wordWeightField.trim()+") VALUES ('"+wordString[i].trim()+"','"+wordNum[i]+"');");
rs.first();
}
}
rs.close();
stmt = con.createStatement();
stmt.executeUpdate("update wordWeightDict set wordWeightCreate = true where wordWeightDescribe = '"+wordWeightDescribe.trim()+"'");
}
catch ( Exception e )
{
System.err.println(e.getMessage());
}
return "Successed!!!";
}
public void train(String trainFile)
{
String text=trainFile;
String str="dictionary.txt";
String sourceString=divsubstring(text);
String search=new String("");
String dictionary=new String("");
String string=new String("");
int Stringlength=0;
int i=0;
String encoding = System.getProperty("file.encoding");
dictionary=readInput(str);
StringTokenizer st=new StringTokenizer(sourceString);
//System.in.read();
while(st.hasMoreTokens())
{
endTagTemp=0;
string=st.nextToken().trim();
while((Stringlength= string.length())>=7)
{
i=7;
search=string.substring(0,i).trim();
while(i>0)
{
search=' '+search+' ';
if((dictionary.indexOf(search))>= 0)
{
wordStringTemp[endTagTemp] = search.substring(0,search.length()-1).trim();
endTagTemp++;
string=string.substring(i);
break;
}
else
{
i=i-1;
if(i==1)
{
search=string.substring(0,i).trim();
wordStringTemp[endTagTemp] = search;
endTagTemp++;
string=string.substring(i);
break;
}
else
search=string.substring(0,i).trim();
}
}
}
i=string.length();
while(string.length()>1)
{
search=' '+string.substring(0,i).trim()+' ';
if((dictionary.indexOf(search))>= 0)
{
wordStringTemp[endTagTemp] = search.substring(0,search.length()-1).trim();
endTagTemp++;
string=string.substring(i);
i=string.length();
}
else
{
i--;
if(i==1)
{
search=string.substring(0,i).trim();
wordStringTemp[endTagTemp] = search;
endTagTemp++;
string=string.substring(i);
i=string.length();
}
}
}
if(string.trim().length()!=0)
{
wordStringTemp[endTagTemp] = string;
endTagTemp++;
}
addWordString();
}
}
public static void addWordString()
{
outer:
for (int j=0; j < endTagTemp; j++)
{
if (wordStringTemp[j] == null) continue;
for (int i =0; i < endTag; i++)
{
// System.out.println("wordString===: "+wordString[i]);
// System.out.println("wordStringTemp===: "+wordStringTemp[j]);
if (wordString[i].trim().equals(wordStringTemp[j].trim()))
{
wordNum[i]++;
continue outer;
}
}
wordString[endTag] = wordStringTemp[j].trim();
wordNum[endTag] = 1;
endTag++;
}
}
public static String readInput(String strInFile)
{
StringBuffer buffer = new StringBuffer();
try
{
FileInputStream fis = new FileInputStream(strInFile);
InputStreamReader isr = new InputStreamReader(fis, "GBK");
Reader in = new BufferedReader(isr);
int ch;
while ((ch = in.read()) > -1)
{
buffer.append((char)ch);
}
in.close();
}catch (IOException e) { e.printStackTrace(); }
return buffer.toString();
}
public static String divsubstring(String sourcefile)
{
StringBuffer temp=new StringBuffer();
String result=new String("");
try
{
FileInputStream fis = new FileInputStream(sourcefile);
InputStreamReader isr = new InputStreamReader(fis, "GBK");
// String str=new String("");
Reader in = new BufferedReader(isr);
int c=in.read();
while(c!=-1)
{
{
// if(((c>64)&&(c<91))||((c>96)&&(c<123))||((c>47)&&(c<58))||((c>19800)&&(c<41000))||(c==183))
if(((c>19800)&&(c<41000)))
temp.append((char)c);
else
temp.append((char)32);
}
c=in.read();
}
in.close();
result=temp.toString();
}catch (IOException e) { System.out.println(e);}
//System.out.println("result : "+result);
return result;
}
public void run()
{
File f = new File(filepath);
String [] m_dir= f.list();
for(int i=0;i<m_dir.length;i++)
{
System.out.println(f+"\\"+m_dir[i]);
train(f+"\\"+m_dir[i]);
}
System.out.println("finished!!!");
this.storeToDatabase();
}
public static void main(String args[])
{
textsegment test = new textsegment("经济325","D:\\MailTool\\分词程序\\分类文件库\\经济325");
new Thread(test).start();
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -