📄 segmenter.java
字号:
import java.lang.*;
import java.io.*;
import java.util.*;
import java.lang.Math.*;
/* Integery by @Author Kelven.JU
erik AT mandarintools.com
Last modified Jan. 13, 2004
*/
public class segmenter {
//private Hashtable zhwords;
public TreeMap zhwords;
public TreeSet csurname, cforeign, cnumbers, cnotname;
public String debugencoding;
//统计词项与词频
public ArrayList wordSum ;
public ArrayList wordCount;
public ArrayList wordAll;
public ArrayList wordCountAll;
//记录最大词频(计算TF值用到)
public int wordCountMax=-1;
public Integer wordCountMaxInteger;
public boolean debug;
// Char form
public final static int TRAD = 0;
public final static int SIMP = 1;
public final static int BOTH = 2;
// Charform is TRAD, SIMP or BOTH
public segmenter(int charform, boolean loadwordfile) {
debug = false;
debugencoding = "UTF-8";
int count = 0;
int treelevel;
csurname = new TreeSet();
cforeign = new TreeSet();
cnumbers = new TreeSet();
cnotname = new TreeSet();
if (charform == SIMP) {
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
} else if (charform == TRAD) {
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
} else { // BOTH
loadset(cnumbers, "data/snumbers_u8.txt");
loadset(cforeign, "data/sforeign_u8.txt");
loadset(csurname, "data/ssurname_u8.txt");
loadset(cnotname, "data/snotname_u8.txt");
loadset(cnumbers, "data/tnumbers_u8.txt");
loadset(cforeign, "data/tforeign_u8.txt");
loadset(csurname, "data/tsurname_u8.txt");
loadset(cnotname, "data/tnotname_u8.txt");
}
//zhwords = new Hashtable(120000);
zhwords = new TreeMap();
if (!loadwordfile) {
return;
}
String newword = null;
try {
InputStream worddata = null;
if (charform == SIMP) {
worddata = getClass().getResourceAsStream("simplexu8.txt");
} else if (charform == TRAD) {
worddata = getClass().getResourceAsStream("tradlexu8.txt");
} else if (charform == BOTH) {
worddata = getClass().getResourceAsStream("bothlexu8.txt");
}
BufferedReader in = new BufferedReader(new InputStreamReader(worddata, "UTF8"));
while ((newword = in.readLine()) != null) {
if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {
zhwords.put(newword.intern(), "1");
if (newword.length() == 3) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
// Add section for words of 5 characters
//addword(newword);
/* hashnode = zhwords;
for (treelevel = 0; treelevel < newword.length(); treelevel++) {
if (hashnode.containsKey(newword.substring(treelevel,treelevel+1).intern())
== true) {
// Do nothing, wait for next loop
} else {
// Add new hashnode to the tree
hashnode.put(newword.substring(treelevel,treelevel+1).intern(),
new Hashtable(50));
}
hashnode =
(Hashtable)hashnode.get(newword.substring(treelevel,treelevel+1).intern());
}
hashnode.put("EOW", new Integer(newword.length())); // Can I put something more useful here?
*/
if (count++ % 20000 == 0) { System.err.println(count); }
//if (count > 65000) { break; }
}
}
in.close();
}
catch (IOException e) {
System.err.println("IOException: "+e);
}
}
/**Output Word Count Information**/
public void outputWorkCount(){
int tmpIndex;
for(int k = 0;k<wordSum.size();k++){
System.out.print(wordSum.get(k));
tmpIndex=k;
System.out.println(" : "+(int) ((Integer)wordCount.get(tmpIndex)).intValue());
}
}
/**Get the TF value of every word in this document**/
public ArrayList getTfValue(String args){
int tmpIndex;
BufferedWriter fileOut;
ArrayList tmpArrayList = new ArrayList(3);
File tmpTfFilePath=new File("tf");
tmpTfFilePath.mkdir();
try{
fileOut=new BufferedWriter ( new FileWriter (args+".tf"));
for(int k = 0;k<wordSum.size();k++){
//System.out.print(wordSum.get(k));
tmpIndex=k;
tmpArrayList.add(((double) ((Integer)wordCount.get(tmpIndex)).intValue())/(double)wordCountMax);
try{
fileOut.newLine();
fileOut.write(k+" : "+((double) ((Integer)wordCount.get(tmpIndex)).intValue())/(double)wordCountMax);
fileOut.flush();
if((k+1)==wordSum.size())
fileOut.close();
}catch (IOException e){
System.out.println("Function getTfValue [segmenter.java 166] IO error!");}
}
}catch (IOException e){
System.out.println("Function getTfValue [segmenter.java 159] IO error!");
}
return tmpArrayList;
}
/**Return true if this document contains a word equals the from @args**/
public boolean containsWord(String args){
for(int k = 0;k<wordSum.size();k++){
if(wordSum.get(k).equals(args)){
return true;
}
}
return false;
}
/**Set the three Objects needed to be Operated**/
public void setWordOfOneDocument(ArrayList args){
wordSum=args;
return;
}
public void setWordCountOfOneDocument(ArrayList args){
wordCount=args;
return;
}
public void setWordMaxCountOfOneDocument(Integer args){
wordCountMaxInteger=args;
return;
}
public void setWordOfAllDocument(ArrayList args){
wordAll=args;
return;
}
public void setWordCountOfAllDocument(ArrayList args){
wordCountAll=args;
return;
}
public int getWordCountMaxOfOneDocument(){
return wordCountMax;
}
/** Load a set of character data */
public void loadset(TreeSet targetset, String sourcefile) {
String dataline;
try {
InputStream setdata = getClass().getResourceAsStream(sourcefile);
BufferedReader in = new BufferedReader(new InputStreamReader(setdata, "UTF-8"));
while ((dataline = in.readLine()) != null) {
if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {
continue;
}
targetset.add(dataline.intern());
}
in.close();
}
catch (Exception e) {
System.err.println("Exception loading data file" + sourcefile + " " + e);
}
}
public boolean isNumber(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cnumbers.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
if (debug) {
try {System.out.println(new String(testword.getBytes("UTF-8")) + " " + result);}
catch (Exception a) { };
}
return result;
}
public boolean isAllForeign(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (cforeign.contains(testword.substring(i, i+1).intern()) == false) {
result = false;
break;
}
}
return result;
}
public boolean isNotCJK(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
result = false;
break;
}
}
return result;
}
/*
String add_ChineseNames(String tmpline) {
int tlen = tmpline.length();
StringBuffer newline = new StringBuffer();
for (int m = 0; m < tlen; m++) {
$tchar = substr($tmpline, $m, 1);
$currtoken = "";
if ($tchar =~ /^\s$/) {
$newline .= $tchar;
} else {
$currtoken = "";
while ($tchar !~ /^\s$/ and $m < $tlen) {
$currtoken .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
if (defined($csurname{$currtoken}) or
defined($uncommoncsurname{$currtoken})) { # found a surname, see what follows
# go past following spaces
$tchar = substr($tmpline, $m, 1);
$spaces = "";
while ($tchar =~ /\s/ and $m < $tlen) {
$spaces .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# Get next token
$tchar = substr($tmpline, $m, 1);
$currtoken2 = "";
while ($tchar !~ /\s/ and $m < $tlen) {
$currtoken2 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# go past following spaces
$tchar = substr($tmpline, $m, 1);
$spaces2 = "";
while ($tchar =~ /\s/ and $m < $tlen) {
$spaces2 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
# Get next token
$tchar = substr($tmpline, $m, 1);
$currtoken3 = "";
while ($tchar !~ /\s/ and $m < $tlen) {
$currtoken3 .= $tchar;
$m++;
$tchar = substr($tmpline, $m, 1);
}
if (isChinese($currtoken2) and (length($currtoken2) == 2) and
(!defined($cnotname{$currtoken2})) and
isChinese($currtoken3) and length($currtoken3) == 2 and
!defined($cnotname{$currtoken3}))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $currtoken3 . $cname[1];
$cwords{$currtoken . $currtoken2 . $currtoken3} = 1;
$cwords{$currtoken . $currtoken2} = 2; # short version for checking
} elsif (isChinese($currtoken2) and (length($currtoken2) == 2)
and (!defined($cnotname{$currtoken2})))
{
$newline .= $currtoken . $currtoken2 . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
} elsif (defined($csurname{$currtoken}) and
isChinese($currtoken2) and (length($currtoken2) == 4) and
($cwords{$currtoken2} != 1) and
(!defined($cnotname{$currtoken2})))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
$cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
} elsif (defined($uncommoncsurname{$currtoken}) and
isChinese($currtoken2) and (length($currtoken2) == 4)
and (!defined($cnotname{$currtoken2}))
and ($cwords{$currtoken2} != 1))
{
$newline .= $cname[0] . $currtoken . $currtoken2 . $cname[1] . $spaces2 . $currtoken3;
$cwords{$currtoken . $currtoken2} = 1;
$cwords{$currtoken . substr($currtoken2, 0, 2)} = 2; # short version to check
} else {
$newline .= $currtoken . $spaces . $currtoken2 . $spaces2 . $currtoken3;
}
} else {
$newline .= $currtoken;
}
$m--; # reset so won't skip space
}
}
$newline;
}
*/
public String stemWord(String word) {
String[] prefix = new String[] {"\u7b2c", "\u526f", "\u4e0d"};
String[] suffix = new String[] {"\u4e86", "\u7684", "\u5730", "\u4e0b", "\u4e0a", "\u4e2d", "\u91cc",
"\u5230", "\u5185", "\u5916", "\u4eec"};
String[] infix = new String[] {"\u5f97", "\u4e0d"};
int i;
StringBuffer unstemmed = new StringBuffer(word);
for (i = 0; i < prefix.length; i++) {
if (unstemmed.substring(0, 1).equals(prefix[i]) == true &&
(zhwords.get(unstemmed.substring(1, unstemmed.length()).intern()) != null ||
unstemmed.length() == 2)) {
System.out.println("Stemmed prefix");
try {System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
unstemmed.deleteCharAt(0);
return unstemmed.toString();
}
}
for (i = 0; i < suffix.length; i++) {
if (unstemmed.substring(unstemmed.length()-1, unstemmed.length()).equals(suffix[i]) == true &&
(zhwords.get(unstemmed.substring(0, unstemmed.length()-1).intern()) != null ||
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -