📄 segmenter.java
字号:
unstemmed.length() == 2)) {
System.out.println("Stemmed suffix");
try {System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
unstemmed.deleteCharAt(unstemmed.length()-1);
return unstemmed.toString();
}
}
for (i = 0; i < infix.length; i++) {
if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null) {
System.out.println("Stemmed infix");
unstemmed.deleteCharAt(1);
return unstemmed.toString();
}
}
return unstemmed.toString();
}
public String segmentLine(String cline, String separator) {
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int i, clength;
char currentchar;
debug= false;
//separator = " ";
clength = cline.length();
int[][] offsets = new int[clength][2];
for (i = 0; i < clength; i++) {
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true) {
// Character in CJK block
if (currentword.length() == 0) { // start looking for next word
//System.err.println("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
outline.append(separator);
}
currentword.append(currentchar);
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} catch (Exception a) { };
}
} else {
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) {
// word is in lexicon
currentword.append(currentchar);
/*******对每个词进行统计的代码(开始)********/
String tmpStr = new String(currentword.toString());
if(debug)
System.out.println(tmpStr);
int tmpCnt; //对应词的出现次数
int tmpIndex = -1; //当前词在队列中的位置(如果存在的话)
int tmpIndex2 =-1;//当前词在所有词库中的位置;
if(debug)
System.out.println("wordSum.size() = "+wordSum.size());
///////以下对当前文档词库进行操作//////
for(int j = 0;j<wordSum.size();j++){
if(wordSum.get(j).equals(tmpStr)){
tmpIndex=j;
break;
}
}
if(debug)
System.out.println("tmpIndex = "+tmpIndex);
if(tmpIndex==-1){
wordSum.add(tmpStr);
tmpIndex=wordSum.indexOf(tmpStr);
//wordCount.ensureCapaticy(tmpIndex);
tmpCnt=1;
if(wordCountMax<tmpCnt){
wordCountMax=tmpCnt;
}
wordCount.add(tmpCnt);
if(debug){
System.out.println("tmpCnt1 = "+tmpCnt);
System.out.println("indexOf1 = "+tmpIndex);
}
}
else{
tmpCnt = (int) ((Integer)wordCount.get(tmpIndex)).intValue();
tmpCnt++;
wordCount.set((int)tmpIndex,tmpCnt);
if(debug)
System.out.println("tmpCnt2 = "+tmpCnt);
if(wordCountMax<tmpCnt){
wordCountMax=tmpCnt;
if(debug)
System.out.println("WordCountMax = "+wordCountMax);
}
}
////////以下对全局词库进行操作///////
for(int j = 0;j<wordAll.size();j++){
if(wordAll.get(j).equals(tmpStr)){
tmpIndex2=j;
break;
}
}
if(tmpIndex2==-1){
wordAll.add(tmpStr);
tmpIndex2=wordAll.indexOf(tmpStr);
tmpCnt=1;
wordCountAll.add(tmpCnt);
}
else{
tmpCnt = (int) ((Integer)wordCountAll.get(tmpIndex2)).intValue();
tmpCnt++;
wordCountAll.set((int)tmpIndex2,tmpCnt);
}
/*******对每个词进行统计的代码(结束)********/
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] {currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) {
// Possible a transliteration of a foreign name
currentword.append(currentchar);
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] {currentchar}).intern())
/* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) {
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() +
currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar +
cline.charAt(i+1)).intern()) == true))
{
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
// Starts a word in the lexicon
currentword.append(currentchar);
} else { // Start anew
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
}
currentword.setLength(0);
currentword.append(currentchar);
}
}
} else { // Not chinese character
//System.err.println("not cjk");
if (currentword.length() > 0) {
outline.append(currentword.toString());
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
}
currentword.setLength(0);
}
outline.append(currentchar);
}
}
outline.append(currentword.toString());
return outline.toString();
//return offsets;
}
public LinkedList segmentLine(String cline) {
StringBuffer currentword = new StringBuffer();
StringBuffer outline = new StringBuffer();
int i, clength;
char currentchar;
String separator = "";
LinkedList offsets = new LinkedList();
int currentoffset = 0;
offsets.add(new Integer(0));
clength = cline.length();
for (i = 0; i < clength; i++) {
currentchar = cline.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
isNumber(cline.substring(i, i+1)) == true) {
// Character in CJK block
if (currentword.length() == 0) { // start looking for next word
//System.err.println("current word length 0");
if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
}
currentword.append(currentchar);
} else {
if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) {
// word is in lexicon
currentword.append(currentchar);
} else if (isNumber(currentword.toString()) &&
cnumbers.contains(new String(new char[] {currentchar}).intern())
/* && (i + 2 < clength) &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) {
// Put all consecutive number characters together
currentword.append(currentchar);
if (debug) {
try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));}
catch (Exception a) { };
}
} else if (isAllForeign(currentword.toString()) &&
cforeign.contains(new String(new char[] {currentchar}).intern()) &&
i + 2 < clength &&
(zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) {
// Possible a transliteration of a foreign name
currentword.append(currentchar);
} else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
(((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("2") == true) &&
i + 1 < clength &&
(zhwords.containsKey(new String(currentword.toString() + currentchar + cline.charAt(i+1)).intern()) == true))
{
// Starts a word in the lexicon
currentword.append(currentchar);
} else { // Start anew
outline.append(currentword.toString());
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
}
currentword.setLength(0);
currentword.append(currentchar);
}
}
} else { // Not chinese character
//System.err.println("not cjk");
if (currentword.length() > 0) {
outline.append(currentword.toString());
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
if (Character.isWhitespace(currentchar) == false) {
outline.append(separator);
currentoffset += separator.length();
if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
}
currentword.setLength(0);
}
while ((i < clength) &&
(Character.UnicodeBlock.of(cline.charAt(i)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)) {
currentword.append(cline.charAt(i));
i++;
}
i--;
outline.append(currentword.toString());
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
currentword.setLength(0);
}
}
outline.append(currentword.toString());
if (currentword.length() > 0) {
currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
}
return offsets;
}
public void addword(String newword) {
zhwords.put(newword.intern(), "1");
if (newword.length() == 3) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
}
if (newword.length() == 4) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
}
if (newword.length() == 5) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
zhwords.put(newword.substring(0,4).intern(), "2");
}
}
if (newword.length() == 6) {
if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
zhwords.put(newword.substring(0,2).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
zhwords.put(newword.substring(0,3).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
zhwords.put(newword.substring(0,4).intern(), "2");
}
if (zhwords.containsKey(newword.substring(0, 5).intern()) == false) {
zhwords.put(newword.substring(0,5).intern(), "2");
}
}
}
public void segmentFile(String inputfile, String encoding) {
byte[] gbbytes;
String outfile = inputfile + ".seg";
String segstring;
wordCountMax=-1;
boolean debug = false;
try {
String dataline;
InputStream srcdata = new FileInputStream(inputfile);
BufferedReader in = new BufferedReader(new InputStreamReader(srcdata, encoding));
BufferedWriter outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));
try{
while ((dataline = in.readLine()) != null) {
segstring = segmentLine(dataline, " ");
if (debug) {
gbbytes = segstring.getBytes(encoding);
System.err.println("Output: " + new String(gbbytes));
}
outbuffer.write(segstring);
outbuffer.newLine();
}
}catch(Exception e){
System.err.println("Exception [segmenter.java 776 while] " + e.toString());
}
in.close();
outbuffer.close();
}
catch (Exception e) {
System.err.println("Exception [segmenter.java 768] " + e.toString());
}
}
public static void printHelp() {
System.out.println("Usage:\njava -jar segmenter.jar [-b|-g|-8] inputfile.txt");
System.out.println("\t-b Big5, -g GB2312, -8 UTF-8");
System.out.println(" Segmented text will be saved to inputfile.txt.seg");
System.exit(0);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -