📄 segmenter.java

📁 1、锁定某个主题抓取； 2、能够产生日志文本文件
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
		 unstemmed.length() == 2)) {
		System.out.println("Stemmed suffix");
		try {System.out.println(new String(unstemmed.toString().getBytes(debugencoding)));} catch (Exception a) { };
		unstemmed.deleteCharAt(unstemmed.length()-1);
		return unstemmed.toString();
	    }
	}
    
	for (i = 0; i < infix.length; i++) {
	    if (unstemmed.length() == 3 && unstemmed.substring(1, 2).equals(infix[i]) == true &&
		zhwords.get(new String(unstemmed.substring(0, 1) + unstemmed.substring(2, 3)).intern()) != null) {
		System.out.println("Stemmed infix");
		unstemmed.deleteCharAt(1);
		return unstemmed.toString();
	    }
	}

	return unstemmed.toString();
    }
    

    public String segmentLine(String cline, String separator) {
	StringBuffer currentword = new StringBuffer();
	StringBuffer outline = new StringBuffer();
	int i, clength;
	char currentchar;
	debug= false;

	//separator = " ";

	clength = cline.length();
	int[][] offsets = new int[clength][2];

	for (i = 0; i < clength; i++) {
	    currentchar = cline.charAt(i);
	    if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
		isNumber(cline.substring(i, i+1)) == true) {
		// Character in CJK block
		if (currentword.length() == 0) {  // start looking for next word
		    //System.err.println("current word length 0");
		    if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
			outline.append(separator); 
		    }
		    currentword.append(currentchar);
		    if (debug) {
			try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} catch (Exception a) { };
		    }

		} else {
		    if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
			((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) {  
			// word is in lexicon
			currentword.append(currentchar);
			
			
			/*******对每个词进行统计的代码（开始）********/
			String tmpStr = new String(currentword.toString());
			if(debug)
			System.out.println(tmpStr);
			int tmpCnt; //对应词的出现次数
			int tmpIndex = -1; //当前词在队列中的位置（如果存在的话）
			int tmpIndex2 =-1;//当前词在所有词库中的位置；
			
			if(debug)
			System.out.println("wordSum.size() = "+wordSum.size());
			
			///////以下对当前文档词库进行操作//////
			for(int j = 0;j<wordSum.size();j++){
				if(wordSum.get(j).equals(tmpStr)){
					tmpIndex=j;
					break;
					}
				}
			if(debug)
				System.out.println("tmpIndex = "+tmpIndex);
			if(tmpIndex==-1){
				wordSum.add(tmpStr);
				tmpIndex=wordSum.indexOf(tmpStr);
				//wordCount.ensureCapaticy(tmpIndex);
				tmpCnt=1;
				if(wordCountMax<tmpCnt){
					wordCountMax=tmpCnt;
					}
				wordCount.add(tmpCnt);
			if(debug){
				System.out.println("tmpCnt1 = "+tmpCnt);
				System.out.println("indexOf1 = "+tmpIndex);
				}
				}
			
			else{
				tmpCnt = (int) ((Integer)wordCount.get(tmpIndex)).intValue();
				tmpCnt++;
				wordCount.set((int)tmpIndex,tmpCnt);
			if(debug)
				System.out.println("tmpCnt2 = "+tmpCnt);
				if(wordCountMax<tmpCnt){
					wordCountMax=tmpCnt;
					if(debug)
					System.out.println("WordCountMax = "+wordCountMax);
					}
				}

			////////以下对全局词库进行操作///////
			for(int j = 0;j<wordAll.size();j++){
				if(wordAll.get(j).equals(tmpStr)){
					tmpIndex2=j;
					break;
					}
				}
			if(tmpIndex2==-1){
				wordAll.add(tmpStr);
				tmpIndex2=wordAll.indexOf(tmpStr);
				tmpCnt=1;
				wordCountAll.add(tmpCnt);
				}
			
			else{
				tmpCnt = (int) ((Integer)wordCountAll.get(tmpIndex2)).intValue();
				tmpCnt++;
				wordCountAll.set((int)tmpIndex2,tmpCnt);
				}
			
			/*******对每个词进行统计的代码（结束）********/
			
			if (debug) {
			    try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} 
			    catch (Exception a) { };
			}

		    } else if (isAllForeign(currentword.toString()) && 
			       cforeign.contains(new String(new char[] {currentchar}).intern()) &&
			       i + 2 < clength &&
			       (zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) {
			// Possible a transliteration of a foreign name
			currentword.append(currentchar);
			if (debug) {
			    try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} 
			    catch (Exception a) { };
			}

		    } else if (isNumber(currentword.toString()) && 
			       cnumbers.contains(new String(new char[] {currentchar}).intern()) 
			       /* && (i + 2 < clength) &&
				  (zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) {
			// Put all consecutive number characters together
			currentword.append(currentchar);
			if (debug) {
			    try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} 
			    catch (Exception a) { };
			}

		    } else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
			       (((String)(zhwords.get(new String(currentword.toString() + 
								 currentchar).intern()))).equals("2") == true) &&
			       i + 1 < clength &&
			       (zhwords.containsKey(new String(currentword.toString() + currentchar + 
							       cline.charAt(i+1)).intern()) == true))
		    {
			if (debug) {
			    try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} 
			    catch (Exception a) { };
			}

			// Starts a word in the lexicon
			currentword.append(currentchar);
			
		    } else { // Start anew
			if (debug) {
			    try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} 
			    catch (Exception a) { };
			}

			outline.append(currentword.toString());
			if (Character.isWhitespace(currentchar) == false) {
			    outline.append(separator);
			}
			currentword.setLength(0);
			currentword.append(currentchar);
		    }
		}
		
	    } else {  // Not chinese character
		//System.err.println("not cjk");
		if (currentword.length() > 0) {
		    outline.append(currentword.toString());
		    if (Character.isWhitespace(currentchar) == false) {
			outline.append(separator);
		    }
		    currentword.setLength(0);
		}
		outline.append(currentchar);
	    }
	}

	outline.append(currentword.toString());
	
	return outline.toString();
	//return offsets;
    }


    public LinkedList segmentLine(String cline) {
	StringBuffer currentword = new StringBuffer();
	StringBuffer outline = new StringBuffer();
	int i, clength;
	char currentchar;
	String separator = "";
	LinkedList offsets = new LinkedList();

	int currentoffset = 0;
	offsets.add(new Integer(0));

	clength = cline.length();
	
	for (i = 0; i < clength; i++) {
	    currentchar = cline.charAt(i);
	    if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS ||
		isNumber(cline.substring(i, i+1)) == true) {

		// Character in CJK block
		if (currentword.length() == 0) {  // start looking for next word
		    //System.err.println("current word length 0");
		    if (i > 0 && (Character.isWhitespace(cline.charAt(i-1)) == false)) {
			outline.append(separator); 
			currentoffset += separator.length(); 
			if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
			
		    }
		    currentword.append(currentchar);
		} else {
		    if (zhwords.containsKey(new String(currentword.toString() + currentchar).intern()) == true &&
			((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("1") == true) {  
			// word is in lexicon
			currentword.append(currentchar);
		    } else if (isNumber(currentword.toString()) && 
			       cnumbers.contains(new String(new char[] {currentchar}).intern()) 
			       /* && (i + 2 < clength) &&
				  (zhwords.containsKey(cline.substring(i, i+2).intern()) == false) */ ) {
			// Put all consecutive number characters together
			currentword.append(currentchar);
			if (debug) {
			    try {System.out.println(new String(currentword.toString().getBytes(debugencoding)));} 
			    catch (Exception a) { };
			}
		    } else if (isAllForeign(currentword.toString()) && 
			       cforeign.contains(new String(new char[] {currentchar}).intern()) &&
			       i + 2 < clength &&
			       (zhwords.containsKey(cline.substring(i, i+2).intern()) == false)) {
			// Possible a transliteration of a foreign name
			currentword.append(currentchar);
		    } else if ((zhwords.containsKey(new String(currentword.toString() + currentchar).intern())) &&
			       (((String)(zhwords.get(new String(currentword.toString() + currentchar).intern()))).equals("2") == true) &&
			       i + 1 < clength &&
			       (zhwords.containsKey(new String(currentword.toString() + currentchar + cline.charAt(i+1)).intern()) == true))
		    {
			// Starts a word in the lexicon
			currentword.append(currentchar);
			
		    } else { // Start anew
			outline.append(currentword.toString());
			currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));

			if (Character.isWhitespace(currentchar) == false) {
			    outline.append(separator);
			    currentoffset += separator.length(); 
			    if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
			}
			currentword.setLength(0);
			currentword.append(currentchar);
		    }
		}
		
	    } else {  // Not chinese character
		//System.err.println("not cjk");
		if (currentword.length() > 0) {
		    outline.append(currentword.toString());
		    currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
		    
		    if (Character.isWhitespace(currentchar) == false) {
			outline.append(separator);
			currentoffset += separator.length(); 
			if (separator.length() > 0) {offsets.add(new Integer(currentoffset));}
		    }
		    currentword.setLength(0);
		} 

		while ((i < clength) && 
		       (Character.UnicodeBlock.of(cline.charAt(i)) != Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS)) {
		    currentword.append(cline.charAt(i));
		    i++;
		}
		i--;
		outline.append(currentword.toString());
		currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
		currentword.setLength(0);
	    }
	}

	outline.append(currentword.toString());
	if (currentword.length() > 0) {
	    currentoffset += currentword.length(); offsets.add(new Integer(currentoffset));
	}
	
	return offsets;
    }


    public void addword(String newword) {
	zhwords.put(newword.intern(), "1");

	if (newword.length() == 3) {
	    if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
		zhwords.put(newword.substring(0,2).intern(), "2");
	    }
	}

	if (newword.length() == 4) {
	    if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
		zhwords.put(newword.substring(0,2).intern(), "2");
	    }
	    if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
		zhwords.put(newword.substring(0,3).intern(), "2");
	    }
	    
	}

	if (newword.length() == 5) {
	    if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
		zhwords.put(newword.substring(0,2).intern(), "2");
	    }
	    if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
		zhwords.put(newword.substring(0,3).intern(), "2");
	    }
	    if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
		zhwords.put(newword.substring(0,4).intern(), "2");
	    }
	}

	if (newword.length() == 6) {
	    if (zhwords.containsKey(newword.substring(0, 2).intern()) == false) {
		zhwords.put(newword.substring(0,2).intern(), "2");
	    }
	    if (zhwords.containsKey(newword.substring(0, 3).intern()) == false) {
		zhwords.put(newword.substring(0,3).intern(), "2");
	    }
	    if (zhwords.containsKey(newword.substring(0, 4).intern()) == false) {
		zhwords.put(newword.substring(0,4).intern(), "2");
	    }
	    if (zhwords.containsKey(newword.substring(0, 5).intern()) == false) {
		zhwords.put(newword.substring(0,5).intern(), "2");
	    }
	}

    }


    public void segmentFile(String inputfile, String encoding) {
	byte[] gbbytes;
	String outfile = inputfile + ".seg";
	String segstring;
	wordCountMax=-1;
	boolean debug = false;

	try {
	    String dataline;
	    InputStream srcdata = new FileInputStream(inputfile);
	    BufferedReader in = new BufferedReader(new InputStreamReader(srcdata, encoding));
	    BufferedWriter outbuffer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outfile), encoding));

 try{
 	while ((dataline = in.readLine()) != null) {
	   
		segstring = segmentLine(dataline, " ");
		if (debug) {
		    gbbytes = segstring.getBytes(encoding);
		    System.err.println("Output: " + new String(gbbytes));
		} 
		outbuffer.write(segstring);
		outbuffer.newLine();
	    }
	
	}catch(Exception e){
	System.err.println("Exception [segmenter.java 776 while] " + e.toString());
	}

 	in.close();
	    outbuffer.close();
	}
	catch (Exception e) {
	    System.err.println("Exception [segmenter.java 768] " + e.toString());
	}
    }

    public static void printHelp() {
	System.out.println("Usage:\njava -jar segmenter.jar [-b|-g|-8] inputfile.txt");
	System.out.println("\t-b Big5, -g GB2312, -8 UTF-8");
	System.out.println("  Segmented text will be saved to inputfile.txt.seg");
	System.exit(0);
    }

}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -