⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 chinesewordsdemo.java

📁 LingPipe(开源自然语言处理的Java开源工具包) 中文分词java程序
💻 JAVA
字号:
package com.aliasi.demo.demos;import com.aliasi.demo.framework.AbstractTextDemo;import com.aliasi.spell.CompiledSpellChecker;import com.aliasi.xml.SAXWriter;import com.aliasi.util.Streams;import java.io.BufferedInputStream;import java.io.InputStream;import java.io.IOException;import java.io.ObjectInputStream;import java.net.URL;import java.net.MalformedURLException;import java.util.Properties;import org.xml.sax.Attributes;import org.xml.sax.SAXException;public class ChineseWordsDemo extends AbstractTextDemo {    private String mCorpusName;    private CompiledSpellChecker mTokenizer;    public ChineseWordsDemo(String corpusName, String modelResource) {	mCorpusName = corpusName;	addModel("Chinese Word Segmentation Model: " + corpusName,		 modelResource);	InputStream in = null;	BufferedInputStream bufIn = null;	ObjectInputStream objIn = null;	try {	    in = ChineseWordsDemo.class.getResourceAsStream(modelResource);	    bufIn = new BufferedInputStream(in);	    objIn = new ObjectInputStream(bufIn);	    Object tokenizer = objIn.readObject();	    mTokenizer = (CompiledSpellChecker) tokenizer;	    mTokenizer.setAllowInsert(true);	    mTokenizer.setAllowMatch(true);	    mTokenizer.setAllowDelete(false);	    mTokenizer.setAllowSubstitute(false);	    mTokenizer.setAllowTranspose(false);	    mTokenizer.setNumConsecutiveInsertionsAllowed(1);	    mTokenizer.setNBest(128);	} catch (IOException e) {	    String msg = "Corpus Name=" + corpusName		+ " Model resource=" + modelResource		+ " IOException=" + e;	    throw new IllegalArgumentException(msg);	} catch (ClassNotFoundException e) {	    String msg = "Corpus Name=" + corpusName		+ " Model resource=" + modelResource		+ " IOException=" + e;	    throw new IllegalArgumentException(msg);	} finally {	    Streams.closeInputStream(objIn);	    Streams.closeInputStream(bufIn);	    Streams.closeInputStream(in);	}    }    public String title() {	return "Chinese Word Segmentation: " + mCorpusName;    }        public String description() {	return "This is the LingPipe demo for Chinese word segmentation,"	    + " also known as tokenization."	    + " It wraps Chinese words in XML elements."	    + " The notion of word is derived from the corpus prepared by " 	    + mCorpusName + ".";    }    public void process(char[] cs, int start, int end,			SAXWriter writer,			Properties properties) throws SAXException {	String input = new String(cs,start,end-start);	String response = mTokenizer.didYouMean(input);	/* more efficient -- no extra double allocation (strings + char array)	for (int next = 0, last = 0; 	     (next = response.indexOf(" ",last)) != 0;	     last = next + 1) {	    	    writer.startSimpleElement("tok");	    writer.characters(cs,start+last,start+next);	    writer.endSimpleElement("tok");	}	*/	String[] tokens = response.split(" ");	for (int i = 0; i < tokens.length; ++i) {	    writer.startSimpleElement("tok");	    writer.characters(tokens[i]);	    writer.endSimpleElement("tok");	}    }}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -