📄 chinesesegmentertwo.java
字号:
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.Iterator;
import java.util.TreeMap;
import java.util.TreeSet;
public class ChineseSegmenterTwo {
private TreeMap wordfrequency, characterfrequency ;
private TreeMap originaltext ;
private TreeMap result1 , result3;
private TreeSet result2 ;
private String input = null;
private int textlength;
public ChineseSegmenterTwo (String filename)
{
originaltext = new TreeMap();
if (filename == null) {
return;
}
String newword = null;
try {
InputStream worddata = null;
worddata = getClass().getResourceAsStream( filename );
BufferedReader in = new BufferedReader(new InputStreamReader(
worddata));
for (int i=0;(newword = in.readLine()) != null;i++) {
if ((newword.indexOf("#")) != -1 ) {
originaltext.put(newword.substring(1).intern(), String.valueOf(i));
}
}
in.close();
}
catch (IOException e) {
e.printStackTrace();
}
Iterator er = originaltext.keySet().iterator();
while ( er.hasNext() )
{
input = input + er.next().toString().intern();
}
Mresult();
}
public boolean inCharacter(String testword) {
boolean result = true;
for (int i = 0; i < testword.length(); i++) {
if (characterfrequency.containsKey(testword.substring(i, i + 1).intern()) == false) {
result = false;
break;
}
}
return result;
}
private void Characterfrequency ()
{
characterfrequency = new TreeMap();
String newword = null;
String k, th;
int x=0;
int length=0;
float y;
Iterator e =originaltext.keySet().iterator();
while ( e.hasNext() )
{
newword = e.next().toString();
length = newword.length() + length ;
for ( int i=0 ; i < newword.length() ; i++ )
{
if ( inCharacter(newword.substring(i,i+1).intern()) )
{
k =characterfrequency.get(newword.substring(i,i+1).intern() ).toString() ;
x= Integer.parseInt(k)+1;
characterfrequency.put(newword.substring(i,i+1).intern() , String.valueOf(x));
}
else
{
characterfrequency.put(newword.substring(i,i+1).intern() , "1");
}
}
}
Iterator ex =characterfrequency.keySet().iterator();
while ( ex.hasNext() )
{
th = ex.next().toString();
k = characterfrequency.get( th ).toString() ;
y = Float.parseFloat(k)/length;
characterfrequency.put( th , String.valueOf(y));
}
textlength = length;
}
public boolean inWord(String testword) {
boolean result = true;
if (wordfrequency.containsKey(testword.intern()) == false) {
result = false;
}
return result;
}
private int Seachchar ( String schar )
{
String k;
float x;
k=characterfrequency.get( schar ).toString() ;
x= Float.parseFloat(k)*textlength;
return (int)x;
}
private void Wordfrequency ()
{
Characterfrequency();
wordfrequency = new TreeMap();
Iterator e =originaltext.keySet().iterator();
String newword = null;
String k, th ;
int x=0;
int firstchar, nextchar;
float y;
while ( e.hasNext() )
{
newword = e.next().toString();
for ( int i=0 ; i < newword.length()-1 ; i++ )
{
if ( inWord(newword.substring(i,i+2).intern()) )
{
k =wordfrequency.get( newword.substring(i,i+2).intern() ).toString() ;
x= Integer.parseInt(k)+1;
wordfrequency.put(newword.substring(i,i+2).intern() , String.valueOf(x));
}
else
{
wordfrequency.put(newword.substring(i,i+2).intern() , "1");
}
}
}
Iterator ex =wordfrequency.keySet().iterator();
while ( ex.hasNext() )
{
th = ex.next().toString();
k = wordfrequency.get( th ).toString() ;
firstchar = Seachchar ( th.substring(0,1).toString() );
nextchar = Seachchar ( th.substring(1,2).toString() );
y = Float.parseFloat(k);
y = y/(firstchar*(textlength/8)+nextchar*(textlength/8) +(y-firstchar)*(textlength/16) +(y-nextchar)*(textlength/16) +textlength);
wordfrequency.put( th , String.valueOf(y));
}
}
private float Search ( String schar )
{
String k;
float x;
k=characterfrequency.get( schar ).toString() ;
x= Float.parseFloat(k);
return x;
}
private void Mresult ()
{
result1 = new TreeMap();
String k, th ;
float firstchar, nextchar;
float y ;
double z ,lim;
Wordfrequency();
lim = 4.0/textlength ;
lim = lim+lim ;
lim = lim*(4.0/(4.0*(textlength/8)+4.0*(textlength/8)+textlength));
lim = (float) Math.log(1.0/lim);
System.out.println(lim) ;
Iterator ex =wordfrequency.keySet().iterator();
while ( ex.hasNext() )
{
th = ex.next().toString();
k = wordfrequency.get( th ).toString() ;
firstchar = Search ( th.substring(0,1).toString() );
nextchar = Search ( th.substring(1,2).toString() );
y = (Float.parseFloat(k))*(firstchar+nextchar);
z=Math.log(1/y);
System.out.println(z);
if ( z < lim )
result1.put( th , String.valueOf(z) );
}
}
public void SegmentLineDouber()
{
result2 = new TreeSet();
StringBuffer currentword = new StringBuffer();
int i, clength;
char currentchar;
String k;
float x,y;
clength = input.length();
for (i = 0; i < clength; i++) {
currentchar = input.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS == true)
{
if (currentword.length() == 0)
{ // start looking for next
// word
currentword.append(currentchar);
} else
{
if (result1.containsKey(new String(currentword.toString() + currentchar).intern()) == true)
{
// word is in lexicon
if( result1.containsKey(new String( new char[]{currentchar , input.charAt(i+1)} ) )== true )
{
k=result1.get( new String(currentword.toString() + currentchar).intern() ).toString() ;
x= Float.parseFloat(k);
k=result1.get( new String( new char[]{currentchar , input.charAt(i+1)} ) ).toString() ;
y= Float.parseFloat(k);
if ( (x-y) < 0.1 && (x-y) > -0.1 )
{
result2.add( new String(currentword.toString() + currentchar).intern() );
System.out.println(x-y);
currentword.setLength(0);
}
else
{
if ( x < y )
{
result2.add( new String(currentword.toString() + currentchar).intern() );
currentword.setLength(0);
}
else
{
currentword.setLength(0);
currentword.append(currentchar);
}
}
}
else
{
result2.add( new String(currentword.toString() + currentchar).intern() );
// System.out.println(new String(currentword.toString() + currentchar).intern());
currentword.setLength(0);
}
}
else
{
currentword.setLength(0);
currentword.append(currentchar);
}
}
}
}
}
private float Seachmore ( String schar )
{
String k;
float x;
k=result3.get( schar ).toString() ;
x= Float.parseFloat(k);
return x;
}
public void SegmentLineMore()
{
result3 = new TreeMap();
StringBuffer currentword = new StringBuffer();
int i, clength;
char currentchar;
String k;
float x,y;
clength = input.length();
for (i = 0; i < clength; i++) {
currentchar = input.charAt(i);
if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS == true)
{
if (currentword.length() == 0)
{ // start looking for next
// word
currentword.append(currentchar);
} else
{
if (result2.contains(new String(currentword.toString() + currentchar).intern()) == true)
{
// System.out.println(new String(currentword.toString() + currentchar).intern());
// word is in lexicon
if( result2.contains(new String( new char[]{input.charAt(i+1) , input.charAt(i+2)} ) )== true )
{
}
else
{
// result2.add( new String(currentword.toString() + currentchar).intern() );
// System.out.println(new String(currentword.toString() + currentchar).intern());
currentword.setLength(0);
}
}
else
{
currentword.setLength(0);
currentword.append(currentchar);
}
}
}
}
}
public static void main(String[] args) throws Exception
{
ChineseSegmenterTwo seg = new ChineseSegmenterTwo( "text.txt" );
seg.SegmentLineDouber();
seg.SegmentLineMore();
// System.out.println(seg.segmentLine("在当今的信息化社会,面对互联网上不断增加的海量信息,人们对信息的处理需求日益增多。大家在面对网络上面五花八门的信息感到兴奋的同时,也发现了寻找到适合自己口味的东西就像在大海中捞针一样困难。解决这个难题的方法之一就是搜索引擎。人们希望借助搜索引擎以最快的速度定位到自己感兴趣的东西。然而,众多的搜索引擎也面临着如下的难题,所以很难满足用户需要:1. 网上的信息多数为自然语言,大部分的搜索引擎对自然语言的处理不到位,因而很难正确的判定用户的需求,也就很难正确的对结果进行细致的分类。2. 搜索引擎找到的信息数量多,篇幅长,很难有效的呈现给用户。如何很好解决搜索引擎的缺点,尽量满足用户的需要呢?最重要的一点就是将信息进行压缩,概括出反应主旨的几句话,这样用户就可在最短的时间内浏览尽量多的信息,并判断那些文章需要进一步的阅读。要做到这一点,搜索引擎就需要自然语言处理中的自动文摘技术,来对自然语言的各种信息进行关键信息的提取或生成。搜索引擎使用人工方式对海量的信息生成摘要是不现实的,Internet上的自动文摘系统就已经成为了紧迫的社会需求,并具有极大的科学意义和实际的应用价值。", "\n"));
Iterator er = seg.result2.iterator();
while ( er.hasNext() )
{
System.out.println(er.next().toString());
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -