📄 stemmingfilter.java
字号:
package de.spieleck.app.jacson.filter;
import de.spieleck.app.jacson.JacsonConfigException;
import de.spieleck.app.jacson.JacsonException;
import de.spieleck.app.jacson.JacsonRegistry;
import de.spieleck.app.jacson.JacsonReport;
import de.spieleck.app.jacson.util.ConfigUtil;
import de.spieleck.app.lang.Stemmer;
import de.spieleck.config.ConfigNode;
import de.spieleck.config.ConfigVerify.Acceptor;
import de.spieleck.util.FastClassForName;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
/**
* Filter to stemm a chunk.
* The returned chunks are either Strings representing the stemms
* or the String corresponding to the first word found connected
* with this particular stemm.
* @author fsn
*/
public class StemmingFilter
extends FilterBase
implements Acceptor
{
/** Config node to choose the stemmer */
public final static String STEMMER_NODE = "stemmer";
/** Config node to tell the stemmer to return representatives not stems */
public final static String WORD_NODE = "word";
/** Defaultpackage for stemmers */
public final static String STEMMER_PCK = "de.spieleck.app.lang.";
/** The Stemmer to be used by this filter as a name */
protected String stemmerName;
/** The Stemmer to be used by this filter as a class */
protected Stemmer stemmer;
/** Map the equivalence classes of words to stemms. */
protected Map stats = new HashMap();
/** Shall we return chunks which are words or stemms */
protected boolean returnWord;
public StemmingFilter()
{
}
public void init(ConfigNode config, JacsonRegistry registry)
throws JacsonConfigException
{
returnWord = config.getBoolean(WORD_NODE, false);
stemmer = getStemmer(config, registry);
ConfigUtil.verify(config, this);
}
public boolean accept(ConfigNode node)
{
String name = node.getName();
return STEMMER_NODE.equals(name)
|| WORD_NODE.equals(name);
}
public void putChunk(String chunk)
throws JacsonException
{
if ( chunk == null )
drain.putChunk(null);
else
{
String stemmed = stemmer.stem(chunk);
if ( !stemmed.equals(chunk) )
{
String word = statistic(chunk, stemmed);
if ( returnWord )
drain.putChunk(word);
else
drain.putChunk(stemmed);
}
else
{
drain.putChunk(stemmed);
}
}
}
protected String statistic(String old, String stem)
{
Stat s = (Stat) stats.get(stem);
if ( s == null )
{
s = new Stat(stem);
stats.put(stem, s);
}
s.add(old);
return s.getRepresent();
}
public void summary()
{
JacsonReport jr = getRegReport();
jr.begin("stemming");
jr.report("stemmer", stemmerName);
Iterator it = stats.keySet().iterator();
while ( it.hasNext() )
{
String s = (String) it.next();
jr.begin("stem");
jr.report("stem", s);
Stat stat = (Stat) stats.get(s);
jr.report("represent", stat.getRepresent());
jr.report("count", ""+stat.getCount());
Iterator i2 = stat.getOrgs();
while ( i2.hasNext() )
{
String s2 = (String) i2.next();
jr.report("org", s2);
}
jr.end();
}
jr.end();
}
public static class Stat
{
protected String stemm = null;
protected String represent = null;
protected Set orgs = new HashSet();
int count;
public Stat(String stemm)
{
this.stemm = stemm;
}
public void add(String org)
{
if ( represent == null )
represent = org;
orgs.add(org);
count++;
}
public String getStem()
{
return stemm;
}
public String getRepresent()
{
return represent;
}
public int getCount()
{
return count;
}
public Iterator getOrgs()
{
return orgs.iterator();
}
}
public Stemmer getStemmer(ConfigNode node, JacsonRegistry registry)
throws JacsonConfigException
{
Iterator it = registry.find(node, STEMMER_NODE);
if ( !it.hasNext() )
throw new JacsonConfigException("Need a "+STEMMER_NODE+" child.");
ConfigNode stemConfig = (ConfigNode) it.next();
if ( it.hasNext() )
throw new JacsonConfigException("Need exactly one selection child.");
stemmerName = registry.scanForValue(stemConfig);
Stemmer stemmer = (Stemmer) FastClassForName.newInstance(
stemmerName, STEMMER_PCK, Stemmer.class);
if ( stemmer == null )
throw new JacsonConfigException(
"Cannot create stemmer "+stemmerName+".");
return stemmer;
}
}
//
// Jacson - Text Filtering with Java.
// Copyright (C) 2002 Frank S. Nestel (nestefan -at- users.sourceforge.net)
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -