⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stemmingfilter.java

📁 java编写的OCR软件
💻 JAVA
字号:
package de.spieleck.app.jacson.filter;

import de.spieleck.app.jacson.JacsonConfigException;
import de.spieleck.app.jacson.JacsonException;
import de.spieleck.app.jacson.JacsonRegistry;
import de.spieleck.app.jacson.JacsonReport;
import de.spieleck.app.jacson.util.ConfigUtil;
import de.spieleck.app.lang.Stemmer;
import de.spieleck.config.ConfigNode;
import de.spieleck.config.ConfigVerify.Acceptor;
import de.spieleck.util.FastClassForName;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

/**
 * Filter to stemm a chunk. 
 * The returned chunks are either Strings representing the stemms
 * or the String corresponding to the first word found connected
 * with this particular stemm.
 * @author fsn
 */
public class StemmingFilter
    extends FilterBase
    implements Acceptor
{
    /** Config node to choose the stemmer */
    public final static String STEMMER_NODE = "stemmer";
    
    /** Config node to tell the stemmer to return representatives not stems */
    public final static String WORD_NODE = "word";

    /** Defaultpackage for stemmers */
    public final static String STEMMER_PCK = "de.spieleck.app.lang.";

    /** The Stemmer to be used by this filter as a name */
    protected String stemmerName;

    /** The Stemmer to be used by this filter as a class */
    protected Stemmer stemmer;

    /** Map the equivalence classes of words to stemms. */
    protected Map stats = new HashMap();

    /** Shall we return chunks which are words or stemms */
    protected boolean returnWord;

    public StemmingFilter()
    {
    }

    public void init(ConfigNode config, JacsonRegistry registry)
        throws JacsonConfigException
    {
        returnWord = config.getBoolean(WORD_NODE, false);
        stemmer = getStemmer(config, registry);
        ConfigUtil.verify(config, this);
    }

    public boolean accept(ConfigNode node)
    {
        String name = node.getName();
        return STEMMER_NODE.equals(name)
            || WORD_NODE.equals(name);
    }

    public void putChunk(String chunk)
        throws JacsonException
    {
        if ( chunk == null )
            drain.putChunk(null);
        else
        {    
            String stemmed = stemmer.stem(chunk);
            if ( !stemmed.equals(chunk) )
            {
                String word = statistic(chunk, stemmed);
                if ( returnWord )
                    drain.putChunk(word);
                else
                    drain.putChunk(stemmed);
            }
            else
            {
                drain.putChunk(stemmed);
            }
        }
    }

    protected String statistic(String old, String stem)
    {
        Stat s = (Stat) stats.get(stem);
        if ( s == null )
        {
            s = new Stat(stem);
            stats.put(stem, s);
        }
        s.add(old);
        return s.getRepresent();
    }

    public void summary()
    {
        JacsonReport jr = getRegReport();
        jr.begin("stemming");
        jr.report("stemmer", stemmerName);
        Iterator it = stats.keySet().iterator();
        while ( it.hasNext() )
        {
            String s = (String) it.next();
            jr.begin("stem");
            jr.report("stem", s);
            Stat stat = (Stat) stats.get(s);
            jr.report("represent", stat.getRepresent());
            jr.report("count", ""+stat.getCount());
            Iterator i2 = stat.getOrgs();
            while ( i2.hasNext() )
            {
                String s2 = (String) i2.next();
                jr.report("org", s2);
            }
            jr.end();  
        }
        jr.end();
    }

    public static class Stat
    {
        protected String stemm = null;
        protected String represent = null;
        protected Set orgs = new HashSet();
        int count;

        public Stat(String stemm) 
        {
            this.stemm = stemm;
        }

        public void add(String org)
        {
            if ( represent == null )
                represent = org;
            orgs.add(org);
            count++;
        }

        public String getStem()
        {
            return stemm;
        }

        public String getRepresent()
        {
            return represent;
        }

        public int getCount()
        {
            return count;
        }

        public Iterator getOrgs()
        {
            return orgs.iterator();
        }
    }

    public Stemmer getStemmer(ConfigNode node, JacsonRegistry registry)
        throws JacsonConfigException
    {
        Iterator it = registry.find(node, STEMMER_NODE);
        if ( !it.hasNext() )
            throw new JacsonConfigException("Need a "+STEMMER_NODE+" child.");
        ConfigNode stemConfig = (ConfigNode) it.next();
        if ( it.hasNext() )
            throw new JacsonConfigException("Need exactly one selection child.");
        stemmerName = registry.scanForValue(stemConfig);
        Stemmer stemmer = (Stemmer) FastClassForName.newInstance(
                                stemmerName, STEMMER_PCK, Stemmer.class);
        if ( stemmer == null )
            throw new JacsonConfigException(
                                  "Cannot create stemmer "+stemmerName+".");
        return stemmer;
    }
}

//
//    Jacson - Text Filtering with Java.
//    Copyright (C) 2002 Frank S. Nestel (nestefan -at- users.sourceforge.net)
//
//    This library is free software; you can redistribute it and/or
//    modify it under the terms of the GNU Lesser General Public
//    License as published by the Free Software Foundation; either
//    version 2.1 of the License, or (at your option) any later version.
//
//    This library is distributed in the hope that it will be useful,
//    but WITHOUT ANY WARRANTY; without even the implied warranty of
//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//    Lesser General Public License for more details.
//
//    You should have received a copy of the GNU Lesser General Public
//    License along with this library; if not, write to the Free Software
//    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -