germanstemmer.java

来自「一个很不错的词频统计程序,目前只支持英文,中文的本人正在修改中.改好后上传给大家」· Java 代码 · 共 156 行

JAVA

156 行

package edu.udo.cs.wvtool.external;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A stemmer for German words. The algorithm is based on the report
 * "A Fast and Simple Stemming Algorithm for German Words" by J鰎g
 * Caumanns (joerg.caumanns@isst.fhg.de).
 *
 * @author    Gerhard Schwarz
 * @version   $Id$
 */
public class GermanStemmer
{
    /**
     * Buffer for the terms while stemming them.
     */
    private StringBuffer sb = new StringBuffer();

    /**
     * Amount of characters that are removed with substitute() while stemming.
     */
    private int substCount = 0;

    /**
     * Stemms the given term to an unique discriminator.
     *
     * @param term  The term that should be stemmed.
     * @return      Discriminator for term
     */
    public String stem( String term )
    {
      // Use lowercase for medium stemming.
      term = term.toLowerCase();
      if ( !isStemmable( term ) )
        return term;
      // Reset the StringBuffer.
      sb.delete( 0, sb.length() );
      sb.insert( 0, term );
      // Stemming starts here...
      substitute( sb );
      strip( sb );
      optimize( sb );
      resubstitute( sb );
      removeParticleDenotion( sb );
      return sb.toString();
    }

    /**
     * Checks if a term could be stemmed.
     *
     * @return  true if, and only if, the given term consists in letters.
     */
    private boolean isStemmable( String term )
    {
      for ( int c = 0; c < term.length(); c++ ) {
        if ( !Character.isLetter( term.charAt( c ) ) )
          return false;
      }
      return true;
    }

    /**
     * suffix stripping (stemming) on the current term. The stripping is reduced
     * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
     * from which all regular suffixes are build of. The simplification causes
     * some overstemming, and way more irregular stems, but still provides unique.
     * discriminators in the most of those cases.
     * The algorithm is context free, except of the length restrictions.
     */
    private void strip( StringBuffer buffer )
    {
      boolean doMore = true;
      while ( doMore && buffer.length() > 3 ) {
        if ( ( buffer.length() + substCount > 5 ) &&
          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
        {
          buffer.delete( buffer.length() - 2, buffer.length() );
        }
        else if ( ( buffer.length() + substCount > 4 ) &&
          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
            buffer.delete( buffer.length() - 2, buffer.length() );
        }
        else if ( ( buffer.length() + substCount > 4 ) &&
          buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
            buffer.delete( buffer.length() - 2, buffer.length() );
        }
        else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
          buffer.deleteCharAt( buffer.length() - 1 );
        }
        else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
          buffer.deleteCharAt( buffer.length() - 1 );
        }
        else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
          buffer.deleteCharAt( buffer.length() - 1 );
        }
        // "t" occurs only as suffix of verbs.
        else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
          buffer.deleteCharAt( buffer.length() - 1 );
        }
        else {
          doMore = false;
        }
      }
    }

    /**
     * Does some optimizations on the term. This optimisations are
     * contextual.
     */
    private void optimize( StringBuffer buffer )
    {
      // Additional step for female plurals of professions and inhabitants.
      if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
        buffer.deleteCharAt( buffer.length() -1 );
        strip( buffer );
      }
      // Additional step for irregular plural nouns like "Matrizen -> Matrix".
      if ( buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
        buffer.setCharAt( buffer.length() - 1, 'x' );
      }
    }

    /**
     * Removes a particle denotion ("ge") from a term.
     */
    private void removeParticleDenotion( StringBuffer buffer )
    {
      if ( buffer.length() > 4 ) {
        for ( int c = 0; c < buffer.length() - 3; c++ ) {
          if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
            buffer.delete( c, c + 2 );
            return;
          }
        }
      }
    }

    /**
     * Do some substitutions for the term to reduce overstemming:
     *
     * - Substitute Umlauts with their corresponding vowel: 漩

germanstemmer.java - 源码说明

本页面展示了「一个很不错的词频统计程序,目前只支持英文,中文的本人正在修改中.改好后上传给大家分享」中的 germanstemmer.java 源码文件，采用 Java 编程语言编写，共 156 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与词频统计相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?