⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 germanstemmer.java

📁 天乙代码src_531.rar 天乙代码src_531.rar 天乙代码src_531.rar 天乙代码src_531.rar
💻 JAVA
字号:
package org.apache.lucene.analysis.de;/* ==================================================================== * The Apache Software License, Version 1.1 * * Copyright (c) 2001 The Apache Software Foundation.  All rights * reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * * 1. Redistributions of source code must retain the above copyright *    notice, this list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright *    notice, this list of conditions and the following disclaimer in *    the documentation and/or other materials provided with the *    distribution. * * 3. The end-user documentation included with the redistribution, *    if any, must include the following acknowledgment: *       "This product includes software developed by the *        Apache Software Foundation (http://www.apache.org/)." *    Alternately, this acknowledgment may appear in the software itself, *    if and wherever such third-party acknowledgments normally appear. * * 4. The names "Apache" and "Apache Software Foundation" and *    "Apache Lucene" must not be used to endorse or promote products *    derived from this software without prior written permission. For *    written permission, please contact apache@apache.org. * * 5. Products derived from this software may not be called "Apache", *    "Apache Lucene", nor may "Apache" appear in their name, without *    prior written permission of the Apache Software Foundation. * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * ==================================================================== * * This software consists of voluntary contributions made by many * individuals on behalf of the Apache Software Foundation.  For more * information on the Apache Software Foundation, please see * <http://www.apache.org/>. *//** * A stemmer for German words. The algorithm is based on the report * "A Fast and Simple Stemming Algorithm for German Words" by J鰎g * Caumanns (joerg.caumanns@isst.fhg.de). * * @author    Gerhard Schwarz * @version   $Id: GermanStemmer.java,v 1.4 2002/04/19 19:09:36 otis Exp $ */public class GermanStemmer {    /**     * Buffer for the terms while stemming them.     */    private StringBuffer sb = new StringBuffer();    /**     * Indicates if a term is handled as a noun.     */    private boolean uppercase = false;    /**     * Amount of characters that are removed with <tt>substitute()</tt> while stemming.     */    private int substCount = 0;    public GermanStemmer() {    }    /**     * Stemms the given term to an unique <tt>discriminator</tt>.     *     * @param term  The term that should be stemmed.     * @return      Discriminator for <tt>term</tt>     */    protected String stem( String term ) {	if ( !isStemmable( term ) ) {	    return term;	}	// Mark a possible noun.	if ( Character.isUpperCase( term.charAt( 0 ) ) ) {	    uppercase = true;	}	else {	    uppercase = false;	}	// Use lowercase for medium stemming.	term = term.toLowerCase();	// Reset the StringBuffer.	sb.delete( 0, sb.length() );	sb.insert( 0, term );	sb = substitute( sb );	// Nouns have only seven possible suffixes.	if ( uppercase && sb.length() > 3 ) {	    if ( sb.substring( sb.length() - 3, sb.length() ).equals( "ern" ) ) {		sb.delete( sb.length() - 3, sb.length() );	    }	    else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "en" ) ) {		sb.delete( sb.length() - 2, sb.length() );	    }	    else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) {		sb.delete( sb.length() - 2, sb.length() );	    }	    else if ( sb.substring( sb.length() - 2, sb.length() ).equals( "es" ) ) {		sb.delete( sb.length() - 2, sb.length() );	    }	    else if ( sb.charAt( sb.length() - 1 ) == 'e' ) {		sb.deleteCharAt( sb.length() - 1 );	    }	    else if ( sb.charAt( sb.length() - 1 ) == 'n' ) {		sb.deleteCharAt( sb.length() - 1 );	    }	    else if ( sb.charAt( sb.length() - 1 ) == 's' ) {		sb.deleteCharAt( sb.length() - 1 );	    }	    // Additional step for female plurals of professions and inhabitants.	    if ( sb.length() > 5 && sb.substring( sb.length() - 3, sb.length() ).equals( "erin*" ) ) {		sb.deleteCharAt( sb.length() -1 );	    }	    // Additional step for irregular plural nouns like "Matrizen -> Matrix".	    if ( sb.charAt( sb.length() - 1 ) == ( 'z' ) ) {		sb.setCharAt( sb.length() - 1, 'x' );	    }	}	// Strip the 7 "base" suffixes: "e", "s", "n", "t", "em", "er", "nd" from all	// other terms. Adjectives, Verbs and Adverbs have a total of 52 different	// possible suffixes, stripping only the characters from they are build	// does mostly the same	else {	    // Strip base suffixes as long as enough characters remain.	    boolean doMore = true;	    while ( sb.length() > 3 && doMore ) {		if ( ( sb.length() + substCount > 5 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "nd" ) ) {		    sb.delete( sb.length() - 2, sb.length() );		}		else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "er" ) ) {		    sb.delete( sb.length() - 2, sb.length() );		}		else if ( ( sb.length() + substCount > 4 ) && sb.substring( sb.length() - 2, sb.length() ).equals( "em" ) ) {		    sb.delete( sb.length() - 2, sb.length() );		}		else if ( sb.charAt( sb.length() - 1 ) == 't' ) {		    sb.deleteCharAt( sb.length() - 1 );		}		else if ( sb.charAt( sb.length() - 1 ) == 'n' ) {		    sb.deleteCharAt( sb.length() - 1 );		}		else if ( sb.charAt( sb.length() - 1 ) == 's' ) {		    sb.deleteCharAt( sb.length() - 1 );		}		else if ( sb.charAt( sb.length() - 1 ) == 'e' ) {		    sb.deleteCharAt( sb.length() - 1 );		}		else {		    doMore = false;		}	    }	}	sb = resubstitute( sb );	if ( !uppercase ) {	    sb = removeParticleDenotion( sb );	}	return sb.toString();    }    /**     * Removes a particle denotion ("ge") from a term, but only if at least 3     * characters will remain.     *     * @return  The term without particle denotion, if there was one.     */    private StringBuffer removeParticleDenotion( StringBuffer buffer ) {	for ( int c = 0; c < buffer.length(); c++ ) {	    // Strip from the beginning of the string to the "ge" inclusive	    if ( c < ( buffer.length() - 4 ) && buffer.charAt( c ) == 'g' && buffer.charAt ( c + 1 ) == 'e' ) {		buffer.delete( 0, c + 2 );	    }	}	return sb;    }    /**     * Do some substitutions for the term to reduce overstemming:     *     * - Substitute Umlauts with their corresponding vowel: 漩

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -