frenchstemmer.java

来自「一套java版本的搜索引擎源码」· Java 代码 · 共 710 行 · 第 1/2 页
JAVA
710 行
package org.apache.lucene.analysis.fr;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** * A stemmer for French words. The algorithm is based on the work of * Dr Martin Porter on his snowball project<br> * refer to http://snowball.sourceforge.net/french/stemmer.html<br> * (French stemming algorithm) for details * * @author    Patrick Talbot */public class FrenchStemmer {    /**     * Buffer for the terms while stemming them.     */    private StringBuffer sb = new StringBuffer();    /**     * A temporary buffer, used to reconstruct R2     */     private StringBuffer tb = new StringBuffer();	/**	 * Region R0 is equal to the whole buffer	 */	private String R0;	/**	 * Region RV	 * "If the word begins with two vowels, RV is the region after the third letter,	 * otherwise the region after the first vowel not at the beginning of the word,	 * or the end of the word if these positions cannot be found."	 */    private String RV;	/**	 * Region R1	 * "R1 is the region after the first non-vowel following a vowel	 * or is the null region at the end of the word if there is no such non-vowel"	 */    private String R1;	/**	 * Region R2	 * "R2 is the region after the first non-vowel in R1 following a vowel	 * or is the null region at the end of the word if there is no such non-vowel"	 */    private String R2;	/**	 * Set to true if we need to perform step 2	 */    private boolean suite;	/**	 * Set to true if the buffer was modified	 */    private boolean modified;    /**     * Stemms the given term to a unique <tt>discriminator</tt>.     *     * @param term  java.langString The term that should be stemmed     * @return java.lang.String  Discriminator for <tt>term</tt>     */    protected String stem( String term ) {		if ( !isStemmable( term ) ) {			return term;		}		// Use lowercase for medium stemming.		term = term.toLowerCase();		// Reset the StringBuffer.		sb.delete( 0, sb.length() );		sb.insert( 0, term );		// reset the booleans		modified = false;		suite = false;		sb = treatVowels( sb );		setStrings();		step1();		if (!modified || suite)		{			if (RV != null)			{				suite = step2a();				if (!suite)					step2b();			}		}		if (modified || suite)			step3();		else			step4();		step5();		step6();		return sb.toString();    }	/**	 * Sets the search region Strings<br>	 * it needs to be done each time the buffer was modified	 */	private void setStrings() {		// set the strings		R0 = sb.toString();		RV = retrieveRV( sb );		R1 = retrieveR( sb );		if ( R1 != null )		{			tb.delete( 0, tb.length() );			tb.insert( 0, R1 );			R2 = retrieveR( tb );		}		else			R2 = null;	}	/**	 * First step of the Porter Algorithmn<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 */	private void step1( ) {		String[] suffix = { "ances", "iqUes", "ismes", "ables", "istes", "ance", "iqUe", "isme", "able", "iste" };		deleteFrom( R2, suffix );		replaceFrom( R2, new String[] { "logies", "logie" }, "log" );		replaceFrom( R2, new String[] { "usions", "utions", "usion", "ution" }, "u" );		replaceFrom( R2, new String[] { "ences", "ence" }, "ent" );		String[] search = { "atrices", "ateurs", "ations", "atrice", "ateur", "ation"};		deleteButSuffixFromElseReplace( R2, search, "ic",  true, R0, "iqU" );		deleteButSuffixFromElseReplace( R2, new String[] { "ements", "ement" }, "eus", false, R0, "eux" );		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "ativ", false );		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iv", false );		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "abl", false );		deleteButSuffixFrom( R2, new String[] { "ements", "ement" }, "iqU", false );		deleteFromIfTestVowelBeforeIn( R1, new String[] { "issements", "issement" }, false, R0 );		deleteFrom( RV, new String[] { "ements", "ement" } );		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "abil", false, R0, "abl" );		deleteButSuffixFromElseReplace( R2, new String[] { "ités", "ité" }, "ic", false, R0, "iqU" );		deleteButSuffixFrom( R2, new String[] { "ités", "ité" }, "iv", true );		String[] autre = { "ifs", "ives", "if", "ive" };		deleteButSuffixFromElseReplace( R2, autre, "icat", false, R0, "iqU" );		deleteButSuffixFromElseReplace( R2, autre, "at", true, R2, "iqU" );		replaceFrom( R0, new String[] { "eaux" }, "eau" );		replaceFrom( R1, new String[] { "aux" }, "al" );		deleteButSuffixFromElseReplace( R2, new String[] { "euses", "euse" }, "", true, R1, "eux" );		deleteFrom( R2, new String[] { "eux" } );		// if one of the next steps is performed, we will need to perform step2a		boolean temp = false;		temp = replaceFrom( RV, new String[] { "amment" }, "ant" );		if (temp == true)			suite = true;		temp = replaceFrom( RV, new String[] { "emment" }, "ent" );		if (temp == true)			suite = true;		temp = deleteFromIfTestVowelBeforeIn( RV, new String[] { "ments", "ment" }, true, RV );		if (temp == true)			suite = true;	}	/**	 * Second step (A) of the Porter Algorithmn<br>	 * Will be performed if nothing changed from the first step	 * or changed were done in the amment, emment, ments or ment suffixes<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 *	 * @return boolean - true if something changed in the StringBuffer	 */	private boolean step2a() {		String[] search = { "îmes", "îtes", "iraIent", "irait", "irais", "irai", "iras", "ira",							"irent", "iriez", "irez", "irions", "irons", "iront",							"issaIent", "issais", "issantes", "issante", "issants", "issant",							"issait", "issais", "issions", "issons", "issiez", "issez", "issent",							"isses", "isse", "ir", "is", "ît", "it", "ies", "ie", "i" };		return deleteFromIfTestVowelBeforeIn( RV, search, false, RV );	}	/**	 * Second step (B) of the Porter Algorithmn<br>	 * Will be performed if step 2 A was performed unsuccessfully<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 */	private void step2b() {		String[] suffix = { "eraIent", "erais", "erait", "erai", "eras", "erions", "eriez",							"erons", "eront","erez", "èrent", "era", "ées", "iez",							"ée", "és", "er", "ez", "é" };		deleteFrom( RV, suffix );		String[] search = { "assions", "assiez", "assent", "asses", "asse", "aIent",							"antes", "aIent", "Aient", "ante", "âmes", "âtes", "ants", "ant",							"ait", "aît", "ais", "Ait", "Aît", "Ais", "ât", "as", "ai", "Ai", "a" };		deleteButSuffixFrom( RV, search, "e", true );		deleteFrom( R2, new String[] { "ions" } );	}	/**	 * Third step of the Porter Algorithmn<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 */	private void step3() {		if (sb.length()>0)		{			char ch = sb.charAt( sb.length()-1 );			if (ch == 'Y')			{				sb.setCharAt( sb.length()-1, 'i' );				setStrings();			}			else if (ch == 'ç')			{				sb.setCharAt( sb.length()-1, 'c' );				setStrings();			}		}	}	/**	 * Fourth step of the Porter Algorithmn<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 */	private void step4() {		if (sb.length() > 1)		{			char ch = sb.charAt( sb.length()-1 );			if (ch == 's')			{				char b = sb.charAt( sb.length()-2 );				if (b != 'a' && b != 'i' && b != 'o' && b != 'u' && b != 'è' && b != 's')				{					sb.delete( sb.length() - 1, sb.length());					setStrings();				}			}		}		boolean found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "s" );		if (!found)		found = deleteFromIfPrecededIn( R2, new String[] { "ion" }, RV, "t" );		replaceFrom( RV, new String[] { "Ière", "ière", "Ier", "ier" }, "i" );		deleteFrom( RV, new String[] { "e" } );		deleteFromIfPrecededIn( RV, new String[] { "ë" }, R0, "gu" );	}	/**	 * Fifth step of the Porter Algorithmn<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 */	private void step5() {		if (R0 != null)		{			if (R0.endsWith("enn") || R0.endsWith("onn") || R0.endsWith("ett") || R0.endsWith("ell") || R0.endsWith("eill"))			{				sb.delete( sb.length() - 1, sb.length() );				setStrings();			}		}	}	/**	 * Sixth (and last!) step of the Porter Algorithmn<br>	 * refer to http://snowball.sourceforge.net/french/stemmer.html for an explanation	 */	private void step6() {		if (R0!=null && R0.length()>0)		{			boolean seenVowel = false;			boolean seenConson = false;			int pos = -1;			for (int i = R0.length()-1; i > -1; i--)			{				char ch = R0.charAt(i);				if (isVowel(ch))				{					if (!seenVowel)					{						if (ch == 'é' || ch == 'è')						{							pos = i;							break;						}					}					seenVowel = true;				}				else				{					if (seenVowel)						break;					else						seenConson = true;				}			}			if (pos > -1 && seenConson && !seenVowel)				sb.setCharAt(pos, 'e');		}	}	/**	 * Delete a suffix searched in zone "source" if zone "from" contains prefix + search string	 *	 * @param source java.lang.String - the primary source zone for search	 * @param search java.lang.String[] - the strings to search for suppression	 * @param from java.lang.String - the secondary source zone for search	 * @param prefix java.lang.String - the prefix to add to the search string to test	 * @return boolean - true if modified	 */	private boolean deleteFromIfPrecededIn( String source, String[] search, String from, String prefix ) {		boolean found = false;		if (source!=null )		{			for (int i = 0; i < search.length; i++) {				if ( source.endsWith( search[i] ))				{
frenchstemmer.java - 源码说明

本页面展示了「一套java版本的搜索引擎源码」中的 frenchstemmer.java 源码文件，采用 Java 编程语言编写，共 710 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与java相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?