📄 brazilianstemmer.java

📁 一套java版本的搜索引擎源码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.analysis.br;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** * A stemmer for Brazilian words. */public class BrazilianStemmer {	/**	 * Changed term	 */	private   String TERM ;	private   String CT ;	private   String R1 ;	private   String R2 ;	private   String RV ;	public BrazilianStemmer() {	}	/**	 * Stemms the given term to an unique <tt>discriminator</tt>.	 *	 * @param term  The term that should be stemmed.	 * @return      Discriminator for <tt>term</tt>	 */	protected String stem( String term ) {    boolean altered = false ; // altered the term    // creates CT    createCT(term) ;		if ( !isIndexable( CT ) ) {			return null;		}		if ( !isStemmable( CT ) ) {			return CT ;		}    R1 = getR1(CT) ;    R2 = getR1(R1) ;    RV = getRV(CT) ;    TERM = term + ";" +CT ;    altered = step1() ;    if (!altered) {      altered = step2() ;    }    if (altered) {      step3();    } else {      step4();    }    step5() ;    return CT ;	}	/**	 * Checks a term if it can be processed correctly.	 *	 * @return  true if, and only if, the given term consists in letters.	 */	private boolean isStemmable( String term ) {		for ( int c = 0; c < term.length(); c++ ) {			// Discard terms that contain non-letter characters.			if ( !Character.isLetter(term.charAt(c))) {				return false;			}		}		return true;	}	/**	 * Checks a term if it can be processed indexed.	 *	 * @return  true if it can be indexed	 */	private boolean isIndexable( String term ) {		return (term.length() < 30) && (term.length() > 2) ;	}	/**	 * See if string is 'a','e','i','o','u'   *   * @return true if is vowel	 */	private boolean isVowel( char value ) {    return (value == 'a') ||           (value == 'e') ||           (value == 'i') ||           (value == 'o') ||           (value == 'u') ;  }	/**	 * Gets R1   *   * R1 - is the region after the first non-vowel follwing a vowel,   *      or is the null region at the end of the word if there is   *      no such non-vowel.   *   * @return null or a string representing R1	 */	private String getR1( String value ) {    int     i;    int     j;    // be-safe !!!    if (value == null) {      return null ;    }    // find 1st vowel    i = value.length()-1 ;    for (j=0 ; j < i ; j++) {      if (isVowel(value.charAt(j))) {        break ;      }    }    if (!(j < i)) {      return null ;    }    // find 1st non-vowel    for ( ; j < i ; j++) {      if (!(isVowel(value.charAt(j)))) {        break ;      }    }    if (!(j < i)) {      return null ;    }    return value.substring(j+1) ;  }	/**	 * Gets RV   *   * RV - IF the second letter is a consoant, RV is the region after   *      the next following vowel,   *   *      OR if the first two letters are vowels, RV is the region   *      after the next consoant,   *   *      AND otherwise (consoant-vowel case) RV is the region after   *      the third letter.   *   *      BUT RV is the end of the word if this positions cannot be   *      found.   *   * @return null or a string representing RV	 */	private String getRV( String value ) {    int     i;    int     j;    // be-safe !!!    if (value == null) {      return null ;    }    i = value.length()-1 ;    // RV - IF the second letter is a consoant, RV is the region after    //      the next following vowel,    if ((i > 0) && !isVowel(value.charAt(1))) {      // find 1st vowel      for (j=2 ; j < i ; j++) {        if (isVowel(value.charAt(j))) {          break ;        }      }      if (j < i) {        return value.substring(j+1) ;      }    }    // RV - OR if the first two letters are vowels, RV is the region    //      after the next consoant,    if ((i > 1) &&        isVowel(value.charAt(0)) &&        isVowel(value.charAt(1))) {      // find 1st consoant      for (j=2 ; j < i ; j++) {        if (!isVowel(value.charAt(j))) {          break ;        }      }      if (j < i) {        return value.substring(j+1) ;      }    }    // RV - AND otherwise (consoant-vowel case) RV is the region after    //      the third letter.    if (i > 2) {      return value.substring(3) ;    }    return null ;  }	/**   * 1) Turn to lowercase   * 2) Remove accents   * 3) ã -> a ; õ -> o   * 4) ç -> c   *   * @return null or a string transformed	 */	private String changeTerm( String value ) {    int     j;    String  r = "" ;    // be-safe !!!    if (value == null) {      return null ;    }    value = value.toLowerCase() ;    for (j=0 ; j < value.length() ; j++) {      if ((value.charAt(j) == 'á') ||          (value.charAt(j) == 'â') ||          (value.charAt(j) == 'ã')) {        r= r + "a" ; continue ;      }      if ((value.charAt(j) == 'é') ||          (value.charAt(j) == 'ê')) {        r= r + "e" ; continue ;      }      if (value.charAt(j) == 'í') {        r= r + "i" ; continue ;      }      if ((value.charAt(j) == 'ó') ||          (value.charAt(j) == 'ô') ||          (value.charAt(j) == 'õ')) {        r= r + "o" ; continue ;      }      if ((value.charAt(j) == 'ú') ||          (value.charAt(j) == 'ü')) {        r= r + "u" ; continue ;      }      if (value.charAt(j) == 'ç') {        r= r + "c" ; continue ;      }      if (value.charAt(j) == 'ñ') {        r= r + "n" ; continue ;      }      r= r+ value.charAt(j) ;    }    return r ;  }	/**   * Check if a string ends with a suffix   *   * @return true if the string ends with the specified suffix	 */	private boolean suffix( String value, String suffix ) {    // be-safe !!!    if ((value == null) || (suffix == null)) {      return false ;    }    if (suffix.length() > value.length()) {      return false ;    }    return value.substring(value.length()-suffix.length()).equals(suffix);  }	/**   * Replace a string suffix by another   *   * @return the replaced String	 */	private String replaceSuffix( String value, String toReplace, String changeTo ) {    String vvalue ;    // be-safe !!!    if ((value == null) ||        (toReplace == null) ||        (changeTo == null) ) {      return value ;    }    vvalue = removeSuffix(value,toReplace) ;    if (value.equals(vvalue)) {      return value ;    } else {      return vvalue + changeTo ;    }  }	/**   * Remove a string suffix   *   * @return the String without the suffix	 */	private String removeSuffix( String value, String toRemove ) {    // be-safe !!!    if ((value == null) ||        (toRemove == null) ||        !suffix(value,toRemove) ) {      return value ;    }    return value.substring(0,value.length()-toRemove.length()) ;  }	/**   * See if a suffix is preceded by a String   *   * @return true if the suffix is preceded	 */	private boolean suffixPreceded( String value, String suffix, String preceded ) {    // be-safe !!!    if ((value == null) ||        (suffix == null) ||        (preceded == null) ||        !suffix(value,suffix) ) {      return false ;    }    return suffix(removeSuffix(value,suffix),preceded) ;  }	/**	 * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.	 */	private void createCT( String term ) {    CT = changeTerm(term) ;    if (CT.length() < 2) return ;    // if the first character is ... , remove it    if ((CT.charAt(0) == '"')  ||        (CT.charAt(0) == '\'') ||        (CT.charAt(0) == '-')  ||        (CT.charAt(0) == ',')  ||        (CT.charAt(0) == ';')  ||        (CT.charAt(0) == '.')  ||        (CT.charAt(0) == '?')  ||        (CT.charAt(0) == '!')        ) {        CT = CT.substring(1);    }    if (CT.length() < 2) return ;    // if the last character is ... , remove it    if ((CT.charAt(CT.length()-1) == '-') ||        (CT.charAt(CT.length()-1) == ',') ||        (CT.charAt(CT.length()-1) == ';') ||        (CT.charAt(CT.length()-1) == '.') ||        (CT.charAt(CT.length()-1) == '?') ||        (CT.charAt(CT.length()-1) == '!') ||        (CT.charAt(CT.length()-1) == '\'') ||        (CT.charAt(CT.length()-1) == '"')        ) {        CT = CT.substring(0,CT.length()-1);    }  }	/**	 * Standart suffix removal.   * Search for the longest among the following suffixes, and perform   * the following actions:   *   * @return false if no ending was removed	 */	private boolean step1() {    if (CT == null) return false ;    // suffix lenght = 7    if (suffix(CT,"uciones") && suffix(R2,"uciones")) {        CT = replaceSuffix(CT,"uciones","u") ; return true;    }    // suffix lenght = 6    if (CT.length() >= 6) {      if (suffix(CT,"imentos") && suffix(R2,"imentos")) {          CT = removeSuffix(CT,"imentos") ; return true;      }      if (suffix(CT,"amentos") && suffix(R2,"amentos")) {          CT = removeSuffix(CT,"amentos") ; return true;      }      if (suffix(CT,"adores") && suffix(R2,"adores")) {          CT = removeSuffix(CT,"adores") ; return true;      }      if (suffix(CT,"adoras") && suffix(R2,"adoras")) {          CT = removeSuffix(CT,"adoras") ; return true;      }      if (suffix(CT,"logias") && suffix(R2,"logias")) {          replaceSuffix(CT,"logias","log") ; return true;      }      if (suffix(CT,"encias") && suffix(R2,"encias")) {          CT = replaceSuffix(CT,"encias","ente") ; return true;      }      if (suffix(CT,"amente") && suffix(R1,"amente")) {          CT = removeSuffix(CT,"amente") ; return true;      }      if (suffix(CT,"idades") && suffix(R2,"idades")) {          CT = removeSuffix(CT,"idades") ; return true;      }    }    // suffix lenght = 5    if (CT.length() >= 5) {      if (suffix(CT,"acoes") && suffix(R2,"acoes")) {          CT = removeSuffix(CT,"acoes") ; return true;      }      if (suffix(CT,"imento") && suffix(R2,"imento")) {          CT = removeSuffix(CT,"imento") ; return true;      }      if (suffix(CT,"amento") && suffix(R2,"amento")) {          CT = removeSuffix(CT,"amento") ; return true;      }      if (suffix(CT,"adora") && suffix(R2,"adora")) {          CT = removeSuffix(CT,"adora") ; return true;      }      if (suffix(CT,"ismos") && suffix(R2,"ismos")) {          CT = removeSuffix(CT,"ismos") ; return true;      }      if (suffix(CT,"istas") && suffix(R2,"istas")) {          CT = removeSuffix(CT,"istas") ; return true;      }      if (suffix(CT,"logia") && suffix(R2,"logia")) {          CT = replaceSuffix(CT,"logia","log") ; return true;      }      if (suffix(CT,"ucion") && suffix(R2,"ucion")) {          CT = replaceSuffix(CT,"ucion","u") ; return true;      }      if (suffix(CT,"encia") && suffix(R2,"encia")) {          CT = replaceSuffix(CT,"encia","ente") ; return true;      }      if (suffix(CT,"mente") && suffix(R2,"mente")) {          CT = removeSuffix(CT,"mente") ; return true;      }      if (suffix(CT,"idade") && suffix(R2,"idade")) {          CT = removeSuffix(CT,"idade") ; return true;      }    }    // suffix lenght = 4    if (CT.length() >= 4) {      if (suffix(CT,"acao") && suffix(R2,"acao")) {          CT = removeSuffix(CT,"acao") ; return true;      }      if (suffix(CT,"ezas") && suffix(R2,"ezas")) {          CT = removeSuffix(CT,"ezas") ; return true;      }      if (suffix(CT,"icos") && suffix(R2,"icos")) {          CT = removeSuffix(CT,"icos") ; return true ;      }      if (suffix(CT,"icas") && suffix(R2,"icas")) {          CT = removeSuffix(CT,"icas") ; return true ;      }      if (suffix(CT,"ismo") && suffix(R2,"ismo")) {          CT = removeSuffix(CT,"ismo") ; return true ;      }      if (suffix(CT,"avel") && suffix(R2,"avel")) {          CT = removeSuffix(CT,"avel") ; return true ;      }      if (suffix(CT,"ivel") && suffix(R2,"ivel")) {          CT = removeSuffix(CT,"ivel") ; return true ;      }      if (suffix(CT,"ista") && suffix(R2,"ista")) {          CT = removeSuffix(CT,"ista") ; return true ;      }      if (suffix(CT,"osos") && suffix(R2,"osos")) {          CT = removeSuffix(CT,"osos") ; return true ;      }      if (suffix(CT,"osas") && suffix(R2,"osas")) {          CT = removeSuffix(CT,"osas") ; return true ;      }      if (suffix(CT,"ador") && suffix(R2,"ador")) {          CT = removeSuffix(CT,"ador") ; return true ;      }      if (suffix(CT,"ivas") && suffix(R2,"ivas")) {
12 下一页
💿 文件大小 8964 K
👤 上传用户 wldxmy
📂 所属分类中间件编程
🏷️ 相关标签

#java #版本 #搜索引擎 #源码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -