📄 brazilianstemmer.java
字号:
package org.apache.lucene.analysis.br;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. *//** * A stemmer for Brazilian words. */public class BrazilianStemmer { /** * Changed term */ private String TERM ; private String CT ; private String R1 ; private String R2 ; private String RV ; public BrazilianStemmer() { } /** * Stemms the given term to an unique <tt>discriminator</tt>. * * @param term The term that should be stemmed. * @return Discriminator for <tt>term</tt> */ protected String stem( String term ) { boolean altered = false ; // altered the term // creates CT createCT(term) ; if ( !isIndexable( CT ) ) { return null; } if ( !isStemmable( CT ) ) { return CT ; } R1 = getR1(CT) ; R2 = getR1(R1) ; RV = getRV(CT) ; TERM = term + ";" +CT ; altered = step1() ; if (!altered) { altered = step2() ; } if (altered) { step3(); } else { step4(); } step5() ; return CT ; } /** * Checks a term if it can be processed correctly. * * @return true if, and only if, the given term consists in letters. */ private boolean isStemmable( String term ) { for ( int c = 0; c < term.length(); c++ ) { // Discard terms that contain non-letter characters. if ( !Character.isLetter(term.charAt(c))) { return false; } } return true; } /** * Checks a term if it can be processed indexed. * * @return true if it can be indexed */ private boolean isIndexable( String term ) { return (term.length() < 30) && (term.length() > 2) ; } /** * See if string is 'a','e','i','o','u' * * @return true if is vowel */ private boolean isVowel( char value ) { return (value == 'a') || (value == 'e') || (value == 'i') || (value == 'o') || (value == 'u') ; } /** * Gets R1 * * R1 - is the region after the first non-vowel follwing a vowel, * or is the null region at the end of the word if there is * no such non-vowel. * * @return null or a string representing R1 */ private String getR1( String value ) { int i; int j; // be-safe !!! if (value == null) { return null ; } // find 1st vowel i = value.length()-1 ; for (j=0 ; j < i ; j++) { if (isVowel(value.charAt(j))) { break ; } } if (!(j < i)) { return null ; } // find 1st non-vowel for ( ; j < i ; j++) { if (!(isVowel(value.charAt(j)))) { break ; } } if (!(j < i)) { return null ; } return value.substring(j+1) ; } /** * Gets RV * * RV - IF the second letter is a consoant, RV is the region after * the next following vowel, * * OR if the first two letters are vowels, RV is the region * after the next consoant, * * AND otherwise (consoant-vowel case) RV is the region after * the third letter. * * BUT RV is the end of the word if this positions cannot be * found. * * @return null or a string representing RV */ private String getRV( String value ) { int i; int j; // be-safe !!! if (value == null) { return null ; } i = value.length()-1 ; // RV - IF the second letter is a consoant, RV is the region after // the next following vowel, if ((i > 0) && !isVowel(value.charAt(1))) { // find 1st vowel for (j=2 ; j < i ; j++) { if (isVowel(value.charAt(j))) { break ; } } if (j < i) { return value.substring(j+1) ; } } // RV - OR if the first two letters are vowels, RV is the region // after the next consoant, if ((i > 1) && isVowel(value.charAt(0)) && isVowel(value.charAt(1))) { // find 1st consoant for (j=2 ; j < i ; j++) { if (!isVowel(value.charAt(j))) { break ; } } if (j < i) { return value.substring(j+1) ; } } // RV - AND otherwise (consoant-vowel case) RV is the region after // the third letter. if (i > 2) { return value.substring(3) ; } return null ; } /** * 1) Turn to lowercase * 2) Remove accents * 3) ã -> a ; õ -> o * 4) ç -> c * * @return null or a string transformed */ private String changeTerm( String value ) { int j; String r = "" ; // be-safe !!! if (value == null) { return null ; } value = value.toLowerCase() ; for (j=0 ; j < value.length() ; j++) { if ((value.charAt(j) == 'á') || (value.charAt(j) == 'â') || (value.charAt(j) == 'ã')) { r= r + "a" ; continue ; } if ((value.charAt(j) == 'é') || (value.charAt(j) == 'ê')) { r= r + "e" ; continue ; } if (value.charAt(j) == 'í') { r= r + "i" ; continue ; } if ((value.charAt(j) == 'ó') || (value.charAt(j) == 'ô') || (value.charAt(j) == 'õ')) { r= r + "o" ; continue ; } if ((value.charAt(j) == 'ú') || (value.charAt(j) == 'ü')) { r= r + "u" ; continue ; } if (value.charAt(j) == 'ç') { r= r + "c" ; continue ; } if (value.charAt(j) == 'ñ') { r= r + "n" ; continue ; } r= r+ value.charAt(j) ; } return r ; } /** * Check if a string ends with a suffix * * @return true if the string ends with the specified suffix */ private boolean suffix( String value, String suffix ) { // be-safe !!! if ((value == null) || (suffix == null)) { return false ; } if (suffix.length() > value.length()) { return false ; } return value.substring(value.length()-suffix.length()).equals(suffix); } /** * Replace a string suffix by another * * @return the replaced String */ private String replaceSuffix( String value, String toReplace, String changeTo ) { String vvalue ; // be-safe !!! if ((value == null) || (toReplace == null) || (changeTo == null) ) { return value ; } vvalue = removeSuffix(value,toReplace) ; if (value.equals(vvalue)) { return value ; } else { return vvalue + changeTo ; } } /** * Remove a string suffix * * @return the String without the suffix */ private String removeSuffix( String value, String toRemove ) { // be-safe !!! if ((value == null) || (toRemove == null) || !suffix(value,toRemove) ) { return value ; } return value.substring(0,value.length()-toRemove.length()) ; } /** * See if a suffix is preceded by a String * * @return true if the suffix is preceded */ private boolean suffixPreceded( String value, String suffix, String preceded ) { // be-safe !!! if ((value == null) || (suffix == null) || (preceded == null) || !suffix(value,suffix) ) { return false ; } return suffix(removeSuffix(value,suffix),preceded) ; } /** * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'. */ private void createCT( String term ) { CT = changeTerm(term) ; if (CT.length() < 2) return ; // if the first character is ... , remove it if ((CT.charAt(0) == '"') || (CT.charAt(0) == '\'') || (CT.charAt(0) == '-') || (CT.charAt(0) == ',') || (CT.charAt(0) == ';') || (CT.charAt(0) == '.') || (CT.charAt(0) == '?') || (CT.charAt(0) == '!') ) { CT = CT.substring(1); } if (CT.length() < 2) return ; // if the last character is ... , remove it if ((CT.charAt(CT.length()-1) == '-') || (CT.charAt(CT.length()-1) == ',') || (CT.charAt(CT.length()-1) == ';') || (CT.charAt(CT.length()-1) == '.') || (CT.charAt(CT.length()-1) == '?') || (CT.charAt(CT.length()-1) == '!') || (CT.charAt(CT.length()-1) == '\'') || (CT.charAt(CT.length()-1) == '"') ) { CT = CT.substring(0,CT.length()-1); } } /** * Standart suffix removal. * Search for the longest among the following suffixes, and perform * the following actions: * * @return false if no ending was removed */ private boolean step1() { if (CT == null) return false ; // suffix lenght = 7 if (suffix(CT,"uciones") && suffix(R2,"uciones")) { CT = replaceSuffix(CT,"uciones","u") ; return true; } // suffix lenght = 6 if (CT.length() >= 6) { if (suffix(CT,"imentos") && suffix(R2,"imentos")) { CT = removeSuffix(CT,"imentos") ; return true; } if (suffix(CT,"amentos") && suffix(R2,"amentos")) { CT = removeSuffix(CT,"amentos") ; return true; } if (suffix(CT,"adores") && suffix(R2,"adores")) { CT = removeSuffix(CT,"adores") ; return true; } if (suffix(CT,"adoras") && suffix(R2,"adoras")) { CT = removeSuffix(CT,"adoras") ; return true; } if (suffix(CT,"logias") && suffix(R2,"logias")) { replaceSuffix(CT,"logias","log") ; return true; } if (suffix(CT,"encias") && suffix(R2,"encias")) { CT = replaceSuffix(CT,"encias","ente") ; return true; } if (suffix(CT,"amente") && suffix(R1,"amente")) { CT = removeSuffix(CT,"amente") ; return true; } if (suffix(CT,"idades") && suffix(R2,"idades")) { CT = removeSuffix(CT,"idades") ; return true; } } // suffix lenght = 5 if (CT.length() >= 5) { if (suffix(CT,"acoes") && suffix(R2,"acoes")) { CT = removeSuffix(CT,"acoes") ; return true; } if (suffix(CT,"imento") && suffix(R2,"imento")) { CT = removeSuffix(CT,"imento") ; return true; } if (suffix(CT,"amento") && suffix(R2,"amento")) { CT = removeSuffix(CT,"amento") ; return true; } if (suffix(CT,"adora") && suffix(R2,"adora")) { CT = removeSuffix(CT,"adora") ; return true; } if (suffix(CT,"ismos") && suffix(R2,"ismos")) { CT = removeSuffix(CT,"ismos") ; return true; } if (suffix(CT,"istas") && suffix(R2,"istas")) { CT = removeSuffix(CT,"istas") ; return true; } if (suffix(CT,"logia") && suffix(R2,"logia")) { CT = replaceSuffix(CT,"logia","log") ; return true; } if (suffix(CT,"ucion") && suffix(R2,"ucion")) { CT = replaceSuffix(CT,"ucion","u") ; return true; } if (suffix(CT,"encia") && suffix(R2,"encia")) { CT = replaceSuffix(CT,"encia","ente") ; return true; } if (suffix(CT,"mente") && suffix(R2,"mente")) { CT = removeSuffix(CT,"mente") ; return true; } if (suffix(CT,"idade") && suffix(R2,"idade")) { CT = removeSuffix(CT,"idade") ; return true; } } // suffix lenght = 4 if (CT.length() >= 4) { if (suffix(CT,"acao") && suffix(R2,"acao")) { CT = removeSuffix(CT,"acao") ; return true; } if (suffix(CT,"ezas") && suffix(R2,"ezas")) { CT = removeSuffix(CT,"ezas") ; return true; } if (suffix(CT,"icos") && suffix(R2,"icos")) { CT = removeSuffix(CT,"icos") ; return true ; } if (suffix(CT,"icas") && suffix(R2,"icas")) { CT = removeSuffix(CT,"icas") ; return true ; } if (suffix(CT,"ismo") && suffix(R2,"ismo")) { CT = removeSuffix(CT,"ismo") ; return true ; } if (suffix(CT,"avel") && suffix(R2,"avel")) { CT = removeSuffix(CT,"avel") ; return true ; } if (suffix(CT,"ivel") && suffix(R2,"ivel")) { CT = removeSuffix(CT,"ivel") ; return true ; } if (suffix(CT,"ista") && suffix(R2,"ista")) { CT = removeSuffix(CT,"ista") ; return true ; } if (suffix(CT,"osos") && suffix(R2,"osos")) { CT = removeSuffix(CT,"osos") ; return true ; } if (suffix(CT,"osas") && suffix(R2,"osas")) { CT = removeSuffix(CT,"osas") ; return true ; } if (suffix(CT,"ador") && suffix(R2,"ador")) { CT = removeSuffix(CT,"ador") ; return true ; } if (suffix(CT,"ivas") && suffix(R2,"ivas")) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -