porter.java

来自「wekaUT是 university texas austin 开发的基于wek」· Java 代码 · 共 420 行

JAVA
420
字号
package weka.deduping.metrics;import java.io.*;/* author:   Fotis Lazarinis (actually I translated from C to Java)   date:     June 1997   address:  Psilovraxou 12, Agrinio, 30100   comments: Compile it, import the Porter class into you program and create an instance.	     Then use the stripAffixes method of this method which takes a String as              input and returns the stem of this String again as a String.*/ class NewString {  public String str;  NewString() {    str = "";  }}/** The Porter stemmer for reducing words to their base stem form. * * @author Fotis Lazarinis */public class Porter implements Serializable {  private String Clean( String str ) {    int last = str.length();         Character ch = new Character( str.charAt(0) );    String temp = "";    for ( int i=0; i < last; i++ ) {      if ( ch.isLetterOrDigit( str.charAt(i) ) )	temp += str.charAt(i);    }       return temp;  } //clean   private boolean hasSuffix( String word, String suffix, NewString stem ) {    String tmp = "";    if ( word.length() <= suffix.length() )      return false;    if (suffix.length() > 1)       if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) )	return false;      stem.str = "";    for ( int i=0; i<word.length()-suffix.length(); i++ )      stem.str += word.charAt( i );    tmp = stem.str;    for ( int i=0; i<suffix.length(); i++ )      tmp += suffix.charAt( i );    if ( tmp.compareTo( word ) == 0 )      return true;    else      return false;  }  private boolean vowel( char ch, char prev ) {    switch ( ch ) {    case 'a': case 'e': case 'i': case 'o': case 'u':       return true;    case 'y': {      switch ( prev ) {      case 'a': case 'e': case 'i': case 'o': case 'u': 	return false;      default: 	return true;      }    }            default :       return false;    }  }  private int measure( String stem ) {        int i=0, count = 0;    int length = stem.length();    while ( i < length ) {      for ( ; i < length ; i++ ) {	if ( i > 0 ) {	  if ( vowel(stem.charAt(i),stem.charAt(i-1)) )	    break;	}	else {  	  if ( vowel(stem.charAt(i),'a') )	    break; 	}      }      for ( i++ ; i < length ; i++ ) {	if ( i > 0 ) {	  if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )	    break;	}	else {  	  if ( !vowel(stem.charAt(i),'?') )	    break;	}      }       if ( i < length ) {	count++;	i++;      }    } //while        return(count);  }  private boolean containsVowel( String word ) {    for (int i=0 ; i < word.length(); i++ )      if ( i > 0 ) {	if ( vowel(word.charAt(i),word.charAt(i-1)) )	  return true;      }      else {  	if ( vowel(word.charAt(0),'a') )	  return true;      }            return false;  }  private boolean cvc( String str ) {    int length=str.length();    if ( length < 3 )      return false;        if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )	 && (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')	 && (vowel(str.charAt(length-2),str.charAt(length-3))) ) {      if (length == 3) {	if (!vowel(str.charAt(0),'?')) 	  return true;	else	  return false;      }      else {	if (!vowel(str.charAt(length-3),str.charAt(length-4)) ) 	  return true; 	else	  return false;      }     }         return false;  }  private String step1( String str ) {     NewString stem = new NewString();    if ( str.charAt( str.length()-1 ) == 's' ) {      if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){	String tmp = "";	for (int i=0; i<str.length()-2; i++)	  tmp += str.charAt(i);	str = tmp;      }      else {	if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == 's' ) ) {	  str = "";	  return str;	}	if ( str.charAt( str.length()-2 ) != 's' ) {	  String tmp = "";	  for (int i=0; i<str.length()-1; i++)	    tmp += str.charAt(i);	  str = tmp;	}      }      }    if ( hasSuffix( str,"eed",stem ) ) {      if ( measure( stem.str ) > 0 ) {	String tmp = "";	for (int i=0; i<str.length()-1; i++)	  tmp += str.charAt( i );	str = tmp;      }    }    else {        if (  (hasSuffix( str,"ed",stem )) || (hasSuffix( str,"ing",stem )) ) { 	if (containsVowel( stem.str ))  {	  String tmp = "";	  for ( int i = 0; i < stem.str.length(); i++)	    tmp += str.charAt( i );	  str = tmp;	  if ( str.length() == 1 )	    return str;	  if ( ( hasSuffix( str,"at",stem) ) || ( hasSuffix( str,"bl",stem ) ) || ( hasSuffix( str,"iz",stem) ) ) {	    str += "e";           	  }	  else {   	    int length = str.length(); 	    if ( (str.charAt(length-1) == str.charAt(length-2)) 		 && (str.charAt(length-1) != 'l') && (str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) {                     	      tmp = "";	      for (int i=0; i<str.length()-1; i++)		tmp += str.charAt(i);	      str = tmp;	    }	    else	      if ( measure( str ) == 1 ) {		if ( cvc(str) ) 		  str += "e";	      }	  }	}      }    }    if ( hasSuffix(str,"y",stem) )       if ( containsVowel( stem.str ) ) {	String tmp = "";	for (int i=0; i<str.length()-1; i++ )	  tmp += str.charAt(i);	str = tmp + "i";      }    return str;    }  private String step2( String str ) {    String[][] suffixes = { { "ational", "ate" },			    { "tional",  "tion" },			    { "enci",    "ence" },			    { "anci",    "ance" },			    { "izer",    "ize" },			    { "iser",    "ize" },			    { "abli",    "able" },			    { "alli",    "al" },			    { "entli",   "ent" },			    { "eli",     "e" },			    { "ousli",   "ous" },			    { "ization", "ize" },			    { "isation", "ize" },			    { "ation",   "ate" },			    { "ator",    "ate" },			    { "alism",   "al" },			    { "iveness", "ive" },			    { "fulness", "ful" },			    { "ousness", "ous" },			    { "aliti",   "al" },			    { "iviti",   "ive" },			    { "biliti",  "ble" }};    NewString stem = new NewString();         for ( int index = 0 ; index < suffixes.length; index++ ) {      if ( hasSuffix ( str, suffixes[index][0], stem ) ) {	if ( measure ( stem.str ) > 0 ) {	  str = stem.str + suffixes[index][1];	  return str;	}      }    }    return str;  }  private String step3( String str ) {    String[][] suffixes = { { "icate", "ic" },			    { "ative", "" },			    { "alize", "al" },			    { "alise", "al" },			    { "iciti", "ic" },			    { "ical",  "ic" },			    { "ful",   "" },			    { "ness",  "" },			    {"sion", "s"},			    {"tion", "t"}    };    NewString stem = new NewString();    for ( int index = 0 ; index<suffixes.length; index++ ) {      if ( hasSuffix ( str, suffixes[index][0], stem ))	if ( measure ( stem.str ) > 0 ) {	  str = stem.str + suffixes[index][1];	  return str;	}    }    return str;  }  private String step4( String str ) {            String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent",			  "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};         NewString stem = new NewString();            for ( int index = 0 ; index<suffixes.length; index++ ) {      if ( hasSuffix ( str, suffixes[index], stem ) ) {            	if ( measure ( stem.str ) > 1 ) {	  str = stem.str;	  return str;	}      }    }    return str;  }  private String step5( String str ) {    if ( str.charAt(str.length()-1) == 'e' ) {       if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */	String tmp = "";	for ( int i=0; i<str.length()-1; i++ ) 	  tmp += str.charAt( i );	str = tmp;      }      else	if ( measure(str) == 1 ) {	  String stem = "";	  for ( int i=0; i<str.length()-1; i++ ) 	    stem += str.charAt( i );	  if ( !cvc(stem) )	    str = stem;	}    }         if ( str.length() == 1 )      return str;    if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) )      if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */	String tmp = "";	for ( int i=0; i<str.length()-1; i++ ) 	  tmp += str.charAt( i );	str = tmp;      }     return str;  }  private String stripPrefixes ( String str) {    String[] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"};    int last = prefixes.length;    for ( int i=0 ; i<last; i++ ) {      if ( str.startsWith( prefixes[i] ) ) {	String temp = "";	for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )	  temp += str.charAt( j+prefixes[i].length() );	return temp;      }    }         return str;  }  private String stripSuffixes( String str ) {    str = step1( str );    if ( str.length() >= 1 )      str = step2( str );    if ( str.length() >= 1 )      str = step3( str );    if ( str.length() >= 1 )      str = step4( str );    if ( str.length() >= 1 )      str = step5( str );     return str;   }  /**  Takes a String as input and returns its stem as a String.*/  public String stripAffixes( String str ) {    str = str.toLowerCase();    str = Clean(str);      if (( str != "" ) && (str.length() > 2)) {      str = stripPrefixes(str);      if (str != "" ) 	str = stripSuffixes(str);    }       return str;  } //stripAffixes  /** For testing, print the stemmed version of a word */  public static void main(String[] args) throws IOException {    String word = args[0];    Porter stemmer = new Porter();    String stem = stemmer.stripAffixes(word);    System.out.println(stem);  }} //class

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?