⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stemmer.java

📁 n algorithm for domain independent linear text segmentation This the Windows version of the C99 al
💻 JAVA
字号:
package uk.ac.man.cs.choif.nlp.surface;/* author:   Fotis Lazarinis (actually I translated from C to Java)   date:     June 1997   address:  Psilovraxou 12, Agrinio, 30100   comments: Compile it, import the Porter class into you program and create an instance.	     Then use the stripAffixes method of this method which takes a String as 			 input and returns the stem of this String again as a String.*/ public final class Stemmer {	private static class NewString {		public String str = "";	}  private static String Clean( String str ) {	 int last = str.length();	 	 Character ch = new Character( str.charAt(0) );	 String temp = "";	 for ( int i=0; i < last; i++ ) {		 if ( ch.isLetterOrDigit( str.charAt(i) ) )			temp += str.charAt(i);	 }   	 return temp;  } //clean                                      private static boolean containsVowel( String word ) {	 for (int i=0 ; i < word.length(); i++ )		 if ( i > 0 ) {			if ( vowel(word.charAt(i),word.charAt(i-1)) )			   return true;		 }		 else {  			if ( vowel(word.charAt(0),'a') )			   return true;		 }			 return false;  }  private static boolean cvc( String str ) {	 int length=str.length();	 if ( length < 3 )		return false;		 if ( (!vowel(str.charAt(length-1),str.charAt(length-2)) )		&& (str.charAt(length-1) != 'w') && (str.charAt(length-1) != 'x') && (str.charAt(length-1) != 'y')		&& (vowel(str.charAt(length-2),str.charAt(length-3))) ) {		if (length == 3) {		   if (!vowel(str.charAt(0),'?')) 			  return true;		   else			  return false;		}		else {		   if (!vowel(str.charAt(length-3),str.charAt(length-4)) ) 			  return true; 		   else			  return false;		} 	 }     	 return false;  }  private static boolean hasSuffix( String word, String suffix, NewString stem ) {	 String tmp = "";	 if ( word.length() <= suffix.length() )		return false;	 if (suffix.length() > 1) 		if ( word.charAt( word.length()-2 ) != suffix.charAt( suffix.length()-2 ) )		   return false;  	 stem.str = "";	 for ( int i=0; i<word.length()-suffix.length(); i++ )		 stem.str += word.charAt( i );	 tmp = stem.str;	 for ( int i=0; i<suffix.length(); i++ )		 tmp += suffix.charAt( i );	 if ( tmp.compareTo( word ) == 0 )		return true;	 else		return false;  }  private static int measure( String stem ) {		int i=0, count = 0;	int length = stem.length();	while ( i < length ) {	   for ( ; i < length ; i++ ) {		   if ( i > 0 ) {			  if ( vowel(stem.charAt(i),stem.charAt(i-1)) )				 break;		   }		   else {  			  if ( vowel(stem.charAt(i),'a') )				break; 		   }	   }	   for ( i++ ; i < length ; i++ ) {		   if ( i > 0 ) {			  if ( !vowel(stem.charAt(i),stem.charAt(i-1)) )				  break;			  }		   else {  			  if ( !vowel(stem.charAt(i),'?') )				 break;		   }	   } 	  if ( i < length ) {		 count++;		 i++;	  }	} //while		return(count);  }  public final static String stemOf( String str ) {	str = str.toLowerCase();	str = Clean(str);  	if (( str != "" ) && (str.length() > 2)) {	   str = stripPrefixes(str);	   if (str != "" ) 		  str = stripSuffixes(str);	}   	return str;	} //stripAffixesprivate static String step1( String str ) { 	 NewString stem = new NewString();	 if ( str.charAt( str.length()-1 ) == 's' ) {		if ( (hasSuffix( str, "sses", stem )) || (hasSuffix( str, "ies", stem)) ){		   String tmp = "";		   for (int i=0; i<str.length()-2; i++)			   tmp += str.charAt(i);		   str = tmp;		}		else {		   if ( ( str.length() == 1 ) && ( str.charAt(str.length()-1) == 's' ) ) {			  str = "";			  return str;		   }		   if ( str.charAt( str.length()-2 ) != 's' ) {			  String tmp = "";			  for (int i=0; i<str.length()-1; i++)				  tmp += str.charAt(i);			  str = tmp;		   }		}  	 }	 if ( hasSuffix( str,"eed",stem ) ) {		   if ( measure( stem.str ) > 0 ) {			  String tmp = "";			  for (int i=0; i<str.length()-1; i++)				  tmp += str.charAt( i );			  str = tmp;		   }	 }	 else {  		if (  (hasSuffix( str,"ed",stem )) || (hasSuffix( str,"ing",stem )) ) { 		   if (containsVowel( stem.str ))  {			  String tmp = "";			  for ( int i = 0; i < stem.str.length(); i++)				  tmp += str.charAt( i );			  str = tmp;			  if ( str.length() == 1 )				 return str;			  if ( ( hasSuffix( str,"at",stem) ) || ( hasSuffix( str,"bl",stem ) ) || ( hasSuffix( str,"iz",stem) ) ) {				 str += "e";		   			  }			  else {   				 int length = str.length(); 				 if ( (str.charAt(length-1) == str.charAt(length-2)) 					&& (str.charAt(length-1) != 'l') && (str.charAt(length-1) != 's') && (str.charAt(length-1) != 'z') ) {					 					tmp = "";					for (int i=0; i<str.length()-1; i++)						tmp += str.charAt(i);					str = tmp;				 }				 else					if ( measure( str ) == 1 ) {					   if ( cvc(str) ) 						  str += "e";					}			  }		   }		}	 }	 if ( hasSuffix(str,"y",stem) ) 		if ( containsVowel( stem.str ) ) {		   String tmp = "";		   for (int i=0; i<str.length()-1; i++ )			   tmp += str.charAt(i);		   str = tmp + "i";		}	 return str;    }  private static String step2( String str ) {	 String[][] suffixes = { { "ational", "ate" },									{ "tional",  "tion" },									{ "enci",    "ence" },									{ "anci",    "ance" },									{ "izer",    "ize" },									{ "iser",    "ize" },									{ "abli",    "able" },									{ "alli",    "al" },									{ "entli",   "ent" },									{ "eli",     "e" },									{ "ousli",   "ous" },									{ "ization", "ize" },									{ "isation", "ize" },									{ "ation",   "ate" },									{ "ator",    "ate" },									{ "alism",   "al" },									{ "iveness", "ive" },									{ "fulness", "ful" },									{ "ousness", "ous" },									{ "aliti",   "al" },									{ "iviti",   "ive" },									{ "biliti",  "ble" }};	 NewString stem = new NewString();	 	 for ( int index = 0 ; index < suffixes.length; index++ ) {		 if ( hasSuffix ( str, suffixes[index][0], stem ) ) {			if ( measure ( stem.str ) > 0 ) {			   str = stem.str + suffixes[index][1];			   return str;			}		 }	 }	 return str;  }  private static String step3( String str ) {		String[][] suffixes = { { "icate", "ic" },									   { "ative", "" },									   { "alize", "al" },									   { "alise", "al" },									   { "iciti", "ic" },									   { "ical",  "ic" },									   { "ful",   "" },									   { "ness",  "" }};		NewString stem = new NewString();		for ( int index = 0 ; index<suffixes.length; index++ ) {			if ( hasSuffix ( str, suffixes[index][0], stem ))			   if ( measure ( stem.str ) > 0 ) {				  str = stem.str + suffixes[index][1];				  return str;			   }		}		return str;  }  private static String step4( String str ) {			 String[] suffixes = { "al", "ance", "ence", "er", "ic", "able", "ible", "ant", "ement", "ment", "ent", "sion", "tion",						   "ou", "ism", "ate", "iti", "ous", "ive", "ize", "ise"};	 	 NewString stem = new NewString();			 for ( int index = 0 ; index<suffixes.length; index++ ) {		 if ( hasSuffix ( str, suffixes[index], stem ) ) {		   			if ( measure ( stem.str ) > 1 ) {			   str = stem.str;			   return str;			}		 }	 }	 return str;  }  private static String step5( String str ) {	 if ( str.charAt(str.length()-1) == 'e' ) { 		if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */		   String tmp = "";		   for ( int i=0; i<str.length()-1; i++ ) 			   tmp += str.charAt( i );		   str = tmp;		}		else		   if ( measure(str) == 1 ) {			  String stem = "";			  for ( int i=0; i<str.length()-1; i++ ) 				  stem += str.charAt( i );			  if ( !cvc(stem) )				 str = stem;		   }	 }	 	 if ( str.length() == 1 )		return str;	 if ( (str.charAt(str.length()-1) == 'l') && (str.charAt(str.length()-2) == 'l') && (measure(str) > 1) )		if ( measure(str) > 1 ) {/* measure(str)==measure(stem) if ends in vowel */		   String tmp = "";		   for ( int i=0; i<str.length()-1; i++ ) 			   tmp += str.charAt( i );		   str = tmp;		} 	 return str;  }  private static String stripPrefixes ( String str) {	 String[] prefixes = { "kilo", "micro", "milli", "intra", "ultra", "mega", "nano", "pico", "pseudo"};	 int last = prefixes.length;	 for ( int i=0 ; i<last; i++ ) {		 if ( str.startsWith( prefixes[i] ) ) {			String temp = "";			for ( int j=0 ; j< str.length()-prefixes[i].length(); j++ )				temp += str.charAt( j+prefixes[i].length() );			return temp;		 }	 }	 	 return str;  }  private static String stripSuffixes( String str ) {	 str = step1( str );	 if ( str.length() >= 1 )		str = step2( str );	 if ( str.length() >= 1 )		str = step3( str );	 if ( str.length() >= 1 )		str = step4( str );	 if ( str.length() >= 1 )		str = step5( str ); 	 return str;   }  private static boolean vowel( char ch, char prev ) {	 switch ( ch ) {		case 'a': case 'e': case 'i': case 'o': case 'u': 		  return true;		case 'y': {		  switch ( prev ) {			case 'a': case 'e': case 'i': case 'o': case 'u': 			  return false;			default: 			  return true;		  }		}				default : 		  return false;	 }  }  }

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -