📄 generictransformator.java
字号:
/*
Jazzy - a Java library for Spell Checking
Copyright (C) 2001 Mindaugas Idzelis
Full text of license can be found in LICENSE.txt
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
package com.swabunga.spell.engine;
import com.swabunga.util.StringUtility;
import java.io.*;
import java.util.HashMap;
import java.util.Vector;
/**
* A Generic implementation of a transformator takes an
* <a href="http://aspell.net/man-html/Phonetic-Code.html">
* aspell phonetics file</a> and constructs some sort of transformation
* table using the inner class TransformationRule.
* </p>
* Basically, each transformation rule represent a line in the phonetic file.
* One line contains two groups of characters separated by white space(s).
* The first group is the <em>match expression</em>.
* The <em>match expression</em> describe letters to associate with a syllable.
* The second group is the <em>replacement expression</em> giving the phonetic
* equivalent of the <em>match expression</em>.
*
* @see SpellDictionaryASpell SpellDictionaryASpell for information on getting
* phonetic files for aspell.
*
* @author Robert Gustavsson (robert@lindesign.se)
*/
public class GenericTransformator implements Transformator {
/**
* This replace list is used if no phonetic file is supplied or it doesn't
* contain the alphabet.
*/
private static final char[] defaultEnglishAlphabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'};
/**
* The alphabet start marker.
* @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
*/
public static final char ALPHABET_START = '[';
/**
* The alphabet end marker.
* @see GenericTransformator#KEYWORD_ALPHBET KEYWORD_ALPHBET
*/
public static final char ALPHABET_END = ']';
/**
* Phonetic file keyword indicating that a different alphabet is used
* for this language. The keyword must be followed an
* {@link GenericTransformator#ALPHABET_START ALPHABET_START} marker,
* a list of characters defining the alphabet and a
* {@link GenericTransformator#ALPHABET_END ALPHABET_END} marker.
*/
public static final String KEYWORD_ALPHBET = "alphabet";
/**
* Phonetic file lines starting with the keywords are skipped.
* The key words are: version, followup, collapse_result.
* Comments, starting with '#', are also skipped to the end of line.
*/
public static final String[] IGNORED_KEYWORDS = {"version", "followup", "collapse_result"};
/**
* Start a group of characters which can be appended to the match expression
* of the phonetic file.
*/
public static final char STARTMULTI = '(';
/**
* End a group of characters which can be appended to the match expression
* of the phonetic file.
*/
public static final char ENDMULTI = ')';
/**
* During phonetic transformation of a word each numeric character is
* replaced by this DIGITCODE.
*/
public static final String DIGITCODE = "0";
/**
* Phonetic file character code indicating that the replace expression
* is empty.
*/
public static final String REPLACEVOID = "_";
private Object[] ruleArray = null;
private char[] alphabetString = defaultEnglishAlphabet;
/**
* Construct a transformation table from the phonetic file
* @param phonetic the phonetic file as specified in aspell
* @throws java.io.IOException indicates a problem while reading
* the phonetic file
*/
public GenericTransformator(File phonetic) throws IOException {
buildRules(new BufferedReader(new FileReader(phonetic)));
alphabetString = washAlphabetIntoReplaceList(getReplaceList());
}
/**
* Construct a transformation table from the phonetic file
* @param phonetic the phonetic file as specified in aspell
* @param encoding the character set required
* @throws java.io.IOException indicates a problem while reading
* the phonetic file
*/
public GenericTransformator(File phonetic, String encoding) throws IOException {
buildRules(new BufferedReader(new InputStreamReader(new FileInputStream(phonetic), encoding)));
alphabetString = washAlphabetIntoReplaceList(getReplaceList());
}
/**
* Construct a transformation table from the phonetic file
* @param phonetic the phonetic file as specified in aspell. The file is
* supplied as a reader.
* @throws java.io.IOException indicates a problem while reading
* the phonetic information
*/
public GenericTransformator(Reader phonetic) throws IOException {
buildRules(new BufferedReader(phonetic));
alphabetString = washAlphabetIntoReplaceList(getReplaceList());
}
/**
* Goes through an alphabet and makes sure that only one of those letters
* that are coded equally will be in the replace list.
* In other words, it removes any letters in the alphabet
* that are redundant phonetically.
*
* This is done to improve speed in the getSuggestion method.
*
* @param alphabet The complete alphabet to wash.
* @return The washed alphabet to be used as replace list.
*/
private char[] washAlphabetIntoReplaceList(char[] alphabet) {
HashMap letters = new HashMap(alphabet.length);
for (int i = 0; i < alphabet.length; i++) {
String tmp = String.valueOf(alphabet[i]);
String code = transform(tmp);
if (!letters.containsKey(code)) {
letters.put(code, new Character(alphabet[i]));
}
}
Object[] tmpCharacters = letters.values().toArray();
char[] washedArray = new char[tmpCharacters.length];
for (int i = 0; i < tmpCharacters.length; i++) {
washedArray[i] = ((Character) tmpCharacters[i]).charValue();
}
return washedArray;
}
/**
* Takes out all single character replacements and put them in a char array.
* This array can later be used for adding or changing letters in getSuggestion().
* @return char[] An array of chars with replacements characters
*/
public char[] getCodeReplaceList() {
char[] replacements;
TransformationRule rule;
Vector tmp = new Vector();
if (ruleArray == null)
return null;
for (int i = 0; i < ruleArray.length; i++) {
rule = (TransformationRule) ruleArray[i];
if (rule.getReplaceExp().length() == 1)
tmp.addElement(rule.getReplaceExp());
}
replacements = new char[tmp.size()];
for (int i = 0; i < tmp.size(); i++) {
replacements[i] = ((String) tmp.elementAt(i)).charAt(0);
}
return replacements;
}
/**
* Builds up an char array with the chars in the alphabet of the language as it was read from the
* alphabet tag in the phonetic file.
* @return char[] An array of chars representing the alphabet or null if no alphabet was available.
*/
public char[] getReplaceList() {
return alphabetString;
}
/**
* Builds the phonetic code of the word.
* @param word the word to transform
* @return the phonetic transformation of the word
*/
public String transform(String word) {
if (ruleArray == null)
return null;
TransformationRule rule;
StringBuffer str = new StringBuffer(word.toUpperCase());
int strLength = str.length();
int startPos = 0, add = 1;
while (startPos < strLength) {
add = 1;
if (Character.isDigit(str.charAt(startPos))) {
StringUtility.replace(str, startPos, startPos + DIGITCODE.length(), DIGITCODE);
startPos += add;
continue;
}
for (int i = 0; i < ruleArray.length; i++) {
//System.out.println("Testing rule#:"+i);
rule = (TransformationRule) ruleArray[i];
if (rule.startsWithExp() && startPos > 0)
continue;
if (startPos + rule.lengthOfMatch() > strLength) {
continue;
}
if (rule.isMatching(str, startPos)) {
String replaceExp = rule.getReplaceExp();
add = replaceExp.length();
StringUtility.replace(str, startPos, startPos + rule.getTakeOut(), replaceExp);
strLength -= rule.getTakeOut();
strLength += add;
//System.out.println("Replacing with rule#:"+i+" add="+add);
break;
}
}
startPos += add;
}
//System.out.println(word);
//System.out.println(str.toString());
return str.toString();
}
// Used to build up the transformastion table.
private void buildRules(BufferedReader in) throws IOException {
String read = null;
Vector ruleList = new Vector();
while ((read = in.readLine()) != null) {
buildRule(realTrimmer(read), ruleList);
}
ruleArray = new TransformationRule[ruleList.size()];
ruleList.copyInto(ruleArray);
}
// Here is where the real work of reading the phonetics file is done.
private void buildRule(String str, Vector ruleList) {
if (str.length() < 1)
return;
for (int i = 0; i < IGNORED_KEYWORDS.length; i++) {
if (str.startsWith(IGNORED_KEYWORDS[i]))
return;
}
// A different alphabet is used for this language, will be read into
// the alphabetString variable.
if (str.startsWith(KEYWORD_ALPHBET)) {
int start = str.indexOf(ALPHABET_START);
int end = str.lastIndexOf(ALPHABET_END);
if (end != -1 && start != -1) {
alphabetString = str.substring(++start, end).toCharArray();
}
return;
}
// str contains two groups of characters separated by white space(s).
// The fisrt group is the "match expression". The second group is the
// "replacement expression" giving the phonetic equivalent of the
// "match expression".
TransformationRule rule = null;
StringBuffer matchExp = new StringBuffer();
StringBuffer replaceExp = new StringBuffer();
boolean start = false,
end = false;
int takeOutPart = 0,
matchLength = 0;
boolean match = true,
inMulti = false;
for (int i = 0; i < str.length(); i++) {
if (Character.isWhitespace(str.charAt(i))) {
match = false;
} else {
if (match) {
if (!isReservedChar(str.charAt(i))) {
matchExp.append(str.charAt(i));
if (!inMulti) {
takeOutPart++;
matchLength++;
}
if (str.charAt(i) == STARTMULTI || str.charAt(i) == ENDMULTI)
inMulti = !inMulti;
}
if (str.charAt(i) == '-')
takeOutPart--;
if (str.charAt(i) == '^')
start = true;
if (str.charAt(i) == '$')
end = true;
} else {
replaceExp.append(str.charAt(i));
}
}
}
if (replaceExp.toString().equals(REPLACEVOID)) {
replaceExp = new StringBuffer("");
//System.out.println("Changing _ to \"\" for "+matchExp.toString());
}
rule = new TransformationRule(matchExp.toString(), replaceExp.toString(), takeOutPart, matchLength, start, end);
//System.out.println(rule.toString());
ruleList.addElement(rule);
}
// Chars with special meaning to aspell. Not everyone is implemented here.
private boolean isReservedChar(char ch) {
if (ch == '<' || ch == '>' || ch == '^' || ch == '$' || ch == '-' || Character.isDigit(ch))
return true;
return false;
}
// Trims off everything we don't care about.
private String realTrimmer(String row) {
int pos = row.indexOf('#');
if (pos != -1) {
row = row.substring(0, pos);
}
return row.trim();
}
// Inner Classes
/*
* Holds the match string and the replace string and all the rule attributes.
* Is responsible for indicating matches.
*/
private class TransformationRule {
private String replace;
private char[] match;
// takeOut=number of chars to replace;
// matchLength=length of matching string counting multies as one.
private int takeOut, matchLength;
private boolean start, end;
// Construktor
public TransformationRule(String match, String replace, int takeout, int matchLength, boolean start, boolean end) {
this.match = match.toCharArray();
this.replace = replace;
this.takeOut = takeout;
this.matchLength = matchLength;
this.start = start;
this.end = end;
}
/*
* Returns true if word from pos and forward matches the match string.
* Precondition: wordPos+matchLength<word.length()
*/
public boolean isMatching(StringBuffer word, int wordPos) {
boolean matching = true, inMulti = false, multiMatch = false;
char matchCh;
for (int matchPos = 0; matchPos < match.length; matchPos++) {
matchCh = match[matchPos];
if (matchCh == STARTMULTI || matchCh == ENDMULTI) {
inMulti = !inMulti;
if (!inMulti)
matching = matching & multiMatch;
else
multiMatch = false;
} else {
if (matchCh != word.charAt(wordPos)) {
if (inMulti)
multiMatch = multiMatch | false;
else
matching = false;
} else {
if (inMulti)
multiMatch = multiMatch | true;
else
matching = true;
}
if (!inMulti)
wordPos++;
if (!matching)
break;
}
}
if (end && wordPos != word.length())
matching = false;
return matching;
}
public String getReplaceExp() {
return replace;
}
public int getTakeOut() {
return takeOut;
}
public boolean startsWithExp() {
return start;
}
public int lengthOfMatch() {
return matchLength;
}
// Just for debugging purposes.
public String toString() {
return "Match:" + String.valueOf(match) + " Replace:" + replace + " TakeOut:" + takeOut + " MatchLength:" + matchLength + " Start:" + start + " End:" + end;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -