📄 searchsieve.java
字号:
/*
* SearchSieve.java. Copyright (c) 2000, David Faden. All rights reserved (for now).
*
* This code is distributed without warranty of any kind.
*/
//SearchToHTML c1999 The Gilbert Post by David Faden
//The applet and code are distributed as linkware...
//If you use this applet or a variant on its code,
//include a link to The Gilbert Post,
//http://www.geocities.com/Athens/Parthenon/1911
//The Gilbert Post and David Faden take no responsibility
//for anything bad that happens as a result of using this applet.
//Please send reports of problems to gilbertnews@hotmail.com, anyway, though.
//Partial modification record
// August 16, 2000: Corrected some inaccurate comments.
//
// August 17, 2000: Removed method isSpace(char). Added the
// isWordChar(int) method to be used as negative replacement.
// isWordChar uses table look-up rather than a loop so it should be
// substantially faster. Added method setWordChar(char, boolean)
// to match isWordChar(int).
// I consider this to be an interim version of isWordChar - the final
// one will use some sort of hash and correctly handle the full range of
// Unicode characters.
//
// August 17, 2000: changed the names of the methods numOfMatches() and keyLength()
// to getNumOfMatches() and getKeyLength() respectively. Documented the undocumented.
/**
* The SearchSieve class sifts for a key in a sequence
* of Java (Unicode) characters fed to the sieve with addChar.
*
* @author David Faden
* @version 0.9a August 17, 2000
*/
public class SearchSieve {
private int numOfMatches;//how many matches have been found?
private int kposition;//current position in key
private char[] key;//search word/phrase
private boolean exact;//must the word be an exact match or
//can it be part of a larger word
private char prevc;//the previous char examined
private char[] buffer;//store chars as one sequence is examined in case
//the key starts at another spot within the chain
private int bufferLength;
private boolean buffering;
private char[] temp;
/**
* Word character table used for fast look-up.
*/
private static boolean[] wordCharTable;
//XXX! This table excludes multi-byte characters. A search for a better
//solution is under way. DF.
static {
wordCharTable = new boolean[256];
for (int i = 0; i < 256; i++) {
if (i >= 'A' && i <= 'Z' ||
i >= 'a' && i <= 'z' ||
i >= '0' && i <= '9' ||
i >= 0xC0) //LATIN CAPITAL LETTER A WITH GRAVE and up
wordCharTable[i] = true;
else
wordCharTable[i] = false;
//I believe that the Java Lang Spec calls for
//booleans to have an uninitialized value of false anyway but...
}
}
public SearchSieve(char[] c, boolean bexact) {
setKey(c, bexact);
}
public void setKey(char[] c, boolean bexact) {
exact=bexact;
key=new char[c.length];
System.arraycopy(c,0,key,0,c.length);
buffer=new char[key.length];
temp=new char[buffer.length];//temp is created here to
//avoid the overhead of recreating it each time it is needed
reset();
}
public void reset() {
kposition=0;
numOfMatches=0;
prevc='\0'; //null according to the Unicode table, nul according to others
bufferLength=0;//clear the buffer
buffering=false;
}
public int getKeyLength() {
return key.length;
}
/**
* Returns true if the sequence of chars added to the SearchSieve,
* including <code>c</code>, constitute a match. For exact searches,
* the last char of a match must be a non-word char.
*/
public final boolean addChar(char c) {
//If there's a match, return true.
boolean matchfound=false;//left unchanged if nothing found
boolean eatthebuffer=false;
if (exact) {
if (kposition == key.length) {
//found a match
if (!isWordChar(c)) {
numOfMatches++;
matchfound=true;
}
kposition=0;
eatthebuffer=true;
}
else if(kposition==0) {
if (c==key[0] && !isWordChar(prevc))
kposition++;
}
else if (c == key[kposition])
kposition++;
else {
//found no match
kposition=0;
eatthebuffer=true;
}
}
//Inexact match check
else {
if (kposition == 0) {
if (c == key[0])
kposition++;
}
else if (c == key[kposition])
kposition++;
//Special case space characters - consider any two to match.
//XXX! Character.isSpace(char) is deprecated in JDK 1.1 but our target
//JDK is 1.0.
else if (Character.isSpace(c) && Character.isSpace(key[kposition]))
kposition++;
else {
//no match found
kposition=0;
eatthebuffer=true;
}
if (kposition == key.length) {
numOfMatches++;
kposition=0;
eatthebuffer=true;
matchfound=true;
}
}
//a kposition of 1 indicates that one char match has been found
if(kposition>1 && !buffering && c==key[0]) {
if(!exact) {
bufferLength=0;
buffering=true;
}//if exact, matches have to be surrounded by spaces
else if(!isWordChar(prevc)) {
bufferLength=0;
buffering=true;
}
}
if(buffering) {
if(c==key[bufferLength]){
buffer[bufferLength]=c;
bufferLength++;
}
else {
bufferLength=0;
buffering=false;
}
}
if(eatthebuffer && bufferLength>0) eatTheBuffer();//feed the buffer back into the
//search system on failures and finds
prevc=c;
return matchfound;
}
//Feed the buffer through addChar(char) as if it were being externally entered.
//No match can ever be found in the buffer alone, so there is
//no need to check the return of addChar(char).
//Huge problem with this method which will cause fatal error...
//Buffer itself may contain a second ambiguous char...
//However, if the buffer doesn't match up, it is overwritten
//When the next call to eatTheBuffer is made
//Idea, what if temp is made internal to eatTheBuffer?
//This could lead to a major memory leak, though...
//Fixed...breath easy now!
private final void eatTheBuffer() {
//reset search
kposition=0;
System.arraycopy(buffer,0,temp,0,bufferLength);
int tempLength=bufferLength;//save the value of bufferLength
bufferLength=0; //reset the buffer
buffering=false;
//Don't bother checking addChar(char) return value - It can never be true
//in this case.
for (int i = 0; i < tempLength; i++)
addChar(temp[i]);
}
public final int getNumOfMatches() {
return numOfMatches;
}
/**
* Returns true if SearchSieves consider <code>c</code> to
* be part of a word.
* <p>
* Currently, this method is limited to
* characters < 256 and will return false for all
* values out of range. This failing will only affect
* "exact" searches, potentially returning false positives.
*
* @param c - The char to test.
* @return True if <code>c</code> is a word char.
*/
public static final boolean isWordChar(int c) {
if (c < 0 || c > 255)
return false;
return wordCharTable[c];
}
/**
* Set whether SearchSieves should consider <code>c</code>
* to be part of a word.
* <p>
* Currently, this method is limited to char < 256 and will
* will throw an IllegalArgumentException for out of range char.
*
* @param c - The char to set.
* @param isWord - A true value will cause SearchSieves to consider c part of a word.
* @exception IllegalArgumentException - Thrown if c > 255.
*/
public static final void setWordChar(char c, boolean isWord) {
int i = c;
//XXX! Remember to chop this section once the code is fully
//internationalized.
if (i == 0 || i > 255)
throw new IllegalArgumentException("Out of range.");
wordCharTable[i] = isWord;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -