📄 queryextractor.java
字号:
package searchingEngine.queryPrepocessing;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Arrays;
public class QueryExtractor {
private char keySymbols[] = {'(',')','!','&','|'};
private final String keyWordSymbols[] = {"and","or","not"};
private int opStart = 2;
private List stopWordList = null;
private String stopWordFilename = "estop.lst";
public static void main(String[] args) {
String orgQuery = "(happy && anull)||((people !!A)|haha)";
System.out.println("org query:\t\t"+orgQuery);
char key[] = {'(',')','!','&','|'};
QueryExtractor getList = new QueryExtractor();
String temp[];
/*
System.out.print("Non-Stemmed List:\t");
temp = getList.getQueryList(orgQuery);
for (int i = 0; i <=temp.length-1;i++){
System.out.print((String)(temp[i])+" ");
}
*/
System.out.println();
temp = getList.getStemedList(orgQuery);
System.out.print("Stemmed List:\t\t");
for (int i = 0; i <temp.length;i++){
System.out.print((String)(temp[i])+" ");
}
}
public QueryExtractor(){
setStopWordFilename(stopWordFilename);
}
public QueryExtractor(char keySymbols[],int opStart){
this();
this.keySymbols=keySymbols;
this.opStart=opStart;
}
public String[] getStemedList(String orgQuery) {
StemRevised stem = new StemRevised();
String temp = stem.fullStem(orgQuery);
List<String> list = getQueryList(temp);
return (String[])list.toArray(new String[0]);
}
private List<String> getQueryList(String orgQuery){
List list = new LinkedList();
int i;
String spliteLexeme = "[\\W]";
if (keySymbols!=null) {
spliteLexeme = "[" + spliteLexeme +"&&[^";
for (i = 0; i<keySymbols.length;i++) {
spliteLexeme += keySymbols[i];
}
spliteLexeme += "]]";
}
System.out.println("lex : "+spliteLexeme);
String splited[] = orgQuery.split(spliteLexeme);
System.out.print("om Qlist, ln70 " + splited.length + " :");
for (int j = 0 ; j<splited.length ; j++){
System.out.print(splited[j]+" ");
}
System.out.println();
int curPos = 0;
for (i = 0; i<splited.length;i++) {
for (int j=0;j<splited[i].length();j++){
curPos = indexOfSymbol(splited[i],j);
if (curPos<0) {
list.add(splited[i].substring(j));
j=splited[i].length();
} else {
if (j<curPos) list.add(splited[i].substring(j,curPos));
list.add(splited[i].substring(curPos,curPos+1));
j=curPos;
}
}
}
if (opStart>=0 && keySymbols!=null) {
int listSize=list.size();
String opLexeme = "[";
for (i = opStart; i<keySymbols.length;i++) {
opLexeme += keySymbols[i];
}
opLexeme += "]";
String opTester;
for (int j=listSize-2;j>=0;j--){
opTester = (String) list.get(j+1);
if (opTester.length()==1 && opTester.matches(opLexeme) && opTester.equals(list.get(j))){
list.remove(j+1);
}
}
}
return filtStopWord(list);
}
private int indexOfSymbol(String term, int k){
if (keySymbols!=null) {
int SymbolPos[] = new int[keySymbols.length];
int result = term.length()+1;
for (int i=0;i<SymbolPos.length;i++){
SymbolPos[i]=term.indexOf(keySymbols[i],k);
if (SymbolPos[i]<0) SymbolPos[i]=term.length()+1;
result = Math.min(result,SymbolPos[i]);
}
if (result>term.length()) {
return -1;
} else {
return result;
}
} else return -1;
}
private String getStopWordFilename() {
return stopWordFilename;
}
private void setStopWordFilename(String filename) {
this.stopWordFilename=filename;
}
private void readStopWordList(String filename) {
String tempWords;
List result = new LinkedList();
java.io.BufferedReader br=null;
try {
br = new java.io.BufferedReader(new java.io.FileReader(filename));
while ((tempWords = br.readLine())!=null){
result.add(tempWords);
}
br.close();
} catch (Exception e){
System.err.print(e);
try { br.close(); } catch (Exception ingore) {}
}
stopWordList = result;
}
private boolean isStopWord(String word){
if (stopWordList!=null) {
for (int i=0;i<stopWordList.size();i++){
if (word.equalsIgnoreCase((String)stopWordList.get(i))) {
return true;
}
}
}
return false;
}
private List filtStopWord(List<String> queryList){
readStopWordList(stopWordFilename);
for (int i=0; i<queryList.size(); i++) {
if (isStopWord(queryList.get(i)) ){//&& !isKeyWordSymbols(queryList.get(i))) {
queryList.remove(i);
i--;
}
}
return queryList;
}
private boolean isKeyWordSymbols(String token) {
return Arrays.asList(keyWordSymbols).contains(token);
}
private class QueryTermList{
private Integer id;
private LinkedList<String> termList=new LinkedList<String>();
public QueryTermList(String unprocessedStr) throws IOException {
// split the strings and remove all - ,
StringTokenizer tokens=new StringTokenizer(unprocessedStr, " -,.");
id=new Integer(Integer.parseInt(tokens.nextToken()));
while (tokens.hasMoreTokens()) {
termList.add(tokens.nextToken());
}
}
// return the query ID
public Integer getID() {
return id;
}
// return the query terms in the form of linked list
public LinkedList<String> getRemainingList() {
return termList;
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -