⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 queryextractor.java

📁 自己写的search engine, 有 boolean search, fuzzy search
💻 JAVA
字号:
package searchingEngine.queryPrepocessing;


import java.io.BufferedReader;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import java.util.StringTokenizer;
import java.util.Arrays;


public class QueryExtractor {

	private char keySymbols[] = {'(',')','!','&','|'};
	private final String keyWordSymbols[] = {"and","or","not"};
	private int opStart = 2;
	private List stopWordList = null;
	private String stopWordFilename = "estop.lst";
	
	public static void main(String[] args) {
		String orgQuery = "(happy && anull)||((people !!A)|haha)";
		System.out.println("org query:\t\t"+orgQuery);
		char key[] = {'(',')','!','&','|'};
		QueryExtractor getList = new QueryExtractor();
		String temp[];
		/*
		System.out.print("Non-Stemmed List:\t");
		temp = getList.getQueryList(orgQuery);
		for (int i = 0; i <=temp.length-1;i++){
			System.out.print((String)(temp[i])+" ");
		}
		*/
		System.out.println();
		temp = getList.getStemedList(orgQuery);
		System.out.print("Stemmed List:\t\t");
		for (int i = 0; i <temp.length;i++){
			System.out.print((String)(temp[i])+" ");
		}
	}

	public QueryExtractor(){
		setStopWordFilename(stopWordFilename);
	}
	
	public QueryExtractor(char keySymbols[],int opStart){
		this();
		this.keySymbols=keySymbols;
		this.opStart=opStart;
	}
	
	public String[] getStemedList(String orgQuery) {
		StemRevised stem = new StemRevised();
		String temp = stem.fullStem(orgQuery);
		List<String> list = getQueryList(temp);
		return (String[])list.toArray(new String[0]);
	}
	
	private List<String> getQueryList(String orgQuery){
		List list = new LinkedList();
		int i;
		String spliteLexeme = "[\\W]";
		if (keySymbols!=null) {
			spliteLexeme = "[" + spliteLexeme +"&&[^";
			for (i = 0; i<keySymbols.length;i++) {
				spliteLexeme += keySymbols[i];
			}
			spliteLexeme += "]]";
		}
		System.out.println("lex : "+spliteLexeme);
		String splited[] = orgQuery.split(spliteLexeme);
		
		System.out.print("om Qlist, ln70 " + splited.length + " :");
		for (int j = 0 ; j<splited.length ; j++){
			System.out.print(splited[j]+" ");
		}
		System.out.println();
		int curPos = 0;
		for (i = 0; i<splited.length;i++) {
			for (int j=0;j<splited[i].length();j++){
				curPos = indexOfSymbol(splited[i],j);
				if (curPos<0) {
					list.add(splited[i].substring(j));
					j=splited[i].length();
				} else {
					if (j<curPos) list.add(splited[i].substring(j,curPos));
					list.add(splited[i].substring(curPos,curPos+1));
					j=curPos;
				}
			}
		}
		
		if (opStart>=0 && keySymbols!=null) {
			int listSize=list.size();
			String opLexeme = "[";
			for (i = opStart; i<keySymbols.length;i++) {
				opLexeme += keySymbols[i];
			}
			opLexeme += "]";
			
			String opTester;
			for (int j=listSize-2;j>=0;j--){
				opTester = (String) list.get(j+1);
				if (opTester.length()==1 && opTester.matches(opLexeme) && opTester.equals(list.get(j))){
					list.remove(j+1);
				}
			}
		}
		return filtStopWord(list);
	}

	
	private int indexOfSymbol(String term, int k){
		if (keySymbols!=null) {
			int SymbolPos[] = new int[keySymbols.length];
			int result = term.length()+1;
			for (int i=0;i<SymbolPos.length;i++){
				SymbolPos[i]=term.indexOf(keySymbols[i],k);
				if (SymbolPos[i]<0) SymbolPos[i]=term.length()+1;
				result = Math.min(result,SymbolPos[i]);
			}
			if (result>term.length()) {
				return -1;
			} else {
				return result;
			}
		} else return -1;
	}
	
	private String getStopWordFilename() {
		return stopWordFilename;
	}
	
	private void setStopWordFilename(String filename) {
		this.stopWordFilename=filename;
	}
	
	private void readStopWordList(String filename) {
		String tempWords;
		List result = new LinkedList();
		java.io.BufferedReader br=null;
		try {
			br = new java.io.BufferedReader(new java.io.FileReader(filename));
			while ((tempWords = br.readLine())!=null){
				result.add(tempWords);
			}
			br.close();
		} catch (Exception e){
			System.err.print(e);
			try { br.close(); } catch (Exception ingore) {}
		}
		stopWordList =  result;
	}
	
	private boolean isStopWord(String word){
		if (stopWordList!=null) {
			for (int i=0;i<stopWordList.size();i++){
				if (word.equalsIgnoreCase((String)stopWordList.get(i))) {
					return true;
				}
			}
		}
		return false;	
	}
	
	private List filtStopWord(List<String> queryList){
		readStopWordList(stopWordFilename);
		for (int i=0; i<queryList.size(); i++) {
			if (isStopWord(queryList.get(i)) ){//&& !isKeyWordSymbols(queryList.get(i))) {
				queryList.remove(i);
				i--;
			}
		}
		return queryList;
	}
	
	private boolean isKeyWordSymbols(String token) {
		return  Arrays.asList(keyWordSymbols).contains(token);
	}
	
	private class QueryTermList{
		private Integer id;
		private LinkedList<String> termList=new LinkedList<String>();
		
		public QueryTermList(String unprocessedStr) throws IOException {
			// split the strings and remove all - , 
			StringTokenizer tokens=new StringTokenizer(unprocessedStr, " -,.");
			id=new Integer(Integer.parseInt(tokens.nextToken()));
			
			while (tokens.hasMoreTokens()) {
				termList.add(tokens.nextToken());	
			}
		}
		// return the query ID
		public Integer getID() {
			return id;	
		}
		// return the query terms in the form of linked list
		public LinkedList<String> getRemainingList() {
			return termList;
		}
		
	
	}
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -