⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 searchquery.java

📁 nutch搜索的改进型工具和优化爬虫的相关工具
💻 JAVA
字号:

/*
 * 创建日期 2005-2-18
 *
 * TODO 要更改此生成的文件的模板,请转至
 * 窗口 - 首选项 - Java - 代码样式 - 代码模板
 */
package net.nutch.searcher;

import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.StringTokenizer;

import net.nutch.searcher.Query.Clause;

import org.apache.log4j.Logger;

import kit.nlp.util.Stopwords;
import kit.nlp.util.Token;
import kit.nlp.util.WordsSegment;
/**
 * @author Administrator
 *
 * TODO 要更改此生成的类型注释的模板,请转至
 * 窗口 - 首选项 - Java - 代码样式 - 代码模板
 */
public class SearchQuery {
	public static final Logger LOGQuery = Logger.getLogger("clientsearch");
	public static final Logger LOG = Logger.getLogger("search");
	//private static WordsSegment ws = new WordsSegment();
	
	private static boolean in(Token token1, Token token2){
		int offset1 = token1.getOffset();
		int len1 = token1.getTerm().getBytes().length;
		int offset2 = token2.getOffset();
		int len2 = token2.getTerm().getBytes().length;
		int end1 = offset1 + len1;
		int end2 = offset2 + len2;
		if (offset2 >= offset1 && offset2 <= offset1 + len1){
			if (end2 <= end1)
				return true;
			return false;
		}
		return false;
	}
	
	static class TokenComparator implements Comparator {
		public int compare(Object o1, Object o2) {
			Token token1 = (Token)o1;
			Token token2 = (Token)o2;
			int length1 = token1.getTerm().getBytes().length;
			int length2 = token2.getTerm().getBytes().length;
			if (length1 < length2)
				return -1;
			if (length1 > length2)
				return 1;
			return 0;
		}
	}
	
	private static String wordSeg(String str){
		if (str == null || str.length() == 0)
			return null;
		try{
			Token[] terms = WordsSegment.segmentToken(str);
			if (terms == null || terms.length == 0){
				LOG.error("Segment String : "+ str + " Error!");
				return null;
			}
			ArrayList<Token> termList = new ArrayList<Token>();
			for (Token term : terms){
				if (Stopwords.isStopword(term.getTerm()))
					continue;
				termList.add(term);
			}

			Token[] phrases = WordsSegment.tokenPhrase(terms);
			if (phrases != null && phrases.length >0){
				if (phrases.length > 1){
					Arrays.sort(phrases,new TokenComparator());
					for (int i=0; i<phrases.length-1; i++){
						Token token1 = phrases[i];
						for (int j=i+1; j<phrases.length; j++){
							Token token2 = phrases[j];
							if (in(token2,token1)){
								token1.setTerm("");
							}
						}
					}
				}
				for (Token phrase : phrases){
					if (phrase.getTerm() == null || phrase.getTerm().length() == 0)
						continue;
					int offset = phrase.getOffset();
					int i = 0;
					for (;i<termList.size();i++){
						Token term = termList.get(i);
						if (term.getOffset() == offset){
							break;
						}
					}
					if (i < termList.size()){
						termList.add(i,phrase);
					}
				}
			}
			for (Token term : termList){
				if (term.getType() == 0)
					continue;
				for (Token phrase : termList){
					if (phrase.getType() > 0)
						continue;
					if (in(phrase, term)){
						term.setTerm("");
						break;
					}
				}
			}
			String returnStr = "";
			for( Token term : termList ){
				String word = term.getTerm();
				if (word == null || word.length() == 0)
					continue;
				
				if ( Stopwords.isStopword(word) ) continue;
				returnStr += word + "/";
			}
			termList.clear();
			return returnStr;
		}catch(Exception e){
			LOG.error("Segment Error:" + str + "******" + e.getMessage());
			return null;
		}
	}

	public static Query parse(String queryStr, int searchFrom, int sort ) throws Exception {
		if (queryStr == null || queryStr.length() == 0)
			return null;
		//String newQueryStr = Stopwords.haveStopword(queryStr);
		String newQueryStr = queryStr;
		
		String fromField = Clause.DEFAULT_FIELD;
		if (searchFrom == 1)//标题检索
			fromField = "anchor";
		
		Query query = new Query();
		String querySeg = "";
		String enQueryStr = URLEncoder.encode(newQueryStr,"GBK");
		enQueryStr = enQueryStr.replaceAll("%A6%DC","%20");
		//enQueryStr = enQueryStr.replaceAll("%A1%A1","%20");
		String deQueryStr = formatOperator(URLDecoder.decode(enQueryStr,"GBK"));
		//中文全角空格替换%A1%A1
		deQueryStr.replace(" "," ");
		StringTokenizer st = new StringTokenizer(deQueryStr," ");
		ArrayList<MyClause> clauseList = new ArrayList<MyClause>();
		while( st.hasMoreTokens() ){
			String clause = st.nextToken();
			if (clause.equals("-")){
				if (st.hasMoreTokens()){
					clause = st.nextToken();
					clauseList.add(new MyClause(clause,true,false));
					continue;
				}
			}
			else if(clause.equals("OR")){
				if (st.hasMoreTokens() && clauseList.size() > 0){
					MyClause tempClause= (MyClause)clauseList.get(clauseList.size()-1);
					tempClause.setRequired(false);
					clause = st.nextToken();
					clauseList.add(new MyClause(clause,false,false));
				}else{
					clauseList.add(new MyClause(clause,false,true));
				}
				continue;
			}
			else if(clause.charAt(0)=='"'){
				if(clause.length()>1)
					clauseList.add(new MyClause(clause.substring(1,clause.length()-1),false, true));
			}
			else{
				clauseList.add(new MyClause(clause,false,true));
			}
		}
		//LOGQuery.info("***********"+clauseList.size()+ "***********");
		if (clauseList.size() == 1){
			MyClause tempClause = (MyClause)clauseList.get(0);
			if (tempClause.isProhibited()){
				tempClause.setProhibited(false);
			}
		}
		while(clauseList.size()>0){
			if (querySeg.length() > 0){
				querySeg = querySeg.trim();
				querySeg += '\t';
			}
			int colon = 0;
			MyClause tempClause = (MyClause)clauseList.remove(0);
			String clause = tempClause.getClause();
			if ((colon=clause.indexOf(":")) > 0){
				String field = clause.substring(0,colon);
				clause = clause.substring(colon+1);
				if (clause==null || clause.length()==0){
					continue;
				}
				if (field.equals("site")){
					if (!tempClause.isProhibited())
						query.addRequiredTerm(clause,"url",tempClause.isRequired());
					else
						query.addProhibitedTerm(clause,"url");
				}else if (field.equals("class")){
					if (!tempClause.isProhibited())
						query.addRequiredTerm(clause,"class",tempClause.isRequired());
					else
						query.addProhibitedTerm(clause,"class");
				}else if (field.equals("url") || field.equals("http")){
					if (!tempClause.isProhibited()){
						query.addRequiredTerm(wordSeg(clause),"url",tempClause.isRequired());
					}
					else
						query.addProhibitedTerm(wordSeg(clause),"url");
				}else if (field.equals("author")){
					if (!tempClause.isProhibited()){
						query.addRequiredTerm(wordSeg(clause),"author",tempClause.isRequired());
					}
					else
						query.addProhibitedTerm(wordSeg(clause),"author");
				}else if(field.equals("rel")){
					if (!tempClause.isProhibited())
						query.addRequiredTerm(clause,"gid",tempClause.isRequired());
					else
						query.addProhibitedTerm(clause,"gid");
				}else if(field.equals("cid")){
					if (!tempClause.isProhibited())
						query.addRequiredTerm(clause,"cid",tempClause.isRequired());
					else
						query.addProhibitedTerm(clause,"cid");
				}else {
					if (!tempClause.isProhibited()){
						//querySeg += ws.segment(clause,false);
						clause = field + " " + clause;
						Token[] tWords = WordsSegment.segmentToken(clause);
						for(Token t : tWords){
							if (Stopwords.isSymbol(t.getTerm()))
								continue;
							querySeg += t.getTerm() + " ";
						}
						query.addRequiredTerm(wordSeg(clause),fromField,tempClause.isRequired());
					}
					else
						query.addProhibitedTerm(wordSeg(clause),fromField);
				}
			}else{
				if (!tempClause.isProhibited()){
					//querySeg += ws.segment(clause,false);
					Token[] tWords = WordsSegment.segmentToken(clause);
					for(Token t : tWords){
						if (Stopwords.isSymbol(t.getTerm()))
							continue;
						querySeg += t.getTerm() + " ";
					}
					query.addRequiredTerm(wordSeg(clause),fromField,tempClause.isRequired());
				}
				else
					query.addProhibitedTerm(wordSeg(clause),fromField);
			}
		}
		query.setQueryStr(querySeg.trim());
		//LOGQuery.info("***********"+querySeg+ "***********");
		return query;
	}
	/**
	 * Add by liubin.2006-03-09
	 * @param queryStr
	 * @return
	 */
	private static String formatOperator(String queryStr){
		
		//change chinese quotes to en-quotes
		queryStr = queryStr.replace("“","\"").replace("”","\"");
		int index		= 0;
		int subStart 	= 0;
		int subEnd 		= 0;
		boolean hasSub 	= true;
		boolean isFirst = false;
		String subStr	= null;
		String rpcStr	= null;
		
		while(hasSub){
			isFirst = false;
			subStart = queryStr.indexOf(" \"",index);
			if (subStart < 0 ){
					subStart = queryStr.indexOf("\"", index);
					if(subStart < 0){
						hasSub = false;
						break;
					}
					else{
						isFirst = true;
					}
						
			}
			
			if(isFirst)
				index = subStart +1;
			else
				index = subStart +2;
			subEnd = queryStr.indexOf("\" ",index);
			if(subEnd < 0 ){
				subEnd = queryStr.indexOf("\"",index);
				if(subEnd < 0)
					break;
			}

			if(isFirst)
				subStr= queryStr.substring(subStart + 1, subEnd+1);
			else
				subStr= queryStr.substring(subStart + 2, subEnd+1);
			
			rpcStr = subStr.replace(" ","");
			queryStr = queryStr.replace(subStr, rpcStr);
			
			if(isFirst)
				index = subStart + rpcStr.length()+1;
			else
				index = subStart + rpcStr.length()+2;
		}
		
		queryStr = queryStr.replace(" -"," - ")
						.replace(" +", " ")
						.replace(" or "," OR ")
						.replace("\""," ");

		return queryStr;
	}
	
	
/*
	public static Query parse(String queryStr) throws Exception {
		if (queryStr == null || queryStr.length() == 0)
			return null;
		return parse(queryStr,0);
	}
	*/
	/*
	public static Query parse(String queryStr,String client) throws Exception{
		if (queryStr == null || queryStr.length() == 0)
			return null;
		LOGQuery.info("client:"+client + " | query:" +queryStr);
		return parse(queryStr);
	}
	*/
	/*
	public static Query parse(String queryStr,int from,String client) throws Exception {
		if (queryStr == null || queryStr.length() == 0)
			return null;
		String searchFrom = "content";
		if (from == 1)
			searchFrom = "title";
		LOGQuery.info("client:" + client + " | query:" +queryStr + " | from:" + 
				searchFrom);
		return parse(queryStr,from);
	}
	*/
	public static Query parse(String queryStr,int from,int sort, String client) throws Exception {
		if (queryStr == null || queryStr.length() == 0)
			return null;
		String searchFrom = "content";
		if (from == 1)
			searchFrom = "title";
		String sortType = "time";
		if (sort == 0)
			sortType = "relativity";
		LOGQuery.info("client:" + client + " | query:" +queryStr + " | from:" + 
				searchFrom + " | sort:" + sortType);
		return parse(queryStr, from, sort);
	}
	
	public static void main(String[] args){
		System.out.println(SearchQuery.wordSeg(args[0]));
	}
	
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -