📄 searchquery.java
字号:
/*
* 创建日期 2005-2-18
*
* TODO 要更改此生成的文件的模板,请转至
* 窗口 - 首选项 - Java - 代码样式 - 代码模板
*/
package net.nutch.searcher;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.StringTokenizer;
import net.nutch.searcher.Query.Clause;
import org.apache.log4j.Logger;
import kit.nlp.util.Stopwords;
import kit.nlp.util.Token;
import kit.nlp.util.WordsSegment;
/**
* @author Administrator
*
* TODO 要更改此生成的类型注释的模板,请转至
* 窗口 - 首选项 - Java - 代码样式 - 代码模板
*/
public class SearchQuery {
public static final Logger LOGQuery = Logger.getLogger("clientsearch");
public static final Logger LOG = Logger.getLogger("search");
//private static WordsSegment ws = new WordsSegment();
private static boolean in(Token token1, Token token2){
int offset1 = token1.getOffset();
int len1 = token1.getTerm().getBytes().length;
int offset2 = token2.getOffset();
int len2 = token2.getTerm().getBytes().length;
int end1 = offset1 + len1;
int end2 = offset2 + len2;
if (offset2 >= offset1 && offset2 <= offset1 + len1){
if (end2 <= end1)
return true;
return false;
}
return false;
}
static class TokenComparator implements Comparator {
public int compare(Object o1, Object o2) {
Token token1 = (Token)o1;
Token token2 = (Token)o2;
int length1 = token1.getTerm().getBytes().length;
int length2 = token2.getTerm().getBytes().length;
if (length1 < length2)
return -1;
if (length1 > length2)
return 1;
return 0;
}
}
private static String wordSeg(String str){
if (str == null || str.length() == 0)
return null;
try{
Token[] terms = WordsSegment.segmentToken(str);
if (terms == null || terms.length == 0){
LOG.error("Segment String : "+ str + " Error!");
return null;
}
ArrayList<Token> termList = new ArrayList<Token>();
for (Token term : terms){
if (Stopwords.isStopword(term.getTerm()))
continue;
termList.add(term);
}
Token[] phrases = WordsSegment.tokenPhrase(terms);
if (phrases != null && phrases.length >0){
if (phrases.length > 1){
Arrays.sort(phrases,new TokenComparator());
for (int i=0; i<phrases.length-1; i++){
Token token1 = phrases[i];
for (int j=i+1; j<phrases.length; j++){
Token token2 = phrases[j];
if (in(token2,token1)){
token1.setTerm("");
}
}
}
}
for (Token phrase : phrases){
if (phrase.getTerm() == null || phrase.getTerm().length() == 0)
continue;
int offset = phrase.getOffset();
int i = 0;
for (;i<termList.size();i++){
Token term = termList.get(i);
if (term.getOffset() == offset){
break;
}
}
if (i < termList.size()){
termList.add(i,phrase);
}
}
}
for (Token term : termList){
if (term.getType() == 0)
continue;
for (Token phrase : termList){
if (phrase.getType() > 0)
continue;
if (in(phrase, term)){
term.setTerm("");
break;
}
}
}
String returnStr = "";
for( Token term : termList ){
String word = term.getTerm();
if (word == null || word.length() == 0)
continue;
if ( Stopwords.isStopword(word) ) continue;
returnStr += word + "/";
}
termList.clear();
return returnStr;
}catch(Exception e){
LOG.error("Segment Error:" + str + "******" + e.getMessage());
return null;
}
}
public static Query parse(String queryStr, int searchFrom, int sort ) throws Exception {
if (queryStr == null || queryStr.length() == 0)
return null;
//String newQueryStr = Stopwords.haveStopword(queryStr);
String newQueryStr = queryStr;
String fromField = Clause.DEFAULT_FIELD;
if (searchFrom == 1)//标题检索
fromField = "anchor";
Query query = new Query();
String querySeg = "";
String enQueryStr = URLEncoder.encode(newQueryStr,"GBK");
enQueryStr = enQueryStr.replaceAll("%A6%DC","%20");
//enQueryStr = enQueryStr.replaceAll("%A1%A1","%20");
String deQueryStr = formatOperator(URLDecoder.decode(enQueryStr,"GBK"));
//中文全角空格替换%A1%A1
deQueryStr.replace(" "," ");
StringTokenizer st = new StringTokenizer(deQueryStr," ");
ArrayList<MyClause> clauseList = new ArrayList<MyClause>();
while( st.hasMoreTokens() ){
String clause = st.nextToken();
if (clause.equals("-")){
if (st.hasMoreTokens()){
clause = st.nextToken();
clauseList.add(new MyClause(clause,true,false));
continue;
}
}
else if(clause.equals("OR")){
if (st.hasMoreTokens() && clauseList.size() > 0){
MyClause tempClause= (MyClause)clauseList.get(clauseList.size()-1);
tempClause.setRequired(false);
clause = st.nextToken();
clauseList.add(new MyClause(clause,false,false));
}else{
clauseList.add(new MyClause(clause,false,true));
}
continue;
}
else if(clause.charAt(0)=='"'){
if(clause.length()>1)
clauseList.add(new MyClause(clause.substring(1,clause.length()-1),false, true));
}
else{
clauseList.add(new MyClause(clause,false,true));
}
}
//LOGQuery.info("***********"+clauseList.size()+ "***********");
if (clauseList.size() == 1){
MyClause tempClause = (MyClause)clauseList.get(0);
if (tempClause.isProhibited()){
tempClause.setProhibited(false);
}
}
while(clauseList.size()>0){
if (querySeg.length() > 0){
querySeg = querySeg.trim();
querySeg += '\t';
}
int colon = 0;
MyClause tempClause = (MyClause)clauseList.remove(0);
String clause = tempClause.getClause();
if ((colon=clause.indexOf(":")) > 0){
String field = clause.substring(0,colon);
clause = clause.substring(colon+1);
if (clause==null || clause.length()==0){
continue;
}
if (field.equals("site")){
if (!tempClause.isProhibited())
query.addRequiredTerm(clause,"url",tempClause.isRequired());
else
query.addProhibitedTerm(clause,"url");
}else if (field.equals("class")){
if (!tempClause.isProhibited())
query.addRequiredTerm(clause,"class",tempClause.isRequired());
else
query.addProhibitedTerm(clause,"class");
}else if (field.equals("url") || field.equals("http")){
if (!tempClause.isProhibited()){
query.addRequiredTerm(wordSeg(clause),"url",tempClause.isRequired());
}
else
query.addProhibitedTerm(wordSeg(clause),"url");
}else if (field.equals("author")){
if (!tempClause.isProhibited()){
query.addRequiredTerm(wordSeg(clause),"author",tempClause.isRequired());
}
else
query.addProhibitedTerm(wordSeg(clause),"author");
}else if(field.equals("rel")){
if (!tempClause.isProhibited())
query.addRequiredTerm(clause,"gid",tempClause.isRequired());
else
query.addProhibitedTerm(clause,"gid");
}else if(field.equals("cid")){
if (!tempClause.isProhibited())
query.addRequiredTerm(clause,"cid",tempClause.isRequired());
else
query.addProhibitedTerm(clause,"cid");
}else {
if (!tempClause.isProhibited()){
//querySeg += ws.segment(clause,false);
clause = field + " " + clause;
Token[] tWords = WordsSegment.segmentToken(clause);
for(Token t : tWords){
if (Stopwords.isSymbol(t.getTerm()))
continue;
querySeg += t.getTerm() + " ";
}
query.addRequiredTerm(wordSeg(clause),fromField,tempClause.isRequired());
}
else
query.addProhibitedTerm(wordSeg(clause),fromField);
}
}else{
if (!tempClause.isProhibited()){
//querySeg += ws.segment(clause,false);
Token[] tWords = WordsSegment.segmentToken(clause);
for(Token t : tWords){
if (Stopwords.isSymbol(t.getTerm()))
continue;
querySeg += t.getTerm() + " ";
}
query.addRequiredTerm(wordSeg(clause),fromField,tempClause.isRequired());
}
else
query.addProhibitedTerm(wordSeg(clause),fromField);
}
}
query.setQueryStr(querySeg.trim());
//LOGQuery.info("***********"+querySeg+ "***********");
return query;
}
/**
* Add by liubin.2006-03-09
* @param queryStr
* @return
*/
private static String formatOperator(String queryStr){
//change chinese quotes to en-quotes
queryStr = queryStr.replace("“","\"").replace("”","\"");
int index = 0;
int subStart = 0;
int subEnd = 0;
boolean hasSub = true;
boolean isFirst = false;
String subStr = null;
String rpcStr = null;
while(hasSub){
isFirst = false;
subStart = queryStr.indexOf(" \"",index);
if (subStart < 0 ){
subStart = queryStr.indexOf("\"", index);
if(subStart < 0){
hasSub = false;
break;
}
else{
isFirst = true;
}
}
if(isFirst)
index = subStart +1;
else
index = subStart +2;
subEnd = queryStr.indexOf("\" ",index);
if(subEnd < 0 ){
subEnd = queryStr.indexOf("\"",index);
if(subEnd < 0)
break;
}
if(isFirst)
subStr= queryStr.substring(subStart + 1, subEnd+1);
else
subStr= queryStr.substring(subStart + 2, subEnd+1);
rpcStr = subStr.replace(" ","");
queryStr = queryStr.replace(subStr, rpcStr);
if(isFirst)
index = subStart + rpcStr.length()+1;
else
index = subStart + rpcStr.length()+2;
}
queryStr = queryStr.replace(" -"," - ")
.replace(" +", " ")
.replace(" or "," OR ")
.replace("\""," ");
return queryStr;
}
/*
public static Query parse(String queryStr) throws Exception {
if (queryStr == null || queryStr.length() == 0)
return null;
return parse(queryStr,0);
}
*/
/*
public static Query parse(String queryStr,String client) throws Exception{
if (queryStr == null || queryStr.length() == 0)
return null;
LOGQuery.info("client:"+client + " | query:" +queryStr);
return parse(queryStr);
}
*/
/*
public static Query parse(String queryStr,int from,String client) throws Exception {
if (queryStr == null || queryStr.length() == 0)
return null;
String searchFrom = "content";
if (from == 1)
searchFrom = "title";
LOGQuery.info("client:" + client + " | query:" +queryStr + " | from:" +
searchFrom);
return parse(queryStr,from);
}
*/
public static Query parse(String queryStr,int from,int sort, String client) throws Exception {
if (queryStr == null || queryStr.length() == 0)
return null;
String searchFrom = "content";
if (from == 1)
searchFrom = "title";
String sortType = "time";
if (sort == 0)
sortType = "relativity";
LOGQuery.info("client:" + client + " | query:" +queryStr + " | from:" +
searchFrom + " | sort:" + sortType);
return parse(queryStr, from, sort);
}
public static void main(String[] args){
System.out.println(SearchQuery.wordSeg(args[0]));
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -