📄 fieldqueryfilter.java
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.searcher;
import java.util.ArrayList;
import java.util.StringTokenizer;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.index.Term;
import net.nutch.searcher.Query.Clause;
import net.nutch.searcher.Query.Phrase;
import kit.nlp.util.Stopwords;
/** Translate query fields to search the same-named field, as indexed by an
* IndexingFilter. Best for tokenized fields. */
public abstract class FieldQueryFilter implements QueryFilter {
private String field;
private float boost = 1.0f;
/** Construct for the named field.*/
protected FieldQueryFilter(String field) {
this(field, 1.0f);
}
/** Construct for the named field, boosting as specified.*/
protected FieldQueryFilter(String field, float boost) {
this.field = field;
this.boost = boost;
}
public BooleanQuery filter(Query input, BooleanQuery output) throws QueryException {
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
// skip non-matching clauses
if (!c.getField().equals(field))
continue;
//System.out.println("**********FieldQueryFilter:" + c.getField());
String queryStr = c.getTerm().toString();
ArrayList<SpanTermQuery> spanQueryList = new ArrayList<SpanTermQuery>();
ArrayList<TermQuery> termQueryList = new ArrayList<TermQuery>();
StringTokenizer st = new StringTokenizer(queryStr,"/");
if (st.countTokens() > 1){
while( st.hasMoreTokens() ){
String word = st.nextToken().toLowerCase();
if (word.length() == 0) continue;
if (Stopwords.isHighFreq(field,word)){
termQueryList.add(new TermQuery(new Term(field, word),true));
continue;
}
if (field.equals("url")){
//System.out.println("query in url>>>>>>");
org.apache.lucene.search.Query luceneClause = new TermQuery(new Term(field, word));
luceneClause.setBoost(boost);
output.add(luceneClause, c.isRequired(), c.isProhibited());
}else{
SpanTermQuery termQuery = new SpanTermQuery(new Term(field, word));
spanQueryList.add(termQuery);
}
}
if (spanQueryList.size() > 1){
SpanTermQuery[] spanTermQuerys = new SpanTermQuery[spanQueryList.size()];
spanQueryList.toArray(spanTermQuerys);
SpanNearQuery spanNearQuery = new SpanNearQuery(spanTermQuerys,6,false);
output.add(spanNearQuery,c.isRequired(),c.isProhibited());
}else if (spanQueryList.size() == 1){
SpanTermQuery termQuery = (SpanTermQuery)spanQueryList.get(0);
output.add(new TermQuery(termQuery.getTerm()),c.isRequired(), c.isProhibited());
}
}else{
String word = st.nextToken().toLowerCase();
if (word.length() == 0) continue;
// 对class:ent;lady;sports等语法的支持
StringTokenizer tokens = new StringTokenizer(word,";");
if (tokens.countTokens() > 1){
BooleanQuery out = new BooleanQuery();
while(tokens.hasMoreTokens()){
TermQuery clause = new TermQuery(new Term(field, tokens.nextToken()));
clause.setBoost(boost);
out.add(clause, false, false);
}
output.add(out, c.isRequired(), c.isProhibited());
}else{
org.apache.lucene.search.Query luceneClause;
String term = tokens.nextToken();
if (Stopwords.isHighFreq(field,term))
luceneClause = new TermQuery(new Term(field, term),true);
else
luceneClause = new TermQuery(new Term(field, term));
luceneClause.setBoost(boost);
output.add(luceneClause, c.isRequired(), c.isProhibited());
}
}
if (termQueryList.size() > 0){
for (TermQuery termQuery : termQueryList){
output.add(termQuery, c.isRequired(), c.isProhibited());
}
termQueryList.clear();
}
}
return output;
}
/*
public BooleanQuery filter(Query input, BooleanQuery output)
throws QueryException {
// examine each clause in the Nutch query
Clause[] clauses = input.getClauses();
for (int i = 0; i < clauses.length; i++) {
Clause c = clauses[i];
// skip non-matching clauses
if (!c.getField().equals(field))
continue;
// optimize phrase clause
if (c.isPhrase()) {
String[] opt = CommonGrams.optimizePhrase(c.getPhrase(), field);
if (opt.length==1) {
c = new Clause(new Query.Term(opt[0]),
c.isRequired(), c.isProhibited());
} else {
c = new Clause(new Phrase(opt), c.isRequired(), c.isProhibited());
}
}
// construct appropriate Lucene clause
org.apache.lucene.search.Query luceneClause;
if (c.isPhrase()) {
Phrase nutchPhrase = c.getPhrase();
Query.Term[] terms = nutchPhrase.getTerms();
PhraseQuery lucenePhrase = new PhraseQuery();
for (int j = 0; j < terms.length; j++) {
lucenePhrase.add(new Term(field, terms[j].toString()));
}
luceneClause = lucenePhrase;
} else {
luceneClause = new TermQuery(new Term(field, c.getTerm().toString()));
}
// set boost
luceneClause.setBoost(boost);
// add it as specified in query
output.add(luceneClause, c.isRequired(), c.isProhibited());
}
// return the modified Lucene query
return output;
}
*/
public float getBoost() {
return boost;
}
public void setBoost(float boost) {
this.boost = boost;
}
public String getField() {
return field;
}
public void setField(String field) {
this.field = field;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -