⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 queryfilters.java

📁 nutch搜索的改进型工具和优化爬虫的相关工具
💻 JAVA
字号:
/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.searcher;

import net.nutch.plugin.*;
import net.nutch.searcher.Query.Clause;
import net.nutch.util.LogFormatter;
import java.util.logging.Logger;
import java.util.*;

import org.apache.lucene.search.BooleanQuery;

/** Creates and caches {@link QueryFilter} implementing plugins.  QueryFilter
 * implementations should define either the "fields" or "raw-fields" attributes
 * for any fields that they process, otherwise these will be ignored by the
 * query parser.  Raw fields are parsed as a single Query.Term, including
 * internal punctuation, while non-raw fields are parsed containing punctuation
 * are parsed as multi-token Query.Phrase's.
 */
public class QueryFilters {
  private static final Logger LOG =
    LogFormatter.getLogger("net.nutch.searcher.QueryFilters");

  private static final QueryFilter[] CACHE;
  private static final HashSet FIELD_NAMES = new HashSet();
  private static final HashSet RAW_FIELD_NAMES = new HashSet();

  static {
    try {
      ExtensionPoint point = PluginRepository.getInstance()
        .getExtensionPoint(QueryFilter.X_POINT_ID);
      if (point == null)
        throw new RuntimeException(QueryFilter.X_POINT_ID+" not found.");
      Extension[] extensions = point.getExtentens();
      CACHE = new QueryFilter[extensions.length];
      for (int i = 0; i < extensions.length; i++) {
        Extension extension = extensions[i];
        ArrayList fieldNames = parseFieldNames(extension, "fields");
        ArrayList rawFieldNames = parseFieldNames(extension, "raw-fields");
        
        if (fieldNames.size() == 0 && rawFieldNames.size() == 0) {
          LOG.warning("QueryFilter: "+extension.getId()+" names no fields.");
          continue;
        }
        CACHE[i] = (QueryFilter)extension.getExtensionInstance();
        FIELD_NAMES.addAll(fieldNames);
        FIELD_NAMES.addAll(rawFieldNames);
        RAW_FIELD_NAMES.addAll(rawFieldNames);
      }
    } catch (PluginRuntimeException e) {
      throw new RuntimeException(e);
    }
  }

  private static ArrayList parseFieldNames(Extension extension,
                                           String attribute) {
    String fields = extension.getAttribute(attribute);
    if (fields == null) fields = "";
    return Collections.list(new StringTokenizer(fields, " ,\t\n\r"));
  }

  private  QueryFilters() {}                  // no public ctor

  /** Run all defined filters. */
  public static BooleanQuery filter(Query input) throws QueryException {
    // first check that all field names are claimed by some plugin
    Clause[] clauses = input.getClauses();
    for (int i = 0; i < clauses.length; i++) {
      Clause c = clauses[i];
      if (!isField(c.getField()))
        throw new QueryException("Not a known field name:"+c.getField());
    }
    // then run each plugin
    BooleanQuery output = new BooleanQuery();
    for (int i = 0 ; i < CACHE.length; i++) {
      output = CACHE[i].filter(input, output);
    }
    return output;
  }

  public static boolean isField(String name) {
    return FIELD_NAMES.contains(name);
  }
  public static boolean isRawField(String name) {
    return RAW_FIELD_NAMES.contains(name);
  }
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -