commongrams.java

来自「爬虫数据的改进,并修正了一些bug」· Java 代码 · 共 243 行
JAVA
243 行
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */

package net.nutch.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;

import java.io.*;
import java.util.*;
import java.util.logging.Logger;

import net.nutch.util.*;

import net.nutch.searcher.Query.*;

/** Construct n-grams for frequently occuring terms and phrases while indexing.
 * Optimize phrase queries to use the n-grams. Single terms are still indexed
 * too, with n-grams overlaid.  This is achieved through the use of {@link
 * Token#setPositionIncrement(int)}.*/
public class CommonGrams {
  private static final Logger LOG =
    LogFormatter.getLogger("net.nutch.analysis.CommonGrams");
  private static final char SEPARATOR = '-';
  private static final HashMap COMMON_TERMS = new HashMap();

  static { init(); }

  private CommonGrams() {}                        // no public ctor

  private static class Filter extends TokenFilter {
    private HashSet common;
    private Token previous;
    private LinkedList gramQueue = new LinkedList();
    private LinkedList nextQueue = new LinkedList();
    private StringBuffer buffer = new StringBuffer();

    /** Construct an n-gram producing filter. */
    public Filter(TokenStream input, HashSet common) {
      super(input);
      this.common = common;
    }

    /** Inserts n-grams into a token stream. */
    public Token next() throws IOException {
      if (gramQueue.size() != 0)                  // consume any queued tokens
        return (Token)gramQueue.removeFirst();

      final Token token = popNext();
      if (token == null)
        return null;

      if (!isCommon(token)) {                     // optimize simple case
        previous = token;
        return token;
      }

      gramQueue.add(token);                       // queue the token

      ListIterator i = nextQueue.listIterator();
      Token gram = token;
      while (isCommon(gram)) {
        if (previous != null && !isCommon(previous)) // queue prev gram first
          gramQueue.addFirst(gramToken(previous, gram));

        Token next = peekNext(i);
        if (next == null)
          break;

        gram = gramToken(gram, next);             // queue next gram last
        gramQueue.addLast(gram);
      }

      previous = token;
      return (Token)gramQueue.removeFirst();
    }

    /** True iff token is for a common term. */
    private boolean isCommon(Token token) {
      return common != null && common.contains(token.termText());
    }

    /** Pops nextQueue or, if empty, reads a new token. */
    private Token popNext() throws IOException {
      if (nextQueue.size() > 0)
        return (Token)nextQueue.removeFirst();
      else
        return input.next();
    }

    /** Return next token in nextQueue, extending it when empty. */
    private Token peekNext(ListIterator i) throws IOException {
      if (!i.hasNext()) {
        Token next = input.next();
        if (next == null)
          return null;
        i.add(next);
        i.previous();
      }
      return (Token)i.next();
    }

    /** Construct a compound token. */
    private Token gramToken(Token first, Token second) {
      buffer.setLength(0);
      buffer.append(first.termText());
      buffer.append(SEPARATOR);
      buffer.append(second.termText());
      Token result = new Token(buffer.toString(),
                               first.startOffset(), second.endOffset(),
                               "gram");
      result.setPositionIncrement(0);
      return result;
    }
  }

  /** Construct using the provided config file. */
  private static void init() {
    try {
      Reader reader = NutchConf.getConfResourceAsReader
        (NutchConf.get("analysis.common.terms.file"));
      BufferedReader in = new BufferedReader(reader);
      String line;
      while ((line = in.readLine()) != null) {
        line = line.trim();
        if (line.startsWith("#") || "".equals(line)) // skip comments
          continue;
        TokenStream ts = new NutchDocumentTokenizer(new StringReader(line));
        Token token = ts.next();
        if (token == null) {
          LOG.warning("Line does not contain a field name: " + line);
          continue;
        }
        String field = token.termText();
        token = ts.next();
        if (token == null) {
          LOG.warning("Line contains only a field name, no word: " + line);
          continue;
        }
        String gram = token.termText();
        while ((token = ts.next()) != null) {
          gram = gram + SEPARATOR + token.termText();
        }
        HashSet table = (HashSet)COMMON_TERMS.get(field);
        if (table == null) {
          table = new HashSet();
          COMMON_TERMS.put(field, table);
        }
        table.add(gram);
      }
    } catch (IOException e) {
      throw new RuntimeException(e.toString());
    }
  }

  /** Construct a token filter that inserts n-grams for common terms.  For use
   * while indexing documents.  */
  public static TokenFilter getFilter(TokenStream ts, String field) {
    return new Filter(ts, (HashSet)COMMON_TERMS.get(field));
  }

  /** Utility to convert an array of Query.Terms into a token stream. */
  private static class ArrayTokens extends TokenStream {
    private Term[] terms;
    private int index;

    public ArrayTokens(Phrase phrase) { this.terms = phrase.getTerms(); }
    
    public Token next() {
      if (index == terms.length)
        return null;
      else
        return new Token(terms[index].toString(), index, ++index);
    }
  }

  /** Optimizes phrase queries to use n-grams when possible. */
  public static String[] optimizePhrase(Phrase phrase, String field) {
    //LOG.info("Optimizing " + phrase + " for " + field);
    ArrayList result = new ArrayList();
    TokenStream ts = getFilter(new ArrayTokens(phrase), field);
    Token token, prev=null;
    int position = 0;
    try {
      while ((token = ts.next()) != null) {
        if (token.getPositionIncrement() != 0 && prev != null)
          result.add(prev.termText());
        prev = token;
        position += token.getPositionIncrement();
        if ((position + arity(token.termText())) == phrase.getTerms().length)
          break;
      }
    } catch (IOException e) {
      throw new RuntimeException(e.toString());
    }
    if (prev != null)
      result.add(prev.termText());

//     LOG.info("Optimized: ");
//     for (int i = 0; i < result.size(); i++) {
//       LOG.info(result.get(i) + " ");
//     }

    return (String[])result.toArray(new String[result.size()]);


  }

  private static int arity(String gram) {
    int index = 0;
    int arity = 0;
    while ((index = gram.indexOf(SEPARATOR, index+1)) != -1) {
      arity++;
    }
    return arity;
  }

  /** For debugging. */
  public static void main(String[] args) throws Exception {
    StringBuffer text = new StringBuffer();
    for (int i = 0; i < args.length; i++) {
      text.append(args[i]);
      text.append(' ');
    }
    TokenStream ts =
      new NutchDocumentTokenizer(new StringReader(text.toString()));
    ts = getFilter(ts, "url");
    Token token;
    while ((token = ts.next()) != null) {
      System.out.println("Token: " + token);
    }
    String[] optimized = optimizePhrase(new Phrase(args), "url");
    System.out.print("Optimized: ");
    for (int i = 0; i < optimized.length; i++) {
      System.out.print(optimized[i] + " ");
    }
    System.out.println();
  }
  
}
commongrams.java - 源码说明

本页面展示了「爬虫数据的改进,并修正了一些bug」中的 commongrams.java 源码文件，采用 Java 编程语言编写，共 243 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与bug相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?