📄 linkdbreader.java

📁 nutch0.8源码
💻 JAVA
字号:
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.nutch.crawl;import java.io.IOException;// Commons Logging importsimport org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;import org.apache.hadoop.io.*;import org.apache.hadoop.fs.*;import org.apache.hadoop.mapred.*;import org.apache.hadoop.mapred.lib.HashPartitioner;import org.apache.hadoop.conf.Configuration;import org.apache.nutch.util.NutchConfiguration;import org.apache.nutch.util.NutchJob;import java.util.Iterator;/** . */public class LinkDbReader implements Closeable {  public static final Log LOG = LogFactory.getLog(LinkDbReader.class);  private static final Partitioner PARTITIONER = new HashPartitioner();  private FileSystem fs;  private Path directory;  private MapFile.Reader[] readers;  private Configuration conf;  public LinkDbReader(FileSystem fs, Path directory, Configuration conf) {    this.fs = fs;    this.directory = directory;    this.conf = conf;  }  public String[] getAnchors(UTF8 url) throws IOException {    Inlinks inlinks = getInlinks(url);    if (inlinks == null)      return null;    return inlinks.getAnchors();  }  public Inlinks getInlinks(UTF8 url) throws IOException {    synchronized (this) {      if (readers == null) {        readers = MapFileOutputFormat.getReaders          (fs, new Path(directory, LinkDb.CURRENT_NAME), this.conf);      }    }        return (Inlinks)MapFileOutputFormat.getEntry      (readers, PARTITIONER, url, new Inlinks());  }    public void close() throws IOException {    if (readers != null) {      for (int i = 0; i < readers.length; i++) {        readers[i].close();      }    }  }    public static void processDumpJob(String linkdb, String output, Configuration config) throws IOException {    if (LOG.isInfoEnabled()) {      LOG.info("LinkDb dump: starting");      LOG.info("LinkDb db: " + linkdb);    }    Path outFolder = new Path(output);    JobConf job = new NutchJob(config);    job.setJobName("read " + linkdb);    job.addInputPath(new Path(linkdb, LinkDb.CURRENT_NAME));    job.setInputFormat(SequenceFileInputFormat.class);    job.setInputKeyClass(UTF8.class);    job.setInputValueClass(Inlinks.class);    job.setOutputPath(outFolder);    job.setOutputFormat(TextOutputFormat.class);    job.setOutputKeyClass(UTF8.class);    job.setOutputValueClass(Inlinks.class);    JobClient.runJob(job);  }    public static void main(String[] args) throws Exception {    if (args.length < 2) {      System.err.println("LinkDbReader <linkdb> {-dump <out_dir> | -url <url>)");      System.err.println("\t-dump <out_dir>\tdump whole link db to a text file in <out_dir>");      System.err.println("\t-url <url>\tprint information about <url> to System.out");      return;    }    Configuration conf = NutchConfiguration.create();    if (args[1].equals("-dump")) {      LinkDbReader.processDumpJob(args[0], args[2], conf);    } else if (args[1].equals("-url")) {      LinkDbReader dbr = new LinkDbReader(FileSystem.get(NutchConfiguration.create()), new Path(args[0]), conf);      Inlinks links = dbr.getInlinks(new UTF8(args[2]));      if (links == null) {        System.out.println(" - no link information.");      } else {        Iterator it = links.iterator();        while (it.hasNext()) {          System.out.println(it.next().toString());        }      }    } else {      System.err.println("Error: wrong argument " + args[1]);      return;    }  }}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -