dbsectionreader.java

来自「爬虫数据的改进,并修正了一些bug」· Java 代码 · 共 319 行
JAVA
319 行
/* Copyright (c) 2003 The Nutch Organization.  All rights reserved.   */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.db;

import java.io.*;
import java.util.*;

import net.nutch.io.*;
import net.nutch.fs.*;
import net.nutch.util.*;

/**********************************************************
 * DBSectionReader reads a discrete portion of a WebDB.
 * It may implement its methods with either a local
 * MapFile.Reader object or (eventually) a remote-
 * machine network interface.  For the moment, we 
 * do only the MapFile.Reader implementation (much of
 * the code for this was moved from the earlier 
 * pre-distributed version of WebDBReadaer).
 *
 * @author Mike Cafarella
 ***********************************************/
public class DBSectionReader {
    NutchFileSystem nfs;
    File sectionFile;
    WritableComparator comparator;
    MapFile.Reader reader;

    /**
     * Right now we assume we're getting a File that is a 
     * MapFile.Reader directory.  But in the future we could
     * also check for existence of a "remote-network" file, similar
     * to the way we do now for distributed index reading.
     * Then, we would either create a MapFile.Reader or a network
     * client for one.
     */
    public DBSectionReader(NutchFileSystem nfs, File sectionFile, WritableComparator comparator) throws IOException {
        this.nfs = nfs;
        this.sectionFile = sectionFile;
        this.comparator = comparator;
        this.reader = new MapFile.Reader(nfs, sectionFile.getPath(), comparator);
    }

    /**
     * Fetch a Page with the given URL, and fill it into
     * the pre-allocated Page 'p'.
     */
    public Page getPage(UTF8 url, Page p) throws IOException {
        return (Page) reader.get(url, p);
    }

    /**
     * Get Pages from the db according to their
     * content hash.
     */
    public Vector getPages(MD5Hash md5) throws IOException {
        Vector records = new Vector(3);
        Page p = new Page();
        p.getMD5().set(md5);

        reader.seek(p);
        while (reader.next(p, NullWritable.get())) {
            if (p.getMD5().compareTo(md5) == 0) {
                records.add(p);
                p = new Page();
            } else {
                break;
            }
        }

        return records;
    }

    /**
     * Test whether a certain piece of content is in the 
     * db, but don't bother returning it.
     */
    public boolean pageExists(MD5Hash md5) throws IOException {
        Page p = new Page();
        p.getMD5().set(md5);
        reader.seek(p);
        if (reader.next(p, NullWritable.get()) && p.getMD5().compareTo(md5) == 0) {
            return true;
        } else {
            return false;
        }
    }

    /**
     * Iterate through all the Pages, sorted by URL
     */
    public Enumeration pages() throws IOException {
        return new TableEnumerator(new MapFile.Reader(nfs, sectionFile.getPath(), comparator));
    }

    //
    // The TableEnumerator goes through all the entries
    // in the Table (which is a MapFile).
    //
    class TableEnumerator implements Enumeration {
        MapFile.Reader reader;
        Page nextItem;

        /**
         * Start the cursor and find the first item.
         * Store it for later return.
         */
        public TableEnumerator(MapFile.Reader reader) {
            this.reader = reader;
            this.nextItem = new Page();
            try {
                if (! reader.next(new UTF8(), this.nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                ie.printStackTrace();
                this.nextItem = null;
            }
        }

        /**
         * If there's no item left in store, we've hit the end.
         */
        public boolean hasMoreElements() {
            return (nextItem != null);
        }

        /**
         * Set aside the item we have in store.  Then retrieve
         * another for the next time we're called.  Finally, return
         * the set-aside item.
         */
        public Object nextElement() {
            if (nextItem == null) {
                throw new NoSuchElementException("PageDB Enumeration");
            }
            Page toReturn = nextItem;
            this.nextItem = new Page();
            try {
                if (! reader.next(new UTF8(), nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
            return toReturn;
        }
    }

    /**
     * Iterate through all the Pages, sorted by MD5
     */
    public Enumeration pagesByMD5() throws IOException {
        return new IndexEnumerator(new SetFile.Reader(nfs, sectionFile.getPath(), comparator));
    }

    //
    // The IndexEnumerator goes through all the entries
    // in the index (which is a SequenceFile).
    //
    class IndexEnumerator implements Enumeration {
        SetFile.Reader reader;
        Page nextItem;

        /**
         * Start the cursor and find the first item.
         * Store it for later return.
         */
        public IndexEnumerator(SetFile.Reader reader) {
            this.reader = reader;
            this.nextItem = new Page();
            try {
                if (! reader.next(nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
        }

        /**
         * If there's no item left in store, we've hit the end.
         */
        public boolean hasMoreElements() {
            return (nextItem != null);
        }

        /**
         * Set aside the item we have in store.  Then retrieve
         * another for the next time we're called.  Finally, return
         * the set-aside item.
         */
        public Object nextElement() {
            if (nextItem == null) {
                throw new NoSuchElementException("PageDB Enumeration");
            }

            Page toReturn = nextItem;
            this.nextItem = new Page();
            try {
                if (! reader.next(nextItem)) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
            return toReturn;
        }
    }

    /**
     * Get all the hyperlinks that link TO the indicated URL.
     */     
    public Vector getLinks(UTF8 url) throws IOException {
        Vector records = new Vector(3);
        Link l = new Link();
        l.getURL().set(url);

        reader.seek(l);
        while (reader.next(l, NullWritable.get())) {
            if (url.equals(l.getURL())) {
                records.add(l);
                l = new Link();
            } else {
                break;
            }
        }
        
        return records;
    }

    /**
     * Grab all the links from the given MD5 hash.
     */
    public Vector getLinks(MD5Hash md5) throws IOException {
        Vector records = new Vector(3);
        Link l = new Link();
        l.getFromID().set(md5);

        reader.seek(l);
        while (reader.next(l, NullWritable.get())) {
            if (md5.equals(l.getFromID())) {
                records.add(l);
                l = new Link();
            } else {
                break;
            }
        }
        
        return records;
    }

    /**
     * Return all the links, by target URL
     */
    public Enumeration links() throws IOException {
        return new MapEnumerator(new MapFile.Reader(nfs, sectionFile.getPath(), comparator));
    }
    
    //
    // Here's the class for the above function
    //
    class MapEnumerator implements Enumeration {
        MapFile.Reader reader;
        Link nextItem;

        /**
         * Start the cursor and find the first item.
         * Store it for later return.
         */
        public MapEnumerator(MapFile.Reader reader) {
            this.reader = reader;
            this.nextItem = new Link();
            try {
                if (! reader.next(this.nextItem, NullWritable.get())) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
        }

        /**
         * If there's no item left in store, we've hit the end.
         */
        public boolean hasMoreElements() {
            return (nextItem != null);
        }

        /**
         * Set aside the item we have in store.  Then retrieve
         * another for the next time we're called.  Finally, return
         * the set-aside item.
         */
        public Object nextElement() {
            if (nextItem == null) {
                throw new NoSuchElementException("PageDB Enumeration");
            }

            Link toReturn = nextItem;
            this.nextItem = new Link();
            try {
                if (! reader.next(nextItem, NullWritable.get())) {
                    this.nextItem = null;
                }
            } catch (IOException ie) {
                this.nextItem = null;
            }
            return toReturn;
        }
    }

    /**
     */
    public void close() throws IOException {
        reader.close();
    }
}
dbsectionreader.java - 源码说明

本页面展示了「爬虫数据的改进,并修正了一些bug」中的 dbsectionreader.java 源码文件，采用 Java 编程语言编写，共 319 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与bug相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?