📄 distributedwebdbreader.java

📁 爬虫数据的改进,并修正了一些bug
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
    // The EnumCall class allows the creator of MetaEnumerator
    // to indicate how to get each enumeration.  Will it be pages
    // or links?
    //
    static abstract class EnumCall {
        /**
         */
        public EnumCall() {
        }

        /**
         * Subclasses override this for different kinds of MetaEnumerator
         * behavior.
         */
        public abstract Enumeration getEnumeration(DBSectionReader reader) throws IOException;
    }

    //
    // For enumerating Pages
    //
    static class PageEnumCall extends EnumCall {
        /**
         */
        public PageEnumCall() {
        }

        /**
         * Get the enum of Pages
         */
        public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
            return reader.pages();
        }
    }

    //
    // For enumerating Pages
    //
    static class PageByMD5EnumCall extends EnumCall {
        /**
         */
        public PageByMD5EnumCall() {
        }

        /**
         * Get the enum of Pages
         */
        public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
            return reader.pagesByMD5();
        }
    }

    //
    // For enumerating Links
    //
    static class LinkEnumCall extends EnumCall {
        /**
         */
        public LinkEnumCall() {
        }

        /**
         * Get the enum of Links
         */
        public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
            return reader.links();
        }
    }

    //
    // MetaEnumerator uses the Enumerations from each
    // DBSectionReader in the passed-in DBSectionReader array.
    //
    class MetaEnumerator implements Enumeration {
        Enumeration enumerations[];
        int curEnum = 0;

        /**
         * Create all the Enumerations from the given Sections
         */
        public MetaEnumerator(DBSectionReader sections[], EnumCall enumCall) throws IOException {
            this.enumerations = new Enumeration[sections.length];

            for (int i = 0; i < enumerations.length; i++) {
                enumerations[i] = enumCall.getEnumeration(sections[i]);
            }
        }

        /**
         * Go through all the DBSectionReader items in
         * pagesByURL, until we find one that hasMoreElements.
         * Or until we hit the end.
         */
        public boolean hasMoreElements() {
            boolean result = false;

            //
            // Go through Enumerations until we find one with
            // hasMoreElements() == true.  (Or until we run out
            // of Enumerations.)
            //
            for (; curEnum < enumerations.length; curEnum++) {
                result = enumerations[curEnum].hasMoreElements();
                
                if (result) {
                    break;
                }
            }
            return result;
        }

        /**
         * Exhaust the Objects we can receive from the 
         * Enumerations array, via calls to nextElement();
         */
        public Object nextElement() {
            Object obj = null;

            //
            // Go through Enumerations until we find one with
            // a nextElement() to return.  (Or until we run out.)
            //
            for (; curEnum < enumerations.length; curEnum++) {
                if (enumerations[curEnum].hasMoreElements()) {
                    obj = enumerations[curEnum].nextElement();

                    if (obj != null) {
                        break;
                    }
                }
            }
            return obj;
        }
    }

    /**
     * The DistributedWebDBReader.main() provides some handy utility methods
     * for looking through the contents of the webdb.  Hoo-boy!
     *
     * Note this only works for a completely-NFS deployment.
     */
    public static void main(String argv[]) throws FileNotFoundException, IOException {
        if (argv.length < 2) {
            System.out.println("Usage: java net.nutch.db.DistributedWebDBReader (-local | -ndfs <namenode:port>) <root> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
            return;
        }

        int i = 0;
        NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
        File root = new File(argv[i++]);
        DistributedWebDBReader reader = new DistributedWebDBReader(nfs, root);
        try {
            String cmd = argv[i++];

            if ("-pageurl".equals(cmd)) {
                String url = argv[i++];
                System.out.println(reader.getPage(url.trim()));
            } else if ("-pagemd5".equals(cmd)) {
                MD5Hash md5 = new MD5Hash(argv[i++]);
                Page pages[] = reader.getPages(md5);
                System.out.println("Found " + pages.length + " pages.");
                for (int j = 0; j < pages.length; j++) {
                    System.out.println("Page " + j + ": " + pages[j]);
                }
            } else if ("-dumppageurl".equals(cmd)) {
                int j = 1;
                for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
                    Page page = (Page) e.nextElement();
                    System.out.println("Page " + j + ": " + page);
                    System.out.println();
                }
            } else if ("-dumppagemd5".equals(cmd)) {
                int j = 1;
                for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
                    Page page = (Page) e.nextElement();
                    System.out.println("Page " + j + ": " + page);
                    System.out.println();
                }
            } else if ("-toppages".equals(cmd)) {
                int topSize = Integer.parseInt(argv[i++]);

                // Create a sorted list
                SortedSet topSet = new TreeSet(new Comparator() {
                    public int compare(Object o1, Object o2) {
                        Page p1 = (Page) o1;
                        Page p2 = (Page) o2;
                        if (p1.getScore() < p2.getScore()) {
                            return -1;
                        } else if (p1.getScore() == p2.getScore()) {
                            // If two scores are equal, we will
                            // use regular Page comparison (which
                            // uses URL as the primary key).  We
                            // don't want to uniquify by score!
                            return p1.compareTo(p2);
                        } else {
                            return 1;
                        }
                    }
                }
                    );

                // Find the top "topSize" elts
                Page lowestPage = null;
                for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
                    Page curPage = (Page) e.nextElement();
                    if (topSet.size() < topSize) {
                        topSet.add(curPage);
                        lowestPage = (Page) topSet.first();
                    } else if (lowestPage.getScore() < curPage.getScore()) {
                        topSet.remove(lowestPage);
                        topSet.add(curPage);
                        lowestPage = (Page) topSet.first();
                    }
                }
            
                // Print them out
                int j = 0;
                for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
                    System.out.println("Page " + j + ": " + (Page) it.next());
                    System.out.println();
                }
            } else if ("-linkurl".equals(cmd)) {
                String url = argv[i++];
                Link links[] = reader.getLinks(new UTF8(url.trim()));
                System.out.println("Found " + links.length + " links.");
                for (int j = 0; j < links.length; j++) {
                    System.out.println("Link " + j + ": " + links[j]);
                }
            } else if ("-linkmd5".equals(cmd)) {
                MD5Hash fromID = new MD5Hash(argv[i++]);
                Link links[] = reader.getLinks(fromID);
                System.out.println("Found " + links.length + " links.");
                for (int j = 0; j < links.length; j++) {
                    System.out.println("Link " + j + ": " + links[j]);
                }
            } else if ("-dumplinks".equals(cmd)) {
                int j = 1;
                for (Enumeration e = reader.links(); e.hasMoreElements(); j++) {
                    Link link = (Link) e.nextElement();
                    System.out.println("Link " + j + ": " + link);
                    System.out.println();
                }
            } else if ("-stats".equals(cmd)) {
                System.out.println("Stats for " + reader);
                System.out.println("-------------------------------");
                System.out.println("Number of pages: " + reader.numPages());
                System.out.println("Number of links: " + reader.numLinks());
                System.out.println("Number of machines (sections): " + reader.numMachines());
            } else {
                System.out.println("Sorry, no command with name " + cmd);
            }
        } finally {
            reader.close();
        }
    }
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -