📄 standardclassifier.java
字号:
/* * WebSPHINX web crawling toolkit * Copyright (C) 1998,1999 Carnegie Mellon University * * This library is free software; you can redistribute it * and/or modify it under the terms of the GNU Library * General Public License as published by the Free Software * Foundation, version 2. * * WebSPHINX homepage: http://www.cs.cmu.edu/~rcm/websphinx/ */package websphinx;/** * Standard classifier, installed in every crawler by default. * <P>On the entire page, this classifier sets the following labels: * <UL> * <LI><B>root</B>: page is the root page of a Web site. For instance, * "http://www.digital.com/" and "http://www.digital.com/index.html" are both * marked as root, but "http://www.digital.com/about" is not. * </UL> * <P>Also sets one or more of the following labels on every link: * <UL> * <LI><B>hyperlink</B>: link is a hyperlink (A, AREA, or FRAME tags) to another page on the Web (using http, file, ftp, or gopher protocols) * <LI><B>image</B>: link is an inline image (IMG). * <LI><B>form</B>: link is a form (FORM tag). A form generally requires some parameters to use. * <LI><B>code</B>: link points to code (APPLET, EMBED, or SCRIPT). * <LI><B>remote</B>: link points to a different Web server. * <LI><B>local</B>: link points to the same Web server. * <LI><B>same-page</B>: link points to the same page (e.g., by an anchor reference like "#top") * <LI><B>sibling</B>: a local link that points to a page in the same directory (e.g. "sibling.html") * <LI><B>descendent</B>: a local link that points downwards in the directory structure (e.g., "deep/deeper/deepest.html") * <LI><B>ancestor</B>: a link that points upwards in the directory structure (e.g., "../..") * </UL> */public class StandardClassifier implements Classifier { /** * Make a StandardClassifier. */ public StandardClassifier () { } /** * Classify a page. * @param page Page to classify */ // FIX: use regular expressions throughout this method public void classify (Page page) { Link origin = page.getOrigin (); String pagePath = origin.getFile(); String pageFilename = origin.getFilename(); String pageDir = origin.getDirectory (); if (pageFilename.equals ("") || pageFilename.startsWith ("index.htm")) page.setLabel ("root"); // FIX: Link needs to resolve "foo/bar/.." and "foo/." to "foo" in order for this // stuff to work properly Link[] links = page.getLinks (); if (links != null) { for (int i=0; i<links.length; ++i) { Link link = links[i]; if (link.getHost().equals (origin.getHost()) && link.getPort() == origin.getPort()) { link.setLabel ("local"); String linkPath = link.getFile (); String linkDir = link.getDirectory (); if (linkPath.equals (pagePath)) link.setLabel ("same-page"); else if (linkDir.equals (pageDir)) link.setLabel ("sibling"); else if (linkDir.startsWith (pageDir)) link.setLabel ("descendent"); else if (pageDir.startsWith (linkDir)) link.setLabel ("ancestor"); // NIY: child, parent } else link.setLabel ("remote"); // Link tag kinds: resource, form, hyperlink String tagName = link.getTagName(); if (tagName == Tag.IMG) link.setLabel ("image"); else if (tagName == Tag.APPLET || tagName == Tag.EMBED || tagName == Tag.SCRIPT) link.setLabel ("code"); else if (tagName == Tag.FORM) link.setLabel ("form"); else if (tagName == Tag.A || tagName == Tag.AREA || tagName == Tag.FRAME) { String protocol = link.getProtocol (); if ((protocol.equals ("http") || protocol.equals ("ftp") || protocol.equals ("file") || protocol.equals ("gopher")) && link.getMethod() == Link.GET) link.setLabel ("hyperlink"); } } } } /** * Priority of this classifier. */ public static final float priority = 0.0F; /** * Get priority of this classifier. * @return priority. */ public float getPriority () { return priority; }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -