⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 linkextractor.java

📁 名称:基于JAVA语言提取网站内部URL的算法 程序名称:LinkExtractor
💻 JAVA
字号:
//package org.apache.lucene.index;import java.io.*;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.tags.LinkTag;import org.htmlparser.util.ParserException;import java.util.*;import java.net.*;/** * LinkExtractor extracts all the links from the given webpage * and prints them on standard output. */public class LinkExtractor {		private String location;	private Parser parser;	private static int b=0;	private static int tID;	private static int iNode;		public final int DEEP=3; //遍历的深度		public static Vector svecLink, svecOutlink;	public static String hostName;    public static boolean bl;		public LinkExtractor(String location) {		this.location = location;	    hostName=GetHostName(location);	    System.out.println("主机名称是 "+hostName);	     	    bl=false;		svecLink=new Vector();		svecOutlink=new Vector();        svecLink.add(location);			}	public void extractLinks(String loc) throws ParserException {	    System.out.println("Parsing "+loc+" for links...");	    	    Vector vecTemp=new Vector();	    		try {			this.parser   = new Parser(loc); // Create the parser object			parser.registerScanners(); // Register standard scanners (Very Important)	        bl=true;		}		catch (ParserException e) {			bl=false;		    	e.printStackTrace();		}								String ss,str1;        URL wwwurl;        boolean byes;         	    int a=0;        b++;				Node [] links = parser.extractAllNodesThatAre(LinkTag.class);		for (int i = 0;i < links.length;i++) {	      if(bl)	      {	      	byes=true;	        System.out.println("Total url is "+links.length+"This page has url "+i);	      	      	LinkTag linkTag = (LinkTag)links[i];            str1=linkTag.getLink();	            	           // System.out.println("the url is "+str1);&&!svecOutlink.contains(str1)            if(str1.equals("")) continue;                        if(str1.charAt(str1.length()-1)=='/'            ||str1.charAt(str1.length()-1)=='\\')             str1=str1.substring(0,str1.length()-1); 			if(!svecLink.contains(str1)) 			{   			    try			    {				   wwwurl=new URL(str1);				   wwwurl.getContent();			    }			    catch(MalformedURLException e)			    {			        byes=false;			    }			    catch(IOException e)			    {			        byes=false;			    }				if(GetHostName(str1).equals(hostName) && byes)				{			    	a++;			    	tID++;			    	svecLink.add(str1);			    	vecTemp.add(str1);			    	System.out.println("the url is "+str1);		    	}		    	else		    	{		    	   		    	    svecOutlink.add(str1);		    	}			} 	           }            		}    	String strNew;    	if(a>0&&b<=DEEP)    	{        	         	     for(int i=0;i<vecTemp.size();i++)	         {	           strNew=(String)vecTemp.get(i);	           System.out.println("this is "+strNew);	           extractLinks(strNew);	         }	    }	 	}	boolean linkAttribute(String strLink)	{	 return true;	}    static void printCol(Enumeration col)    {    	    	String str;        while(col.hasMoreElements())        {        str=(String)col.nextElement();        System.out.println(str);        }    }        public String GetHostName(String hostname)    {    	URL aurl;    	String ss=" ";    	try		{	    aurl=new URL(hostname);	    ss=aurl.getHost();    	}	    catch(MalformedURLException e)	    {	      e.printStackTrace();	     //return "null";	    }	    return ss;    }    	public static void main(String[] args) {		/*		if (args.length<0) {			System.err.println("Syntax Error : Please provide the location(URL or file) to parse");			System.exit(-1);		}*/	    Vector allLink=new Vector();		String strNew,strall1,strall2,str;		String ss="http://www.dlut.edu.cn/";		LinkExtractor linkExtractor = new LinkExtractor(ss);		try {	        linkExtractor.extractLinks(ss);					    Enumeration col=svecLink.elements();            while(col.hasMoreElements())            {             str=(String)col.nextElement();             System.out.println(str);            }            		}		catch (ParserException e) {			e.printStackTrace();		}	}}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -