📄 spider.java

📁 lucene 是java 的版的搜索引擎公共模块
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
					}
				}

				if (url != null && isloopget)
				{
					if (url.startsWith("http://")
							|| (url.startsWith("https://") && groksHTTPS)) {

						// verify we're on the same host and port
						URL u = new URL(url);
						if (u.getHost().equals(summary.url.getHost())
								&& u.getPort() == summary.url.getPort()) {

							url = chopOffNamedAnchor(url);
							if (indexedURLs.get(url) == null)
								urls.add(url);
						}
					} else if (url.indexOf("://") == -1
							&& !url.startsWith("mailto:")
							&& !url.startsWith("#")
							&& !url.startsWith("javascript:")) {
						// parse relative url
						url = formURL(summary.url, url);
						url = chopOffNamedAnchor(url);
						if (indexedURLs.get(url) == null)
							urls.add(url);
					}
				}
				
				
			} else if (obj instanceof TextToken) {
				if(isIgnoreText) continue;
				TextToken t = (TextToken) obj;
				String text = t.getText();
				if (text != null && text.trim().length() > 0)
					desc.append(text.trim()).append(" ");
			}
		}

		if (desc.length() > descSize)
			desc.setLength(descSize);
		summary.desc = desc.toString();

		String list[] = new String[urls.size()];
		urls.toArray(list);
		return list;
	}

	private String chopOffNamedAnchor(String url) {
		int pos = url.indexOf("#");
		if (pos == -1)
			return url;
		else
			return url.substring(0, pos);
	}

	// converts relative URL to absolute URL
	private String formURL(URL origURL, String newURL) {
		StringBuffer base = new StringBuffer(origURL.getProtocol());
		base.append("://").append(origURL.getHost());
		if (origURL.getPort() != -1) {
			base.append(":").append(origURL.getPort());
		}

		if (newURL.startsWith("/")) {
			base.append(newURL);
		} else if (newURL.startsWith("..")) {
			String file = origURL.getFile();
		} else {
			String file = origURL.getFile();
			int pos = file.lastIndexOf("/");
			if (pos != -1)
				file = file.substring(0, pos);

			while (newURL.startsWith("../")) {
				pos = file.lastIndexOf("/");
				file = file.substring(0, pos);
				newURL = newURL.substring(3);
			}

			base.append(file).append("/").append(newURL);
		}

		return base.toString();
	}
	
	/**
	 *  解析content type , text/html;charset=utf-8
	 *  
	 * @param strcontenttype :: text/html;charset=utf-8
	 * @return  ret[0]= text/html ret[1]= utf-8
	 */
	private String [] parseContentType(String strcontenttype)
	{
		String straret [] = new String[2];
		// 默认数值。 
		straret[0] ="text/html";
		straret[1] ="gb2312";
		try
		{
			if(strcontenttype !=null)
			{
				int npos = strcontenttype.indexOf(";");
				if(npos ==-1) 
					straret[0] =strcontenttype;
				else
				{
					straret[0] =strcontenttype.substring(0,npos);
					npos = strcontenttype.indexOf("=");
					if(npos !=-1)
					{
						straret[1] =strcontenttype.substring(npos+1);
					}
					
				}
				
			}
		
		}catch(Exception se)
		{
			
		}
		return straret;
	}

	private URLSummary loadURL(String url)  {
		HttpURLConnection uc;
		String ct = "";
		URLSummary summary = null;
		 GetMethod get =null;
        
		try {
			 get = new GetMethod(url);
	         get.setFollowRedirects(true);
             int iGetResultCode = httpclient.executeMethod(get);
             if(iGetResultCode ==200)
             {
            	  ct = get.getResponseCharSet();
            	  String strGetResponseBody = get.getResponseBodyAsString();
            	 // System.out.println("ddddddddddddddddddddd");
            	 if("ISO-8859-1".equals(ct))
            	 {
            		 //不确定的编码. 
            		 byte [] atemp = strGetResponseBody.getBytes("ISO-8859-1");     	            
            		 String strcharset = getFileEncoding(new ByteArrayInputStream(atemp));
            		  System.out.println("charset=="+ct+"strcharset="+strcharset);
      	            
       	             if("big5".equalsIgnoreCase(strcharset) || "gb2312".equalsIgnoreCase(strcharset))
       	             {
       	            	strGetResponseBody = new String(atemp, "GBK");
       	            //	System.out.println(strGetResponseBody);
       	             }
            		 
       	         		 
            		 
            	 }
	             //java.io.ByteArrayInputStream binput = 
	            
	            
	            // System.out.println(strGetResponseBody);
				 summary = new URLSummary();
				 summary.url = new URL(url);
				 summary.body =strGetResponseBody;
             }
			//ct = uc.getContentType();
		} catch (Exception e) {
			// 404
			summary=null;
		}
		finally
		{
			if(get!=null) get.releaseConnection();
			
		}

	//	String contdata[] = parseContentType(ct);
		return summary;
		
	}
	
	
	public static String  getFileEncoding(InputStream imp) throws Exception 
	{
		String rv = "ASCII";		
		boolean found = false ;
		//System.out.println("befoer getFileEncoding");
		
		nsDetector det = new nsDetector(2) ;

		// Set an observer...
		// The Notify() will be called when a matching charset is found.

		det.Init(new nsICharsetDetectionObserver() {
			public void Notify(String charset) {
			    HtmlCharsetDetector.found = true ;
			   
			}
	    	});

		//URL url = new URL(argv[0]);
		//BufferedInputStream imp = new BufferedInputStream(fin);
		
		byte[] buf = new byte[1024] ;
		int len;
		boolean done = false ;
		boolean isAscii = true ;
		   
		while( (len=imp.read(buf,0,buf.length)) != -1) {

			// Check if the stream is only ascii.
			if (isAscii)
			    isAscii = det.isAscii(buf,len);

			// DoIt if non-ascii and not done yet.
			if (!isAscii && !done)
	 		    done = det.DoIt(buf,len, false);
		}
		det.DataEnd();

		if (isAscii) 
		{
			rv = "ASCII";
		 
		   found = true ;
		}

		if (!found) 
		{
		   String prob[] = det.getProbableCharsets() ;
		   if(prob.length > 0 )
			   rv = prob[0];
		}
		imp.close();
		//System.out.println("after getFileEncoding");
		return rv;
	}
	
	

	private void parseArgs(String argv[]) {
		for (int i = 0; i < argv.length; i++) {
			if (argv[i].equals("-u"))
				urls.add(argv[++i]);
			else if (argv[i].equals("-d"))
				indexDir = argv[++i];
			else if (argv[i].equals("-i"))
				include.add(argv[++i]);
			else if (argv[i].equals("-e"))
				exclude.add(argv[++i]);
			else if (argv[i].equals("-v"))
				verbose = true;
			else if (argv[i].equals("-a"))
				incremental = true;
			else if (argv[i].equals("-m"))
				mimeTypes.put(argv[++i], Boolean.TRUE);
			else if (argv[i].equals("-t"))
				threads = Integer.parseInt(argv[++i]);
			else if (argv[i].equals("-s"))
				descSize = Integer.parseInt(argv[++i]);
		}

		if (urls.size() == 0)
			throw new IllegalArgumentException(
					"Missing required argument: -u [start url]");
		if (indexDir == null)
			throw new IllegalArgumentException(
					"Missing required argument: -d [index dir]");

		if (threads < 1)
			throw new IllegalArgumentException("Invalid number of threads: "
					+ threads);

		if (mimeTypes.size() == 0) {
			// add default MIME types
			mimeTypes.put("text/html", Boolean.TRUE);
			mimeTypes.put("text/plain", Boolean.TRUE);
		}
	}

	private void print(String str) {
		System.out.println(str);
	}

	
	public static void main(String argv[]) throws Exception   
	{
		Spider a = new Spider();
		a.spiderURL("http://www.sina.com.cn/");
		
	}
	
}

class URLSummary {

	URL url;

	String body;

	String desc = "";

	String title = "Untitled";

	public String toString() {
		return "URL=" + url.toString() + "\r\ndesc=" + desc + "\r\ntitle="
				+ title + "\r\n";

	}

	
	
}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -