📄 spider.java

📁 lucene 是java 的版的搜索引擎公共模块
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package com.microvois.luence;


import cvu.html.HTMLTokenizer;
import cvu.html.TagToken;
import cvu.html.TextToken;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Enumeration;
import java.net.URL;
import java.net.HttpURLConnection;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.StringReader;
import java.io.FileNotFoundException;
import java.security.Security;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.mozilla.intl.chardet.HtmlCharsetDetector;
import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import java.io.*;

import com.microvois.luence.inputproxy;


public class Spider implements Runnable {

	private static String lineSep = System.getProperty("line.separator");

	private String indexDir;

	private ArrayList urls;

	private ArrayList include;

	private ArrayList exclude;

	private ArrayList threadList;

	private boolean verbose;

	private boolean incremental;

	private boolean groksHTTPS;


	private HashMap indexedURLs;

	private HashMap mimeTypes;

	private int threads;

	private int descSize;
	private int nInputcount = 0;

	private int bytes;
	 private HttpClient httpclient;
     
	 private boolean isloopget = false;
	 private String mainargs[] = null;
	/** 
	 * 是否全站分析入库。 
	 * 
	 * @param bvalue
	 */
	public void setLoopSpider(boolean bvalue)
	{
		 this.isloopget = bvalue;		
	}
	
	
	
	public void spiderURL(String url) throws Exception 
	{
			inputproxy.flush();
			urls.add(url);
			indexDir= Config.StrDataDirectroy;
				
			if (urls.size() == 0)
				throw new IllegalArgumentException(
						"缺少需要的URL");
			if (indexDir == null)
				throw new IllegalArgumentException(
						"Missing required argument: -d [index dir]");
			
			if (mimeTypes.size() == 0) {
				// add default MIME types
				mimeTypes.put("text/html", Boolean.TRUE);
				mimeTypes.put("text/plain", Boolean.TRUE);
			}
			
			go();

			//inputproxy.flush();
	}
	 

	public Spider() 
	{
		this(null);
	}
	
	public Spider(String argv[]) {
		groksHTTPS = true;
		verbose = false;
		incremental = false;
		threads = 1;
		descSize = 1024;
		bytes = 0;
		include = new ArrayList();
		exclude = new ArrayList();
		urls = new ArrayList();
		threadList = new ArrayList();
		indexedURLs = new HashMap();
		mimeTypes = new HashMap();
		mainargs= argv;
		if(argv !=null)
		parseArgs(argv);
		httpclient = new HttpClient(new MultiThreadedHttpConnectionManager());
		httpclient.getHttpConnectionManager().
               getParams().setConnectionTimeout(30000);
		

	}

	public void go() throws Exception {
		// create the index directory -- or append to existing
		if (verbose) {
			print("Creating index in: " + indexDir);
			if (incremental)
				print("    - using incremental mode");
		}
		
		

		// index each entry point URL
		long start = System.currentTimeMillis();
		for (int i = 0; i < threads; i++) {
			Thread t = new Thread(this, "Spindle Spider Thread #" + (i + 1));
			t.start();
			threadList.add(t);
		}
		
		if(mainargs !=null)
		{
			while (threadList.size() > 0)
			{
				Thread child = (Thread) threadList.remove(0);
				child.join();
			}
			long elapsed = System.currentTimeMillis() - start;
	
			// save the index
			
				print("Indexed " + indexedURLs.size() + " URLs (" + (bytes / 1024)
						+ " KB) in " + (elapsed / 1000) + " seconds");
		}
		
		}

	public void run() {
		String url;
		try {
			while ((url = dequeueURL()) != null) {
				indexURL(url);
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
		inputproxy.flush();
		threads--;
	}

	public synchronized String dequeueURL() throws Exception {
		while (true) {
			if (urls.size() > 0) {
				return (String) urls.remove(0);
			} else {
				threads--;
				if (threads > 0) {
					wait();
					threads++;
				} else {
					notifyAll();
					return null;
				}
			}
		}
	}

	public synchronized void enqueueURL(String url) {
		if (indexedURLs.get(url) == null) {
			urls.add(url);
			indexedURLs.put(url, Boolean.TRUE);
			notifyAll();
		}
	}

	private void indexURL(String url) throws Exception {
		if (verbose)
			print("  " + Thread.currentThread().getName() + ": Adding URL: "
					+ url);
		int nloopcount = 3;
		int i=0;
		URLSummary summary =null;
		while(summary==null && i < nloopcount )
		{
		 summary = loadURL(url);
		 i++;		
		}
		if (summary != null && summary.body != null) 
		{
			String urls[] = parseURLs(summary);
			System.out.println(summary.toString());
			synchronized (this) 
			{
				bytes += summary.body.length();
				if(inputproxy.inputData(summary.title,summary.desc , url))
				{
					System.out.println(url+" input search engine sucess ...");
				}
				else
					System.out.println(url+" input search engine failed ...");
				if(nInputcount % 100 ==0 && nInputcount !=0)
					inputproxy.flush();
				
			}

			for ( i = 0; i < urls.length; i++)
			{
				// check against the include/exclude list
				boolean add = true;
				for (int x = 0; add && x < include.size(); x++) {
					String inc = (String) include.get(x);
					add = (urls[i].indexOf(inc) != -1);
				}
				for (int x = 0; add && x < exclude.size(); x++) {
					String ex = (String) exclude.get(x);
					add = (urls[i].indexOf(ex) == -1);
				}

				if (add) {
					enqueueURL(urls[i]);
				}
			}
		}
	}

	
	// 解析页面中的内容 。。。。。
	private String[] parseURLs(URLSummary summary) throws Exception {
		StringBuffer desc = new StringBuffer();
		ArrayList urls = new ArrayList();
		boolean isIgnoreText = false;
		HTMLTokenizer ht = new HTMLTokenizer(new StringReader(summary.body));
		for (Enumeration e = ht.getTokens(); e.hasMoreElements();) {
			Object obj = e.nextElement();
			if (obj instanceof TagToken) {
				TagToken tag = (TagToken) obj;
				String tagName = tag.getName().toLowerCase();
				//System.out.println("tag="+tag.toString()+"::"+tagName);
				String url = null;
				
				if(tagName.equals("meta") )
				{
					// 将keywords , description 加入到关键字中。 
					//System.out.println(tag.getAttribute("name")+"::"+ tag.getAttribute("content"));
					
					if(tag.getAttribute("name")!=null && "keywords".equalsIgnoreCase(tag.getAttribute("name")) && tag.getAttribute("content") !=null)
					{
						desc.append(tag.getAttribute("content"));
					}
					else if(tag.getAttribute("name")!=null && "description".equalsIgnoreCase(tag.getAttribute("name")) && tag.getAttribute("content") !=null)
					{
						desc.append(tag.getAttribute("content"));
					}
					
				}
				
				//过滤到script 里的内容， style
				
					if(tag.isEndTag()==false &&  tagName.equals("style"))
						isIgnoreText= true;
					else if(tag.isEndTag()==true &&  tagName.equals("style"))
					{
						isIgnoreText = false;
					}
					
				if(tag.isEndTag()==false &&  tagName.equals("script"))
					isIgnoreText= true;
				else if(tag.isEndTag()==true &&  tagName.equals("script"))
				{
					isIgnoreText = false;
				}
				else if (tagName.equals("a")) 
				{
					url = tag.getAttributes().get("href");
				} 
				else if (tagName.equals("frame")) 
				{
					url = tag.getAttributes().get("src");
				} 
				else if (tagName.equals("title") && e.hasMoreElements()
						&& !tag.isEndTag()) 
				{
					obj = e.nextElement();
					if (obj instanceof TextToken) {
						TextToken title = (TextToken) obj;
						summary.title = title.getText();
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -