📄 google.java

📁 这是一个matlab的java实现。里面有许多内容。请大家慢慢捉摸。
💻 JAVA
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept.   This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit).   http://www.cs.umass.edu/~mccallum/mallet   This software is provided under the terms of the Common Public License,   version 1.0, as published by http://www.opensource.org.  For further   information, see the file `LICENSE' included with this distribution. *//**  Wrapper for Google API. *   @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>*/package edu.umass.cs.mallet.projects.dex.web;import edu.umass.cs.mallet.projects.dex.types.*;import edu.umass.cs.mallet.base.util.*;import java.io.*;import java.net.*;import com.google.soap.search.*;import java.util.regex.*;import java.util.*;import java.util.logging.*;public class Google {	private static Logger logger = MalletLogger.getLogger(Google.class.getName());	public final static int MAX_URL_COUNTER = 30;		public Google(String threadName, String outDirName, Person person, HashSet stopWords) {		this.threadName = threadName;		this.outDirName = outDirName;		this.person = person;		this.stopWords = stopWords;		search = new GoogleSearch();		urls = new HashMap(100);				keys = new Vector();		keys.add("OAyQ4ahQFHKmk5c7LPR3Y6d4zaqs8OQw"); // ronb@cs.umass.edu		keys.add("e0IL+KxQFHKEJDnNJUxed7dsmZT++bgg"); // ronb@cs.technion.ac.il		keys.add("lLEx5MRQFHLdDIP41weVfsmwM+Gr6hcm"); // my hotmail		keys.add("rQek+5JQFHJ+LWofOR0rc4iKWDasWBq8"); // culotta		keys.add("LzOQzYBQFHIgtMEE9bdan57EINhcJosr"); // wellner		keys.add("7YG4Si5QFHJ3LNChfpXzb6hHh7YBMuyP"); // mccallum		keys.add("7KnKT6BQFHLKVp66Bb/2hlv9FFGsccrt"); // hough		keys.add("9cQWFoBQFHIi4M2xoOeOD2WR4JuLHple"); // ghamrawi		keys.add("o3ec5WtQFHLwR3QTRPkcznLgUzGy0KTF"); // ghuang		keys.add("sP0y7O5QFHKdK1B1flUxQc59mRnexRfD"); // fuchun		keys.add("WY9bFKJQFHKElRtRPWc3VapOJFRzxiC5"); // ronb@ciirsrv		keys.add("t5M/rqZQFHL44q1mFaNPoyu2IuJrUmWq"); // ronb@dandenong		keys.add("t+w15CBQFHJXho9EI5LtNf8bm0sYh7Za"); // ronb@loki		keys.add("5irs6jVQFHI+UWHUe61bz48NmXG0GzDU"); // csanna		keys.add("BU5CqHtQFHJ3Rd+iBb4O4uzyR3AC7Gjd"); // casutton		keys.add("TwE5W55QFHK3nkRSVf24fyv/3YqkFNIn"); // Anna's hotmail		keys.add("iIu6WmVQFHJOXMn6RLBgfXsxqUZvbwBQ"); // culotta@ciirsrv		keys.add("tEAulR5QFHK+hVHrJhplzpQjDIYC0I7C"); // culotta@dandenong		keys.add("0XOMkL9QFHJeiq2E3YYNWjlxMugtAOrH"); // culotta@loki		keys.add("POyGUClQFHLr8gIR7wme0RdFZj1IqVYz"); // ronb@tx		currentKeyIndex = 0;		String key = (String)keys.get(currentKeyIndex);		search.setKey(key);	}		public static boolean binaryFile(String url) {		Pattern pat =	    Pattern.compile("\\.(pdf|ps|doc|ppt|gz|zip|avi|gif|jpg|jpeg|mpg|mpeg|cfm|php)$",											Pattern.CASE_INSENSITIVE);		Matcher mat = pat.matcher(url);		if (mat.find())	    return true;		return false;	}		public static boolean isDomainInQuery(String query) {		Pattern pat = Pattern.compile(" site\\:.+");		Matcher mat = pat.matcher(query);		return mat.find();	}		public static String getBase(String url) {		// If ends with slash, the url is the base		Pattern pat = Pattern.compile("\\/$");		Matcher mat = pat.matcher(url);		if(mat.find())	    return url;		// if ends with file name, everything before the file name is the base		pat = Pattern.compile("^(.*\\/)[^\\/]+\\.[^\\/]+$");		mat = pat.matcher(url);		if(mat.matches()) {	    return mat.group(1);		}		return url + "/";	}		public String getFileName(int num) {		Integer intValue = new Integer(num);		return outDirName + File.separator + intValue.toString() + ".html";	}		public void extractURLs(String line, String base) {		Pattern pat = Pattern.compile("href\\=\\\"([^\\\"]+)\\\"");		Matcher mat = pat.matcher(line);		while(mat.find()){	    String url = mat.group();	    pat = Pattern.compile("\\\"([^\\\"]+)\\\"");	    Matcher mat1 = pat.matcher(url);	    if(mat1.find())				url = mat1.group(1);	    pat = Pattern.compile("\\#"); // sharps	    mat1 = pat.matcher(url);	    if(mat1.find()) continue;	    if(binaryFile(url)) continue; // binary files	    if(url.regionMatches(0,base,0,base.length()) == false) {				pat = Pattern.compile("\\:"); // http: ftp: mailto:				mat1 = pat.matcher(url);				if(mat1.find()) continue;				pat = Pattern.compile("^(\\/|\\.\\.)"); // If URL starts with slash or ..				mat1 = pat.matcher(url);				if(mat1.find()) continue;				pat = Pattern.compile("[^\\w\\-\\/\\.]"); // not proper file name				mat1 = pat.matcher(url);				if(mat1.find()) continue;				pat = Pattern.compile("^\\.\\/?$"); // only current file				mat1 = pat.matcher(url);				if(mat1.matches()) continue;				pat = Pattern.compile("^\\.\\/"); // remove ./ from the beginning				mat1 = pat.matcher(url);				url = mat1.replaceAll("");								url = base + url;	    }			// if no such url in urls and list of urls is not too long	    if(urls.get(url) == null && urls.size() < MAX_URL_COUNTER) {				urls.put(url, new Boolean(false));				logger.fine(threadName + ") Found URL: " + url + " (false)");	    }		}	}		/** gets words from a line of html, filtered by a stop list	 * @param line line of html	 * @return list of words	 */	public ArrayList getWordsFromLine(String line) {		Pattern pat = Pattern.compile("\\W+");		ArrayList stoppedWords = new ArrayList();		String[] words = pat.split(line);		for(int i = 0; i < words.length; i++){	    String word = words[i].toLowerCase();	    if(!(word.equals("") || stopWords.contains(word)))				stoppedWords.add (word);		}		return stoppedWords;	}		public void buildContextModelForPerson() {		if(person.contextModel.numLocations() > 0)	    return; // We already have context model from email		if(person.contextPages.size() == 0) {	    logger.warning("Cannot build context model for ");	    person.printPersonalInfo();	    System.exit(1);		}		for (int i=0; i < person.contextPages.size(); i++) {	    String fileName = ((WebPage)person.contextPages.elementAt(i)).fileName;	    try {				BufferedReader in = new BufferedReader(new FileReader(new File(fileName)));				String inputLine;				while ((inputLine = in.readLine()) != null) {					String[] words = (String[]) getWordsFromLine(inputLine).toArray (new String[]{});;					person.addWordsToContextModel (words);					inputLine = in.readLine();				}				in.close();	    }	    catch (FileNotFoundException e) {				System.err.println(threadName + ") Cannot open input file " + fileName);	    }	    catch (IOException e) {				System.err.println(threadName + ") Cannot close input file " + fileName);	    }		}	}		public BufferedReader establishConnection(String stringURL) {		try{	    HttpTimeoutHandler xHTH = new HttpTimeoutHandler(7000); // in milliseconds	    URL url = new URL((URL)null, stringURL, xHTH);	    //URL url = new URL(stringURL);	    URLConnection connection = url.openConnection();	    logger.fine(threadName + ") Connection opened");	    connection.connect();	    int contentLength = connection.getContentLength();	    if(contentLength > 100000) {				logger.fine(threadName + ") Too long file - connection stopped");				return null;	    }	    //System.out.println("Content Length of " + stringURL + " is " + contentLength);	    return new BufferedReader(new InputStreamReader(connection.getInputStream()));		}		catch (MalformedURLException u) {	    logger.warning(threadName + ") Malformed URL: " + stringURL);		}		catch (IOException e) {	    logger.warning(threadName + ") Failed to establish connection to " + stringURL + " error: " + e);		}		return null;	}		public boolean retrievePage(String url, String outFileName, ArrayList words) {		int MAX_NUMBER_OF_LINES = 5000;		String base = getBase(url);		logger.fine(threadName + ") Retrieving page: " + url);		try {	    String inputLine;	    BufferedReader in = establishConnection(url);	    if(in == null)				return false;	    BufferedWriter out =				new BufferedWriter(new FileWriter(new File(outFileName)));	    int numberOfLines = 0;	    while ((inputLine = in.readLine()) != null) {				numberOfLines++;				if(numberOfLines == MAX_NUMBER_OF_LINES) {					logger.fine(threadName + ") Too long file - stop reading");					break;				}				extractURLs(inputLine, base);				words.addAll (getWordsFromLine(inputLine));				out.write(inputLine);				out.newLine();	    }	    out.close();	    in.close();	    logger.fine(threadName + ") Content is written");	    person.addPage(new WebPage(url, outFileName));	    return true;		}		catch (FileNotFoundException e) {	    logger.warning(threadName + ") Cannot open output file " + outFileName);		}		catch (IOException e) {	    logger.warning(threadName + ") Cannot close output file " + outFileName);		}		return false;	}		public int processUnseenURLs(int fileNumber, ArrayList words) {		boolean allProcessedURLs = false;		while(allProcessedURLs == false) {			allProcessedURLs = true;			Object[] keys = urls.keySet().toArray();			for(int i = 0; i < keys.length; i++) {				String url = (String)keys[i];				Boolean seen = (Boolean)urls.get(url);				if(seen.booleanValue() == false) {					allProcessedURLs = false;					urls.put(url, new Boolean(true));					String outFileName = getFileName(fileNumber);					if(retrievePage(url, outFileName, words))							fileNumber++;				}			}		}		return fileNumber;	}		public void changeKey() {		currentKeyIndex++;		if(currentKeyIndex == keys.size()) {	    currentKeyIndex = 0;		}		logger.fine(threadName + ") Key changed");		String key = (String)keys.get(currentKeyIndex);		search.setKey(key);	}		public GoogleSearchResult doSearch(String query) {		GoogleSearchResult r = null;		try {			r = search.doSearch();			return r;		} catch(GoogleSearchFault fault) {			logger.warning(threadName + ") Search failed on query: " + query);	    changeKey();		}		return r;	}		public void removeFiles(int beginIndex, int endIndex) {		for (int i = beginIndex; i < endIndex; i++) {	    String outFileName = getFileName(i);	    File file = new File(outFileName);	    file.delete();	    person.removePage(outFileName);		}	}		public boolean responseToQuery(String query) {		String url = "";		boolean returnValue = false;		search.setQueryString(query);		GoogleSearchResult r = doSearch(query);			if(r == null)				return false;			GoogleSearchResultElement[] results = r.getResultElements();			if (results.length == 0)				return false;			int fileNumber = 1;			int oldFileNumber = fileNumber;			if(isDomainInQuery(query) == false)				buildContextModelForPerson();			for (int i = 0; i < results.length; i++) {				url = results[i].getURL();				logger.fine(threadName + ") Found URL: " + url + " (true)");				if(binaryFile(url)) {// URL is not an ASCII file					logger.fine(threadName + ") URL " + url + " is binary");					continue;				}				if (person.isLoginOrNameInURL(url, threadName) == false) {					logger.fine(threadName + ") URL " + url + " is is not related to person");					// URL is not related to person					continue;				}				logger.info(threadName + ") Found URL: " + url + " (true)");				oldFileNumber = fileNumber;				urls.put(url, new Boolean(true));				String outFileName = getFileName(fileNumber);				ArrayList wordsFromWeb = new ArrayList();				if(retrievePage(url, outFileName, wordsFromWeb)) {					fileNumber++;					fileNumber = processUnseenURLs(fileNumber, wordsFromWeb);										if(isDomainInQuery(query)) {						returnValue = true;						//continue;						break;					}					double cosine = person.calculateCosineWithContextModel((String[])wordsFromWeb.toArray(new String[] {}));					if(cosine > 0.008) {						//if(cosine > -0.1) {						returnValue = true;						//continue;						break;					}					logger.info(threadName + ") URL: " + url + " has cosine " 											+ cosine + " with true model...removing fileno " +											oldFileNumber + " to " + fileNumber + " for " +											person.getFirstName());					removeFiles(oldFileNumber, fileNumber);					fileNumber = oldFileNumber;				}			}			//person.removeAllWords();			return returnValue;	}		// Fields	String threadName;	String outDirName;	Person person;	HashSet stopWords;		GoogleSearch search;	HashMap urls;	Vector keys;	int currentKeyIndex;}
💿 文件大小 5351 K
👤 上传用户 wait2010
📂 所属分类 matlab例程
🏷️ 相关标签

#matlab #java #家
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -