📄 google.java
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** Wrapper for Google API. * @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>*/package edu.umass.cs.mallet.projects.dex.web;import edu.umass.cs.mallet.projects.dex.types.*;import edu.umass.cs.mallet.base.util.*;import java.io.*;import java.net.*;import com.google.soap.search.*;import java.util.regex.*;import java.util.*;import java.util.logging.*;public class Google { private static Logger logger = MalletLogger.getLogger(Google.class.getName()); public final static int MAX_URL_COUNTER = 30; public Google(String threadName, String outDirName, Person person, HashSet stopWords) { this.threadName = threadName; this.outDirName = outDirName; this.person = person; this.stopWords = stopWords; search = new GoogleSearch(); urls = new HashMap(100); keys = new Vector(); keys.add("OAyQ4ahQFHKmk5c7LPR3Y6d4zaqs8OQw"); // ronb@cs.umass.edu keys.add("e0IL+KxQFHKEJDnNJUxed7dsmZT++bgg"); // ronb@cs.technion.ac.il keys.add("lLEx5MRQFHLdDIP41weVfsmwM+Gr6hcm"); // my hotmail keys.add("rQek+5JQFHJ+LWofOR0rc4iKWDasWBq8"); // culotta keys.add("LzOQzYBQFHIgtMEE9bdan57EINhcJosr"); // wellner keys.add("7YG4Si5QFHJ3LNChfpXzb6hHh7YBMuyP"); // mccallum keys.add("7KnKT6BQFHLKVp66Bb/2hlv9FFGsccrt"); // hough keys.add("9cQWFoBQFHIi4M2xoOeOD2WR4JuLHple"); // ghamrawi keys.add("o3ec5WtQFHLwR3QTRPkcznLgUzGy0KTF"); // ghuang keys.add("sP0y7O5QFHKdK1B1flUxQc59mRnexRfD"); // fuchun keys.add("WY9bFKJQFHKElRtRPWc3VapOJFRzxiC5"); // ronb@ciirsrv keys.add("t5M/rqZQFHL44q1mFaNPoyu2IuJrUmWq"); // ronb@dandenong keys.add("t+w15CBQFHJXho9EI5LtNf8bm0sYh7Za"); // ronb@loki keys.add("5irs6jVQFHI+UWHUe61bz48NmXG0GzDU"); // csanna keys.add("BU5CqHtQFHJ3Rd+iBb4O4uzyR3AC7Gjd"); // casutton keys.add("TwE5W55QFHK3nkRSVf24fyv/3YqkFNIn"); // Anna's hotmail keys.add("iIu6WmVQFHJOXMn6RLBgfXsxqUZvbwBQ"); // culotta@ciirsrv keys.add("tEAulR5QFHK+hVHrJhplzpQjDIYC0I7C"); // culotta@dandenong keys.add("0XOMkL9QFHJeiq2E3YYNWjlxMugtAOrH"); // culotta@loki keys.add("POyGUClQFHLr8gIR7wme0RdFZj1IqVYz"); // ronb@tx currentKeyIndex = 0; String key = (String)keys.get(currentKeyIndex); search.setKey(key); } public static boolean binaryFile(String url) { Pattern pat = Pattern.compile("\\.(pdf|ps|doc|ppt|gz|zip|avi|gif|jpg|jpeg|mpg|mpeg|cfm|php)$", Pattern.CASE_INSENSITIVE); Matcher mat = pat.matcher(url); if (mat.find()) return true; return false; } public static boolean isDomainInQuery(String query) { Pattern pat = Pattern.compile(" site\\:.+"); Matcher mat = pat.matcher(query); return mat.find(); } public static String getBase(String url) { // If ends with slash, the url is the base Pattern pat = Pattern.compile("\\/$"); Matcher mat = pat.matcher(url); if(mat.find()) return url; // if ends with file name, everything before the file name is the base pat = Pattern.compile("^(.*\\/)[^\\/]+\\.[^\\/]+$"); mat = pat.matcher(url); if(mat.matches()) { return mat.group(1); } return url + "/"; } public String getFileName(int num) { Integer intValue = new Integer(num); return outDirName + File.separator + intValue.toString() + ".html"; } public void extractURLs(String line, String base) { Pattern pat = Pattern.compile("href\\=\\\"([^\\\"]+)\\\""); Matcher mat = pat.matcher(line); while(mat.find()){ String url = mat.group(); pat = Pattern.compile("\\\"([^\\\"]+)\\\""); Matcher mat1 = pat.matcher(url); if(mat1.find()) url = mat1.group(1); pat = Pattern.compile("\\#"); // sharps mat1 = pat.matcher(url); if(mat1.find()) continue; if(binaryFile(url)) continue; // binary files if(url.regionMatches(0,base,0,base.length()) == false) { pat = Pattern.compile("\\:"); // http: ftp: mailto: mat1 = pat.matcher(url); if(mat1.find()) continue; pat = Pattern.compile("^(\\/|\\.\\.)"); // If URL starts with slash or .. mat1 = pat.matcher(url); if(mat1.find()) continue; pat = Pattern.compile("[^\\w\\-\\/\\.]"); // not proper file name mat1 = pat.matcher(url); if(mat1.find()) continue; pat = Pattern.compile("^\\.\\/?$"); // only current file mat1 = pat.matcher(url); if(mat1.matches()) continue; pat = Pattern.compile("^\\.\\/"); // remove ./ from the beginning mat1 = pat.matcher(url); url = mat1.replaceAll(""); url = base + url; } // if no such url in urls and list of urls is not too long if(urls.get(url) == null && urls.size() < MAX_URL_COUNTER) { urls.put(url, new Boolean(false)); logger.fine(threadName + ") Found URL: " + url + " (false)"); } } } /** gets words from a line of html, filtered by a stop list * @param line line of html * @return list of words */ public ArrayList getWordsFromLine(String line) { Pattern pat = Pattern.compile("\\W+"); ArrayList stoppedWords = new ArrayList(); String[] words = pat.split(line); for(int i = 0; i < words.length; i++){ String word = words[i].toLowerCase(); if(!(word.equals("") || stopWords.contains(word))) stoppedWords.add (word); } return stoppedWords; } public void buildContextModelForPerson() { if(person.contextModel.numLocations() > 0) return; // We already have context model from email if(person.contextPages.size() == 0) { logger.warning("Cannot build context model for "); person.printPersonalInfo(); System.exit(1); } for (int i=0; i < person.contextPages.size(); i++) { String fileName = ((WebPage)person.contextPages.elementAt(i)).fileName; try { BufferedReader in = new BufferedReader(new FileReader(new File(fileName))); String inputLine; while ((inputLine = in.readLine()) != null) { String[] words = (String[]) getWordsFromLine(inputLine).toArray (new String[]{});; person.addWordsToContextModel (words); inputLine = in.readLine(); } in.close(); } catch (FileNotFoundException e) { System.err.println(threadName + ") Cannot open input file " + fileName); } catch (IOException e) { System.err.println(threadName + ") Cannot close input file " + fileName); } } } public BufferedReader establishConnection(String stringURL) { try{ HttpTimeoutHandler xHTH = new HttpTimeoutHandler(7000); // in milliseconds URL url = new URL((URL)null, stringURL, xHTH); //URL url = new URL(stringURL); URLConnection connection = url.openConnection(); logger.fine(threadName + ") Connection opened"); connection.connect(); int contentLength = connection.getContentLength(); if(contentLength > 100000) { logger.fine(threadName + ") Too long file - connection stopped"); return null; } //System.out.println("Content Length of " + stringURL + " is " + contentLength); return new BufferedReader(new InputStreamReader(connection.getInputStream())); } catch (MalformedURLException u) { logger.warning(threadName + ") Malformed URL: " + stringURL); } catch (IOException e) { logger.warning(threadName + ") Failed to establish connection to " + stringURL + " error: " + e); } return null; } public boolean retrievePage(String url, String outFileName, ArrayList words) { int MAX_NUMBER_OF_LINES = 5000; String base = getBase(url); logger.fine(threadName + ") Retrieving page: " + url); try { String inputLine; BufferedReader in = establishConnection(url); if(in == null) return false; BufferedWriter out = new BufferedWriter(new FileWriter(new File(outFileName))); int numberOfLines = 0; while ((inputLine = in.readLine()) != null) { numberOfLines++; if(numberOfLines == MAX_NUMBER_OF_LINES) { logger.fine(threadName + ") Too long file - stop reading"); break; } extractURLs(inputLine, base); words.addAll (getWordsFromLine(inputLine)); out.write(inputLine); out.newLine(); } out.close(); in.close(); logger.fine(threadName + ") Content is written"); person.addPage(new WebPage(url, outFileName)); return true; } catch (FileNotFoundException e) { logger.warning(threadName + ") Cannot open output file " + outFileName); } catch (IOException e) { logger.warning(threadName + ") Cannot close output file " + outFileName); } return false; } public int processUnseenURLs(int fileNumber, ArrayList words) { boolean allProcessedURLs = false; while(allProcessedURLs == false) { allProcessedURLs = true; Object[] keys = urls.keySet().toArray(); for(int i = 0; i < keys.length; i++) { String url = (String)keys[i]; Boolean seen = (Boolean)urls.get(url); if(seen.booleanValue() == false) { allProcessedURLs = false; urls.put(url, new Boolean(true)); String outFileName = getFileName(fileNumber); if(retrievePage(url, outFileName, words)) fileNumber++; } } } return fileNumber; } public void changeKey() { currentKeyIndex++; if(currentKeyIndex == keys.size()) { currentKeyIndex = 0; } logger.fine(threadName + ") Key changed"); String key = (String)keys.get(currentKeyIndex); search.setKey(key); } public GoogleSearchResult doSearch(String query) { GoogleSearchResult r = null; try { r = search.doSearch(); return r; } catch(GoogleSearchFault fault) { logger.warning(threadName + ") Search failed on query: " + query); changeKey(); } return r; } public void removeFiles(int beginIndex, int endIndex) { for (int i = beginIndex; i < endIndex; i++) { String outFileName = getFileName(i); File file = new File(outFileName); file.delete(); person.removePage(outFileName); } } public boolean responseToQuery(String query) { String url = ""; boolean returnValue = false; search.setQueryString(query); GoogleSearchResult r = doSearch(query); if(r == null) return false; GoogleSearchResultElement[] results = r.getResultElements(); if (results.length == 0) return false; int fileNumber = 1; int oldFileNumber = fileNumber; if(isDomainInQuery(query) == false) buildContextModelForPerson(); for (int i = 0; i < results.length; i++) { url = results[i].getURL(); logger.fine(threadName + ") Found URL: " + url + " (true)"); if(binaryFile(url)) {// URL is not an ASCII file logger.fine(threadName + ") URL " + url + " is binary"); continue; } if (person.isLoginOrNameInURL(url, threadName) == false) { logger.fine(threadName + ") URL " + url + " is is not related to person"); // URL is not related to person continue; } logger.info(threadName + ") Found URL: " + url + " (true)"); oldFileNumber = fileNumber; urls.put(url, new Boolean(true)); String outFileName = getFileName(fileNumber); ArrayList wordsFromWeb = new ArrayList(); if(retrievePage(url, outFileName, wordsFromWeb)) { fileNumber++; fileNumber = processUnseenURLs(fileNumber, wordsFromWeb); if(isDomainInQuery(query)) { returnValue = true; //continue; break; } double cosine = person.calculateCosineWithContextModel((String[])wordsFromWeb.toArray(new String[] {})); if(cosine > 0.008) { //if(cosine > -0.1) { returnValue = true; //continue; break; } logger.info(threadName + ") URL: " + url + " has cosine " + cosine + " with true model...removing fileno " + oldFileNumber + " to " + fileNumber + " for " + person.getFirstName()); removeFiles(oldFileNumber, fileNumber); fileNumber = oldFileNumber; } } //person.removeAllWords(); return returnValue; } // Fields String threadName; String outDirName; Person person; HashSet stopWords; GoogleSearch search; HashMap urls; Vector keys; int currentKeyIndex;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -