📄 homepagefinder.java
字号:
/* Copyright (C) 2003 Univ. of Massachusetts Amherst, Computer Science Dept. This file is part of "MALLET" (MAchine Learning for LanguagE Toolkit). http://www.cs.umass.edu/~mccallum/mallet This software is provided under the terms of the Common Public License, version 1.0, as published by http://www.opensource.org. For further information, see the file `LICENSE' included with this distribution. *//** Generate queries and search for people's homepages on the web. * @author Ron Bekkerman <A HREF="mailto:ronb@cs.umass.edu">ronb@cs.umass.edu</A>*/package edu.umass.cs.mallet.projects.dex.web;import edu.umass.cs.mallet.projects.dex.types.*;import edu.umass.cs.mallet.base.util.MalletLogger;import java.util.logging.*;import java.util.*;import java.io.*;public class HomePageFinder { private static Logger logger = MalletLogger.getLogger(HomePageFinder.class.getName()); public final static int MIN_NUM_OCCURRENCES = 0; File outputDir; String userName; HashSet stopWords; public HomePageFinder (File outputDir, String userName, HashSet stopWords) { this.outputDir = outputDir; this.userName = userName; this.stopWords = stopWords; } public void findHomePagesFor (People people) { Vector threads = new Vector(); Iterator piter = people.iterator(); while (piter.hasNext()) { Person person = (Person)piter.next(); if(person.processedForWebPages) // Processed on previous stages continue; if(person.findLogin(userName)) // No need in looking for email directory user continue; Query q = new Query(MIN_NUM_OCCURRENCES); // Look for people who appear more than 0 times Vector queries = q.buildQueries(person); if(queries.size() == 0){ logger.info("No queries generated for "); person.printPersonalInfo(); continue; } String destDirName = generateOutDirName(outputDir.toString(), person); makeDir(destDirName); WebThread thread = new WebThread(queries, destDirName, person, stopWords); threads.addElement(thread); logger.fine(thread.getName() + " added; Threads size: " + threads.size()); logger.fine("Active count: " + Thread.activeCount()); person.processedForWebPages = true; if(Thread.activeCount() > 63) { logger.info("Too many threads in the system - wait"); waitForAllThreadsDead(threads); } logger.fine("Thread " + thread.getName() + " is about to start"); thread.start(); } waitForAllThreadsDead(threads); } private void waitForAllThreadsDead(Vector threads) { for (int i = 0; i < threads.size(); i++) { try{ WebThread thread = (WebThread) threads.elementAt(i); logger.fine (thread.getName() + ") Wait for thread to finish"); thread.join(); logger.fine (thread.getName() + ") Thread finished"); } catch (InterruptedException err){ logger.warning("Problem with interruptions"); } } } private void makeDir(String dir) { File outDir = new File(dir); try { if(outDir.exists() == false) outDir.mkdir(); } catch (SecurityException e) { logger.warning("No permission to make directory " + dir); } } public static String generateOutDirName(String outDir, Person person) { return outDir + File.separator + ((CountedString)person.names.elementAt(0)).str; } }
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -