📄 webdbinjector.java
字号:
* From time to time the Parser will set the "current location"
* by calling this function. It's useful for emitting locations
* for error messages.
*/
public void setDocumentLocator(Locator locator) {
location = locator;
}
//
// Interface ErrorHandler
//
/**
* Emit the exception message
*/
public void error(SAXParseException spe) {
LOG.severe("Error: " + spe.toString() + ": " + spe.getMessage());
spe.printStackTrace(System.out);
}
/**
* Emit the exception message, with line numbers
*/
public void fatalError(SAXParseException spe) {
LOG.severe("Fatal error: " + spe.toString() + ": " + spe.getMessage());
LOG.severe("Last known line is " + location.getLineNumber() + ", column " + location.getColumnNumber());
spe.printStackTrace(System.out);
}
/**
* Emit exception warning message
*/
public void warning(SAXParseException spe) {
LOG.warning("Warning: " + spe.toString() + ": " + spe.getMessage());
spe.printStackTrace(System.out);
}
}
private IWebDBWriter dbWriter;
/**
* WebDBInjector takes a reference to a WebDBWriter that it should add to.
*/
public WebDBInjector(IWebDBWriter dbWriter) {
this.dbWriter = dbWriter;
}
/**
* Close dbWriter and save changes
*/
public void close() throws IOException {
dbWriter.close();
}
/**
* Utility to present small status bar
*/
public void printStatusBar(int small, int big){
if ((pages % small ) == 0) {
System.out.print(".");
}
if ((pages % big ) == 0) {
printStatus();
}
}
long startTime = System.currentTimeMillis();
long pages = 0;
long nextFetch = System.currentTimeMillis();
/**
* Utility to present performance stats
*/
public void printStatus(){
long elapsed = (System.currentTimeMillis() - this.startTime);
if ( this.pages == 0) {
} else {
LOG.info("\t" + this.pages + "\t" +
(int)((1000 * pages)/elapsed) + " pages/second\t" );
}
}
/**
* Iterate through all the items in this flat text file and
* add them to the db.
*/
public void injectURLFile(File urlList) throws IOException {
nextFetch = urlList.lastModified();
BufferedReader reader = new BufferedReader(new FileReader(urlList));
try {
String curStr = null;
LOG.info("Starting URL processing");
while ((curStr = reader.readLine()) != null) {
String url = curStr.trim();
if (addPage(url))
this.pages++;
printStatusBar(2000,50000);
}
LOG.info("Added " + pages + " pages");
} catch (Exception e) {
LOG.severe("error while injecting:" + e);
e.printStackTrace();
} finally {
reader.close();
}
}
/**
* Iterate through all the items in this structured DMOZ file.
* Add each URL to the web db.
*/
public void injectDmozFile(File dmozFile, int subsetDenom, boolean includeAdult, boolean includeDmozDesc, int skew, Pattern topicPattern) throws IOException, SAXException, ParserConfigurationException {
nextFetch = dmozFile.lastModified();
SAXParserFactory parserFactory = SAXParserFactory.newInstance();
SAXParser parser = parserFactory.newSAXParser();
XMLReader reader = parser.getXMLReader();
// Create our own processor to receive SAX events
RDFProcessor rp =
new RDFProcessor(reader, subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern);
reader.setContentHandler(rp);
reader.setErrorHandler(rp);
LOG.info("skew = " + rp.hashSkew);
//
// Open filtered text stream. The UTF8Filter makes sure that
// only appropriate XML-approved UTF8 characters are received.
// Any non-conforming characters are silently skipped.
//
XMLCharFilter in = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(dmozFile)), "UTF-8")));
try {
InputSource is = new InputSource(in);
reader.parse(is);
} catch (Exception e) {
LOG.severe(e.toString());
e.printStackTrace(System.out);
System.exit(0);
} finally {
in.close();
}
}
private boolean addPage(String url) throws IOException {
url = URLFilterFactory.getFilter().filter(url);
if (url != null) {
try {
Page page = new Page(url, NEW_INJECTED_PAGE_SCORE, nextFetch);
dbWriter.addPageIfNotPresent(page);
return true;
} catch (MalformedURLException e) {
LOG.warning("bad url: "+url);
}
}
return false;
}
private static void addTopicsFromFile(String topicFile, Vector topics) throws IOException {
BufferedReader in = null;
try {
in = new BufferedReader(new InputStreamReader(new FileInputStream(topicFile), "UTF-8"));
String line = null;
while ((line = in.readLine()) != null) {
topics.addElement(new String(line));
}
}
catch (Exception e) {
LOG.severe(e.toString());
e.printStackTrace(System.out);
System.exit(0);
} finally {
in.close();
}
}
/**
* Command-line access. User may add URLs via a flat text file
* or the structured DMOZ file. By default, we ignore Adult
* material (as categorized by DMOZ).
*/
public static void main(String argv[]) throws Exception {
if (argv.length < 3) {
System.out.println("Usage: WebDBInjector (-local | -ndfs <namenode:port>) <db_dir> (-urlfile <url_file> | -dmozfile <dmoz_file>) [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-noDmozDesc] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
return;
}
//
// Parse the command line, figure out what kind of
// URL file we need to load
//
int subsetDenom = 1;
int skew = 0;
String command = null, loadfile = null;
boolean includeAdult = false, includeDmozDesc = true;
Pattern topicPattern = null;
Vector topics = new Vector();
int i = 0;
NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
try {
File root = new File(argv[i++]);
for (; i < argv.length; i++) {
if ("-urlfile".equals(argv[i]) ||
"-dmozfile".equals(argv[i])) {
command = argv[i];
loadfile = argv[i+1];
i++;
} else if ("-includeAdultMaterial".equals(argv[i])) {
includeAdult = true;
} else if ("-noDmozDesc".equals(argv[i])) {
includeDmozDesc = false;
} else if ("-subset".equals(argv[i])) {
subsetDenom = Integer.parseInt(argv[i+1]);
i++;
} else if ("-topic".equals(argv[i])) {
topics.addElement(argv[i+1]);
i++;
} else if ("-topicFile".equals(argv[i])) {
addTopicsFromFile(argv[i+1], topics);
i++;
} else if ("-skew".equals(argv[i])) {
skew = Integer.parseInt(argv[i+1]);
i++;
}
}
//
// Create the webdbWriter, the injector, and then inject the
// right kind of URL file.
//
IWebDBWriter writer = new WebDBWriter(nfs, root);
WebDBInjector injector = new WebDBInjector(writer);
try {
if ("-urlfile".equals(command)) {
if (!topics.isEmpty()) {
System.out.println("You can't select URLs based on a topic when usin a URL-file");
}
injector.injectURLFile(new File(loadfile));
} else if ("-dmozfile".equals(command)) {
if (!topics.isEmpty()) {
String regExp = new String("^(");
int j = 0;
for ( ; j < topics.size() - 1; ++j) {
regExp = regExp.concat((String) topics.get(j));
regExp = regExp.concat("|");
}
regExp = regExp.concat((String) topics.get(j));
regExp = regExp.concat(").*");
LOG.info("Topic selection pattern = " + regExp);
topicPattern = Pattern.compile(regExp);
}
injector.injectDmozFile(new File(loadfile), subsetDenom, includeAdult, includeDmozDesc, skew, topicPattern);
} else {
System.out.println("No command indicated.");
return;
}
} finally {
injector.close();
}
} finally {
nfs.close();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -