📄 distributedwebdbreader.java
字号:
// The EnumCall class allows the creator of MetaEnumerator
// to indicate how to get each enumeration. Will it be pages
// or links?
//
static abstract class EnumCall {
/**
*/
public EnumCall() {
}
/**
* Subclasses override this for different kinds of MetaEnumerator
* behavior.
*/
public abstract Enumeration getEnumeration(DBSectionReader reader) throws IOException;
}
//
// For enumerating Pages
//
static class PageEnumCall extends EnumCall {
/**
*/
public PageEnumCall() {
}
/**
* Get the enum of Pages
*/
public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
return reader.pages();
}
}
//
// For enumerating Pages
//
static class PageByMD5EnumCall extends EnumCall {
/**
*/
public PageByMD5EnumCall() {
}
/**
* Get the enum of Pages
*/
public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
return reader.pagesByMD5();
}
}
//
// For enumerating Links
//
static class LinkEnumCall extends EnumCall {
/**
*/
public LinkEnumCall() {
}
/**
* Get the enum of Links
*/
public Enumeration getEnumeration(DBSectionReader reader) throws IOException {
return reader.links();
}
}
//
// MetaEnumerator uses the Enumerations from each
// DBSectionReader in the passed-in DBSectionReader array.
//
class MetaEnumerator implements Enumeration {
Enumeration enumerations[];
int curEnum = 0;
/**
* Create all the Enumerations from the given Sections
*/
public MetaEnumerator(DBSectionReader sections[], EnumCall enumCall) throws IOException {
this.enumerations = new Enumeration[sections.length];
for (int i = 0; i < enumerations.length; i++) {
enumerations[i] = enumCall.getEnumeration(sections[i]);
}
}
/**
* Go through all the DBSectionReader items in
* pagesByURL, until we find one that hasMoreElements.
* Or until we hit the end.
*/
public boolean hasMoreElements() {
boolean result = false;
//
// Go through Enumerations until we find one with
// hasMoreElements() == true. (Or until we run out
// of Enumerations.)
//
for (; curEnum < enumerations.length; curEnum++) {
result = enumerations[curEnum].hasMoreElements();
if (result) {
break;
}
}
return result;
}
/**
* Exhaust the Objects we can receive from the
* Enumerations array, via calls to nextElement();
*/
public Object nextElement() {
Object obj = null;
//
// Go through Enumerations until we find one with
// a nextElement() to return. (Or until we run out.)
//
for (; curEnum < enumerations.length; curEnum++) {
if (enumerations[curEnum].hasMoreElements()) {
obj = enumerations[curEnum].nextElement();
if (obj != null) {
break;
}
}
}
return obj;
}
}
/**
* The DistributedWebDBReader.main() provides some handy utility methods
* for looking through the contents of the webdb. Hoo-boy!
*
* Note this only works for a completely-NFS deployment.
*/
public static void main(String argv[]) throws FileNotFoundException, IOException {
if (argv.length < 2) {
System.out.println("Usage: java net.nutch.db.DistributedWebDBReader (-local | -ndfs <namenode:port>) <root> [-pageurl url] | [-pagemd5 md5] | [-dumppageurl] | [-dumppagemd5] | [-toppages <k>] | [-linkurl url] | [-linkmd5 md5] | [-dumplinks] | [-stats]");
return;
}
int i = 0;
NutchFileSystem nfs = NutchFileSystem.parseArgs(argv, i);
File root = new File(argv[i++]);
DistributedWebDBReader reader = new DistributedWebDBReader(nfs, root);
try {
String cmd = argv[i++];
if ("-pageurl".equals(cmd)) {
String url = argv[i++];
System.out.println(reader.getPage(url.trim()));
} else if ("-pagemd5".equals(cmd)) {
MD5Hash md5 = new MD5Hash(argv[i++]);
Page pages[] = reader.getPages(md5);
System.out.println("Found " + pages.length + " pages.");
for (int j = 0; j < pages.length; j++) {
System.out.println("Page " + j + ": " + pages[j]);
}
} else if ("-dumppageurl".equals(cmd)) {
int j = 1;
for (Enumeration e = reader.pages(); e.hasMoreElements(); j++) {
Page page = (Page) e.nextElement();
System.out.println("Page " + j + ": " + page);
System.out.println();
}
} else if ("-dumppagemd5".equals(cmd)) {
int j = 1;
for (Enumeration e = reader.pagesByMD5(); e.hasMoreElements(); j++) {
Page page = (Page) e.nextElement();
System.out.println("Page " + j + ": " + page);
System.out.println();
}
} else if ("-toppages".equals(cmd)) {
int topSize = Integer.parseInt(argv[i++]);
// Create a sorted list
SortedSet topSet = new TreeSet(new Comparator() {
public int compare(Object o1, Object o2) {
Page p1 = (Page) o1;
Page p2 = (Page) o2;
if (p1.getScore() < p2.getScore()) {
return -1;
} else if (p1.getScore() == p2.getScore()) {
// If two scores are equal, we will
// use regular Page comparison (which
// uses URL as the primary key). We
// don't want to uniquify by score!
return p1.compareTo(p2);
} else {
return 1;
}
}
}
);
// Find the top "topSize" elts
Page lowestPage = null;
for (Enumeration e = reader.pages(); e.hasMoreElements(); ) {
Page curPage = (Page) e.nextElement();
if (topSet.size() < topSize) {
topSet.add(curPage);
lowestPage = (Page) topSet.first();
} else if (lowestPage.getScore() < curPage.getScore()) {
topSet.remove(lowestPage);
topSet.add(curPage);
lowestPage = (Page) topSet.first();
}
}
// Print them out
int j = 0;
for (Iterator it = topSet.iterator(); it.hasNext(); j++) {
System.out.println("Page " + j + ": " + (Page) it.next());
System.out.println();
}
} else if ("-linkurl".equals(cmd)) {
String url = argv[i++];
Link links[] = reader.getLinks(new UTF8(url.trim()));
System.out.println("Found " + links.length + " links.");
for (int j = 0; j < links.length; j++) {
System.out.println("Link " + j + ": " + links[j]);
}
} else if ("-linkmd5".equals(cmd)) {
MD5Hash fromID = new MD5Hash(argv[i++]);
Link links[] = reader.getLinks(fromID);
System.out.println("Found " + links.length + " links.");
for (int j = 0; j < links.length; j++) {
System.out.println("Link " + j + ": " + links[j]);
}
} else if ("-dumplinks".equals(cmd)) {
int j = 1;
for (Enumeration e = reader.links(); e.hasMoreElements(); j++) {
Link link = (Link) e.nextElement();
System.out.println("Link " + j + ": " + link);
System.out.println();
}
} else if ("-stats".equals(cmd)) {
System.out.println("Stats for " + reader);
System.out.println("-------------------------------");
System.out.println("Number of pages: " + reader.numPages());
System.out.println("Number of links: " + reader.numLinks());
System.out.println("Number of machines (sections): " + reader.numMachines());
} else {
System.out.println("Sorry, no command with name " + cmd);
}
} finally {
reader.close();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -