📄 luceneretrievalengine.java
字号:
DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
df.setMaximumFractionDigits(1);
// Preparing objects for the index:
HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(descriptions.length);
// in the first run we identify the semantic objects that we want to index and build
// a table were we can relate them to the documents (identified by their path)
for (int i = 0; i < descriptions.length; i++) {
try {
Element e = builder.build(descriptions[i]).getRootElement();
List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
for (Iterator iterator = l.iterator(); iterator.hasNext();) {
Element semanticElement = (Element) iterator.next();
String xmlString = outputter.outputString(semanticElement).trim().replaceAll("id=\"id_[0-9]*\"", "");
// check if element is already there, indicator is its string representation.
if (!elementMap.keySet().contains(xmlString)) {
// its not here, put it in.
elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
// System.out.println(xmlString);
}
// now get the unified element
semanticElement = elementMap.get(xmlString).semanticElement;
// and check if there is an entry in the table for where to find the element
if (!element2document.keySet().contains(semanticElement)) {
element2document.put(semanticElement, new LinkedList<String>());
}
// and add found document if not already there:
List documentList = element2document.get(semanticElement);
if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]);
}
if (statusBar != null) statusBar.setStatus("Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
} catch (JDOMException e1) {
System.err.println("Exception in document #" + i + ": " + e1.getMessage());
} catch (IOException e1) {
e1.printStackTrace();
}
}
// read stats:
// System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");
// Now we can add the nodes to a lucene index:
// fields: label, id, type, files (separated by '|'), xml, all
// -------------------------------------------
// opening the index for writing:
boolean createFlag = true;
String indexDir = parseSemanticIndexDirectory(pathToIndex);
Analyzer analyzer = new StandardAnalyzer();
IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);
if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");
// iterating through nodes and storing them:
for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
Element semElement = iterator.next();
// needed for later XPath :( otherwise everthing in the whole document is retrieved.
String fileList = getFileListFromNode(element2document.get(semElement));
Document idxDocument = new Document();
// adding the file itself ...
idxDocument.add(Field.UnIndexed("files", fileList));
// System.out.println(((Element) o).getTextTrim());
StringBuilder all = new StringBuilder(255);
// adding the label
// addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
String elementLabel = semElement.getChild("Label", semElement.getNamespace()).getChildTextTrim("Name", semElement.getNamespace());
idxDocument.add(Field.Text("label", elementLabel));
// adding the type:
String elementType = semElement.getAttribute("type", xsi).getValue().trim();
idxDocument.add(Field.UnIndexed("type", elementType));
// adding the XML contents:
String xmlString = outputter.outputString(semElement);
idxDocument.add(Field.UnIndexed("xml", xmlString));
// adding the id:
idxDocument.add(Field.UnIndexed("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + ""));
// adding all, unstored for retrieval only
List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
for (Iterator it3 = l.iterator(); it3.hasNext();) {
Element e = (Element) it3.next();
all.append(e.getTextTrim());
all.append(" ");
}
idxDocument.add(Field.UnStored("all", all.toString()));
writer.addDocument(idxDocument);
}
// now optimize and close the index:
// todo: open index for appending and/or updating
writer.optimize();
writer.close();
// Now we can create the powerset for each existing graph
// (based on sorted node ids) and store
// all resulting graphs within an index.
// ----------------------------------------------------------
if (statusBar != null) statusBar.setStatus("Creating and merging powersets of available graphs");
HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(descriptions.length);
for (int i = 0; i < descriptions.length; i++) {
try {
Element e = builder.build(descriptions[i]).getRootElement();
List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
LinkedList<Relation> relations = new LinkedList<Relation>();
LinkedList<Integer> nodes = new LinkedList<Integer>();
for (Iterator iterator = l.iterator(); iterator.hasNext();) {
Element semanticElement = (Element) iterator.next();
String xmlString = outputter.outputString(semanticElement);
int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
String docID = semanticElement.getAttribute("id").getValue();
docID2overallID.put(docID, id);
nodes.add(id);
}
// get all relations with global ids and eliminate inverse relations
l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
for (Iterator iterator = l.iterator(); iterator.hasNext();) {
Element relation = (Element) iterator.next();
int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
String type = relation.getAttribute("type").getValue();
type = type.substring(type.lastIndexOf(':') + 1);
Relation r = eliminateInverse(new Relation(source, target, type));
relations.add(r);
}
// now create a graph object
Collections.sort(nodes);
Collections.sort(relations);
LinkedList<Node> nodeList = new LinkedList<Node>();
for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
nodeList.add(new Node(iterator.next()));
}
Graph g = new Graph(nodeList, relations);
// List<Graph> powerSet = new LinkedList<Graph>();
// powerSet.add(g);
HashSet<String> docs = new HashSet<String>(1);
docs.add(descriptions[i]);
graph2document.put(g, docs);
/*
// add all these subgraphs and the reference to the document to
// a data structure:
for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) {
Graph graph = iterator.next();
// List<Graph> relationsPowerSet = graph.getPowerSetOfRelations();
// for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) {
// Graph graph1 = iterator1.next();
// }
// add graph if not trivial:
if (graph.getNodes().size() > 1) {
// containsKey for Graph does not match my needs -
// different graph objects reference the same graph!
if (string2graph.containsKey(graph.toString())) {
graph = string2graph.get(graph.toString());
graph2document.get(graph).add(descriptions[i]);
} else {
HashSet<String> docs = new HashSet<String>(1);
docs.add(descriptions[i]);
graph2document.put(graph, docs);
}
}
}
*/
} catch (JDOMException e1) {
System.err.println("Exception in document #" + i + ": " + e1.getMessage());
}
}
HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);
/*
For now we reduce the number of graphs by identifiying and merging duplicates and
remove redundant entries:
*/
for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
Graph g = iterator.next();
if (str2graph.containsKey(g.toString())) {
g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
} else {
str2graph.put(g.toString(), g);
g2d.put(g, graph2document.get(g));
}
}
graph2document = g2d;
System.out.println(graph2document.size() + " non trivial different graphs were found");
// now put all the available graphs into an index:
// -----------------------------------------------
// todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie
// for now we will store a simple text file:
if (statusBar != null) statusBar.setStatus("Storing powersets of available graphs as file");
String indexFile;
if (!pathToIndex.endsWith(File.separator)) {
indexFile = pathToIndex + File.separator + "idx_graphs.list";
} else {
indexFile = pathToIndex + "idx_graphs.list";
}
File f = new File(indexFile);
BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false))));
for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
Graph g = iterator.next();
bw.write(g.toString());
for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) {
String s = iterator1.next();
bw.write("|" + s);
}
bw.write("\n");
}
bw.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Searches for all available nodes with given query String
*
* @param queryString query like "Mathias Lux" or some text inside a node.
* @param whereToSearch defines the base directory for the search
* @return a List of Matching nodes with their associated weights
*/
public List<Node> getNodes(String queryString, String whereToSearch) {
LinkedList<Node> result = new LinkedList<Node>();
try {
IndexSearcher searcher = new IndexSearcher(parseSemanticIndexDirectory(whereToSearch));
Query query = QueryParser.parse(queryString, "all", new StandardAnalyzer());
Hits hits = searcher.search(query);
int hitsCount = hits.length();
// System.out.println("Found " + hits.length() + " matching nodes.");
if (hitsCount > MAX_RESULTS) hitsCount = MAX_RESULTS;
for (int i = 0; i < hitsCount; i++) {
Document d = hits.doc(i);
StringBuilder sb = new StringBuilder(20);
sb.append(hits.score(i));
sb.append(": ");
sb.append(d.get("label"));
// System.out.println(sb.toString());
Node node = new Node(Integer.parseInt(d.get("id")), hits.score(i), d.get("label"));
result.add(node);
}
} catch (IOException e) {
e.printStackTrace();
} catch (ParseException e) {
System.err.println("QueryString was: " + queryString);
e.printStackTrace();
}
return result;
}
private String getFileListFromNode(List<String> list) {
StringBuilder files = new StringBuilder(64);
for (Iterator<String> it2 = list.iterator(); it2.hasNext();) {
files.append(it2.next());
if (it2.hasNext()) {
files.append('|');
}
}
return files.toString();
}
/**
* Eliminates all inverse relations to simplify retrieval
*
* @param relation
* @return the normalized relation
*/
public static Relation eliminateInverse(Relation relation) {
Relation result = relation;
if (relationMapping.containsKey(relation.getType())) {
result = new Relation(relation.getTarget(), relation.getSource(), relationMapping.get(relation.getType()));
}
return result;
}
}
class ElementEntry {
public Element semanticElement;
public int id;
public ElementEntry(Element semanticElement, int id) {
this.semanticElement = semanticElement;
this.id = id;
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -