luceneretrievalengine.java

来自「基于MPEG 7 标准,符合未来语义网架构,很值得参考」· Java 代码 · 共 889 行 · 第 1/3 页
JAVA
889 行
            DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
            df.setMaximumFractionDigits(1);

            // Preparing objects for the index:
            HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
            HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(descriptions.length);

            // in the first run we identify the semantic objects that we want to index and build
            // a table were we can relate them to the documents (identified by their path)
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(descriptions[i]).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement).trim().replaceAll("id=\"id_[0-9]*\"", "");
                        // check if element is already there, indicator is its string representation.
                        if (!elementMap.keySet().contains(xmlString)) {
                            // its not here, put it in.
                            elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
//                            System.out.println(xmlString);
                        }
                        // now get the unified element
                        semanticElement = elementMap.get(xmlString).semanticElement;
                        // and check if there is an entry in the table for where to find the element
                        if (!element2document.keySet().contains(semanticElement)) {
                            element2document.put(semanticElement, new LinkedList<String>());
                        }
                        // and add found document if not already there:
                        List documentList = element2document.get(semanticElement);
                        if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]);
                    }
                    if (statusBar != null) statusBar.setStatus("Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            // read stats:
            // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

            // Now we can add the nodes to a lucene index:
            // fields: label, id, type, files (separated by '|'), xml, all
            // -------------------------------------------

            // opening the index for writing:
            boolean createFlag = true;
            String indexDir = parseSemanticIndexDirectory(pathToIndex);
            Analyzer analyzer = new StandardAnalyzer();
            IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

            if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

            // iterating through nodes and storing them:
            for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
                Element semElement = iterator.next();
                // needed for later XPath :( otherwise everthing in the whole document is retrieved.

                String fileList = getFileListFromNode(element2document.get(semElement));
                Document idxDocument = new Document();
                // adding the file itself ...
                idxDocument.add(Field.UnIndexed("files", fileList));

//                System.out.println(((Element) o).getTextTrim());

                StringBuilder all = new StringBuilder(255);
                // adding the label
//                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
                String elementLabel = semElement.getChild("Label", semElement.getNamespace()).getChildTextTrim("Name", semElement.getNamespace());
                idxDocument.add(Field.Text("label", elementLabel));

                // adding the type:
                String elementType = semElement.getAttribute("type", xsi).getValue().trim();
                idxDocument.add(Field.UnIndexed("type", elementType));
                // adding the XML contents:
                String xmlString = outputter.outputString(semElement);
                idxDocument.add(Field.UnIndexed("xml", xmlString));
                // adding the id:
                idxDocument.add(Field.UnIndexed("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + ""));
                // adding all, unstored for retrieval only
                List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
                for (Iterator it3 = l.iterator(); it3.hasNext();) {
                    Element e = (Element) it3.next();
                    all.append(e.getTextTrim());
                    all.append(" ");
                }
                idxDocument.add(Field.UnStored("all", all.toString()));

                writer.addDocument(idxDocument);

            }
            // now optimize and close the index:
            // todo: open index for appending and/or updating
            writer.optimize();
            writer.close();

            // Now we can create the powerset for each existing graph
            // (based on sorted node ids) and store
            // all resulting graphs within an index.
            // ----------------------------------------------------------
            if (statusBar != null) statusBar.setStatus("Creating and merging powersets of available graphs");
            HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(descriptions.length);
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(descriptions[i]).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                    LinkedList<Relation> relations = new LinkedList<Relation>();
                    LinkedList<Integer> nodes = new LinkedList<Integer>();
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement);
                        int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                        String docID = semanticElement.getAttribute("id").getValue();
                        docID2overallID.put(docID, id);
                        nodes.add(id);
                    }
                    // get all relations with global ids and eliminate inverse relations
                    l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element relation = (Element) iterator.next();
                        int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                        int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                        String type = relation.getAttribute("type").getValue();
                        type = type.substring(type.lastIndexOf(':') + 1);
                        Relation r = eliminateInverse(new Relation(source, target, type));
                        relations.add(r);
                    }

                    // now create a graph object
                    Collections.sort(nodes);
                    Collections.sort(relations);
                    LinkedList<Node> nodeList = new LinkedList<Node>();
                    for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
                        nodeList.add(new Node(iterator.next()));
                    }
                    Graph g = new Graph(nodeList, relations);
//                    List<Graph> powerSet = new LinkedList<Graph>();
//                    powerSet.add(g);
                    HashSet<String> docs = new HashSet<String>(1);
                    docs.add(descriptions[i]);
                    graph2document.put(g, docs);
/*

                    // add all these subgraphs and the reference to the document to
                    // a data structure:
                    for (Iterator<Graph> iterator = powerSet.iterator(); iterator.hasNext();) {
                        Graph graph = iterator.next();
//                        List<Graph> relationsPowerSet = graph.getPowerSetOfRelations();
//                        for (Iterator<Graph> iterator1 = relationsPowerSet.iterator(); iterator1.hasNext();) {
//                            Graph graph1 = iterator1.next();
//                        }
                        // add graph if not trivial:
                        if (graph.getNodes().size() > 1) {
                            // containsKey for Graph does not match my needs -
                            // different graph objects reference the same graph!
                            if (string2graph.containsKey(graph.toString())) {
                                graph = string2graph.get(graph.toString());
                                graph2document.get(graph).add(descriptions[i]);
                            } else {
                                HashSet<String> docs = new HashSet<String>(1);
                                docs.add(descriptions[i]);
                                graph2document.put(graph, docs);
                            }
                        }
                    }
*/
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                }
            }

            HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
            HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

            /*
                For now we reduce the number of graphs by identifiying and merging duplicates and
                remove redundant entries:
            */
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                if (str2graph.containsKey(g.toString())) {
                    g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
                } else {
                    str2graph.put(g.toString(), g);
                    g2d.put(g, graph2document.get(g));
                }
            }
            graph2document = g2d;
            System.out.println(graph2document.size() + " non trivial different graphs were found");
            // now put all the available graphs into an index:
            // -----------------------------------------------
            // todo: create real fast storable index of subgraphs instead of file :-) possible candidate a trie

            // for now we will store a simple text file:
            if (statusBar != null) statusBar.setStatus("Storing powersets of available graphs as file");
            String indexFile;
            if (!pathToIndex.endsWith(File.separator)) {
                indexFile = pathToIndex + File.separator + "idx_graphs.list";
            } else {
                indexFile = pathToIndex + "idx_graphs.list";
            }
            File f = new File(indexFile);
            BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(f, false))));
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                bw.write(g.toString());
                for (Iterator<String> iterator1 = graph2document.get(g).iterator(); iterator1.hasNext();) {
                    String s = iterator1.next();
                    bw.write("|" + s);
                }
                bw.write("\n");
            }
            bw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    /**
     * Searches for all available nodes with given query String
     *
     * @param queryString   query like "Mathias Lux" or some text inside a node.
     * @param whereToSearch defines the base directory for the search
     * @return a List of Matching nodes with their associated weights
     */
    public List<Node> getNodes(String queryString, String whereToSearch) {
        LinkedList<Node> result = new LinkedList<Node>();
        try {
            IndexSearcher searcher = new IndexSearcher(parseSemanticIndexDirectory(whereToSearch));
            Query query = QueryParser.parse(queryString, "all", new StandardAnalyzer());
            Hits hits = searcher.search(query);
            int hitsCount = hits.length();
//            System.out.println("Found " + hits.length() + " matching nodes.");
            if (hitsCount > MAX_RESULTS) hitsCount = MAX_RESULTS;

            for (int i = 0; i < hitsCount; i++) {
                Document d = hits.doc(i);
                StringBuilder sb = new StringBuilder(20);
                sb.append(hits.score(i));
                sb.append(": ");
                sb.append(d.get("label"));
//                System.out.println(sb.toString());
                Node node = new Node(Integer.parseInt(d.get("id")), hits.score(i), d.get("label"));
                result.add(node);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            System.err.println("QueryString was: " + queryString);
            e.printStackTrace();
        }
        return result;
    }

    private String getFileListFromNode(List<String> list) {
        StringBuilder files = new StringBuilder(64);
        for (Iterator<String> it2 = list.iterator(); it2.hasNext();) {
            files.append(it2.next());
            if (it2.hasNext()) {
                files.append('|');
            }
        }
        return files.toString();
    }

    /**
     * Eliminates all inverse relations to simplify retrieval
     *
     * @param relation
     * @return the normalized relation
     */
    public static Relation eliminateInverse(Relation relation) {
        Relation result = relation;
        if (relationMapping.containsKey(relation.getType())) {
            result = new Relation(relation.getTarget(), relation.getSource(), relationMapping.get(relation.getType()));
        }
        return result;
    }
}

class ElementEntry {
    public Element semanticElement;
    public int id;

    public ElementEntry(Element semanticElement, int id) {
        this.semanticElement = semanticElement;
        this.id = id;
    }
}
luceneretrievalengine.java - 源码说明

本页面展示了「基于MPEG 7 标准,符合未来语义网架构,很值得参考」中的 luceneretrievalengine.java 源码文件，采用 Java 编程语言编写，共 889 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与MPEG相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?