📄 lucenepathindexretrievalengine.java

📁 基于MPEG 7 标准,符合未来语义网架构,很值得参考
💻 JAVA
📖 第 1 页 / 共 3 页
字号:
        } else {
            List<List<Node>> endResult = new LinkedList<List<Node>>();
            List<Node> firstNodesResults = nodeResults.get(0);
            int numLevels = 0;
            for (Iterator<Node> iterator = firstNodesResults.iterator(); iterator.hasNext();) {
                Node node = iterator.next();
                if (node.getWeight() < 1f) break;
                numLevels++;
            }
            numLevels += depth;
            for (int i = 0; i < numLevels && i < firstNodesResults.size(); i++) {
                List<Node> nodeList = new LinkedList<Node>();
                nodeList.add(firstNodesResults.get(i));
                endResult.add(nodeList);
            }
            return endResult;
        }
    }


    /**
     * Searches for all available nodes with given query String
     *
     * @param queryString   query like "Mathias Lux" or some text inside a node.
     * @param whereToSearch defines the base directory for the search
     * @return a List of Matching nodes with their associated weights
     */
    public List<Node> getNodes(String queryString, String whereToSearch) {
        LinkedList<Node> result = new LinkedList<Node>();
        try {
            IndexSearcher searcher = new IndexSearcher(parseSemanticIndexDirectory(whereToSearch));
            Query query = QueryParser.parse(queryString, "all", new StandardAnalyzer());
            Hits hits = searcher.search(query);
            int hitsCount = hits.length();
//            System.out.println("Found " + hits.length() + " matching nodes.");
            if (hitsCount > MAX_RESULTS) hitsCount = MAX_RESULTS;

            for (int i = 0; i < hitsCount; i++) {
                Document d = hits.doc(i);
                StringBuilder sb = new StringBuilder(20);
                sb.append(hits.score(i));
                sb.append(": ");
                sb.append(d.get("label"));
//                System.out.println(sb.toString());
                Node node = new Node(Integer.parseInt(d.get("id")), hits.score(i), d.get("label"));
                result.add(node);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            System.err.println("QueryString was: " + queryString);
            e.printStackTrace();
        }
        return result;
    }


    public Vector getSimilarImages(Element VisualDescriptor, String whereToSearch, boolean recursive, JProgressBar progress) {
        return null;
    }

    public Vector getImagesByXPathSearch(String xPath, String whereToSearch, boolean recursive, JProgressBar progress) {
        return null;
    }

    public void indexFilesSemantically(String pathToIndex, StatusBar statusBar) {
        if (statusBar != null) statusBar.setStatus("Creating index from semantic annotations");

        SAXBuilder builder = new SAXBuilder();
        XMLOutputter outputter = new XMLOutputter(Format.getRawFormat().setIndent("").setLineSeparator("").setExpandEmptyElements(false));

        try {
            String[] descriptions = FileOperations.getAllDescriptions(new File(pathToIndex), true);
            if (descriptions == null) return;
            float numAllDocsPercent = (float) descriptions.length / 100f;
            DecimalFormat df = (DecimalFormat) NumberFormat.getInstance();
            df.setMaximumFractionDigits(1);

            // Preparing objects for the index:
            HashMap<String, ElementEntry> elementMap = new HashMap<String, ElementEntry>(descriptions.length);
            HashMap<Element, LinkedList<String>> element2document = new HashMap<Element, LinkedList<String>>(descriptions.length);

            // in the first run we identify the semantic objects that we want to index and build
            // a table were we can relate them to the documents (identified by their path)
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(descriptions[i]).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement).trim().replaceAll("id=\"id_[0-9]*\"", "");
                        // check if element is already there, indicator is its string representation.
                        if (!elementMap.keySet().contains(xmlString)) {
                            // its not here, put it in.
                            elementMap.put(xmlString, new ElementEntry(semanticElement, elementMap.size()));
//                            System.out.println(xmlString);
                        }
                        // now get the unified element
                        semanticElement = elementMap.get(xmlString).semanticElement;
                        // and check if there is an entry in the table for where to find the element
                        if (!element2document.keySet().contains(semanticElement)) {
                            element2document.put(semanticElement, new LinkedList<String>());
                        }
                        // and add found document if not already there:
                        List documentList = element2document.get(semanticElement);
                        if (!documentList.contains(descriptions[i])) documentList.add(descriptions[i]);
                    }
                    if (statusBar != null) statusBar.setStatus("Parsing documents for nodes: " + df.format((float) i / numAllDocsPercent));
                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                } catch (IOException e1) {
                    e1.printStackTrace();
                }
            }
            // read stats:
            // System.out.println("Got " + countOverallElements + " Elements in " + descriptions.length + " descriptions, " + elementMap.size() + " elements are pairwise different.");

            // Now we can add the nodes to a lucene index:
            // fields: label, id, type, files (separated by '|'), xml, all
            // -------------------------------------------

            // opening the index for writing:
            boolean createFlag = true;
            String indexDir = parseSemanticIndexDirectory(pathToIndex);
            Analyzer analyzer = new StandardAnalyzer();
            IndexWriter writer = new IndexWriter(indexDir, analyzer, createFlag);

            if (statusBar != null) statusBar.setStatus("Creating index for " + element2document.size() + " different available nodes");

            // iterating through nodes and storing them:
            for (Iterator<Element> iterator = element2document.keySet().iterator(); iterator.hasNext();) {
                Element semElement = iterator.next();
                // needed for later XPath :( otherwise everthing in the whole document is retrieved.

                String fileList = getFileListFromNode(element2document.get(semElement));
                Document idxDocument = new Document();
                // adding the file itself ...
                idxDocument.add(Field.UnIndexed("files", fileList));

//                System.out.println(((Element) o).getTextTrim());

                StringBuilder all = new StringBuilder(255);
                // adding the label
//                addToDocument(idxDocument, semElement, "//Label/Name", "label", all);
                String elementLabel = semElement.getChild("Label", semElement.getNamespace()).getChildTextTrim("Name", semElement.getNamespace());
                idxDocument.add(Field.Text("label", elementLabel));

                // adding the type:
                String elementType = semElement.getAttribute("type", xsi).getValue().trim();
                idxDocument.add(Field.UnIndexed("type", elementType));
                // adding the XML contents:
                String xmlString = outputter.outputString(semElement);
                idxDocument.add(Field.UnIndexed("xml", xmlString));
                // adding the id:
                idxDocument.add(Field.UnIndexed("id", elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id + ""));
                // adding all, unstored for retrieval only
                List l = RetrievalToolkit.xpathQuery(semElement, "*//*", null);
                for (Iterator it3 = l.iterator(); it3.hasNext();) {
                    Element e = (Element) it3.next();
                    all.append(e.getTextTrim());
                    all.append(" ");
                }
                idxDocument.add(Field.UnStored("all", all.toString()));

                writer.addDocument(idxDocument);

            }
            // now optimize and close the index:
            // todo: open index for appending and/or updating
            writer.optimize();
            writer.close();

            // Now we can create the powerset for each existing graph
            // (based on sorted node ids) and store
            // all resulting graphs within an index.
            // ----------------------------------------------------------
            if (statusBar != null) statusBar.setStatus("Creating and merging of available graphs");
            HashMap<Graph, HashSet<String>> graph2document = new HashMap<Graph, HashSet<String>>(descriptions.length);
            for (int i = 0; i < descriptions.length; i++) {
                try {
                    Element e = builder.build(descriptions[i]).getRootElement();
                    List l = RetrievalToolkit.xpathQuery(e, "//Semantic/SemanticBase", null);
                    HashMap<String, Integer> docID2overallID = new HashMap<String, Integer>(l.size());
                    LinkedList<Relation> relations = new LinkedList<Relation>();
                    LinkedList<Integer> nodes = new LinkedList<Integer>();
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element semanticElement = (Element) iterator.next();
                        String xmlString = outputter.outputString(semanticElement);
                        int id = elementMap.get(xmlString.trim().replaceAll("id=\"id_[0-9]*\"", "")).id;
                        String docID = semanticElement.getAttribute("id").getValue();
                        docID2overallID.put(docID, id);
                        nodes.add(id);
                    }
                    // get all relations with global ids and eliminate inverse relations
                    l = RetrievalToolkit.xpathQuery(e, "//Graph/Relation", null);
                    for (Iterator iterator = l.iterator(); iterator.hasNext();) {
                        Element relation = (Element) iterator.next();
                        int source = docID2overallID.get(relation.getAttribute("source").getValue().substring(1));
                        int target = docID2overallID.get(relation.getAttribute("target").getValue().substring(1));
                        String type = relation.getAttribute("type").getValue();
                        type = type.substring(type.lastIndexOf(':') + 1);
                        Relation r = eliminateInverse(new Relation(source, target, type));
                        relations.add(r);
                    }

                    // now create a graph object
                    Collections.sort(nodes);
                    Collections.sort(relations);
                    LinkedList<Node> nodeList = new LinkedList<Node>();
                    for (Iterator<Integer> iterator = nodes.iterator(); iterator.hasNext();) {
                        nodeList.add(new Node(iterator.next()));
                    }
                    Graph g = new Graph(nodeList, relations);
                    HashSet<String> docs = new HashSet<String>(1);
                    docs.add(descriptions[i]);
                    graph2document.put(g, docs);

                } catch (JDOMException e1) {
                    System.err.println("Exception in document #" + i + ": " + e1.getMessage());
                }
            }

            HashMap<String, Graph> str2graph = new HashMap<String, Graph>(graph2document.size() / 2);
            HashMap<Graph, HashSet<String>> g2d = new HashMap<Graph, HashSet<String>>(descriptions.length);

            /*
                For now we reduce the number of graphs by identifiying and merging duplicates and
                remove redundant entries:
            */
            for (Iterator<Graph> iterator = graph2document.keySet().iterator(); iterator.hasNext();) {
                Graph g = iterator.next();
                if (str2graph.containsKey(g.toString())) {
                    g2d.get(str2graph.get(g.toString())).addAll(graph2document.get(g));
                } else {
                    str2graph.put(g.toString(), g);
                    g2d.put(g, graph2document.get(g));
                }
            }
            graph2document = g2d;
            System.out.println(graph2document.size() + " non trivial different graphs were found");
            // now put all the available graphs into an index:
            // -----------------------------------------------

            // for now we will store a simple text file:
            if (statusBar != null) statusBar.setStatus("Saving index of paths");

            boolean createPathIndexFlag = true;
            String pathIndexDir = parsePathIndexDirectory(pathToIndex);
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -