📄 testtermvectors.java
字号:
setupDoc(testDoc4, test4); Directory dir = new RAMDirectory(); try { IndexWriter writer = new IndexWriter(dir, new SimpleAnalyzer(), true); assertTrue(writer != null); writer.addDocument(testDoc1); writer.addDocument(testDoc2); writer.addDocument(testDoc3); writer.addDocument(testDoc4); writer.close(); IndexSearcher knownSearcher = new IndexSearcher(dir); TermEnum termEnum = knownSearcher.reader.terms(); TermDocs termDocs = knownSearcher.reader.termDocs(); //System.out.println("Terms: " + termEnum.size() + " Orig Len: " + termArray.length); Similarity sim = knownSearcher.getSimilarity(); while (termEnum.next() == true) { Term term = termEnum.term(); //System.out.println("Term: " + term); termDocs.seek(term); while (termDocs.next()) { int docId = termDocs.doc(); int freq = termDocs.freq(); //System.out.println("Doc Id: " + docId + " freq " + freq); TermFreqVector vector = knownSearcher.reader.getTermFreqVector(docId, "field"); float tf = sim.tf(freq); float idf = sim.idf(term, knownSearcher); //float qNorm = sim.queryNorm() //This is fine since we don't have stop words float lNorm = sim.lengthNorm("field", vector.getTerms().length); //float coord = sim.coord() //System.out.println("TF: " + tf + " IDF: " + idf + " LenNorm: " + lNorm); assertTrue(vector != null); String[] vTerms = vector.getTerms(); int [] freqs = vector.getTermFrequencies(); for (int i = 0; i < vTerms.length; i++) { if (term.text().equals(vTerms[i])) { assertTrue(freqs[i] == freq); } } } //System.out.println("--------"); } Query query = new TermQuery(new Term("field", "chocolate")); Hits hits = knownSearcher.search(query); //doc 3 should be the first hit b/c it is the shortest match assertTrue(hits.length() == 3); float score = hits.score(0); /*System.out.println("Hit 0: " + hits.id(0) + " Score: " + hits.score(0) + " String: " + hits.doc(0).toString()); System.out.println("Explain: " + knownSearcher.explain(query, hits.id(0))); System.out.println("Hit 1: " + hits.id(1) + " Score: " + hits.score(1) + " String: " + hits.doc(1).toString()); System.out.println("Explain: " + knownSearcher.explain(query, hits.id(1))); System.out.println("Hit 2: " + hits.id(2) + " Score: " + hits.score(2) + " String: " + hits.doc(2).toString()); System.out.println("Explain: " + knownSearcher.explain(query, hits.id(2)));*/ assertTrue(hits.id(0) == 2); assertTrue(hits.id(1) == 3); assertTrue(hits.id(2) == 0); TermFreqVector vector = knownSearcher.reader.getTermFreqVector(hits.id(1), "field"); assertTrue(vector != null); //System.out.println("Vector: " + vector); String[] terms = vector.getTerms(); int [] freqs = vector.getTermFrequencies(); assertTrue(terms != null && terms.length == 10); for (int i = 0; i < terms.length; i++) { String term = terms[i]; //System.out.println("Term: " + term); int freq = freqs[i]; assertTrue(test4.indexOf(term) != -1); Integer freqInt = (Integer)test4Map.get(term); assertTrue(freqInt != null); assertTrue(freqInt.intValue() == freq); } SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader.getTermFreqVector(hits.id(1), mapper); SortedSet vectorEntrySet = mapper.getTermVectorEntrySet(); assertTrue("mapper.getTermVectorEntrySet() Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10); TermVectorEntry last = null; for (Iterator iterator = vectorEntrySet.iterator(); iterator.hasNext();) { TermVectorEntry tve = (TermVectorEntry) iterator.next(); if (tve != null && last != null) { assertTrue("terms are not properly sorted", last.getFrequency() >= tve.getFrequency()); Integer expectedFreq = (Integer) test4Map.get(tve.getTerm()); //we expect double the expectedFreq, since there are two fields with the exact same text and we are collapsing all fields assertTrue("Frequency is not correct:", tve.getFrequency() == 2*expectedFreq.intValue()); } last = tve; } FieldSortedTermVectorMapper fieldMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator()); knownSearcher.reader.getTermFreqVector(hits.id(1), fieldMapper); Map map = fieldMapper.getFieldToTerms(); assertTrue("map Size: " + map.size() + " is not: " + 2, map.size() == 2); vectorEntrySet = (SortedSet) map.get("field"); assertTrue("vectorEntrySet is null and it shouldn't be", vectorEntrySet != null); assertTrue("vectorEntrySet Size: " + vectorEntrySet.size() + " is not: " + 10, vectorEntrySet.size() == 10); knownSearcher.close(); } catch (IOException e) { e.printStackTrace(); assertTrue(false); } } private void setupDoc(Document doc, String text) { doc.add(new Field("field", text, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.add(new Field("field2", text, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); //System.out.println("Document: " + doc); } // Test only a few docs having vectors public void testRareVectors() throws IOException { IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); for(int i=0;i<100;i++) { Document doc = new Document(); doc.add(new Field("field", English.intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO)); writer.addDocument(doc); } for(int i=0;i<10;i++) { Document doc = new Document(); doc.add(new Field("field", English.intToEnglish(100+i), Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); } writer.close(); searcher = new IndexSearcher(directory); Query query = new TermQuery(new Term("field", "hundred")); Hits hits = searcher.search(query); assertEquals(10, hits.length()); for (int i = 0; i < hits.length(); i++) { TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(i)); assertTrue(vector != null); assertTrue(vector.length == 1); } } // In a single doc, for the same field, mix the term // vectors up public void testMixedVectrosVectors() throws IOException { IndexWriter writer = new IndexWriter(directory, new SimpleAnalyzer(), true); Document doc = new Document(); doc.add(new Field("field", "one", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.NO)); doc.add(new Field("field", "one", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.YES)); doc.add(new Field("field", "one", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); doc.add(new Field("field", "one", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_OFFSETS)); doc.add(new Field("field", "one", Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS)); writer.addDocument(doc); writer.close(); searcher = new IndexSearcher(directory); Query query = new TermQuery(new Term("field", "one")); Hits hits = searcher.search(query); assertEquals(1, hits.length()); TermFreqVector [] vector = searcher.reader.getTermFreqVectors(hits.id(0)); assertTrue(vector != null); assertTrue(vector.length == 1); TermPositionVector tfv = (TermPositionVector) vector[0]; assertTrue(tfv.getField().equals("field")); String[] terms = tfv.getTerms(); assertEquals(1, terms.length); assertEquals(terms[0], "one"); assertEquals(5, tfv.getTermFrequencies()[0]); int[] positions = tfv.getTermPositions(0); assertEquals(5, positions.length); for(int i=0;i<5;i++) assertEquals(i, positions[i]); TermVectorOffsetInfo[] offsets = tfv.getOffsets(0); assertEquals(5, offsets.length); for(int i=0;i<5;i++) { assertEquals(4*i, offsets[i].getStartOffset()); assertEquals(4*i+3, offsets[i].getEndOffset()); } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -