📄 documentwriter.java
字号:
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Fieldable;import org.apache.lucene.search.Similarity;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexOutput;import java.io.IOException;import java.io.PrintStream;import java.io.Reader;import java.io.StringReader;import java.util.Arrays;import java.util.BitSet;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.LinkedList;import java.util.List;final class DocumentWriter { private Analyzer analyzer; private Directory directory; private Similarity similarity; private FieldInfos fieldInfos; private int maxFieldLength; private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL; private PrintStream infoStream; /** This ctor used by test code only. * * @param directory The directory to write the document information to * @param analyzer The analyzer to use for the document * @param similarity The Similarity function * @param maxFieldLength The maximum number of tokens a field may have */ DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength) { this.directory = directory; this.analyzer = analyzer; this.similarity = similarity; this.maxFieldLength = maxFieldLength; } DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) { this.directory = directory; this.analyzer = analyzer; this.similarity = writer.getSimilarity(); this.maxFieldLength = writer.getMaxFieldLength(); this.termIndexInterval = writer.getTermIndexInterval(); } final void addDocument(String segment, Document doc) throws CorruptIndexException, IOException { // create field infos fieldInfos = new FieldInfos(); fieldInfos.add(doc); // invert doc into postingTable postingTable.clear(); // clear postingTable fieldLengths = new int[fieldInfos.size()]; // init fieldLengths fieldPositions = new int[fieldInfos.size()]; // init fieldPositions fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets fieldStoresPayloads = new BitSet(fieldInfos.size()); fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts Arrays.fill(fieldBoosts, doc.getBoost()); try { // Before we write the FieldInfos we invert the Document. The reason is that // during invertion the TokenStreams of tokenized fields are being processed // and we might encounter tokens that have payloads associated with them. In // this case we have to update the FieldInfo of the particular field. invertDocument(doc); // sort postingTable into an array Posting[] postings = sortPostingTable(); // write field infos fieldInfos.write(directory, segment + ".fnm"); // write field values FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { fieldsWriter.addDocument(doc); } finally { fieldsWriter.close(); } /* for (int i = 0; i < postings.length; i++) { Posting posting = postings[i]; System.out.print(posting.term); System.out.print(" freq=" + posting.freq); System.out.print(" pos="); System.out.print(posting.positions[0]); for (int j = 1; j < posting.freq; j++) System.out.print("," + posting.positions[j]); System.out.println(""); } */ // write postings writePostings(postings, segment); // write norms of indexed fields writeNorms(segment); } finally { // close TokenStreams IOException ex = null; Iterator it = openTokenStreams.iterator(); while (it.hasNext()) { try { ((TokenStream) it.next()).close(); } catch (IOException e) { if (ex != null) { ex = e; } } } openTokenStreams.clear(); if (ex != null) { throw ex; } } } // Keys are Terms, values are Postings. // Used to buffer a document before it is written to the index. private final Hashtable postingTable = new Hashtable(); private int[] fieldLengths; private int[] fieldPositions; private int[] fieldOffsets; private float[] fieldBoosts; // If any of the tokens of a paticular field carry a payload // then we enable payloads for that field. private BitSet fieldStoresPayloads; // Keep references of the token streams. We must close them after // the postings are written to the segment. private List openTokenStreams = new LinkedList(); // Tokenizes the fields of a document into Postings. private final void invertDocument(Document doc) throws IOException { Iterator fieldIterator = doc.getFields().iterator(); while (fieldIterator.hasNext()) { Fieldable field = (Fieldable) fieldIterator.next(); String fieldName = field.name(); int fieldNumber = fieldInfos.fieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field if (length>0) position+=analyzer.getPositionIncrementGap(fieldName); int offset = fieldOffsets[fieldNumber]; // offset field if (field.isIndexed()) { if (!field.isTokenized()) { // un-tokenized field String stringValue = field.stringValue(); if(field.isStoreOffsetWithTermVector()) addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length())); else addPosition(fieldName, stringValue, position++, null, null); offset += stringValue.length(); length++; } else { // tokenized field TokenStream stream = field.tokenStreamValue(); // the field does not have a TokenStream, // so we have to obtain one from the analyzer if (stream == null) { Reader reader; // find or make Reader if (field.readerValue() != null) reader = field.readerValue(); else if (field.stringValue() != null) reader = new StringReader(field.stringValue()); else throw new IllegalArgumentException ("field must have either String or Reader value"); // Tokenize field and add to postingTable stream = analyzer.tokenStream(fieldName, reader); } // remember this TokenStream, we must close it later openTokenStreams.add(stream); // reset the TokenStream to the first token stream.reset(); Token lastToken = null; for (Token t = stream.next(); t != null; t = stream.next()) { position += (t.getPositionIncrement() - 1); Payload payload = t.getPayload(); if (payload != null) { // enable payloads for this field fieldStoresPayloads.set(fieldNumber); } TermVectorOffsetInfo termVectorOffsetInfo; if (field.isStoreOffsetWithTermVector()) { termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()); } else { termVectorOffsetInfo = null; } addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo); lastToken = t; if (++length >= maxFieldLength) { if (infoStream != null) infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens"); break; } } if(lastToken != null) offset += lastToken.endOffset() + 1; } fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.getBoost(); fieldOffsets[fieldNumber] = offset; } } // update fieldInfos for all fields that have one or more tokens with payloads for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { fieldInfos.fieldInfo(i).storePayloads = true; } } private final Term termBuffer = new Term("", ""); // avoid consing private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) { termBuffer.set(field, text); //System.out.println("Offset: " + offset); Posting ti = (Posting) postingTable.get(termBuffer); if (ti != null) { // word seen before int freq = ti.freq; if (ti.positions.length == freq) { // positions array is full int[] newPositions = new int[freq * 2]; // double size int[] positions = ti.positions; System.arraycopy(positions, 0, newPositions, 0, freq); ti.positions = newPositions; if (ti.payloads != null) { // the current field stores payloads Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array Payload[] payloads = ti.payloads; System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -