📄 documentwriter.java

📁 lucene2.2.0版本
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
12 下一页
package org.apache.lucene.index;/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements.  See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License.  You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */import org.apache.lucene.analysis.Analyzer;import org.apache.lucene.analysis.Token;import org.apache.lucene.analysis.TokenStream;import org.apache.lucene.document.Document;import org.apache.lucene.document.Fieldable;import org.apache.lucene.search.Similarity;import org.apache.lucene.store.Directory;import org.apache.lucene.store.IndexOutput;import java.io.IOException;import java.io.PrintStream;import java.io.Reader;import java.io.StringReader;import java.util.Arrays;import java.util.BitSet;import java.util.Enumeration;import java.util.Hashtable;import java.util.Iterator;import java.util.LinkedList;import java.util.List;final class DocumentWriter {  private Analyzer analyzer;  private Directory directory;  private Similarity similarity;  private FieldInfos fieldInfos;  private int maxFieldLength;  private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;  private PrintStream infoStream;  /** This ctor used by test code only.   *   * @param directory The directory to write the document information to   * @param analyzer The analyzer to use for the document   * @param similarity The Similarity function   * @param maxFieldLength The maximum number of tokens a field may have   */   DocumentWriter(Directory directory, Analyzer analyzer,                 Similarity similarity, int maxFieldLength) {    this.directory = directory;    this.analyzer = analyzer;    this.similarity = similarity;    this.maxFieldLength = maxFieldLength;  }  DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer) {    this.directory = directory;    this.analyzer = analyzer;    this.similarity = writer.getSimilarity();    this.maxFieldLength = writer.getMaxFieldLength();    this.termIndexInterval = writer.getTermIndexInterval();  }  final void addDocument(String segment, Document doc)          throws CorruptIndexException, IOException {    // create field infos    fieldInfos = new FieldInfos();    fieldInfos.add(doc);        // invert doc into postingTable    postingTable.clear();			  // clear postingTable    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions    fieldOffsets = new int[fieldInfos.size()];    // init fieldOffsets    fieldStoresPayloads = new BitSet(fieldInfos.size());        fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts    Arrays.fill(fieldBoosts, doc.getBoost());    try {          // Before we write the FieldInfos we invert the Document. The reason is that      // during invertion the TokenStreams of tokenized fields are being processed       // and we might encounter tokens that have payloads associated with them. In       // this case we have to update the FieldInfo of the particular field.      invertDocument(doc);          // sort postingTable into an array      Posting[] postings = sortPostingTable();          // write field infos       fieldInfos.write(directory, segment + ".fnm");      // write field values      FieldsWriter fieldsWriter =        new FieldsWriter(directory, segment, fieldInfos);      try {        fieldsWriter.addDocument(doc);      } finally {        fieldsWriter.close();      }          /*      for (int i = 0; i < postings.length; i++) {        Posting posting = postings[i];        System.out.print(posting.term);        System.out.print(" freq=" + posting.freq);        System.out.print(" pos=");        System.out.print(posting.positions[0]);        for (int j = 1; j < posting.freq; j++)	  System.out.print("," + posting.positions[j]);        System.out.println("");      }       */      // write postings      writePostings(postings, segment);      // write norms of indexed fields      writeNorms(segment);    } finally {      // close TokenStreams      IOException ex = null;            Iterator it = openTokenStreams.iterator();      while (it.hasNext()) {        try {          ((TokenStream) it.next()).close();        } catch (IOException e) {          if (ex != null) {            ex = e;          }        }      }      openTokenStreams.clear();            if (ex != null) {        throw ex;      }    }  }  // Keys are Terms, values are Postings.  // Used to buffer a document before it is written to the index.  private final Hashtable postingTable = new Hashtable();  private int[] fieldLengths;  private int[] fieldPositions;  private int[] fieldOffsets;  private float[] fieldBoosts;    // If any of the tokens of a paticular field carry a payload  // then we enable payloads for that field.   private BitSet fieldStoresPayloads;    // Keep references of the token streams. We must close them after  // the postings are written to the segment.  private List openTokenStreams = new LinkedList();  // Tokenizes the fields of a document into Postings.  private final void invertDocument(Document doc)          throws IOException {    Iterator fieldIterator = doc.getFields().iterator();    while (fieldIterator.hasNext()) {      Fieldable field = (Fieldable) fieldIterator.next();      String fieldName = field.name();      int fieldNumber = fieldInfos.fieldNumber(fieldName);      int length = fieldLengths[fieldNumber];     // length of field      int position = fieldPositions[fieldNumber]; // position in field      if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);      int offset = fieldOffsets[fieldNumber];       // offset field      if (field.isIndexed()) {        if (!field.isTokenized()) {		  // un-tokenized field          String stringValue = field.stringValue();          if(field.isStoreOffsetWithTermVector())            addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));          else            addPosition(fieldName, stringValue, position++, null, null);          offset += stringValue.length();          length++;        } else         { // tokenized field          TokenStream stream = field.tokenStreamValue();                    // the field does not have a TokenStream,          // so we have to obtain one from the analyzer          if (stream == null) {            Reader reader;			  // find or make Reader            if (field.readerValue() != null)              reader = field.readerValue();            else if (field.stringValue() != null)              reader = new StringReader(field.stringValue());            else              throw new IllegalArgumentException                      ("field must have either String or Reader value");              // Tokenize field and add to postingTable            stream = analyzer.tokenStream(fieldName, reader);          }                    // remember this TokenStream, we must close it later          openTokenStreams.add(stream);                    // reset the TokenStream to the first token          stream.reset();                    Token lastToken = null;          for (Token t = stream.next(); t != null; t = stream.next()) {            position += (t.getPositionIncrement() - 1);                          Payload payload = t.getPayload();            if (payload != null) {              // enable payloads for this field              fieldStoresPayloads.set(fieldNumber);            }                          TermVectorOffsetInfo termVectorOffsetInfo;            if (field.isStoreOffsetWithTermVector()) {              termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());            } else {              termVectorOffsetInfo = null;            }            addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);                          lastToken = t;            if (++length >= maxFieldLength) {              if (infoStream != null)                infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");              break;            }          }                      if(lastToken != null)            offset += lastToken.endOffset() + 1;        }        fieldLengths[fieldNumber] = length;	  // save field length        fieldPositions[fieldNumber] = position;	  // save field position        fieldBoosts[fieldNumber] *= field.getBoost();        fieldOffsets[fieldNumber] = offset;      }    }        // update fieldInfos for all fields that have one or more tokens with payloads    for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {     	fieldInfos.fieldInfo(i).storePayloads = true;    }  }  private final Term termBuffer = new Term("", ""); // avoid consing  private final void addPosition(String field, String text, int position, Payload payload, TermVectorOffsetInfo offset) {    termBuffer.set(field, text);    //System.out.println("Offset: " + offset);    Posting ti = (Posting) postingTable.get(termBuffer);    if (ti != null) {				  // word seen before      int freq = ti.freq;      if (ti.positions.length == freq) {	  // positions array is full        int[] newPositions = new int[freq * 2];	  // double size        int[] positions = ti.positions;        System.arraycopy(positions, 0, newPositions, 0, freq);        ti.positions = newPositions;                if (ti.payloads != null) {          // the current field stores payloads          Payload[] newPayloads = new Payload[freq * 2];  // grow payloads array          Payload[] payloads = ti.payloads;          System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
12 下一页
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -