📄 documentswriter.java

📁 Lucene a java open-source SearchEngine Framework
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
上一页 1 2 3 45
        }        if (doVectorOffsets) {          final int upto = vectorsPool.newSlice(firstSize);          v.offsetStart = v.offsetUpto = vectorsPool.byteOffset + upto;        }        return v;      }      int offsetStartCode;      int offsetStart;      /** This is the hotspot of indexing: it's called once       *  for every term of every document.  Its job is to *       *  update the postings byte stream (Postings hash) *       *  based on the occurence of a single term. */      private void addPosition(Token token) throws AbortException {        final Payload payload = token.getPayload();        // Get the text of this term.  Term can either        // provide a String token or offset into a char[]        // array        final char[] tokenText = token.termBuffer();        final int tokenTextLen = token.termLength();        int code = 0;        // Compute hashcode        int downto = tokenTextLen;        while (downto > 0)          code = (code*31) + tokenText[--downto];        // System.out.println("  addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets);        int hashPos = code & postingsHashMask;        assert !postingsCompacted;        // Locate Posting in hash        p = postingsHash[hashPos];        if (p != null && !postingEquals(tokenText, tokenTextLen)) {          // Conflict: keep searching different locations in          // the hash table.          final int inc = ((code>>8)+code)|1;          do {            code += inc;            hashPos = code & postingsHashMask;            p = postingsHash[hashPos];          } while (p != null && !postingEquals(tokenText, tokenTextLen));        }                final int proxCode;        // If we hit an exception below, it's possible the        // posting list or term vectors data will be        // partially written and thus inconsistent if        // flushed, so we have to abort all documents        // since the last flush:        try {          if (p != null) {       // term seen since last flush            if (docID != p.lastDocID) { // term not yet seen in this doc                          // System.out.println("    seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto);              assert p.docFreq > 0;              // Now that we know doc freq for previous doc,              // write it & lastDocCode              freqUpto = p.freqUpto & BYTE_BLOCK_MASK;              freq = postingsPool.buffers[p.freqUpto >> BYTE_BLOCK_SHIFT];              if (1 == p.docFreq)                writeFreqVInt(p.lastDocCode|1);              else {                writeFreqVInt(p.lastDocCode);                writeFreqVInt(p.docFreq);              }              p.freqUpto = freqUpto + (p.freqUpto & BYTE_BLOCK_NOT_MASK);              if (doVectors) {                vector = addNewVector();                if (doVectorOffsets) {                  offsetStartCode = offsetStart = offset + token.startOffset();                  offsetEnd = offset + token.endOffset();                }              }              proxCode = position;              p.docFreq = 1;              // Store code so we can write this after we're              // done with this new doc              p.lastDocCode = (docID-p.lastDocID) << 1;              p.lastDocID = docID;            } else {                                // term already seen in this doc              // System.out.println("    seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto);              p.docFreq++;              proxCode = position-p.lastPosition;              if (doVectors) {                vector = p.vector;                if (vector == null)                  vector = addNewVector();                if (doVectorOffsets) {                  offsetStart = offset + token.startOffset();                  offsetEnd = offset + token.endOffset();                  offsetStartCode = offsetStart-vector.lastOffset;                }              }            }          } else {					  // term not seen before            // System.out.println("    never seen docID=" + docID);            // Refill?            if (0 == postingsFreeCount) {              getPostings(postingsFreeList);              postingsFreeCount = postingsFreeList.length;            }            final int textLen1 = 1+tokenTextLen;            if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) {              if (textLen1 > CHAR_BLOCK_SIZE) {                // Just skip this term, to remain as robust as                // possible during indexing.  A TokenFilter                // can be inserted into the analyzer chain if                // other behavior is wanted (pruning the term                // to a prefix, throwing an exception, etc).                if (maxTermPrefix == null)                  maxTermPrefix = new String(tokenText, 0, 30);                // Still increment position:                position++;                return;              }              charPool.nextBuffer();            }            final char[] text = charPool.buffer;            final int textUpto = charPool.byteUpto;            // Pull next free Posting from free list            p = postingsFreeList[--postingsFreeCount];            p.textStart = textUpto + charPool.byteOffset;            charPool.byteUpto += textLen1;            System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen);            text[textUpto+tokenTextLen] = 0xffff;                      assert postingsHash[hashPos] == null;            postingsHash[hashPos] = p;            numPostings++;            if (numPostings == postingsHashHalfSize)              rehashPostings(2*postingsHashSize);            // Init first slice for freq & prox streams            final int firstSize = levelSizeArray[0];            final int upto1 = postingsPool.newSlice(firstSize);            p.freqStart = p.freqUpto = postingsPool.byteOffset + upto1;            final int upto2 = postingsPool.newSlice(firstSize);            p.proxStart = p.proxUpto = postingsPool.byteOffset + upto2;            p.lastDocCode = docID << 1;            p.lastDocID = docID;            p.docFreq = 1;            if (doVectors) {              vector = addNewVector();              if (doVectorOffsets) {                offsetStart = offsetStartCode = offset + token.startOffset();                offsetEnd = offset + token.endOffset();              }            }            proxCode = position;          }          proxUpto = p.proxUpto & BYTE_BLOCK_MASK;          prox = postingsPool.buffers[p.proxUpto >> BYTE_BLOCK_SHIFT];          assert prox != null;          if (payload != null && payload.length > 0) {            writeProxVInt((proxCode<<1)|1);            writeProxVInt(payload.length);            writeProxBytes(payload.data, payload.offset, payload.length);            fieldInfo.storePayloads = true;          } else            writeProxVInt(proxCode<<1);          p.proxUpto = proxUpto + (p.proxUpto & BYTE_BLOCK_NOT_MASK);          p.lastPosition = position++;          if (doVectorPositions) {            posUpto = vector.posUpto & BYTE_BLOCK_MASK;            pos = vectorsPool.buffers[vector.posUpto >> BYTE_BLOCK_SHIFT];            writePosVInt(proxCode);            vector.posUpto = posUpto + (vector.posUpto & BYTE_BLOCK_NOT_MASK);          }          if (doVectorOffsets) {            offsetUpto = vector.offsetUpto & BYTE_BLOCK_MASK;            offsets = vectorsPool.buffers[vector.offsetUpto >> BYTE_BLOCK_SHIFT];            writeOffsetVInt(offsetStartCode);            writeOffsetVInt(offsetEnd-offsetStart);            vector.lastOffset = offsetEnd;            vector.offsetUpto = offsetUpto + (vector.offsetUpto & BYTE_BLOCK_NOT_MASK);          }        } catch (Throwable t) {          throw new AbortException(t, DocumentsWriter.this);        }      }      /** Called when postings hash is too small (> 50%       *  occupied) or too large (< 20% occupied). */      void rehashPostings(final int newSize) {        final int newMask = newSize-1;        Posting[] newHash = new Posting[newSize];        for(int i=0;i<postingsHashSize;i++) {          Posting p0 = postingsHash[i];          if (p0 != null) {            final int start = p0.textStart & CHAR_BLOCK_MASK;            final char[] text = charPool.buffers[p0.textStart >> CHAR_BLOCK_SHIFT];            int pos = start;            while(text[pos] != 0xffff)              pos++;            int code = 0;            while (pos > start)              code = (code*31) + text[--pos];            int hashPos = code & newMask;            assert hashPos >= 0;            if (newHash[hashPos] != null) {              final int inc = ((code>>8)+code)|1;              do {                code += inc;                hashPos = code & newMask;              } while (newHash[hashPos] != null);            }            newHash[hashPos] = p0;          }        }        postingsHashMask =  newMask;        postingsHash = newHash;        postingsHashSize = newSize;        postingsHashHalfSize = newSize >> 1;      }            final ByteSliceReader vectorSliceReader = new ByteSliceReader();      /** Called once per field per document if term vectors       *  are enabled, to write the vectors to *       *  RAMOutputStream, which is then quickly flushed to       *  * the real term vectors files in the Directory. */      void writeVectors(FieldInfo fieldInfo) throws IOException {        assert fieldInfo.storeTermVector;        vectorFieldNumbers[numVectorFields] = fieldInfo.number;        vectorFieldPointers[numVectorFields] = tvfLocal.getFilePointer();        numVectorFields++;        final int numPostingsVectors = postingsVectorsUpto;        tvfLocal.writeVInt(numPostingsVectors);        byte bits = 0x0;        if (doVectorPositions)          bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;        if (doVectorOffsets)           bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;        tvfLocal.writeByte(bits);        doVectorSort(postingsVectors, numPostingsVectors);        Posting lastPosting = null;        final ByteSliceReader reader = vectorSliceReader;        for(int j=0;j<numPostingsVectors;j++) {          PostingVector vector = postingsVectors[j];          Posting posting = vector.p;          final int freq = posting.docFreq;                    final int prefix;          final char[] text2 = charPool.buffers[posting.textStart >> CHAR_BLOCK_SHIFT];          final int start2 = posting.textStart & CHAR_BLOCK_MASK;          int pos2 = start2;          // Compute common prefix between last term and          // this term          if (lastPosting == null)            prefix = 0;          else {            final char[] text1 = charPool.buffers[lastPosting.textStart >> CHAR_BLOCK_SHIFT];            final int start1 = lastPosting.textStart & CHAR_BLOCK_MASK;            int pos1 = start1;            while(true) {              final char c1 = text1[pos1];              final char c2 = text2[pos2];              if (c1 != c2 || c1 == 0xffff) {                prefix = pos1-start1;                break;              }              pos1++;              pos2++;            }          }          lastPosting = posting;          // Compute length          while(text2[pos2] != 0xffff)            pos2++;          final int suffix = pos2 - start2 - prefix;          tvfLocal.writeVInt(prefix);          tvfLocal.writeVInt(suffix);          tvfLocal.writeChars(text2, start2 + prefix, suffix);          tvfLocal.writeVInt(freq);          if (doVectorPositions) {            reader.init(vectorsPool, vector.posStart, vector.posUpto);            reader.writeTo(tvfLocal);          }          if (doVectorOffsets) {            reader.init(vectorsPool, vector.offsetStart, vector.offsetUpto);            reader.writeTo(tvfLocal);          }        }      }    }  }  private static final byte defaultNorm = Similarity.encodeNorm(1.0f);  /** Write norms in the "true" segment format.  This is   *  called only during commit, to create the .nrm file. */  void writeNorms(String segmentName, int totalNumDoc) throws IOException {    IndexOutput normsOut = directory.createOutput(segmentName + "." + IndexFileNames.NORMS_EXTENSION);    try {      normsOut.writeBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.length);      final int numField = fieldInfos.size();      for (int fieldIdx=0;fieldIdx<numField;fieldIdx++) {        FieldInfo fi = fieldInfos.fieldInfo(fieldIdx);        if (fi.isIndexed && !fi.omitNorms) {          BufferedNorms n = norms[fieldIdx];          final long v;          if (n == null)            v = 0;          else {            v = n.out.getFilePointer();            n.out.writeTo(normsOut);            n.reset();          }          if (v < totalNumDoc)            fillBytes(normsOut, defaultNorm, (int) (totalNumDoc-v));        }      }    } finally {      normsOut.close();    }  }  private DefaultSkipListWriter skipListWriter = null;  private boolean currentFieldStorePayloads;  /** Creates a segment from all Postings in the Postings   *  hashes across all ThreadStates & FieldDatas. */  private List writeSegment(
上一页 1 2 3 45
💿 文件大小 5390 K
👤 上传用户 rickie936
📂 所属分类 Java编程
🏷️ 相关标签

#SearchEngine #open-source #Framework #Lucene
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -