📄 documentswriter.java
字号:
} if (doVectorOffsets) { final int upto = vectorsPool.newSlice(firstSize); v.offsetStart = v.offsetUpto = vectorsPool.byteOffset + upto; } return v; } int offsetStartCode; int offsetStart; /** This is the hotspot of indexing: it's called once * for every term of every document. Its job is to * * update the postings byte stream (Postings hash) * * based on the occurence of a single term. */ private void addPosition(Token token) throws AbortException { final Payload payload = token.getPayload(); // Get the text of this term. Term can either // provide a String token or offset into a char[] // array final char[] tokenText = token.termBuffer(); final int tokenTextLen = token.termLength(); int code = 0; // Compute hashcode int downto = tokenTextLen; while (downto > 0) code = (code*31) + tokenText[--downto]; // System.out.println(" addPosition: buffer=" + new String(tokenText, 0, tokenTextLen) + " pos=" + position + " offsetStart=" + (offset+token.startOffset()) + " offsetEnd=" + (offset + token.endOffset()) + " docID=" + docID + " doPos=" + doVectorPositions + " doOffset=" + doVectorOffsets); int hashPos = code & postingsHashMask; assert !postingsCompacted; // Locate Posting in hash p = postingsHash[hashPos]; if (p != null && !postingEquals(tokenText, tokenTextLen)) { // Conflict: keep searching different locations in // the hash table. final int inc = ((code>>8)+code)|1; do { code += inc; hashPos = code & postingsHashMask; p = postingsHash[hashPos]; } while (p != null && !postingEquals(tokenText, tokenTextLen)); } final int proxCode; // If we hit an exception below, it's possible the // posting list or term vectors data will be // partially written and thus inconsistent if // flushed, so we have to abort all documents // since the last flush: try { if (p != null) { // term seen since last flush if (docID != p.lastDocID) { // term not yet seen in this doc // System.out.println(" seen before (new docID=" + docID + ") freqUpto=" + p.freqUpto +" proxUpto=" + p.proxUpto); assert p.docFreq > 0; // Now that we know doc freq for previous doc, // write it & lastDocCode freqUpto = p.freqUpto & BYTE_BLOCK_MASK; freq = postingsPool.buffers[p.freqUpto >> BYTE_BLOCK_SHIFT]; if (1 == p.docFreq) writeFreqVInt(p.lastDocCode|1); else { writeFreqVInt(p.lastDocCode); writeFreqVInt(p.docFreq); } p.freqUpto = freqUpto + (p.freqUpto & BYTE_BLOCK_NOT_MASK); if (doVectors) { vector = addNewVector(); if (doVectorOffsets) { offsetStartCode = offsetStart = offset + token.startOffset(); offsetEnd = offset + token.endOffset(); } } proxCode = position; p.docFreq = 1; // Store code so we can write this after we're // done with this new doc p.lastDocCode = (docID-p.lastDocID) << 1; p.lastDocID = docID; } else { // term already seen in this doc // System.out.println(" seen before (same docID=" + docID + ") proxUpto=" + p.proxUpto); p.docFreq++; proxCode = position-p.lastPosition; if (doVectors) { vector = p.vector; if (vector == null) vector = addNewVector(); if (doVectorOffsets) { offsetStart = offset + token.startOffset(); offsetEnd = offset + token.endOffset(); offsetStartCode = offsetStart-vector.lastOffset; } } } } else { // term not seen before // System.out.println(" never seen docID=" + docID); // Refill? if (0 == postingsFreeCount) { getPostings(postingsFreeList); postingsFreeCount = postingsFreeList.length; } final int textLen1 = 1+tokenTextLen; if (textLen1 + charPool.byteUpto > CHAR_BLOCK_SIZE) { if (textLen1 > CHAR_BLOCK_SIZE) { // Just skip this term, to remain as robust as // possible during indexing. A TokenFilter // can be inserted into the analyzer chain if // other behavior is wanted (pruning the term // to a prefix, throwing an exception, etc). if (maxTermPrefix == null) maxTermPrefix = new String(tokenText, 0, 30); // Still increment position: position++; return; } charPool.nextBuffer(); } final char[] text = charPool.buffer; final int textUpto = charPool.byteUpto; // Pull next free Posting from free list p = postingsFreeList[--postingsFreeCount]; p.textStart = textUpto + charPool.byteOffset; charPool.byteUpto += textLen1; System.arraycopy(tokenText, 0, text, textUpto, tokenTextLen); text[textUpto+tokenTextLen] = 0xffff; assert postingsHash[hashPos] == null; postingsHash[hashPos] = p; numPostings++; if (numPostings == postingsHashHalfSize) rehashPostings(2*postingsHashSize); // Init first slice for freq & prox streams final int firstSize = levelSizeArray[0]; final int upto1 = postingsPool.newSlice(firstSize); p.freqStart = p.freqUpto = postingsPool.byteOffset + upto1; final int upto2 = postingsPool.newSlice(firstSize); p.proxStart = p.proxUpto = postingsPool.byteOffset + upto2; p.lastDocCode = docID << 1; p.lastDocID = docID; p.docFreq = 1; if (doVectors) { vector = addNewVector(); if (doVectorOffsets) { offsetStart = offsetStartCode = offset + token.startOffset(); offsetEnd = offset + token.endOffset(); } } proxCode = position; } proxUpto = p.proxUpto & BYTE_BLOCK_MASK; prox = postingsPool.buffers[p.proxUpto >> BYTE_BLOCK_SHIFT]; assert prox != null; if (payload != null && payload.length > 0) { writeProxVInt((proxCode<<1)|1); writeProxVInt(payload.length); writeProxBytes(payload.data, payload.offset, payload.length); fieldInfo.storePayloads = true; } else writeProxVInt(proxCode<<1); p.proxUpto = proxUpto + (p.proxUpto & BYTE_BLOCK_NOT_MASK); p.lastPosition = position++; if (doVectorPositions) { posUpto = vector.posUpto & BYTE_BLOCK_MASK; pos = vectorsPool.buffers[vector.posUpto >> BYTE_BLOCK_SHIFT]; writePosVInt(proxCode); vector.posUpto = posUpto + (vector.posUpto & BYTE_BLOCK_NOT_MASK); } if (doVectorOffsets) { offsetUpto = vector.offsetUpto & BYTE_BLOCK_MASK; offsets = vectorsPool.buffers[vector.offsetUpto >> BYTE_BLOCK_SHIFT]; writeOffsetVInt(offsetStartCode); writeOffsetVInt(offsetEnd-offsetStart); vector.lastOffset = offsetEnd; vector.offsetUpto = offsetUpto + (vector.offsetUpto & BYTE_BLOCK_NOT_MASK); } } catch (Throwable t) { throw new AbortException(t, DocumentsWriter.this); } } /** Called when postings hash is too small (> 50% * occupied) or too large (< 20% occupied). */ void rehashPostings(final int newSize) { final int newMask = newSize-1; Posting[] newHash = new Posting[newSize]; for(int i=0;i<postingsHashSize;i++) { Posting p0 = postingsHash[i]; if (p0 != null) { final int start = p0.textStart & CHAR_BLOCK_MASK; final char[] text = charPool.buffers[p0.textStart >> CHAR_BLOCK_SHIFT]; int pos = start; while(text[pos] != 0xffff) pos++; int code = 0; while (pos > start) code = (code*31) + text[--pos]; int hashPos = code & newMask; assert hashPos >= 0; if (newHash[hashPos] != null) { final int inc = ((code>>8)+code)|1; do { code += inc; hashPos = code & newMask; } while (newHash[hashPos] != null); } newHash[hashPos] = p0; } } postingsHashMask = newMask; postingsHash = newHash; postingsHashSize = newSize; postingsHashHalfSize = newSize >> 1; } final ByteSliceReader vectorSliceReader = new ByteSliceReader(); /** Called once per field per document if term vectors * are enabled, to write the vectors to * * RAMOutputStream, which is then quickly flushed to * * the real term vectors files in the Directory. */ void writeVectors(FieldInfo fieldInfo) throws IOException { assert fieldInfo.storeTermVector; vectorFieldNumbers[numVectorFields] = fieldInfo.number; vectorFieldPointers[numVectorFields] = tvfLocal.getFilePointer(); numVectorFields++; final int numPostingsVectors = postingsVectorsUpto; tvfLocal.writeVInt(numPostingsVectors); byte bits = 0x0; if (doVectorPositions) bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR; if (doVectorOffsets) bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR; tvfLocal.writeByte(bits); doVectorSort(postingsVectors, numPostingsVectors); Posting lastPosting = null; final ByteSliceReader reader = vectorSliceReader; for(int j=0;j<numPostingsVectors;j++) { PostingVector vector = postingsVectors[j]; Posting posting = vector.p; final int freq = posting.docFreq; final int prefix; final char[] text2 = charPool.buffers[posting.textStart >> CHAR_BLOCK_SHIFT]; final int start2 = posting.textStart & CHAR_BLOCK_MASK; int pos2 = start2; // Compute common prefix between last term and // this term if (lastPosting == null) prefix = 0; else { final char[] text1 = charPool.buffers[lastPosting.textStart >> CHAR_BLOCK_SHIFT]; final int start1 = lastPosting.textStart & CHAR_BLOCK_MASK; int pos1 = start1; while(true) { final char c1 = text1[pos1]; final char c2 = text2[pos2]; if (c1 != c2 || c1 == 0xffff) { prefix = pos1-start1; break; } pos1++; pos2++; } } lastPosting = posting; // Compute length while(text2[pos2] != 0xffff) pos2++; final int suffix = pos2 - start2 - prefix; tvfLocal.writeVInt(prefix); tvfLocal.writeVInt(suffix); tvfLocal.writeChars(text2, start2 + prefix, suffix); tvfLocal.writeVInt(freq); if (doVectorPositions) { reader.init(vectorsPool, vector.posStart, vector.posUpto); reader.writeTo(tvfLocal); } if (doVectorOffsets) { reader.init(vectorsPool, vector.offsetStart, vector.offsetUpto); reader.writeTo(tvfLocal); } } } } } private static final byte defaultNorm = Similarity.encodeNorm(1.0f); /** Write norms in the "true" segment format. This is * called only during commit, to create the .nrm file. */ void writeNorms(String segmentName, int totalNumDoc) throws IOException { IndexOutput normsOut = directory.createOutput(segmentName + "." + IndexFileNames.NORMS_EXTENSION); try { normsOut.writeBytes(SegmentMerger.NORMS_HEADER, 0, SegmentMerger.NORMS_HEADER.length); final int numField = fieldInfos.size(); for (int fieldIdx=0;fieldIdx<numField;fieldIdx++) { FieldInfo fi = fieldInfos.fieldInfo(fieldIdx); if (fi.isIndexed && !fi.omitNorms) { BufferedNorms n = norms[fieldIdx]; final long v; if (n == null) v = 0; else { v = n.out.getFilePointer(); n.out.writeTo(normsOut); n.reset(); } if (v < totalNumDoc) fillBytes(normsOut, defaultNorm, (int) (totalNumDoc-v)); } } } finally { normsOut.close(); } } private DefaultSkipListWriter skipListWriter = null; private boolean currentFieldStorePayloads; /** Creates a segment from all Postings in the Postings * hashes across all ThreadStates & FieldDatas. */ private List writeSegment(
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -