📄 documentswriter.java
字号:
int newSize = (int) (allFieldDataArray.length*1.5); int newHashSize = fieldDataHash.length*2; FieldData newArray[] = new FieldData[newSize]; FieldData newHashArray[] = new FieldData[newHashSize]; System.arraycopy(allFieldDataArray, 0, newArray, 0, numAllFieldData); // Rehash fieldDataHashMask = newSize-1; for(int j=0;j<fieldDataHash.length;j++) { FieldData fp0 = fieldDataHash[j]; while(fp0 != null) { hashPos = fp0.fieldInfo.name.hashCode() & fieldDataHashMask; FieldData nextFP0 = fp0.next; fp0.next = newHashArray[hashPos]; newHashArray[hashPos] = fp0; fp0 = nextFP0; } } allFieldDataArray = newArray; fieldDataHash = newHashArray; } allFieldDataArray[numAllFieldData++] = fp; } else { assert fp.fieldInfo == fi; } if (thisFieldGen != fp.lastGen) { // First time we're seeing this field for this doc fp.lastGen = thisFieldGen; fp.fieldCount = 0; fp.doVectors = fp.doVectorPositions = fp.doVectorOffsets = false; fp.doNorms = fi.isIndexed && !fi.omitNorms; if (numFieldData == fieldDataArray.length) { int newSize = fieldDataArray.length*2; FieldData newArray[] = new FieldData[newSize]; System.arraycopy(fieldDataArray, 0, newArray, 0, numFieldData); fieldDataArray = newArray; } fieldDataArray[numFieldData++] = fp; } if (field.isTermVectorStored()) { if (!fp.doVectors && numVectorFields++ == vectorFieldPointers.length) { final int newSize = (int) (numVectorFields*1.5); vectorFieldPointers = new long[newSize]; vectorFieldNumbers = new int[newSize]; } fp.doVectors = true; docHasVectors = true; fp.doVectorPositions |= field.isStorePositionWithTermVector(); fp.doVectorOffsets |= field.isStoreOffsetWithTermVector(); } if (fp.fieldCount == fp.docFields.length) { Fieldable[] newArray = new Fieldable[fp.docFields.length*2]; System.arraycopy(fp.docFields, 0, newArray, 0, fp.docFields.length); fp.docFields = newArray; } // Lazily allocate arrays for postings: if (field.isIndexed() && fp.postingsHash == null) fp.initPostingArrays(); fp.docFields[fp.fieldCount++] = field; } // Maybe init the local & global fieldsWriter if (localFieldsWriter == null) { if (fieldsWriter == null) { assert docStoreSegment == null; assert segment != null; docStoreSegment = segment; // If we hit an exception while init'ing the // fieldsWriter, we must abort this segment // because those files will be in an unknown // state: try { fieldsWriter = new FieldsWriter(directory, docStoreSegment, fieldInfos); } catch (Throwable t) { throw new AbortException(t, DocumentsWriter.this); } files = null; } localFieldsWriter = new FieldsWriter(null, fdtLocal, fieldInfos); } // First time we see a doc that has field(s) with // stored vectors, we init our tvx writer if (docHasVectors) { if (tvx == null) { assert docStoreSegment != null; // If we hit an exception while init'ing the term // vector output files, we must abort this segment // because those files will be in an unknown // state: try { tvx = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_INDEX_EXTENSION); tvx.writeInt(TermVectorsReader.FORMAT_VERSION); tvd = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_DOCUMENTS_EXTENSION); tvd.writeInt(TermVectorsReader.FORMAT_VERSION); tvf = directory.createOutput(docStoreSegment + "." + IndexFileNames.VECTORS_FIELDS_EXTENSION); tvf.writeInt(TermVectorsReader.FORMAT_VERSION); // We must "catch up" for all docs before us // that had no vectors: for(int i=0;i<numDocsInStore;i++) { tvx.writeLong(tvd.getFilePointer()); tvd.writeVInt(0); } } catch (Throwable t) { throw new AbortException(t, DocumentsWriter.this); } files = null; } numVectorFields = 0; } } /** Do in-place sort of Posting array */ void doPostingSort(Posting[] postings, int numPosting) { quickSort(postings, 0, numPosting-1); } void quickSort(Posting[] postings, int lo, int hi) { if (lo >= hi) return; int mid = (lo + hi) >>> 1; if (comparePostings(postings[lo], postings[mid]) > 0) { Posting tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (comparePostings(postings[mid], postings[hi]) > 0) { Posting tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (comparePostings(postings[lo], postings[mid]) > 0) { Posting tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return; Posting partition = postings[mid]; for (; ;) { while (comparePostings(postings[right], partition) > 0) --right; while (left < right && comparePostings(postings[left], partition) <= 0) ++left; if (left < right) { Posting tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } quickSort(postings, lo, left); quickSort(postings, left + 1, hi); } /** Do in-place sort of PostingVector array */ void doVectorSort(PostingVector[] postings, int numPosting) { quickSort(postings, 0, numPosting-1); } void quickSort(PostingVector[] postings, int lo, int hi) { if (lo >= hi) return; int mid = (lo + hi) >>> 1; if (comparePostings(postings[lo].p, postings[mid].p) > 0) { PostingVector tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (comparePostings(postings[mid].p, postings[hi].p) > 0) { PostingVector tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (comparePostings(postings[lo].p, postings[mid].p) > 0) { PostingVector tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return; PostingVector partition = postings[mid]; for (; ;) { while (comparePostings(postings[right].p, partition.p) > 0) --right; while (left < right && comparePostings(postings[left].p, partition.p) <= 0) ++left; if (left < right) { PostingVector tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } quickSort(postings, lo, left); quickSort(postings, left + 1, hi); } /** If there are fields we've seen but did not see again * in the last run, then free them up. Also reduce * postings hash size. */ void trimFields() { int upto = 0; for(int i=0;i<numAllFieldData;i++) { FieldData fp = allFieldDataArray[i]; if (fp.lastGen == -1) { // This field was not seen since the previous // flush, so, free up its resources now // Unhash final int hashPos = fp.fieldInfo.name.hashCode() & fieldDataHashMask; FieldData last = null; FieldData fp0 = fieldDataHash[hashPos]; while(fp0 != fp) { last = fp0; fp0 = fp0.next; } assert fp0 != null; if (last == null) fieldDataHash[hashPos] = fp.next; else last.next = fp.next; if (infoStream != null) infoStream.println(" remove field=" + fp.fieldInfo.name); } else { // Reset fp.lastGen = -1; allFieldDataArray[upto++] = fp; if (fp.numPostings > 0 && ((float) fp.numPostings) / fp.postingsHashSize < 0.2) { int hashSize = fp.postingsHashSize; // Reduce hash so it's between 25-50% full while (fp.numPostings < (hashSize>>1) && hashSize >= 2) hashSize >>= 1; hashSize <<= 1; if (hashSize != fp.postingsHash.length) fp.rehashPostings(hashSize); } } } // If we didn't see any norms for this field since // last flush, free it for(int i=0;i<norms.length;i++) { BufferedNorms n = norms[i]; if (n != null && n.upto == 0) norms[i] = null; } numAllFieldData = upto; // Also pare back PostingsVectors if it's excessively // large if (maxPostingsVectors * 1.5 < postingsVectors.length) { final int newSize; if (0 == maxPostingsVectors) newSize = 1; else newSize = (int) (1.5*maxPostingsVectors); PostingVector[] newArray = new PostingVector[newSize]; System.arraycopy(postingsVectors, 0, newArray, 0, newSize); postingsVectors = newArray; } } /** Tokenizes the fields of a document into Postings */ void processDocument(Analyzer analyzer) throws IOException, AbortException { final int numFields = numFieldData; assert 0 == fdtLocal.length(); if (tvx != null) // If we are writing vectors then we must visit // fields in sorted order so they are written in // sorted order. TODO: we actually only need to // sort the subset of fields that have vectors // enabled; we could save [small amount of] CPU // here. Arrays.sort(fieldDataArray, 0, numFields); // We process the document one field at a time for(int i=0;i<numFields;i++) fieldDataArray[i].processField(analyzer); if (maxTermPrefix != null && infoStream != null) infoStream.println("WARNING: document contains at least one immense term (longer than the max length " + MAX_TERM_LENGTH + "), all of which were skipped. Please correct the analyzer to not produce such terms. The prefix of the first immense term is: '" + maxTermPrefix + "...'"); if (ramBufferSize != IndexWriter.DISABLE_AUTO_FLUSH && numBytesUsed > 0.95 * ramBufferSize) balanceRAM(); } final ByteBlockPool postingsPool = new ByteBlockPool(true); final ByteBlockPool vectorsPool = new ByteBlockPool(false); final CharBlockPool charPool = new CharBlockPool(); // Current posting we are working on Posting p; PostingVector vector; // USE ONLY FOR DEBUGGING! /* public String getPostingText() { char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT]; int upto = p.textStart & CHAR_BLOCK_MASK; while(text[upto] != 0xffff) upto++; return new String(text, p.textStart, upto-(p.textStart & BYTE_BLOCK_MASK)); } */ /** Test whether the text for current Posting p equals * current tokenText. */ boolean postingEquals(final char[] tokenText, final int tokenTextLen) { final char[] text = charPool.buffers[p.textStart >> CHAR_BLOCK_SHIFT]; assert text != null; int pos = p.textStart & CHAR_BLOCK_MASK; int tokenPos = 0; for(;tokenPos<tokenTextLen;pos++,tokenPos++) if (tokenText[tokenPos] != text[pos]) return false; return 0xffff == text[pos]; } /** Compares term text for two Posting instance and * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */ int comparePostings(Posting p1, Posting p2) { final char[] text1 = charPool.buffers[p1.textStart >> CHAR_BLOCK_SHIFT]; int pos1 = p1.textStart & CHAR_BLOCK_MASK; final char[] text2 = charPool.buffers[p2.textStart >> CHAR_BLOCK_SHIFT]; int pos2 = p2.textStart & CHAR_BLOCK_MASK; while(true) { final char c1 = text1[pos1++]; final char c2 = text2[pos2++]; if (c1 < c2) if (0xffff == c2) return 1; else
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -