📄 bidiorder.java

📁 处理PDF
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
                ++limit;            }                        byte succLevel = limit < textLength ? resultLevels[limit] : paragraphEmbeddingLevel;            byte succType = typeForLevel(Math.max(succLevel, level));                        // 3) resolving weak types            // Rules W1-W7.            resolveWeakTypes(start, limit, level, prevType, succType);                        // 4) resolving neutral types            // Rules N1-N3.            resolveNeutralTypes(start, limit, level, prevType, succType);                        // 5) resolving implicit embedding levels            // Rules I1, I2.            resolveImplicitLevels(start, limit, level, prevType, succType);                        prevLevel = level;            start = limit;        }                // Reinsert explicit codes and assign appropriate levels to 'hide' them.        // This is for convenience, so the resulting level array maps 1-1        // with the initial array.        // See the implementation suggestions section of TR#9 for guidelines on        // how to implement the algorithm without removing and reinserting the codes.        textLength = reinsertExplicitCodes(textLength);    }        /**     * 1) determining the paragraph level.     * <p>     * Rules P2, P3.     * <p>     * At the end of this function, the member variable paragraphEmbeddingLevel is set to either 0 or 1.     */    private void determineParagraphEmbeddingLevel() {        byte strongType = -1; // unknown                // Rule P2.        for (int i = 0; i < textLength; ++i) {            byte t = resultTypes[i];            if (t == L || t == AL || t == R) {                strongType = t;                break;            }        }                // Rule P3.        if (strongType == -1) { // none found            // default embedding level when no strong types found is 0.            paragraphEmbeddingLevel = 0;        } else if (strongType == L) {            paragraphEmbeddingLevel = 0;        } else { // AL, R            paragraphEmbeddingLevel = 1;        }    }        /**     * Process embedding format codes.     * <p>     * Calls processEmbeddings to generate an embedding array from the explicit format codes.  The     * embedding overrides in the array are then applied to the result types, and the result levels are     * initialized.     * @see #processEmbeddings     */    private void determineExplicitEmbeddingLevels() {        embeddings = processEmbeddings(resultTypes, paragraphEmbeddingLevel);                for (int i = 0; i < textLength; ++i) {            byte level = embeddings[i];            if ((level & 0x80) != 0) {                level &= 0x7f;                resultTypes[i] = typeForLevel(level);            }            resultLevels[i] = level;        }    }        /**     * Rules X9.     * Remove explicit codes so that they may be ignored during the remainder     * of the main portion of the algorithm.  The length of the resulting text     * is returned.     * @return the length of the data excluding explicit codes and BN.     */    private int removeExplicitCodes() {        int w = 0;        for (int i = 0; i < textLength; ++i) {            byte t = initialTypes[i];            if (!(t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN)) {                embeddings[w] = embeddings[i];                resultTypes[w] = resultTypes[i];                resultLevels[w] = resultLevels[i];                w++;            }        }        return w; // new textLength while explicit levels are removed    }        /**     * Reinsert levels information for explicit codes.     * This is for ease of relating the level information     * to the original input data.  Note that the levels     * assigned to these codes are arbitrary, they're     * chosen so as to avoid breaking level runs.     * @param textLength the length of the data after compression     * @return the length of the data (original length of     * types array supplied to constructor)     */    private int reinsertExplicitCodes(int textLength) {        for (int i = initialTypes.length; --i >= 0;) {            byte t = initialTypes[i];            if (t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN) {                embeddings[i] = 0;                resultTypes[i] = t;                resultLevels[i] = -1;            } else {                --textLength;                embeddings[i] = embeddings[textLength];                resultTypes[i] = resultTypes[textLength];                resultLevels[i] = resultLevels[textLength];            }        }                // now propagate forward the levels information (could have        // propagated backward, the main thing is not to introduce a level        // break where one doesn't already exist).                if (resultLevels[0] == -1) {            resultLevels[0] = paragraphEmbeddingLevel;        }        for (int i = 1; i < initialTypes.length; ++i) {            if (resultLevels[i] == -1) {                resultLevels[i] = resultLevels[i-1];            }        }                // Embedding information is for informational purposes only        // so need not be adjusted.                return initialTypes.length;    }        /**     * 2) determining explicit levels     * Rules X1 - X8     *     * The interaction of these rules makes handling them a bit complex.     * This examines resultTypes but does not modify it.  It returns embedding and     * override information in the result array.  The low 7 bits are the level, the high     * bit is set if the level is an override, and clear if it is an embedding.     */    private static byte[] processEmbeddings(byte[] resultTypes, byte paragraphEmbeddingLevel) {        final int EXPLICIT_LEVEL_LIMIT = 62;                int textLength = resultTypes.length;        byte[] embeddings = new byte[textLength];                // This stack will store the embedding levels and override status in a single byte        // as described above.        byte[] embeddingValueStack = new byte[EXPLICIT_LEVEL_LIMIT];        int stackCounter = 0;                // An LRE or LRO at level 60 is invalid, since the new level 62 is invalid.  But        // an RLE at level 60 is valid, since the new level 61 is valid.  The current wording        // of the rules requires that the RLE remain valid even if a previous LRE is invalid.        // This keeps track of ignored LRE or LRO codes at level 60, so that the matching PDFs        // will not try to pop the stack.        int overflowAlmostCounter = 0;                // This keeps track of ignored pushes at level 61 or higher, so that matching PDFs will        // not try to pop the stack.        int overflowCounter = 0;                // Rule X1.                // Keep the level separate from the value (level | override status flag) for ease of access.        byte currentEmbeddingLevel = paragraphEmbeddingLevel;        byte currentEmbeddingValue = paragraphEmbeddingLevel;                // Loop through types, handling all remaining rules        for (int i = 0; i < textLength; ++i) {                        embeddings[i] = currentEmbeddingValue;                        byte t = resultTypes[i];                        // Rules X2, X3, X4, X5            switch (t) {                case RLE:                case LRE:                case RLO:                case LRO:                    // Only need to compute new level if current level is valid                    if (overflowCounter == 0) {                        byte newLevel;                        if (t == RLE || t == RLO) {                            newLevel = (byte)((currentEmbeddingLevel + 1) | 1); // least greater odd                        } else { // t == LRE || t == LRO                            newLevel = (byte)((currentEmbeddingLevel + 2) & ~1); // least greater even                        }                                                // If the new level is valid, push old embedding level and override status                        // No check for valid stack counter, since the level check suffices.                        if (newLevel < EXPLICIT_LEVEL_LIMIT) {                            embeddingValueStack[stackCounter] = currentEmbeddingValue;                            stackCounter++;                                                        currentEmbeddingLevel = newLevel;                            if (t == LRO || t == RLO) { // override                                currentEmbeddingValue = (byte)(newLevel | 0x80);                            } else {                                currentEmbeddingValue = newLevel;                            }                                                        // Adjust level of format mark (for expositional purposes only, this gets                            // removed later).                            embeddings[i] = currentEmbeddingValue;                            break;                        }                                                // Otherwise new level is invalid, but a valid level can still be achieved if this                        // level is 60 and we encounter an RLE or RLO further on.  So record that we                        // 'almost' overflowed.                        if (currentEmbeddingLevel == 60) {                            overflowAlmostCounter++;                            break;                        }                    }                                        // Otherwise old or new level is invalid.                    overflowCounter++;                    break;                                    case PDF:                    // The only case where this did not actually overflow but may have almost overflowed                    // is when there was an RLE or RLO on level 60, which would result in level 61.  So we                    // only test the almost overflow condition in that case.                    //                    // Also note that there may be a PDF without any pushes at all.                                        if (overflowCounter > 0) {                        --overflowCounter;                    } else if (overflowAlmostCounter > 0 && currentEmbeddingLevel != 61) {                        --overflowAlmostCounter;                    } else if (stackCounter > 0) {                        --stackCounter;                        currentEmbeddingValue = embeddingValueStack[stackCounter];                        currentEmbeddingLevel = (byte)(currentEmbeddingValue & 0x7f);                    }                    break;                                    case B:                    // Rule X8.                                        // These values are reset for clarity, in this implementation B can only                    // occur as the last code in the array.                    stackCounter = 0;                    overflowCounter = 0;                    overflowAlmostCounter = 0;                    currentEmbeddingLevel = paragraphEmbeddingLevel;                    currentEmbeddingValue = paragraphEmbeddingLevel;                                        embeddings[i] = paragraphEmbeddingLevel;                    break;                                    default:                    break;            }        }                return embeddings;    }            /**     * 3) resolving weak types     * Rules W1-W7.     *     * Note that some weak types (EN, AN) remain after this processing is complete.     */    private void resolveWeakTypes(int start, int limit, byte level, byte sor, byte eor) {                // Rule W1.        // Changes all NSMs.        byte preceedingCharacterType = sor;        for (int i = start; i < limit; ++i) {            byte t = resultTypes[i];            if (t == NSM) {                resultTypes[i] = preceedingCharacterType;            } else {                preceedingCharacterType = t;            }        }                // Rule W2.        // EN does not change at the start of the run, because sor != AL.        for (int i = start; i < limit; ++i) {            if (resultTypes[i] == EN) {                for (int j = i - 1; j >= start; --j) {                    byte t = resultTypes[j];                    if (t == L || t == R || t == AL) {                        if (t == AL) {                            resultTypes[i] = AN;                        }                        break;                    }                }            }        }                // Rule W3.        for (int i = start; i < limit; ++i) {            if (resultTypes[i] == AL) {                resultTypes[i] = R;            }        }                // Rule W4.        // Since there must be values on both sides for this rule to have an        // effect, the scan skips the first and last value.
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -