📄 bidiorder.java
字号:
++limit;
}
byte succLevel = limit < textLength ? resultLevels[limit] : paragraphEmbeddingLevel;
byte succType = typeForLevel(Math.max(succLevel, level));
// 3) resolving weak types
// Rules W1-W7.
resolveWeakTypes(start, limit, level, prevType, succType);
// 4) resolving neutral types
// Rules N1-N3.
resolveNeutralTypes(start, limit, level, prevType, succType);
// 5) resolving implicit embedding levels
// Rules I1, I2.
resolveImplicitLevels(start, limit, level, prevType, succType);
prevLevel = level;
start = limit;
}
// Reinsert explicit codes and assign appropriate levels to 'hide' them.
// This is for convenience, so the resulting level array maps 1-1
// with the initial array.
// See the implementation suggestions section of TR#9 for guidelines on
// how to implement the algorithm without removing and reinserting the codes.
textLength = reinsertExplicitCodes(textLength);
}
/**
* 1) determining the paragraph level.
* <p>
* Rules P2, P3.
* <p>
* At the end of this function, the member variable paragraphEmbeddingLevel is set to either 0 or 1.
*/
private void determineParagraphEmbeddingLevel() {
byte strongType = -1; // unknown
// Rule P2.
for (int i = 0; i < textLength; ++i) {
byte t = resultTypes[i];
if (t == L || t == AL || t == R) {
strongType = t;
break;
}
}
// Rule P3.
if (strongType == -1) { // none found
// default embedding level when no strong types found is 0.
paragraphEmbeddingLevel = 0;
} else if (strongType == L) {
paragraphEmbeddingLevel = 0;
} else { // AL, R
paragraphEmbeddingLevel = 1;
}
}
/**
* Process embedding format codes.
* <p>
* Calls processEmbeddings to generate an embedding array from the explicit format codes. The
* embedding overrides in the array are then applied to the result types, and the result levels are
* initialized.
* @see #processEmbeddings
*/
private void determineExplicitEmbeddingLevels() {
embeddings = processEmbeddings(resultTypes, paragraphEmbeddingLevel);
for (int i = 0; i < textLength; ++i) {
byte level = embeddings[i];
if ((level & 0x80) != 0) {
level &= 0x7f;
resultTypes[i] = typeForLevel(level);
}
resultLevels[i] = level;
}
}
/**
* Rules X9.
* Remove explicit codes so that they may be ignored during the remainder
* of the main portion of the algorithm. The length of the resulting text
* is returned.
* @return the length of the data excluding explicit codes and BN.
*/
private int removeExplicitCodes() {
int w = 0;
for (int i = 0; i < textLength; ++i) {
byte t = initialTypes[i];
if (!(t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN)) {
embeddings[w] = embeddings[i];
resultTypes[w] = resultTypes[i];
resultLevels[w] = resultLevels[i];
w++;
}
}
return w; // new textLength while explicit levels are removed
}
/**
* Reinsert levels information for explicit codes.
* This is for ease of relating the level information
* to the original input data. Note that the levels
* assigned to these codes are arbitrary, they're
* chosen so as to avoid breaking level runs.
* @param textLength the length of the data after compression
* @return the length of the data (original length of
* types array supplied to constructor)
*/
private int reinsertExplicitCodes(int textLength) {
for (int i = initialTypes.length; --i >= 0;) {
byte t = initialTypes[i];
if (t == LRE || t == RLE || t == LRO || t == RLO || t == PDF || t == BN) {
embeddings[i] = 0;
resultTypes[i] = t;
resultLevels[i] = -1;
} else {
--textLength;
embeddings[i] = embeddings[textLength];
resultTypes[i] = resultTypes[textLength];
resultLevels[i] = resultLevels[textLength];
}
}
// now propagate forward the levels information (could have
// propagated backward, the main thing is not to introduce a level
// break where one doesn't already exist).
if (resultLevels[0] == -1) {
resultLevels[0] = paragraphEmbeddingLevel;
}
for (int i = 1; i < initialTypes.length; ++i) {
if (resultLevels[i] == -1) {
resultLevels[i] = resultLevels[i-1];
}
}
// Embedding information is for informational purposes only
// so need not be adjusted.
return initialTypes.length;
}
/**
* 2) determining explicit levels
* Rules X1 - X8
*
* The interaction of these rules makes handling them a bit complex.
* This examines resultTypes but does not modify it. It returns embedding and
* override information in the result array. The low 7 bits are the level, the high
* bit is set if the level is an override, and clear if it is an embedding.
*/
private static byte[] processEmbeddings(byte[] resultTypes, byte paragraphEmbeddingLevel) {
final int EXPLICIT_LEVEL_LIMIT = 62;
int textLength = resultTypes.length;
byte[] embeddings = new byte[textLength];
// This stack will store the embedding levels and override status in a single byte
// as described above.
byte[] embeddingValueStack = new byte[EXPLICIT_LEVEL_LIMIT];
int stackCounter = 0;
// An LRE or LRO at level 60 is invalid, since the new level 62 is invalid. But
// an RLE at level 60 is valid, since the new level 61 is valid. The current wording
// of the rules requires that the RLE remain valid even if a previous LRE is invalid.
// This keeps track of ignored LRE or LRO codes at level 60, so that the matching PDFs
// will not try to pop the stack.
int overflowAlmostCounter = 0;
// This keeps track of ignored pushes at level 61 or higher, so that matching PDFs will
// not try to pop the stack.
int overflowCounter = 0;
// Rule X1.
// Keep the level separate from the value (level | override status flag) for ease of access.
byte currentEmbeddingLevel = paragraphEmbeddingLevel;
byte currentEmbeddingValue = paragraphEmbeddingLevel;
// Loop through types, handling all remaining rules
for (int i = 0; i < textLength; ++i) {
embeddings[i] = currentEmbeddingValue;
byte t = resultTypes[i];
// Rules X2, X3, X4, X5
switch (t) {
case RLE:
case LRE:
case RLO:
case LRO:
// Only need to compute new level if current level is valid
if (overflowCounter == 0) {
byte newLevel;
if (t == RLE || t == RLO) {
newLevel = (byte)((currentEmbeddingLevel + 1) | 1); // least greater odd
} else { // t == LRE || t == LRO
newLevel = (byte)((currentEmbeddingLevel + 2) & ~1); // least greater even
}
// If the new level is valid, push old embedding level and override status
// No check for valid stack counter, since the level check suffices.
if (newLevel < EXPLICIT_LEVEL_LIMIT) {
embeddingValueStack[stackCounter] = currentEmbeddingValue;
stackCounter++;
currentEmbeddingLevel = newLevel;
if (t == LRO || t == RLO) { // override
currentEmbeddingValue = (byte)(newLevel | 0x80);
} else {
currentEmbeddingValue = newLevel;
}
// Adjust level of format mark (for expositional purposes only, this gets
// removed later).
embeddings[i] = currentEmbeddingValue;
break;
}
// Otherwise new level is invalid, but a valid level can still be achieved if this
// level is 60 and we encounter an RLE or RLO further on. So record that we
// 'almost' overflowed.
if (currentEmbeddingLevel == 60) {
overflowAlmostCounter++;
break;
}
}
// Otherwise old or new level is invalid.
overflowCounter++;
break;
case PDF:
// The only case where this did not actually overflow but may have almost overflowed
// is when there was an RLE or RLO on level 60, which would result in level 61. So we
// only test the almost overflow condition in that case.
//
// Also note that there may be a PDF without any pushes at all.
if (overflowCounter > 0) {
--overflowCounter;
} else if (overflowAlmostCounter > 0 && currentEmbeddingLevel != 61) {
--overflowAlmostCounter;
} else if (stackCounter > 0) {
--stackCounter;
currentEmbeddingValue = embeddingValueStack[stackCounter];
currentEmbeddingLevel = (byte)(currentEmbeddingValue & 0x7f);
}
break;
case B:
// Rule X8.
// These values are reset for clarity, in this implementation B can only
// occur as the last code in the array.
stackCounter = 0;
overflowCounter = 0;
overflowAlmostCounter = 0;
currentEmbeddingLevel = paragraphEmbeddingLevel;
currentEmbeddingValue = paragraphEmbeddingLevel;
embeddings[i] = paragraphEmbeddingLevel;
break;
default:
break;
}
}
return embeddings;
}
/**
* 3) resolving weak types
* Rules W1-W7.
*
* Note that some weak types (EN, AN) remain after this processing is complete.
*/
private void resolveWeakTypes(int start, int limit, byte level, byte sor, byte eor) {
// Rule W1.
// Changes all NSMs.
byte preceedingCharacterType = sor;
for (int i = start; i < limit; ++i) {
byte t = resultTypes[i];
if (t == NSM) {
resultTypes[i] = preceedingCharacterType;
} else {
preceedingCharacterType = t;
}
}
// Rule W2.
// EN does not change at the start of the run, because sor != AL.
for (int i = start; i < limit; ++i) {
if (resultTypes[i] == EN) {
for (int j = i - 1; j >= start; --j) {
byte t = resultTypes[j];
if (t == L || t == R || t == AL) {
if (t == AL) {
resultTypes[i] = AN;
}
break;
}
}
}
}
// Rule W3.
for (int i = start; i < limit; ++i) {
if (resultTypes[i] == AL) {
resultTypes[i] = R;
}
}
// Rule W4.
// Since there must be values on both sides for this rule to have an
// effect, the scan skips the first and last value.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -