📄 ucsreader.java
字号:
ch[offset + charsRead] = fCharBuf[--fCharCount];
charsRead++;
} else {
break;
}
}
// Reading remaining chars from InputStream.
if (0 != (length - charsRead)) {
/*
* Each output char (two for supplementary characters) will require
* us to read 4 input bytes. But as we cannot predict how many
* supplementary chars we will encounter, so we should try to read
* maximum possible number.
*/
int byteLength = (length - charsRead) << 2;
if (byteLength > fBuffer.length) {
byteLength = fBuffer.length;
}
int count = fInputStream.read(fBuffer, 0, byteLength);
if (-1 == count) {
return (0 == charsRead) ? (-1) : charsRead;
} else {
// try and make count be a multiple of the number of bytes we're
// looking for (simply reading 1 to 3 bytes from input stream to
// ensure the last code point is complete)
// this looks ugly, but it avoids an if at any rate...
int numToRead = ((4 - (count & 3)) & 3);
for (int i = 0; i < numToRead; i++) {
int charRead = fInputStream.read();
if (charRead == -1) {
// end of input; something likely went wrong! Pad buffer
// with zeros.
for (int j = i; j < numToRead; j++)
fBuffer[count + j] = 0;
break;
} else {
fBuffer[count + i] = (byte) charRead;
}
}
count += numToRead;
// now count is a multiple of the right number of bytes
int numChars = count >> 2;
int curPos = 0;
/*
* `i` is index of currently processed char from InputStream.
* `charsCount` also counts number of chars that were (possibly)
* read from internal char buffer.
*/
int charsCount = charsRead;
int i;
for (i = 0; (i < numChars) && (length >= charsCount); i++) {
int b0 = fBuffer[curPos++] & 0xff;
int b1 = fBuffer[curPos++] & 0xff;
int b2 = fBuffer[curPos++] & 0xff;
int b3 = fBuffer[curPos++] & 0xff;
int codepoint;
if (UCS4BE == fEncoding) {
codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
// Again, validity of this codepoint is never checked, this
// can yield problems sometimes.
if (!isSupplementaryCodePoint(codepoint)) {
ch[offset + charsCount] = (char) codepoint;
charsCount++;
} else {
// Checking if we can put another 2 chars in buffer.
if (2 <= (length - charsCount)) {
int cp1 = (codepoint - 0x10000) & 0xFFFFF;
ch[offset + charsCount] = (char) (0xD800 + (cp1 >>> 10));
ch[offset + charsCount + 1] = (char) (0xDC00 + (cp1 & 0x3FF));
charsCount += 2;
} else {
break; // END for
}
}
} // END for
// Storing data, that possibly remain in `fBuffer` into internal
// char buffer for future use :)
curPos = (numChars << 2) - 1;
for (int k = numChars; k > i; k--) {
// Reading bytes in reverse order
int b3 = fBuffer[curPos--] & 0xff;
int b2 = fBuffer[curPos--] & 0xff;
int b1 = fBuffer[curPos--] & 0xff;
int b0 = fBuffer[curPos--] & 0xff;
int codepoint;
if (UCS4BE == fEncoding) {
codepoint = ((b0 << 24) + (b1 << 16) + (b2 << 8) + b3);
} else {
codepoint = ((b3 << 24) + (b2 << 16) + (b1 << 8) + b0);
}
// Look if we need to increase buffer size
if (2 > (fCharBuf.length - k)) {
char[] newBuf = new char[fCharBuf.length << 1];
System.arraycopy(fCharBuf, 0, newBuf, 0, fCharBuf.length);
fCharBuf = newBuf;
}
if (!isSupplementaryCodePoint(codepoint)) {
fCharBuf[fCharCount++] = (char) codepoint;
} else {
int cp1 = (codepoint - 0x10000) & 0xFFFFF;
// In this case store low surrogate code unit first, so that
// it can be read back after high one.
fCharBuf[fCharCount++] = (char) (0xDC00 + ((char) cp1 & 0x3FF));
fCharBuf[fCharCount++] = (char) (0xD800 + (cp1 >>> 10));
}
} // END for
return charsCount;
} // END if (-1 == count) ELSE
} // END if (0 != (length - charsRead))
return charsRead;
} // read(char[],int,int)
/**
* Read <code>UCS-2</code> characters into a portion of an array.
* This method will block until some input is available, an I/O
* error occurs, or the end of the stream is reached.
* <p>
* In original <code>UCSReader</code> this code was part of
* <code>read(char[], int, int)</code> method, but I removed it
* from there to reduce complexity of the latter.
* </p>
*
* @param ch destination buffer
* @param offset offset at which to start storing characters
* @param length maximum number of characters to read
*
* @return The number of characters read, or <code>-1</code>
* if the end of the stream has been reached
*
* @exception IOException If an I/O error occurs
*/
protected int readUCS2(char[] ch, int offset, int length)
throws IOException {
int byteLength = length << 1;
if (byteLength > fBuffer.length) {
byteLength = fBuffer.length;
}
int count = fInputStream.read(fBuffer, 0, byteLength);
if (count == -1) {
return -1;
}
// try and make count be a multiple of the number of bytes we're
// looking for (simply reading 1 to 3 bytes from input stream to
// ensure the last code point is complete)
int numToRead = count & 1;
if (numToRead != 0) {
count++;
int charRead = fInputStream.read();
if (charRead == -1) { // end of input; something likely went
// wrong! Pad buffer with nulls.
fBuffer[count] = 0;
} else {
fBuffer[count] = (byte) charRead;
}
}
// now count is a multiple of the right number of bytes
int numChars = count >> 1;
int curPos = 0;
for (int i = 0; i < numChars; i++) {
int b0 = fBuffer[curPos++] & 0xff;
int b1 = fBuffer[curPos++] & 0xff;
if (fEncoding == UCS2BE) {
ch[offset + i] = (char) ((b0 << 8) + b1);
} else {
ch[offset + i] = (char) ((b1 << 8) + b0);
}
}
return numChars;
} // END readUCS2(char[], int, int)
/**
* Skip characters. This method will block until some characters are
* available, an I/O error occurs, or the end of the stream is reached.
*
* @param n The number of characters to skip
*
* @return The number of characters actually skipped
*
* @exception IOException If an I/O error occurs
*/
public long skip(long n) throws IOException {
/*
* charWidth will represent the number of bits to move
* n leftward to get num of bytes to skip, and then move the result
* rightward
* to get num of chars effectively skipped.
* The trick with &'ing, as with elsewhere in this dcode, is
* intended to avoid an expensive use of / that might not be optimized
* away.
*/
int charWidth = (fEncoding >= 4) ? 2 : 1;
long bytesSkipped = fInputStream.skip(n << charWidth);
if ((bytesSkipped & (charWidth | 1)) == 0) {
return bytesSkipped >>> charWidth;
}
return (bytesSkipped >>> charWidth) + 1;
} // skip(long):long
/**
* Tell whether this stream is ready to be read.
*
* @return True if the next read() is guaranteed not to block for input,
* false otherwise. Note that returning false does not guarantee that the
* next read will block.
*
* @exception IOException If an I/O error occurs
*/
public boolean ready() throws IOException {
return false;
} // ready()
/**
* Tell whether this stream supports the mark() operation.
*/
public boolean markSupported() {
return fInputStream.markSupported();
} // markSupported()
/**
* Mark the present position in the stream. Subsequent calls to
* <code>reset</code> will attempt to reposition the stream to this point.
* Not all character-input streams support the <code>mark</code> operation.
* This is one of them :) It relies on marking facilities of underlying
* byte stream.
*
* @param readAheadLimit Limit on the number of characters that may be
* read while still preserving the mark. After
* reading this many characters, attempting to
* reset the stream may fail.
*
* @exception IOException If the stream does not support
* <code>mark</code>, or if some other I/O error
* occurs
*/
public void mark(int readAheadLimit) throws IOException {
fInputStream.mark(readAheadLimit);
} // mark(int)
/**
* Reset the stream. If the stream has been marked, then attempt to
* reposition it at the mark. If the stream has not been marked, then
* attempt to reset it in some way appropriate to the particular stream,
* for example by repositioning it to its starting point. This stream
* implementation does not support <code>mark</code>/<code>reset</code>
* by itself, it relies on underlying byte stream in this matter.
*
* @exception IOException If the stream has not been marked,
* or if the mark has been invalidated,
* or if the stream does not support reset(),
* or if some other I/O error occurs
*/
public void reset() throws IOException {
fInputStream.reset();
} // reset()
/**
* Close the stream. Once a stream has been closed, further
* <code>read</code>, <code>ready</code>, <code>mark</code>,
* or <code>reset</code> invocations will throw an IOException.
* Closing a previously-closed stream, however, has no effect.
*
* @exception IOException If an I/O error occurs
*/
public void close() throws IOException {
fInputStream.close();
fInputStream = null;
fCharBuf = null;
fBuffer = null;
} // close()
/**
* Returns the encoding currently in use by this character stream.
*
* @return Encoding of this stream. Either ISO-10646-UCS-2 or
* ISO-10646-UCS-4. Problem is that this string doesn't indicate
* the byte order of that encoding. What to do, then? Unlike
* UTF-16 byte order cannot be made part of the encoding name
* in this case and still can be critical. Currently you can
* find out the byte order by invoking <code>getByteOrder</code>
* method.
*/
public String getEncoding() {
if (4 > fEncoding) {
return "ISO-10646-UCS-2";
} else {
return "ISO-10646-UCS-4";
}
}
/**
* Returns byte order ("endianness") of the encoding currently in use by
* this character stream. This is a string with two possible values:
* <code>LITTLE_ENDIAN</code> and <code>BIG_ENDIAN</code>. Maybe using
* a named constant is a better alternative, but I just don't like them.
* But feel free to change this behavior if you think that would be
* better.
*
* @return <code>LITTLE_ENDIAN</code> or <code>BIG_ENDIAN</code> depending
* on byte order of current encoding of this stream.
*/
public String getByteOrder() {
if ((1 == fEncoding) || (4 == fEncoding)) {
return "LITTLE_ENDIAN";
} else {
return "BIG_ENDIAN";
}
}
/**
* Determines whether the specified character (Unicode code point)
* is in the supplementary character range. The method call is
* equivalent to the expression:
* <blockquote><pre>
* codePoint >= 0x10000 && codePoint <= 0x10ffff
* </pre></blockquote>
*
* Stolen from JDK 1.5 <code>java.lang.Character</code> class in
* order to provide JDK 1.4 compatibility.
*
* @param codePoint the character (Unicode code point) to be tested
* @return <code>true</code> if the specified character is in the Unicode
* supplementary character range; <code>false</code> otherwise.
*/
protected boolean isSupplementaryCodePoint(int codePoint) {
return (codePoint >= MIN_SUPPLEMENTARY_CODE_POINT) && (codePoint <= MAX_CODE_POINT);
}
} // class UCSReader
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -