📄 text.java

📁 hadoop:Nutch集群平台
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
上一页 12
  }  static {    // register this comparator    WritableComparator.define(Text.class, new Comparator());  }  /// STATIC UTILITIES FROM HERE DOWN  /**   * Converts the provided byte array to a String using the   * UTF-8 encoding. If the input is malformed,   * replace by a default value.   */  public static String decode(byte[] utf8) throws CharacterCodingException {    return decode(ByteBuffer.wrap(utf8), true);  }    public static String decode(byte[] utf8, int start, int length)       throws CharacterCodingException {      return decode(ByteBuffer.wrap(utf8, start, length), true);  }    /**   * Converts the provided byte array to a String using the   * UTF-8 encoding. If <code>replace</code> is true, then   * malformed input is replaced with the   * substitution character, which is U+FFFD. Otherwise the   * method throws a MalformedInputException.   */  public static String decode(byte[] utf8, int start, int length, boolean replace)     throws CharacterCodingException {    return decode(ByteBuffer.wrap(utf8, start, length), replace);  }    private static String decode(ByteBuffer utf8, boolean replace)     throws CharacterCodingException {    synchronized(DECODER) {      if (replace) {        DECODER.onMalformedInput(                                 java.nio.charset.CodingErrorAction.REPLACE);        DECODER.onUnmappableCharacter(CodingErrorAction.REPLACE);      }      String str = DECODER.decode(utf8).toString();      // set decoder back to its default value: REPORT      if (replace) {        DECODER.onMalformedInput(CodingErrorAction.REPORT);        DECODER.onUnmappableCharacter(CodingErrorAction.REPORT);      }      return str;    }  }  /**   * Converts the provided String to bytes using the   * UTF-8 encoding. If the input is malformed,   * invalid chars are replaced by a default value.   * @return ByteBuffer: bytes stores at ByteBuffer.array()    *                     and length is ByteBuffer.limit()   */  public static ByteBuffer encode(String string)    throws CharacterCodingException {    return encode(string, true);  }  /**   * Converts the provided String to bytes using the   * UTF-8 encoding. If <code>replace</code> is true, then   * malformed input is replaced with the   * substitution character, which is U+FFFD. Otherwise the   * method throws a MalformedInputException.   * @return ByteBuffer: bytes stores at ByteBuffer.array()    *                     and length is ByteBuffer.limit()   */  public static ByteBuffer encode(String string, boolean replace)    throws CharacterCodingException {    synchronized(ENCODER) {      if (replace) {        ENCODER.onMalformedInput(CodingErrorAction.REPLACE);        ENCODER.onUnmappableCharacter(CodingErrorAction.REPLACE);      }      ByteBuffer bytes=ENCODER.encode(CharBuffer.wrap(string.toCharArray()));      if (replace) {        ENCODER.onMalformedInput(CodingErrorAction.REPORT);        ENCODER.onUnmappableCharacter(CodingErrorAction.REPORT);      }      return bytes;    }  }  /** Read a UTF8 encoded string from in   */  public static String readString(DataInput in) throws IOException {    int length = WritableUtils.readVInt(in);    byte [] bytes = new byte[length];    in.readFully(bytes, 0, length);    return decode(bytes);  }  /** Write a UTF8 encoded string to out   */  public static int writeString(DataOutput out, String s) throws IOException {    ByteBuffer bytes = encode(s);    int length = bytes.limit();    WritableUtils.writeVInt(out, length);    out.write(bytes.array(), 0, length);    return length;  }  ////// states for validateUTF8    private static final int LEAD_BYTE = 0;  private static final int TRAIL_BYTE_1 = 1;  private static final int TRAIL_BYTE = 2;  /**    * Check if a byte array contains valid utf-8   * @param utf8: byte array   * @exception MalformedInputException if the byte array contains invalid utf-8   */  public static void validateUTF8(byte[] utf8) throws MalformedInputException {     validateUTF8(utf8, 0, utf8.length);       }    /**   * Check to see if a byte array is valid utf-8   * @param utf8 the array of bytes   * @param start the offset of the first byte in the array   * @param len the length of the byte sequence   * @throws MalformedInputException if the byte array contains invalid bytes   */  public static void validateUTF8(byte[] utf8, int start, int len)    throws MalformedInputException {    int count = start;    int leadByte = 0;    int length = 0;    int state = LEAD_BYTE;    while (count < start+len) {      int aByte = ((int) utf8[count] & 0xFF);      switch (state) {      case LEAD_BYTE:        leadByte = aByte;        length = bytesFromUTF8[aByte];        switch (length) {        case 0: // check for ASCII          if (leadByte > 0x7F)            throw new MalformedInputException(count);          break;        case 1:          if (leadByte < 0xC2 || leadByte > 0xDF)            throw new MalformedInputException(count);          state = TRAIL_BYTE_1;          break;        case 2:          if (leadByte < 0xE0 || leadByte > 0xEF)            throw new MalformedInputException(count);          state = TRAIL_BYTE_1;          break;        case 3:          if (leadByte < 0xF0 || leadByte > 0xF4)            throw new MalformedInputException(count);          state = TRAIL_BYTE_1;          break;        default:          // too long! Longest valid UTF-8 is 4 bytes (lead + three)          // or if < 0 we got a trail byte in the lead byte position          throw new MalformedInputException(count);        } // switch (length)        break;      case TRAIL_BYTE_1:        if (leadByte == 0xF0 && aByte < 0x90)          throw new MalformedInputException(count);        if (leadByte == 0xF4 && aByte > 0x8F)          throw new MalformedInputException(count);        if (leadByte == 0xE0 && aByte < 0xA0)          throw new MalformedInputException(count);        if (leadByte == 0xED && aByte > 0x9F)          throw new MalformedInputException(count);        // falls through to regular trail-byte test!!      case TRAIL_BYTE:        if (aByte < 0x80 || aByte > 0xBF)          throw new MalformedInputException(count);        if (--length == 0) {          state = LEAD_BYTE;        } else {          state = TRAIL_BYTE;        }        break;      } // switch (state)      count++;    }  }  /**   * Magic numbers for UTF-8. These are the number of bytes   * that <em>follow</em> a given lead byte. Trailing bytes   * have the value -1. The values 4 and 5 are presented in   * this table, even though valid UTF-8 cannot include the   * five and six byte sequences.   */  static final int[] bytesFromUTF8 =  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0,    // trail bytes    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };  /**   * Returns the next code point at the current position in   * the buffer. The buffer's position will be incremented.   * Any mark set on this buffer will be changed by this method!   */  public static int bytesToCodePoint(ByteBuffer bytes) {    bytes.mark();    byte b = bytes.get();    bytes.reset();    int extraBytesToRead = bytesFromUTF8[(int)(b & 0xFF)];    if (extraBytesToRead < 0) return -1; // trailing byte!    int ch = 0;    switch (extraBytesToRead) {    case 5: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */    case 4: ch += (int)(bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */    case 3: ch += (int)(bytes.get() & 0xFF); ch <<= 6;    case 2: ch += (int)(bytes.get() & 0xFF); ch <<= 6;    case 1: ch += (int)(bytes.get() & 0xFF); ch <<= 6;    case 0: ch += (int)(bytes.get() & 0xFF);    }    ch -= offsetsFromUTF8[extraBytesToRead];    return ch;  }    static final int offsetsFromUTF8[] =  { 0x00000000, 0x00003080,    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };  /**   * For the given string, returns the number of UTF-8 bytes   * required to encode the string.   * @param string text to encode   * @return number of UTF-8 bytes required to encode   */  public static int utf8Length(String string) {    CharacterIterator iter = new StringCharacterIterator(string);    char ch = iter.first();    int size = 0;    while (ch != CharacterIterator.DONE) {      if ((ch >= 0xD800) && (ch < 0xDC00)) {        // surrogate pair?        char trail = iter.next();        if ((trail > 0xDBFF) && (trail < 0xE000)) {          // valid pair          size += 4;        } else {          // invalid pair          size += 3;          iter.previous(); // rewind one        }      } else if (ch < 0x80) {        size++;      } else if (ch < 0x800) {        size += 2;      } else {        // ch < 0x10000, that is, the largest char value        size += 3;      }      ch = iter.next();    }    return size;  }}
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -