text.java

来自「hadoop:Nutch集群平台」· Java 代码 · 共 580 行 · 第 1/2 页
JAVA
580 行
/** * Copyright 2005 The Apache Software Foundation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * *     http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */package org.apache.hadoop.io;import java.io.IOException;import java.io.DataInput;import java.io.DataOutput;import java.nio.ByteBuffer;import java.nio.CharBuffer;import java.nio.charset.CharacterCodingException;import java.nio.charset.Charset;import java.nio.charset.CharsetDecoder;import java.nio.charset.CharsetEncoder;import java.nio.charset.CodingErrorAction;import java.nio.charset.MalformedInputException;import java.text.CharacterIterator;import java.text.StringCharacterIterator;import org.apache.commons.logging.Log;import org.apache.commons.logging.LogFactory;/** This class stores text using standard UTF8 encoding.  It provides methods * to serialize, deserialize, and compare texts at byte level.  The type of * length is integer and is serialized using zero-compressed format.  <p>In * addition, it provides methods for string traversal without converting the * byte array to a string.  <p>Also includes utilities for * serializing/deserialing a string, coding/decoding a string, checking if a * byte array contains valid UTF8 code, calculating the length of an encoded * string. */public class Text implements WritableComparable {  private static final Log LOG= LogFactory.getLog("org.apache.hadoop.io.Text");    private static final CharsetDecoder DECODER =     Charset.forName("UTF-8").newDecoder().    onMalformedInput(CodingErrorAction.REPORT).    onUnmappableCharacter(CodingErrorAction.REPORT);  private static final CharsetEncoder ENCODER =     Charset.forName("UTF-8").newEncoder().    onMalformedInput(CodingErrorAction.REPORT).    onUnmappableCharacter(CodingErrorAction.REPORT);  private static final byte [] EMPTY_BYTES = new byte[0];    private byte[] bytes;  private int length;  public Text() {    bytes = EMPTY_BYTES;  }  /** Construct from a string.    */  public Text(String string) {    set(string);  }  /** Construct from another text. */  public Text(Text utf8) {    set(utf8);  }  /** Construct from a byte array.   */  public Text(byte[] utf8)  {    set(utf8);  }    /** Retuns the raw bytes. */  public byte[] getBytes() {    return bytes;  }  /** Returns the number of bytes in the byte array */   public int getLength() {    return length;  }    /**   * Returns the Unicode Scalar Value (32-bit integer value)   * for the character at <code>position</code>. Note that this   * method avoids using the converter or doing String instatiation   * @returns the Unicode scalar value at position or -1   *          if the position is invalid or points to a   *          trailing byte   */  public int charAt(int position) {    if (position > this.length) return -1; // too long    if (position < 0) return -1; // duh.          ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);    return bytesToCodePoint(bb.slice());  }    public int find(String what) {    return find(what, 0);  }    /**   * Finds any occurence of <code>what</code> in the backing   * buffer, starting as position <code>start</code>. The starting   * position is measured in bytes and the return value is in   * terms of byte position in the buffer. The backing buffer is   * not converted to a string for this operation.   * @return byte position of the first occurence of the search   *         string in the UTF-8 buffer or -1 if not found   */  public int find(String what, int start) {    try {      ByteBuffer src = ByteBuffer.wrap(this.bytes);      ByteBuffer tgt = encode(what);      byte b = tgt.get();      src.position(start);                while (src.hasRemaining()) {        if (b == src.get()) { // matching first byte          src.mark(); // save position in loop          tgt.mark(); // save position in target          boolean found = true;          int pos = src.position()-1;          while (tgt.hasRemaining()) {            if (!src.hasRemaining()) { // src expired first              tgt.reset();              src.reset();              found = false;              break;            }            if (!(tgt.get() == src.get())) {              tgt.reset();              src.reset();              found = false;              break; // no match            }          }          if (found) return pos;        }      }      return -1; // not found    } catch (CharacterCodingException e) {      // can't get here      e.printStackTrace();      return -1;    }  }    /** Set to contain the contents of a string.    */  public void set(String string) {    try {      ByteBuffer bb = encode(string, true);      bytes = bb.array();      length = bb.limit();    }catch(CharacterCodingException e) {      throw new RuntimeException("Should not have happened " + e.toString());     }  }  /** Set to a utf8 byte array   */  public void set(byte[] utf8) {    set(utf8, 0, utf8.length);  }    /** copy a text. */  public void set(Text other) {    set(other.bytes, 0, other.length);  }  /**   * Set the Text to range of bytes   * @param utf8 the data to copy from   * @param start the first position of the new string   * @param len the number of bytes of the new string   */  public void set(byte[] utf8, int start, int len) {    setCapacity(len);    System.arraycopy(utf8, start, bytes, 0, len);    this.length = len;  }  /*   * Sets the capacity of this Text object to <em>at least</em>   * <code>len</code> bytes. If the current buffer is longer,   * then the capacity and existing content of the buffer are   * unchanged. If <code>len</code> is larger   * than the current capacity, the Text object's capacity is   * increased to match. The existing contents of the buffer   * (if any) are deleted.   */  private void setCapacity( int len ) {    if (bytes == null || bytes.length < len)      bytes = new byte[len];        }     /**    * Convert text back to string   * @see java.lang.Object#toString()   */  public String toString() {    try {      return decode(bytes, 0, length);    } catch (CharacterCodingException e) {       return null;    }  }    /** deserialize    */  public void readFields(DataInput in) throws IOException {    length = WritableUtils.readVInt(in);    setCapacity(length);    in.readFully(bytes, 0, length);  }  /** Skips over one Text in the input. */  public static void skip(DataInput in) throws IOException {    int length = WritableUtils.readVInt(in);    in.skipBytes(length);  }  /** serialize   * write this object to out   * length uses zero-compressed encoding   * @see Writable#write(DataOutput)   */  public void write(DataOutput out) throws IOException {    WritableUtils.writeVInt(out, length);    out.write(bytes, 0, length);  }  /** Compare two Texts bytewise using standard UTF8 ordering. */  public int compareTo(Object o) {    Text that = (Text)o;    if(this == that)      return 0;    else      return WritableComparator.compareBytes(bytes, 0, length,                                             that.bytes, 0, that.length);  }  /** Returns true iff <code>o</code> is a Text with the same contents.  */  public boolean equals(Object o) {    if (!(o instanceof Text))      return false;    Text that = (Text)o;    if (this == that)      return true;    else if (this.length != that.length)      return false;    else      return WritableComparator.compareBytes(bytes, 0, length,                                             that.bytes, 0, that.length) == 0;  }  /** hash function */  public int hashCode() {    return WritableComparator.hashBytes(bytes, length);  }  /** A WritableComparator optimized for Text keys. */  public static class Comparator extends WritableComparator {    public Comparator() {      super(Text.class);    }    public int compare(byte[] b1, int s1, int l1,                       byte[] b2, int s2, int l2) {      try {        int n1 = readVInt(b1, s1);        int n2 = readVInt(b2, s2);        return compareBytes(b1, s1+WritableUtils.getVIntSize(n1), n1,                             b2, s2+WritableUtils.getVIntSize(n2), n2);      }catch(IOException e) {        LOG.warn(e);        throw new RuntimeException(e);      }    }
text.java - 源码说明

本页面展示了「hadoop:Nutch集群平台」中的 text.java 源码文件，采用 Java 编程语言编写，共 580 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。
虫虫下载站收录了大量与hadoop相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。
⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?