encodingheuristics.scala

来自「JAVA 语言的函数式编程扩展」· SCALA 代码 · 共 178 行

SCALA
178
字号
/*                     __                                               *\**     ________ ___   / /  ___     Scala API                            ****    / __/ __// _ | / /  / _ |    (c) 2002-2007, LAMP/EPFL             ****  __\ \/ /__/ __ |/ /__/ __ |    http://scala-lang.org/               **** /____/\___/_/ |_/____/_/ | |                                         ****                          |/                                          **\*                                                                      */// $Id: EncodingHeuristics.scala 13032 2007-10-09 16:28:22Z michelou $package scala.xml.include.saximport java.io.{IOException, InputStreamReader, InputStream}/** * <p> * <code>EncodingHeuristics</code> reads from a stream * (which should be buffered) and attempts to guess * what the encoding of the text in the stream is. * Byte order marks are stripped from the stream. * If it fails to determine the type of the encoding, * it returns the default UTF-8.  * </p> * <p> *   Translated from Elliotte Rusty Harold's Java source * </p> * * @author Burak Emir */object EncodingHeuristics {  /**    * <p>    * This utility method ????.    * </p>    *    * @param in   <code>InputStream</code> to read from.     * @return String  The name of the encoding.    * @throws IOException if the stream cannot be reset back to where it was when    *                     the method was invoked.    */      def readEncodingFromStream(in: InputStream): String = {    //System.err.println("EncodingHeuristics::readEncodingFromStream");    // This may fail if there are a lot of space characters before the end    // of the encoding declaration    in.mark(1024)    var ret: String = null    try {      // lots of things can go wrong here. If any do, I just return null      // so that we'll fall back on the encoding declaration or the      // UTF-8 default      val byte1 = in.read()      val byte2 = in.read()      if (byte1 == 0xFE && byte2 == 0xFF) {        // don't reset because the byte order mark should not be included????        ret =  "UnicodeBig"; // name for big-endian????               }      else if (byte1 == 0xFF && byte2 == 0xFE) {        // don't reset because the byte order mark should not be included????        // will the reader throw away the byte order mark or will it return it????        ret =  "UnicodeLittle"        }      /* In accordance with the Character Model [Character Model],        when the text format is a Unicode encoding, the XInclude        processor must fail the inclusion when the text in the        selected range is non-normalized. When transcoding characters        to a Unicode encoding from a legacy encoding, a normalizing transcoder must be used. */            val byte3 = in.read()      // check for UTF-8 byte order mark      if (byte1 == 0xEF && byte2 == 0xBB && byte3 == 0xBF) {        // don't reset because the byte order mark should not be included????        // in general what happens if text document includes non-XML legal chars????        ret =  "UTF-8";                }            val byte4 = in.read();      if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFE && byte4 == 0xFF) {        // don't reset because the byte order mark should not be included????        ret =  "UCS-4"; // right name for big-endian UCS-4 in Java 1.4????               }      else if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0xFF && byte4 == 0xFE) {        // don't reset because the byte order mark should not be included????        ret =  "UCS-4"; // right name for little-endian UCS-4 in Java 1.4????               }      // no byte order mark present; first character must be       // less than sign or white space      // Let's look for less-than signs first      if (byte1 == 0x00 && byte2 == 0x00 && byte3 == 0x00 && byte4 == '<') {        in.reset()        ret =  "UCS-4"  // right name for big-endian UCS-4 in Java 1.4????      }      else if (byte1 == '<' && byte2 == 0x00 && byte3 == 0x00 && byte4 == 0x00) {        in.reset()        ret =  "UCS-4"  // right name for little-endian UCS-4 in Java 1.4????      }      else if (byte1 == 0x00 && byte2 == '<' && byte3 == 0x00 && byte4 == '?') {        in.reset()        ret =  "UnicodeBigUnmarked"      }      else if (byte1 == '<' && byte2 == 0x00 && byte3 == '?' && byte4 == 0x00) {        in.reset()        ret =  "UnicodeLittleUnmarked"       }      else if (byte1 == '<' && byte2 == '?' && byte3 == 'x' && byte4 == 'm') {        // ASCII compatible, must read encoding declaration         // 1024 bytes will be far enough to read most XML declarations        val data = new Array[Byte](1024)        data(0) = byte1.asInstanceOf[Byte]        data(1) = byte2.asInstanceOf[Byte]        data(2) = byte3.asInstanceOf[Byte]        data(3) = byte4.asInstanceOf[Byte]        val length = in.read(data, 4, 1020) + 4;        // Use Latin-1 (ISO-8859-1) because it's ASCII compatible and        // all byte sequences are legal Latin-1 sequences so I don't have        // to worry about encoding errors if I slip past the         // end of the XML/text declaration        val declaration = new String(data, 0, length, "8859_1");        // if any of these throw a StringIndexOutOfBoundsException        // we just fall into the catch bloclk and return null        // since this can't be well-formed XML        var position = declaration.indexOf("encoding") + 8;        var c: Char = '\0'  // bogus init value        // get rid of white space before equals sign        do {          c = declaration.charAt(position)          position += 1                } while (c == ' ' || c == '\t' || c == '\r' || c == '\n') ;        if (c != '=') { // malformed          in.reset()          ret =  "UTF-8"        }        // get rid of white space after equals sign        do {          c = declaration.charAt(position)          position += 1        } while (c == ' ' || c == '\t' || c == '\r' || c == '\n') ;        var delimiter: Char = c        if (delimiter != '\'' && delimiter != '"') { // malformed          in.reset()          ret =  "UTF-8"        }        // now positioned to read encoding name        val encodingName = new StringBuffer()        do {          c = declaration.charAt(position)          position += 1          encodingName.append(c)        } while(c != delimiter)        encodingName.setLength(encodingName.length() - 1)  // rm delim        in.reset()        ret =  encodingName.toString()      }        else if (byte1 == 0x4C && byte2 == 0x6F && byte3 == 0xA7 && byte4 == 0x94) {          // EBCDIC compatible, must read encoding declaration           // ????         }          } catch {       case e: Exception =>        in.reset()        ret = "UTF-8"    }    // no XML or text declaration present    //System.err.println("exit EncodingHeuristics::readEncodingFromStream");    if (ret != null)      ret    else {      in.reset()      "UTF-8"    }  }  }

⌨️ 快捷键说明

复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?