⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 htmldecoder.java

📁 网页采集系统 ================= 安装配置 ------- 1 程序我就不说了 2 配置文件 applicationContext.xml 里面有详细的注释 3 已经
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
package com.laozizhu.search.util;

import java.util.HashMap;

/**
 * 替换HTMl里面的字符 e.g.: < > " å И 水
 * 
 * @author 老紫竹(laozizhu.com)
 */
public class HTMLDecoder {

  public static final HashMap<String, Character> charTable;

  public static String decode(String s) {
    String t;
    Character ch;
    int tmpPos, i;

    int maxPos = s.length();
    StringBuffer sb = new StringBuffer(maxPos);
    int curPos = 0;
    while (curPos < maxPos) {
      char c = s.charAt(curPos++);
      if (c == '&') {
        tmpPos = curPos;
        if (tmpPos < maxPos) {
          char d = s.charAt(tmpPos++);
          if (d == '#') {
            if (tmpPos < maxPos) {
              d = s.charAt(tmpPos++);
              if ((d == 'x') || (d == 'X')) {
                if (tmpPos < maxPos) {
                  d = s.charAt(tmpPos++);
                  if (isHexDigit(d)) {
                    while (tmpPos < maxPos) {
                      d = s.charAt(tmpPos++);
                      if (!isHexDigit(d)) {
                        if (d == ';') {
                          t = s.substring(curPos + 2, tmpPos - 1);
                          try {
                            i = Integer.parseInt(t, 16);
                            if ((i >= 0) && (i < 65536)) {
                              c = (char) i;
                              curPos = tmpPos;
                            }
                          } catch (NumberFormatException e) {
                          }
                        }
                        break;
                      }
                    }
                  }
                }
              } else if (isDigit(d)) {
                while (tmpPos < maxPos) {
                  d = s.charAt(tmpPos++);
                  if (!isDigit(d)) {
                    if (d == ';') {
                      t = s.substring(curPos + 1, tmpPos - 1);
                      try {
                        i = Integer.parseInt(t);
                        if ((i >= 0) && (i < 65536)) {
                          c = (char) i;
                          curPos = tmpPos;
                        }
                      } catch (NumberFormatException e) {
                      }
                    }
                    break;
                  }
                }
              }
            }
          } else if (isLetter(d)) {
            while (tmpPos < maxPos) {
              d = s.charAt(tmpPos++);
              if (!isLetterOrDigit(d)) {
                if (d == ';') {
                  t = s.substring(curPos, tmpPos - 1);
                  ch = (Character) charTable.get(t);
                  if (ch != null) {
                    c = ch.charValue();
                    curPos = tmpPos;
                  }
                }
                break;
              }
            }
          }
        }
      }
      sb.append(c);
    }
    return sb.toString();
  }

  private static boolean isLetterOrDigit(char c) {
    return isLetter(c) || isDigit(c);
  }

  private static boolean isHexDigit(char c) {
    return isHexLetter(c) || isDigit(c);
  }

  private static boolean isLetter(char c) {
    return ((c >= 'a') && (c <= 'z')) || ((c >= 'A') && (c <= 'Z'));
  }

  private static boolean isHexLetter(char c) {
    return ((c >= 'a') && (c <= 'f')) || ((c >= 'A') && (c <= 'F'));
  }

  private static boolean isDigit(char c) {
    return (c >= '0') && (c <= '9');
  }

  public static String compact(String s) {
    int maxPos = s.length();
    StringBuffer sb = new StringBuffer(maxPos);
    int curPos = 0;
    while (curPos < maxPos) {
      char c = s.charAt(curPos++);
      if (isWhitespace(c)) {
        while ((curPos < maxPos) && isWhitespace(s.charAt(curPos))) {
          curPos++;
        }
        c = '\u0020';
      }
      sb.append(c);
    }
    return sb.toString();
  }

  // HTML is very particular about what constitutes white space.
  public static boolean isWhitespace(char ch) {
    return (ch == '\u0020') || (ch == '\r') || (ch == '\n') || (ch == '\u0009') || (ch == '\u000c') || (ch == '\u200b');
  }

  static {
    charTable = new HashMap<String, Character>();
    charTable.put("quot", new Character((char) 34));
    charTable.put("amp", new Character((char) 38));
    charTable.put("apos", new Character((char) 39));
    charTable.put("lt", new Character((char) 60));
    charTable.put("gt", new Character((char) 62));
    charTable.put("nbsp", new Character(' '));
    charTable.put("iexcl", new Character((char) 161));
    charTable.put("cent", new Character((char) 162));
    charTable.put("pound", new Character((char) 163));
    charTable.put("curren", new Character((char) 164));
    charTable.put("yen", new Character((char) 165));
    charTable.put("brvbar", new Character((char) 166));
    charTable.put("sect", new Character((char) 167));
    charTable.put("uml", new Character((char) 168));
    charTable.put("copy", new Character((char) 169));
    charTable.put("ordf", new Character((char) 170));
    charTable.put("laquo", new Character((char) 171));
    charTable.put("not", new Character((char) 172));
    charTable.put("shy", new Character((char) 173));
    charTable.put("reg", new Character((char) 174));
    charTable.put("macr", new Character((char) 175));
    charTable.put("deg", new Character((char) 176));
    charTable.put("plusmn", new Character((char) 177));
    charTable.put("sup2", new Character((char) 178));
    charTable.put("sup3", new Character((char) 179));
    charTable.put("acute", new Character((char) 180));
    charTable.put("micro", new Character((char) 181));
    charTable.put("para", new Character((char) 182));
    charTable.put("middot", new Character((char) 183));
    charTable.put("cedil", new Character((char) 184));
    charTable.put("sup1", new Character((char) 185));
    charTable.put("ordm", new Character((char) 186));
    charTable.put("raquo", new Character((char) 187));
    charTable.put("frac14", new Character((char) 188));
    charTable.put("frac12", new Character((char) 189));
    charTable.put("frac34", new Character((char) 190));
    charTable.put("iquest", new Character((char) 191));
    charTable.put("Agrave", new Character((char) 192));
    charTable.put("Aacute", new Character((char) 193));
    charTable.put("Acirc", new Character((char) 194));
    charTable.put("Atilde", new Character((char) 195));
    charTable.put("Auml", new Character((char) 196));
    charTable.put("Aring", new Character((char) 197));
    charTable.put("AElig", new Character((char) 198));
    charTable.put("Ccedil", new Character((char) 199));
    charTable.put("Egrave", new Character((char) 200));
    charTable.put("Eacute", new Character((char) 201));
    charTable.put("Ecirc", new Character((char) 202));
    charTable.put("Euml", new Character((char) 203));
    charTable.put("Igrave", new Character((char) 204));
    charTable.put("Iacute", new Character((char) 205));
    charTable.put("Icirc", new Character((char) 206));
    charTable.put("Iuml", new Character((char) 207));
    charTable.put("ETH", new Character((char) 208));
    charTable.put("Ntilde", new Character((char) 209));
    charTable.put("Ograve", new Character((char) 210));
    charTable.put("Oacute", new Character((char) 211));

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -