📄 charactertranslationtest.java
字号:
// HTMLParser Library $Name: v1_6_20051112 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Derick Oswald//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/tests/utilTests/CharacterTranslationTest.java,v $// $Author: derrickoswald $// $Date: 2004/07/31 16:42:32 $// $Revision: 1.46 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.tests.utilTests;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileInputStream;import java.io.FileWriter;import java.io.IOException;import java.io.InputStream;import java.io.PrintStream;import java.io.PrintWriter;import java.lang.reflect.Field;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.Random;import org.htmlparser.Node;import org.htmlparser.Parser;import org.htmlparser.Remark;import org.htmlparser.Tag;import org.htmlparser.Text;import org.htmlparser.tags.LinkTag;import org.htmlparser.tests.ParserTestCase;import org.htmlparser.util.CharacterReference;import org.htmlparser.util.NodeIterator;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;import org.htmlparser.util.Translate;import org.htmlparser.util.sort.Sort;public class CharacterTranslationTest extends ParserTestCase{ static { System.setProperty ("org.htmlparser.tests.utilTests.CharacterTranslationTest", "CharacterTranslationTest"); } /** * The list of references. */ protected static CharacterReference[] mReferences; public CharacterTranslationTest (String name) { super (name); } /** * Class loader to access the compiled character references. */ class SimpleClassLoader extends ClassLoader { /** * The class path for this class loader. */ String mRoot; public SimpleClassLoader (String root) { if (!root.endsWith (File.separator)) root += File.separator; mRoot = root; } public Class loadClass (String className) throws ClassNotFoundException { return (loadClass (className, true)); } public synchronized Class loadClass (String className, boolean resolveIt) throws ClassNotFoundException { byte data[]; FileInputStream in; Class ret; try { // try system class loader ret = super.findSystemClass (className); } catch (ClassNotFoundException e) { try { in = new FileInputStream (mRoot + className + ".class"); data = new byte[in.available ()]; in.read (data); in.close (); ret = defineClass (className, data, 0, data.length); if (null == ret) throw new ClassFormatError (); if (resolveIt) resolveClass (ret); } catch (IOException ioe) { throw new ClassNotFoundException (); } } return (ret); } } /** * Create a character reference translation class source file. * Usage: * <pre> * java -classpath .:lib/htmlparser.jar Generate > Translate.java * </pre> * Derived from HTMLStringFilter.java provided as an example with the * htmlparser.jar file available at * <a href="http://htmlparser.sourceforge.net">htmlparser.sourceforge.net</a> * written by Somik Raha ( * <a href='mailto:somik@industriallogic.com? * subject=htmlparser'>somik@industriallogic. com</a> * <a href="http://industriallogic.com">http://industriallogic.com</a>). * @author <a href='mailto:DerrickOswald@users.sourceforge.net?subject=Character Reference Translation class'>Derrick Oswald</a> */ public class Generate { /** * The working parser. */ protected Parser mParser; protected String nl = System.getProperty ("line.separator", "\n"); /** * Create a Generate object. * Sets up the generation by creating a new <code>Parser</code> pointed * at <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">http://www.w3.org/TR/REC-html40/sgml/entities.html</a> * with the standard scanners registered. */ public Generate () throws ParserException { mParser = new Parser ("http://www.w3.org/TR/REC-html40/sgml/entities.html"); } /** * Translate character references. * After generating the Translate class we could use it * to do this job, but that would involve a bootstrap * problem, so this method does the reference conversion * for a very tiny subset (enough to understand the w3.org * page). * @param string The raw string. * @return The string with character references fixed. */ public String translate (String string) { int index; int amp; StringBuffer ret; ret = new StringBuffer (4096); index = 0; while ((index < string.length ()) && (-1 != (amp = string.indexOf ('&', index)))) { // include the part before the special character ret.append (string.substring (index, amp)); if (string.startsWith (" ", amp)) { ret.append (" "); index = amp + 6; } else if (string.startsWith ("<", amp)) { ret.append ("<"); index = amp + 4; } else if (string.startsWith (">", amp)) { ret.append (">"); index = amp + 4; } else if (string.startsWith ("&", amp)) { ret.append ("&"); index = amp + 5; } else if (string.startsWith (""e;", amp)) { ret.append ("\""); index = amp + 7; } else if (string.startsWith ("÷", amp)) { //ret.append ('\u00F7'); //index = amp + 8; ret.append ("&"); index = amp + 1; } else if (string.startsWith ("©", amp)) { //ret.append ('\u00A9'); //index = amp + 6; ret.append ("&"); index = amp + 1; } else { System.out.println ("unknown special character starting with " + string.substring (amp, amp + 7)); ret.append ("&"); index = amp + 1; } } ret.append (string.substring (index)); return (ret.toString ()); } public void gather (Node node, StringBuffer buffer) { NodeList children; if (node instanceof Text) { // Node is a plain string // Cast it to an HTMLText Text stringNode = (Text)node; // Retrieve the data from the object buffer.append (stringNode.getText ()); } else if (node instanceof LinkTag) { // Node is a link // Cast it to an HTMLLinkTag LinkTag linkNode = (LinkTag)node; // Retrieve the data from the object and print it buffer.append (linkNode.getLinkText ()); } else if (node instanceof Tag) { String name = ((Tag)node).getTagName (); if (name.equals ("BR") || name.equals ("P")) buffer.append (nl); else { children = ((Tag)node).getChildren (); if (null != children) for (int i = 0; i < children.size (); i++) gather (children.elementAt (i), buffer); } } else if (node instanceof Remark) { } else { System.out.println (); System.out.println(node.toString()); } } /** * Find the lowest index of whitespace (space or newline). * @param string The string to look in. * @param index Where to start looking. * @return -1 if there is no whitespace, the minimum index otherwise. */ public int indexOfWhitespace (String string, int index) { int space; int cr; int ret; space = string.indexOf (" ", index); cr = string.indexOf (nl, index); if (-1 == space) ret = cr; else if (-1 == cr) ret = space; else ret = Math.min (space, cr); return (ret); } /** * Rewrite the comment string. * In the sgml table, the comments are of the form: * <pre> * -- latin capital letter I with diaeresis, * U+00CF ISOlat1 * </pre> * so we just want to make a one-liner without the spaces and newlines. * @param string The raw comment. * @return The single line comment. */ public String pack (String string) { int index; int spaces; StringBuffer ret; ret = new StringBuffer (string.length ()); if (string.startsWith ("-- ")) string = string.substring (3); // remove doublespaces index = 0; while ((index < string.length ()) && (-1 != (spaces = indexOfWhitespace (string, index)))) { ret.append (string.substring (index, spaces)); ret.append (" "); while ((spaces < string.length ()) && Character.isWhitespace (string.charAt (spaces))) spaces++; index = spaces; } if (index < string.length ()) ret.append (string.substring (index)); return (ret.toString ()); } /** * Pretty up a comment string. * @param string The comment to operate on. * @return The beautiful comment string. */ public String pretty (String string) { int index; int spaces; StringBuffer ret;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -