⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parserutils.java

📁 html 解析处理代码
💻 JAVA
📖 第 1 页 / 共 4 页
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v $// $Author: derrickoswald $// $Date: 2005/05/15 11:49:05 $// $Revision: 1.47 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.util;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.tags.CompositeTag;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;public class ParserUtils{    public static String removeChars(String s, char occur) {        StringBuffer newString = new StringBuffer();        char ch;        for (int i = 0; i < s.length(); i++) {            ch = s.charAt(i);            if (ch != occur)                newString.append(ch);        }        return newString.toString();    }    public static String removeEscapeCharacters(String inputString) {        inputString = ParserUtils.removeChars(inputString, '\r');        inputString = ParserUtils.removeChars(inputString, '\n');        inputString = ParserUtils.removeChars(inputString, '\t');        return inputString;    }    public static String removeTrailingBlanks(String text) {        char ch = ' ';        while (ch == ' ') {            ch = text.charAt(text.length() - 1);            if (ch == ' ')                text = text.substring(0, text.length() - 1);        }        return text;    }    /**     * Search given node and pick up any objects of given type.     * @param node The node to search.     * @param type The class to search for.     * @return A node array with the matching nodes.     */    public static Node[] findTypeInNode(Node node, Class type)    {        NodeFilter filter;        NodeList ret;                ret = new NodeList ();        filter = new NodeClassFilter (type);        node.collectInto (ret, filter);        return (ret.toNodeArray ());    }    /**     * Split the input string considering as string separator     * all the not numerical characters     * with the only exception of the characters specified in charsDoNotBeRemoved param.     * <BR>For example if you call splitButDigits(&quot;&lt;DIV&gt;  +12.5, +3.4 &lt;/DIV&gt;&quot;, &quot;+.&quot;),     * <BR>you obtain an array of strings {&quot;+12.5&quot;, &quot;+3.4&quot;} as output (1,2,3,4 and 5 are digits and +,. are chars that do not be removed).     * @param input The string in input.     * @param charsDoNotBeRemoved The chars that do not be removed.     * @return The array of strings as output.    */    public static String[] splitButDigits (String input, String charsDoNotBeRemoved)    { 	        ArrayList output = new ArrayList();        int minCapacity = 0;        StringBuffer str = new StringBuffer();        boolean charFound = false;        boolean toBeAdd = false;        for (int index=0; index<input.length(); index++)        {                charFound=false;            for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++)                if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index))                    charFound=true;            if ((Character.isDigit(input.charAt(index))) || (charFound))            {                str.append(input.charAt(index));                toBeAdd=false;            }            else                if (!toBeAdd)                    toBeAdd=true;            // finished to parse one string            if (toBeAdd && (str.length()!=0)) {                minCapacity++;                output.ensureCapacity(minCapacity);                if (output.add(str.toString()))                    str = new StringBuffer();                else                    minCapacity--;            }        }        // add the last string        if (str.length()!=0) {            minCapacity++;            output.ensureCapacity(minCapacity);            if (output.add(str.toString()))                str = new StringBuffer();            else                minCapacity--;        }        output.trimToSize();        Object[] outputObj = output.toArray();        String[] outputStr = new String[output.size()];        for (int i=0; i<output.size(); i++)            outputStr[i] = new String((String) outputObj[i]);        return outputStr;            }        /**     * Remove from the input string all the not numerical characters     * with the only exception of the characters specified in charsDoNotBeRemoved param.     * <BR>For example if you call trimButDigits(&quot;&lt;DIV&gt;  +12.5 &lt;/DIV&gt;&quot;, &quot;+.&quot;),     * <BR>you obtain a string &quot;+12.5&quot; as output (1,2 and 5 are digits and +,. are chars that do not be removed).     * <BR>For example if you call trimButDigits(&quot;&lt;DIV&gt;  +1 2 . 5 &lt;/DIV&gt;&quot;, &quot;+.&quot;),     * <BR>you obtain a string &quot;+12.5&quot; as output (the spaces between 1 and 2, 2 and ., . and 5 are removed).     * @param input The string in input.     * @param charsDoNotBeRemoved The chars that do not be removed.     * @return The string as output.    */    public static String trimButDigits (String input, String charsDoNotBeRemoved)    { 	        StringBuffer output = new StringBuffer();        boolean charFound=false;        for (int index=0; index<input.length(); index++)        {                charFound=false;            for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++)                if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index))                    charFound=true;            if ((Character.isDigit(input.charAt(index))) || (charFound))                output.append(input.charAt(index));        }        return output.toString();            }        /**     * Remove from the beginning and the end of the input string all the not numerical characters     * with the only exception of the characters specified in charsDoNotBeRemoved param.     * <BR>The removal process removes only chars at the beginning and at the end of the string.     * <BR>For example if you call trimButDigitsBeginEnd(&quot;&lt;DIV&gt;  +12.5 &lt;/DIV&gt;&quot;, &quot;+.&quot;),     * <BR>you obtain a string &quot;+12.5&quot; as output (1,2 and 5 are digits and +,. are chars that do not be removed).     * <BR>For example if you call trimButDigitsBeginEnd(&quot;&lt;DIV&gt;  +1 2 . 5 &lt;/DIV&gt;&quot;, &quot;+.&quot;),     * <BR>you obtain a string &quot;+1 2 . 5&quot; as output (the spacess inside the string are not removed).     * @param input - The string in input.     * @param charsDoNotBeRemoved - The chars that do not be removed.     * @return The string as output.    */    public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved)    { 	        String output = new String();        int begin=0;        int end=input.length()-1;        boolean charFound=false;        boolean ok=true;        for (int index=begin; (index<input.length()) && ok; index++)        {                            charFound=false;            for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++)                if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index))                    charFound=true;            if ( (Character.isDigit(input.charAt(index))) || (charFound) )            {                begin=index;                ok=false;            }        }        ok=true;        for (int index=end; (index>=0) && ok; index--)        {            charFound=false;            for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++)                if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index))                    charFound=true;            if ( (Character.isDigit(input.charAt(index))) || (charFound) )            {                end=index;                ok=false;            }        }        output=input.substring(begin,end+1);        return output;            }        /**     * Split the input string considering as string separator     * all the spaces and tabs like chars and     * the chars specified in the input variable charsToBeRemoved.     * <BR>For example if you call splitSpaces(&quot;&lt;DIV&gt;  +12.5, +3.4 &lt;/DIV&gt;&quot;, &quot;&lt;>DIV/,&quot;),     * &lt;BR>you obtain an array of strings {&quot;+12.5&quot;, &quot;+3.4&quot;} as output (space chars and &lt;,&gt;,D,I,V,/ and the comma are chars that must be removed).     * @param input The string in input.     * @param charsToBeRemoved The chars to be removed.     * @return The array of strings as output.    */    public static String[] splitSpaces (String input, String charsToBeRemoved)    { 	        ArrayList output = new ArrayList();        int minCapacity = 0;        StringBuffer str = new StringBuffer();        boolean charFound = false;        boolean toBeAdd = false;        for (int index=0; index<input.length(); index++)        {                charFound=false;            for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++)                if (charsToBeRemoved.charAt(charsCount)==input.charAt(index))                    charFound=true;            if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound)))            {                str.append(input.charAt(index));                toBeAdd=false;            }            else                if (!toBeAdd)                    toBeAdd=true;            // finished to parse one string            if (toBeAdd && (str.length()!=0)) {                minCapacity++;                output.ensureCapacity(minCapacity);                if (output.add(str.toString()))                    str = new StringBuffer();                else                    minCapacity--;            }        }        // add the last string        if (str.length()!=0) {            minCapacity++;            output.ensureCapacity(minCapacity);            if (output.add(str.toString()))                str = new StringBuffer();            else                minCapacity--;        }        output.trimToSize();        Object[] outputObj = output.toArray();        String[] outputStr = new String[output.size()];        for (int i=0; i<output.size(); i++)            outputStr[i] = new String((String) outputObj[i]);        return outputStr;            }    /**     * Remove from the input string all the spaces and tabs like chars.     * Remove also the chars specified in the input variable charsToBeRemoved.     * <BR>For example if you call trimSpaces(&quot;&lt;DIV&gt;  +12.5 &lt;/DIV&gt;&quot;, &quot;&lt;>DIV/&quot;),     * <BR>you obtain a string &quot;+12.5&quot; as output (space chars and &lt;,&gt;,D,I,V,/ are chars that must be removed).     * <BR>For example if you call trimSpaces(&quot;&lt;DIV&gt;  Trim All Spaces Also The Ones Inside The String &lt;/DIV&gt;&quot;, &quot;&lt;>DIV/&quot;),     * <BR>you obtain a string &quot;TrimAllSpacesAlsoTheOnesInsideTheString&quot; as output (all the spaces inside the string are removed).     * @param input The string in input.     * @param charsToBeRemoved The chars to be removed.     * @return The string as output.    */    public static String trimSpaces (String input, String charsToBeRemoved)    {

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -