📄 parserutils.java
字号:
// HTMLParser Library $Name: v1_6 $ - A java-based parser for HTML// http://sourceforge.org/projects/htmlparser// Copyright (C) 2004 Somik Raha//// Revision Control Information//// $Source: /cvsroot/htmlparser/htmlparser/src/org/htmlparser/util/ParserUtils.java,v $// $Author: derrickoswald $// $Date: 2005/05/15 11:49:05 $// $Revision: 1.47 $//// This library is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This library is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU// Lesser General Public License for more details.//// You should have received a copy of the GNU Lesser General Public// License along with this library; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA//package org.htmlparser.util;import java.io.UnsupportedEncodingException;import java.util.ArrayList;import org.htmlparser.Node;import org.htmlparser.NodeFilter;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.filters.NodeClassFilter;import org.htmlparser.filters.TagNameFilter;import org.htmlparser.lexer.Lexer;import org.htmlparser.lexer.Page;import org.htmlparser.tags.CompositeTag;import org.htmlparser.util.NodeList;import org.htmlparser.util.ParserException;public class ParserUtils{ public static String removeChars(String s, char occur) { StringBuffer newString = new StringBuffer(); char ch; for (int i = 0; i < s.length(); i++) { ch = s.charAt(i); if (ch != occur) newString.append(ch); } return newString.toString(); } public static String removeEscapeCharacters(String inputString) { inputString = ParserUtils.removeChars(inputString, '\r'); inputString = ParserUtils.removeChars(inputString, '\n'); inputString = ParserUtils.removeChars(inputString, '\t'); return inputString; } public static String removeTrailingBlanks(String text) { char ch = ' '; while (ch == ' ') { ch = text.charAt(text.length() - 1); if (ch == ' ') text = text.substring(0, text.length() - 1); } return text; } /** * Search given node and pick up any objects of given type. * @param node The node to search. * @param type The class to search for. * @return A node array with the matching nodes. */ public static Node[] findTypeInNode(Node node, Class type) { NodeFilter filter; NodeList ret; ret = new NodeList (); filter = new NodeClassFilter (type); node.collectInto (ret, filter); return (ret.toNodeArray ()); } /** * Split the input string considering as string separator * all the not numerical characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>For example if you call splitButDigits("<DIV> +12.5, +3.4 </DIV>", "+."), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (1,2,3,4 and 5 are digits and +,. are chars that do not be removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The array of strings as output. */ public static String[] splitButDigits (String input, String charsDoNotBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuffer str = new StringBuffer(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ((Character.isDigit(input.charAt(index))) || (charFound)) { str.append(input.charAt(index)); toBeAdd=false; } else if (!toBeAdd) toBeAdd=true; // finished to parse one string if (toBeAdd && (str.length()!=0)) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } } // add the last string if (str.length()!=0) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } output.trimToSize(); Object[] outputObj = output.toArray(); String[] outputStr = new String[output.size()]; for (int i=0; i<output.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Remove from the input string all the not numerical characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>For example if you call trimButDigits("<DIV> +12.5 </DIV>", "+."), * <BR>you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). * <BR>For example if you call trimButDigits("<DIV> +1 2 . 5 </DIV>", "+."), * <BR>you obtain a string "+12.5" as output (the spaces between 1 and 2, 2 and ., . and 5 are removed). * @param input The string in input. * @param charsDoNotBeRemoved The chars that do not be removed. * @return The string as output. */ public static String trimButDigits (String input, String charsDoNotBeRemoved) { StringBuffer output = new StringBuffer(); boolean charFound=false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ((Character.isDigit(input.charAt(index))) || (charFound)) output.append(input.charAt(index)); } return output.toString(); } /** * Remove from the beginning and the end of the input string all the not numerical characters * with the only exception of the characters specified in charsDoNotBeRemoved param. * <BR>The removal process removes only chars at the beginning and at the end of the string. * <BR>For example if you call trimButDigitsBeginEnd("<DIV> +12.5 </DIV>", "+."), * <BR>you obtain a string "+12.5" as output (1,2 and 5 are digits and +,. are chars that do not be removed). * <BR>For example if you call trimButDigitsBeginEnd("<DIV> +1 2 . 5 </DIV>", "+."), * <BR>you obtain a string "+1 2 . 5" as output (the spacess inside the string are not removed). * @param input - The string in input. * @param charsDoNotBeRemoved - The chars that do not be removed. * @return The string as output. */ public static String trimButDigitsBeginEnd (String input, String charsDoNotBeRemoved) { String output = new String(); int begin=0; int end=input.length()-1; boolean charFound=false; boolean ok=true; for (int index=begin; (index<input.length()) && ok; index++) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ( (Character.isDigit(input.charAt(index))) || (charFound) ) { begin=index; ok=false; } } ok=true; for (int index=end; (index>=0) && ok; index--) { charFound=false; for (int charsCount=0; charsCount<charsDoNotBeRemoved.length(); charsCount++) if (charsDoNotBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if ( (Character.isDigit(input.charAt(index))) || (charFound) ) { end=index; ok=false; } } output=input.substring(begin,end+1); return output; } /** * Split the input string considering as string separator * all the spaces and tabs like chars and * the chars specified in the input variable charsToBeRemoved. * <BR>For example if you call splitSpaces("<DIV> +12.5, +3.4 </DIV>", "<>DIV/,"), * <BR>you obtain an array of strings {"+12.5", "+3.4"} as output (space chars and <,>,D,I,V,/ and the comma are chars that must be removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The array of strings as output. */ public static String[] splitSpaces (String input, String charsToBeRemoved) { ArrayList output = new ArrayList(); int minCapacity = 0; StringBuffer str = new StringBuffer(); boolean charFound = false; boolean toBeAdd = false; for (int index=0; index<input.length(); index++) { charFound=false; for (int charsCount=0; charsCount<charsToBeRemoved.length(); charsCount++) if (charsToBeRemoved.charAt(charsCount)==input.charAt(index)) charFound=true; if (!((Character.isWhitespace(input.charAt(index))) || (Character.isSpaceChar(input.charAt(index))) || (charFound))) { str.append(input.charAt(index)); toBeAdd=false; } else if (!toBeAdd) toBeAdd=true; // finished to parse one string if (toBeAdd && (str.length()!=0)) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } } // add the last string if (str.length()!=0) { minCapacity++; output.ensureCapacity(minCapacity); if (output.add(str.toString())) str = new StringBuffer(); else minCapacity--; } output.trimToSize(); Object[] outputObj = output.toArray(); String[] outputStr = new String[output.size()]; for (int i=0; i<output.size(); i++) outputStr[i] = new String((String) outputObj[i]); return outputStr; } /** * Remove from the input string all the spaces and tabs like chars. * Remove also the chars specified in the input variable charsToBeRemoved. * <BR>For example if you call trimSpaces("<DIV> +12.5 </DIV>", "<>DIV/"), * <BR>you obtain a string "+12.5" as output (space chars and <,>,D,I,V,/ are chars that must be removed). * <BR>For example if you call trimSpaces("<DIV> Trim All Spaces Also The Ones Inside The String </DIV>", "<>DIV/"), * <BR>you obtain a string "TrimAllSpacesAlsoTheOnesInsideTheString" as output (all the spaces inside the string are removed). * @param input The string in input. * @param charsToBeRemoved The chars to be removed. * @return The string as output. */ public static String trimSpaces (String input, String charsToBeRemoved) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -