📄 parsefilestring.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Toni Volkmer (volkmer@prudsys.com)
* @version 1.0
*/
package com.prudsys.pdm.Utils;
import java.util.Vector;
import java.util.Hashtable;
import java.util.Enumeration;
import java.util.NoSuchElementException;
/**
* This class provides an interface for parsing one line of a transactional
* text file.
*/
public class ParseFileString {
/**
* These chars are used in automatic mode.
*/
protected char testChars[] = {',', ';', '\t', ' ', ':', '.'};
/**
* Character the string is splitted by.
*/
protected char columnSeperator = '\0';
/**
* Quote character.
*/
protected char columnQuote = '\0';
protected int pos = 0;
/**
* Vector containing the identified elements.
*/
private Vector<String> tokens = new Vector<String>();
private String m_parseString = "";
char automaticQuoteC[] = { '\"', '\'', '`' };
// char escapeC = '\\';
/**
* Empty Constructor.
*/
public ParseFileString() {
}
/**
* Constructor runs parsing by calling
* the {@link #init(String, char, char) init method}.
*
* @param str The string that shall be parsed.
* @param seperator The character the string shall be splitted by.
*/
public ParseFileString(String str, char seperator) {
this.init(str, seperator, '\0');
}
/**
* Constructor runs parsing by calling
* the {@link #init(String, char, char) init method}.
*
* @param str The string that shall be parsed.
* @param seperator The character the string shall be splitted by.
* @param quotes Quotation character.
*/
public ParseFileString(String str, char seperator, char quotes) {
this.init(str, seperator, quotes);
}
/**
* Returns the character which the string is splitted by.
*
* @return seperator character.
*/
public char getColumnSeperator() {
return columnSeperator;
}
/**
* Sets the character which the string is splitted by.
*
* @param seperator new seperator.
*/
public void setColumnSeperator(char seperator) {
this.columnSeperator = seperator;
}
/**
* Returns the quotation character.
*
* @return quotation character.
*/
public char getColumnQuotes() {
return columnQuote;
}
/**
* Sets the quotation character.
*
* @param quotes new quotation character.
*/
public void setColumnQuotes(char quotes) {
this.columnQuote = quotes;
}
public boolean findParameters(String[] testArray) {
if(testArray==null || testArray.length < 1)
return false;
int sTable[][][] = new int[this.testChars.length][][];
int qC = this.automaticQuoteC.length;
char bestDelim = '\0';
char bestQuote = '\0';
int bestCount = 0;
for(int iTestC=0; iTestC < this.testChars.length; iTestC++) {
sTable[iTestC] = new int[qC+1][];
for(int iTestQ=0; iTestQ < qC+1; iTestQ++) {
Hashtable<Integer, Integer> tempTable = new Hashtable<Integer, Integer>(testArray.length);
sTable[iTestC][iTestQ] = new int [testArray.length];
char testDelim = this.testChars[iTestC];
char testQuote = '\0';
if(iTestQ < qC)
testQuote = this.automaticQuoteC[iTestQ];
for(int iTestL=0; iTestL < testArray.length; iTestL++) {
if(testArray[iTestL]==null || testArray[iTestL].equals("")) {
sTable[iTestC][iTestQ][iTestL] = 0;
continue;
}
String line = testArray[iTestL].toString();
char oldDelim = this.columnSeperator;
char oldQuote = this.columnQuote;
this.columnSeperator = testDelim;
this.columnQuote = testQuote;
this.parseString(line);
this.columnSeperator = oldDelim;
this.columnQuote = oldQuote;
sTable[iTestC][iTestQ][iTestL] = this.countTokens();
Integer key = new Integer(this.countTokens());
if(tempTable.containsKey(key)) {
int val = tempTable.get(key).intValue();
tempTable.put(key, new Integer(val+1));
}
else
tempTable.put(key, new Integer(1));
}//end for lines
int bestKey = -1;
int bestKeyVal = 0;
Enumeration<Integer> tempEnum = tempTable.keys();
while(tempEnum.hasMoreElements()) {
Integer localKey = tempEnum.nextElement();
int localVal = tempTable.get(localKey).intValue();
if(localVal > bestKeyVal) {
bestKey = localKey.intValue();
bestKeyVal = localVal;
}
}
if(bestKeyVal > bestCount && bestKey > 1) {
bestDelim = testDelim;
bestQuote = testQuote;
bestCount = bestKeyVal;
}
}//end for quote chars
}//end for delim chars
if(bestDelim!='\0') {
this.columnSeperator = bestDelim;
this.columnQuote = bestQuote;
return true;
}
return false;
}
/* protected void parseString(String str, char delim, char quote) {
tokens = new Vector();
pos = 0;
if(str==null)
return;
str = str.trim();
this.m_parseString = str.toString();
if(delim=='\0') {
tokens.add(str.toString());
return;
}
int strLength = m_parseString.length();
boolean newField = false;
String token = "";
for(int i=0; i < strLength; i++) { // main loop
char c = m_parseString.charAt(i);
if(delim!='\0' && c==delim) {
if(inQuote==false) {
inQuote = true;
newField = true;
}
else {
inQuote = false;
}
}// end if
else if(c==delim && inQuote==false) {
token = token.trim();
if(!token.equals(""))
tokens.add(token);
if(delim!='\0')
newField = false;
else
newField = true;
token = new String();
}
else if(delim!='\0' && inQuote==false) { // corrupted line
}
else {
token += c;
}
}// end for
token = token.trim();
if(newField==true && inQuote==false)
tokens.add(token);
if(tokens.size() < 1 && m_parseString.length() > 0)
tokens.add(m_parseString.toString());
}*/
/**
* Parses the specified string by using the {@link #columnSeperator seperator}
* and the {@link #columnQuote quotation} character.
*
* @param str String what shall be parsed.
*/
public void parseString(String str) {
// this.parseString(str, this.columnSeperator, this.columnQuote);
tokens = null;
if(str==null)
return;
str = str.trim();
tokens = new Vector<String>();
pos = 0;
m_parseString = str.toString();
if(columnSeperator=='\0') {
tokens.add(new String(str));
return;
}
int strLength = m_parseString.length();
boolean inQuote = false;
boolean newField = false;
boolean wasInQuote = false;
String token = "";
for(int i=0; i < strLength; i++) { // main loop
char c = m_parseString.charAt(i);
if(columnQuote!='\0' && c==columnQuote) {
if(inQuote==false && !wasInQuote && token.length()<1) {
inQuote = true;
newField = true;
wasInQuote = true;
}
else if(inQuote==false && wasInQuote) {
inQuote = true;
token += columnQuote;
newField = true;
}
else if(!inQuote && token.length()>0) {
token += c;
}
else {
inQuote = false;
}
}// end if
else if(c==columnSeperator && inQuote==false) {
if(!wasInQuote)
token = token.trim();
// if(!token.equals(""))
tokens.add(new String(token));
// if(columnQuote!='\0')
newField = false;
// else
// newField = true;
token = new String();
wasInQuote = false;
}
// else if(columnQuote!='\0' && inQuote==false) { // corrupted line
// }
else if(!inQuote && wasInQuote && c==' ') {} //ignore following space characters afters quotations
else if(!inQuote && !wasInQuote && c==' ' && token.length()<1) {}
else {
token += c;
newField = true;
}
}// end for
if(!wasInQuote)
token = token.trim();
if(newField==true && inQuote==false) // - && !token.equals("")
tokens.add(new String(token));
if(tokens.size() < 1 && m_parseString.length() > 0)
tokens.add(new String(m_parseString));
}// end parseString
/**
* Determines whether the specified character is inside the string array or not.
*
* @param str String array that contains one or more lines.
* @param c Character that will be searched inside the string array.
*
* @return <b>true</b> if one String of the String array <code>str</code> contains the character <code>c</code>; otherwise <b>false</b>.
*/
private boolean isSeperatorIn(String[] str, char c) {
for(int i=0; i < str.length; i++) {
char [] cTest = str[i].toCharArray();
for(int j=1; j < cTest.length-1; j++) // Don't search for seperators at the beginning or end
if(cTest[j]==c)
return true;
}
return false;
}
/**
* Searches for the seperator in the specified String array.
*
* @param str String array for which the seperator is searched for.
*
* @return the seperator character; if no one could be found <b>'\0'</b>.
*/
char searchForSeperator(String[] str) throws IllegalArgumentException, UnknownError {
if(str==null)
throw new IllegalArgumentException("Parameter must not be null");
for(int i=0; i < testChars.length; i++)
if(isSeperatorIn(str, testChars[i]))
return testChars[i];
return '\0';
}
/**
* Determines whether the vector has more elements left.
*
* @return <b>true</b> if more elements left; otherwise <b>false</b>
*/
public boolean hasMoreTokens() {
if(tokens!=null && pos < tokens.size())
return true;
return false;
}
/**
* Returns the next element.
*
* @return next element.
*/
public String nextToken() throws NoSuchElementException {
if(!hasMoreTokens())
throw new NoSuchElementException();
String token = tokens.elementAt(pos);
pos++;
if(token==null)
throw new NoSuchElementException();
return token.toString();
}
/**
* Returns the number of elements.
*
* @return number of elements.
*/
public int countTokens() {
if(tokens!=null)
return tokens.size();
return 0;
}
/**
* Returns the element with the specified index.
*
* @param element index of the element; '1' is the first index.
*
* @return element as String value.
*/
public String getToken(int element) throws NoSuchElementException {
if(element > tokens.size() || element <= 0)
throw new NoSuchElementException();
String token = tokens.elementAt(element-1);
if(token==null)
throw new NoSuchElementException();
return token.toString();
}
/**
* Sets the seperator and quotation character and runs parsing.
*
* @param str Line string that shall be parsed.
* @param seperator Seperator character.
* @param quotes Quotation character.
*
* @see #setColumnSeperator(char)
* @see #setColumnQuotes(char)
* @see #parseString(String)
*/
public void init(String str, char seperator, char quotes) {
this.setColumnSeperator(seperator);
this.setColumnQuotes(quotes);
this.parseString(str);
}
/**
* Main routine for testing.
*/
public final static void main(String argv[]) {
ParseFileString ps = new ParseFileString();
// ps.setColumnSeperator('\t');
ps.setColumnSeperator(ps.searchForSeperator(new String[]{"test","\"Dies\" \"ist\" \"ein\" \"Test b\"", "Test"}));
System.out.println(":"+ps.getColumnSeperator()+":");
ps.setColumnQuotes('\"');
ps.parseString("\"Dies\"\t\"ist\"\t\"ein\"\t\"Test\tb\"");
System.out.println(ps.countTokens() + " tokens found.");
for(int i=1; i<=ps.countTokens(); i++)
System.out.println(":" + ps.getToken(i) + ":");
/* while(ps.hasMoreTokens()) {
System.out.println(":" + ps.nextToken() + ":");
}*/
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -