⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 parsefilestring.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

 /**
  * Title: XELOPES Data Mining Library
  * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
  * Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
  * Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
  * @author Toni Volkmer (volkmer@prudsys.com)
  * @version 1.0
  */


package com.prudsys.pdm.Utils;

import java.util.Vector;
import java.util.Hashtable;
import java.util.Enumeration;
import java.util.NoSuchElementException;

/**
 * This class provides an interface for parsing one line of a transactional
 * text file.
 */
public class ParseFileString {
  /**
   * These chars are used in automatic mode.
   */
  protected char testChars[] = {',', ';', '\t', ' ', ':', '.'};

  /**
   * Character the string is splitted by.
   */
  protected char columnSeperator = '\0';

  /**
   * Quote character.
   */
  protected char columnQuote = '\0';

  protected int pos = 0;

  /**
   * Vector containing the identified elements.
   */
  private Vector<String> tokens = new Vector<String>();
  private String m_parseString = "";

  char automaticQuoteC[] = { '\"', '\'', '`' };
//  char escapeC = '\\';

  /**
   * Empty Constructor.
   */
  public ParseFileString() {
  }

  /**
   * Constructor runs parsing by calling
   * the {@link #init(String, char, char) init method}.
   *
   * @param str The string that shall be parsed.
   * @param seperator The character the string shall be splitted by.
   */
  public ParseFileString(String str, char seperator) {
    this.init(str, seperator, '\0');
  }

  /**
   * Constructor runs parsing by calling
   * the {@link #init(String, char, char) init method}.
   *
   * @param str The string that shall be parsed.
   * @param seperator The character the string shall be splitted by.
   * @param quotes Quotation character.
   */
  public ParseFileString(String str, char seperator, char quotes) {
    this.init(str, seperator, quotes);
  }

  /**
   * Returns the character which the string is splitted by.
   *
   * @return seperator character.
   */
  public char getColumnSeperator() {
    return columnSeperator;
  }

  /**
   * Sets the character which the string is splitted by.
   *
   * @param seperator new seperator.
   */
  public void setColumnSeperator(char seperator) {
    this.columnSeperator = seperator;
  }

  /**
   * Returns the quotation character.
   *
   * @return quotation character.
   */
  public char getColumnQuotes() {
    return columnQuote;
  }

  /**
   * Sets the quotation character.
   *
   * @param quotes new quotation character.
   */
  public void setColumnQuotes(char quotes) {
    this.columnQuote = quotes;
  }

  public boolean findParameters(String[] testArray) {
    if(testArray==null || testArray.length < 1)
      return false;

    int sTable[][][] = new int[this.testChars.length][][];
    int qC = this.automaticQuoteC.length;
    char bestDelim = '\0';
    char bestQuote = '\0';
    int bestCount = 0;

    for(int iTestC=0; iTestC < this.testChars.length; iTestC++) {
      sTable[iTestC] = new int[qC+1][];

      for(int iTestQ=0; iTestQ < qC+1; iTestQ++) {
        Hashtable<Integer, Integer> tempTable = new Hashtable<Integer, Integer>(testArray.length);
        sTable[iTestC][iTestQ] = new int [testArray.length];

        char testDelim = this.testChars[iTestC];
        char testQuote = '\0';
        if(iTestQ < qC)
          testQuote = this.automaticQuoteC[iTestQ];

        for(int iTestL=0; iTestL < testArray.length; iTestL++) {
          if(testArray[iTestL]==null || testArray[iTestL].equals("")) {
            sTable[iTestC][iTestQ][iTestL] = 0;
            continue;
          }
          String line = testArray[iTestL].toString();
          char oldDelim = this.columnSeperator;
          char oldQuote = this.columnQuote;
          this.columnSeperator = testDelim;
          this.columnQuote = testQuote;
          this.parseString(line);
          this.columnSeperator = oldDelim;
          this.columnQuote = oldQuote;
          sTable[iTestC][iTestQ][iTestL] = this.countTokens();
          Integer key = new Integer(this.countTokens());
          if(tempTable.containsKey(key)) {
            int val = tempTable.get(key).intValue();
            tempTable.put(key, new Integer(val+1));
          }
          else
            tempTable.put(key, new Integer(1));
        }//end for lines
        int bestKey = -1;
        int bestKeyVal = 0;
        Enumeration<Integer> tempEnum = tempTable.keys();
        while(tempEnum.hasMoreElements()) {
          Integer localKey = tempEnum.nextElement();
          int localVal = tempTable.get(localKey).intValue();

          if(localVal > bestKeyVal) {
            bestKey = localKey.intValue();
            bestKeyVal = localVal;
          }
        }
        if(bestKeyVal > bestCount && bestKey > 1) {
          bestDelim = testDelim;
          bestQuote = testQuote;
          bestCount = bestKeyVal;
        }
      }//end for quote chars
    }//end for delim chars
    if(bestDelim!='\0') {
      this.columnSeperator = bestDelim;
      this.columnQuote = bestQuote;
      return true;
    }

    return false;
  }

/*  protected void parseString(String str, char delim, char quote) {
    tokens = new Vector();
    pos = 0;

    if(str==null)
      return;
    str = str.trim();
    this.m_parseString = str.toString();

    if(delim=='\0') {
      tokens.add(str.toString());
      return;
    }
    int strLength = m_parseString.length();

    boolean newField = false;
    String token = "";
    for(int i=0; i < strLength; i++) { // main loop
      char c = m_parseString.charAt(i);

      if(delim!='\0' && c==delim) {
        if(inQuote==false) {
          inQuote = true;
          newField = true;
        }
        else {
          inQuote = false;
        }
      }// end if
      else if(c==delim && inQuote==false) {
        token = token.trim();
        if(!token.equals(""))
          tokens.add(token);
        if(delim!='\0')
          newField = false;
        else
          newField = true;
        token = new String();
      }
      else if(delim!='\0' && inQuote==false) { // corrupted line
      }
      else {
        token += c;
      }

    }// end for
    token = token.trim();
    if(newField==true && inQuote==false)
      tokens.add(token);

    if(tokens.size() < 1 && m_parseString.length() > 0)
      tokens.add(m_parseString.toString());
  }*/



  /**
   * Parses the specified string by using the {@link #columnSeperator seperator}
   * and the {@link #columnQuote quotation} character.
   *
   * @param str String what shall be parsed.
   */
  public void parseString(String str) {
//    this.parseString(str, this.columnSeperator, this.columnQuote);
    tokens = null;
    if(str==null)
      return;

    str = str.trim();
    tokens = new Vector<String>();
    pos = 0;

    m_parseString = str.toString();

    if(columnSeperator=='\0') {
      tokens.add(new String(str));
      return;
    }
    int strLength = m_parseString.length();

    boolean inQuote = false;
    boolean newField = false;
    boolean wasInQuote = false;
    String token = "";
    for(int i=0; i < strLength; i++) { // main loop
      char c = m_parseString.charAt(i);

      if(columnQuote!='\0' && c==columnQuote) {
        if(inQuote==false && !wasInQuote && token.length()<1) {
          inQuote = true;
          newField = true;
          wasInQuote = true;
        }
        else if(inQuote==false && wasInQuote) {
          inQuote = true;
          token += columnQuote;
          newField = true;
        }
        else if(!inQuote && token.length()>0) {
          token += c;
        }
        else {
          inQuote = false;
        }
      }// end if
      else if(c==columnSeperator && inQuote==false) {
        if(!wasInQuote)
          token = token.trim();
//        if(!token.equals(""))
        tokens.add(new String(token));
//        if(columnQuote!='\0')
          newField = false;
//        else
//          newField = true;
        token = new String();
        wasInQuote = false;
      }
//      else if(columnQuote!='\0' && inQuote==false) { // corrupted line
//      }
      else if(!inQuote && wasInQuote && c==' ') {} //ignore following space characters afters quotations
      else if(!inQuote && !wasInQuote && c==' ' && token.length()<1) {}
      else {
        token += c;
        newField = true;
      }

    }// end for
    if(!wasInQuote)
      token = token.trim();
    if(newField==true && inQuote==false) // - && !token.equals("")
      tokens.add(new String(token));

    if(tokens.size() < 1 && m_parseString.length() > 0)
      tokens.add(new String(m_parseString));
  }// end parseString

  /**
   * Determines whether the specified character is inside the string array or not.
   *
   * @param str String array that contains one or more lines.
   * @param c Character that will be searched inside the string array.
   *
   * @return <b>true</b> if one String of the String array <code>str</code> contains the character <code>c</code>; otherwise <b>false</b>.
   */
  private boolean isSeperatorIn(String[] str, char c) {
    for(int i=0; i < str.length; i++) {
      char [] cTest = str[i].toCharArray();
      for(int j=1; j < cTest.length-1; j++) // Don't search for seperators at the beginning or end
        if(cTest[j]==c)
          return true;
    }
    return false;
  }

  /**
   * Searches for the seperator in the specified String array.
   *
   * @param str String array for which the seperator is searched for.
   *
   * @return the seperator character; if no one could be found <b>'\0'</b>.
   */
  char searchForSeperator(String[] str) throws IllegalArgumentException, UnknownError {
    if(str==null)
      throw new IllegalArgumentException("Parameter must not be null");

    for(int i=0; i < testChars.length; i++)
      if(isSeperatorIn(str, testChars[i]))
        return testChars[i];

    return '\0';
  }

  /**
   * Determines whether the vector has more elements left.
   *
   * @return <b>true</b> if more elements left; otherwise <b>false</b>
   */
  public boolean hasMoreTokens() {
    if(tokens!=null && pos < tokens.size())
      return true;

    return false;
  }

  /**
   * Returns the next element.
   *
   * @return next element.
   */
  public String nextToken() throws NoSuchElementException {
    if(!hasMoreTokens())
      throw new NoSuchElementException();

    String token = tokens.elementAt(pos);
    pos++;
    if(token==null)
      throw new NoSuchElementException();

    return token.toString();
  }

  /**
   * Returns the number of elements.
   *
   * @return number of elements.
   */
  public int countTokens() {
    if(tokens!=null)
      return tokens.size();

    return 0;
  }

  /**
   * Returns the element with the specified index.
   *
   * @param element index of the element; '1' is the first index.
   *
   * @return element as String value.
   */
  public String getToken(int element) throws NoSuchElementException {
    if(element > tokens.size() || element <= 0)
      throw new NoSuchElementException();

    String token = tokens.elementAt(element-1);
    if(token==null)
      throw new NoSuchElementException();

    return token.toString();
  }

  /**
   * Sets the seperator and quotation character and runs parsing.
   *
   * @param str Line string that shall be parsed.
   * @param seperator Seperator character.
   * @param quotes Quotation character.
   *
   * @see #setColumnSeperator(char)
   * @see #setColumnQuotes(char)
   * @see #parseString(String)
   */
  public void init(String str, char seperator, char quotes) {
    this.setColumnSeperator(seperator);
    this.setColumnQuotes(quotes);
    this.parseString(str);
  }

  /**
   * Main routine for testing.
   */
  public final static void main(String argv[]) {
    ParseFileString ps = new ParseFileString();
//    ps.setColumnSeperator('\t');
    ps.setColumnSeperator(ps.searchForSeperator(new String[]{"test","\"Dies\" \"ist\" \"ein\" \"Test b\"", "Test"}));
    System.out.println(":"+ps.getColumnSeperator()+":");
    ps.setColumnQuotes('\"');
    ps.parseString("\"Dies\"\t\"ist\"\t\"ein\"\t\"Test\tb\"");
    System.out.println(ps.countTokens() + " tokens found.");
    for(int i=1; i<=ps.countTokens(); i++)
      System.out.println(":" + ps.getToken(i) + ":");
/*    while(ps.hasMoreTokens()) {
      System.out.println(":" + ps.nextToken() + ":");
    }*/
  }

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -