⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 miningexcelstream.java

📁 一个数据挖掘软件ALPHAMINERR的整个过程的JAVA版源代码
💻 JAVA
📖 第 1 页 / 共 2 页
字号:
/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/**
 * Title: XELOPES Data Mining Library
 * Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
 * Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
 * Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
 * @author Michael Thess
 * @version 1.2
 */

package com.prudsys.pdm.Input.Records.Excel;

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;

import org.apache.poi.hssf.record.RecordFormatException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.omg.cwm.objectmodel.core.CorePackage;
import org.omg.cwm.objectmodel.core.DataType;
import org.omg.cwm.resource.record.Field;
import org.omg.cwm.resource.record.RecordDef;
import org.omg.cwm.resource.record.RecordFile;
import org.omg.cwm.resource.record.RecordPackage;

import com.prudsys.pdm.Common.Constants;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningArrayStream;
import com.prudsys.pdm.Input.MiningFileStream;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Utils.IntVector;

/**
 * Extends MiningFileStream for Excel files.
 */
public class MiningExcelStream extends MiningFileStream {
	// -----------------------------------------------------------------------
	//  Variables declarations
	// -----------------------------------------------------------------------

	//<<Frank J. Xu, 16/02/2005
	//Add function to update the type of categorical data.
	//protected int m_BoundedThreshold = 20;
	//>>Frank J. Xu, 16/02/2005

	/** Input stream for Excel file. */
	protected InputStream input;

	/** POI file system. */
	protected POIFSFileSystem fs;

	/** Excel workbook. */
	protected HSSFWorkbook wb;

	/** Excel sheet. */
	protected HSSFSheet sheet;

	/** Array stream containing the Excel table data. */
	protected MiningArrayStream dataTable;

	// -----------------------------------------------------------------------
	//  Constructors
	// -----------------------------------------------------------------------
	/**
	 * Empty constructor.
	 */
	public MiningExcelStream() {
	}

	/**
	 * Mining Excel stream for a given Excel file and given meta data.
	 * 
	 * @param dataFileName
	 *            path of Excel file to access
	 * @param metaData
	 *            meta data of file data
	 * @exception MiningException
	 *                couldn't create excel stream
	 */
	public MiningExcelStream(String dataFileName,
			MiningDataSpecification metaData) throws MiningException {
		// Open Excel file:
		try {
			input = new FileInputStream(dataFileName);
			fs = new POIFSFileSystem(input);
			wb = new HSSFWorkbook(fs);
			sheet = wb.getSheetAt(0);
		} catch (IOException ex) {
			throw new MiningException("Can't read from the file: "
					+ dataFileName);
			/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :Begin*/
		} catch (RecordFormatException E){
		    throw new MiningException("Can't read from the file: "
					+ dataFileName);
		}catch(Exception E) {
		    throw new MiningException("Can't read from the file: "
					+ dataFileName);
		}
		/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :End*/
		this.fileName = dataFileName;

		// Get meta data (and read data matrix):
		this.metaData = metaData;
		if (metaData == null) {
			// metaData = recognize();
		}
	}

	public MiningExcelStream(String dataFileName,
			MiningDataSpecification metaData, int a_sheetIndex)
			throws MiningException {
		// Open Excel file:
		try {
			input = new FileInputStream(dataFileName);
			fs = new POIFSFileSystem(input);
			wb = new HSSFWorkbook(fs);
			sheet = wb.getSheetAt(a_sheetIndex);
		} catch (IOException ex) {
			throw new MiningException("Can't read from the file: "
					+ dataFileName);
			/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :Begin*/
		} catch (RecordFormatException E){
		    throw new MiningException("Can't read from the file: "
					+ dataFileName);
		}catch(Exception E) {
		    throw new MiningException("Can't read from the file: "
					+ dataFileName);
		}
		/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :End*/
		this.fileName = dataFileName;

		// Get meta data (and read data matrix):
		this.metaData = metaData;
		if (metaData == null) {
			metaData = recognize(a_sheetIndex);
		}
	}

	/**
	 * Mining file stream for a given file.
	 * 
	 * @param dataFileName
	 *            path of Excel file to access
	 * @exception MiningException
	 *                couldn't create excel stream
	 */
	public MiningExcelStream(String dataFileName) throws MiningException {
		this(dataFileName, null);
	}

	// -----------------------------------------------------------------------
	//  Getter and setter methods
	// -----------------------------------------------------------------------
	/**
	 * Returns supported stream methods.
	 * 
	 * @return supported stream methods
	 */
	public Enumeration getSupportedStreamMethods() {

		Vector suppmeth = new Vector();
		suppmeth.addElement("recognize");
		suppmeth.addElement("reset");
		suppmeth.addElement("move");

		return suppmeth.elements();
	}

	/**
	 * Finds physical file model (CWM Resource Package "Record").
	 * 
	 * @exception MiningException
	 *                couldn't obtain physical model
	 */
	public void findPhysicalModel() throws MiningException {

		// Factory methods:
		com.prudsys.pdm.Cwm.CWMCompletePackage cwmFactory = com.prudsys.pdm.Cwm.CWMCompletePackage
				.getCWMCompletePackage();
		CorePackage cpg = cwmFactory.getCore();
		RecordPackage rpg = cwmFactory.getRecord();

		// Create RecordFile:
		RecordFile rfile = rpg.getRecordFile().createRecordFile();
		rfile.setName(fileName);

		// Iterate over all sheets (==> RecordDef's):
		int nsheet = wb.getNumberOfSheets();
		for (int i = 0; i < nsheet; i++) {
			// Get sheet and create RecordDef:
			HSSFSheet sh = wb.getSheetAt(i);
			RecordDef recdef = rpg.getRecordDef().createRecordDef();
			recdef.setName(wb.getSheetName(i));
			rfile.addRecord(recdef);

			// Get first row for determining meta data:
			Iterator rows = sh.rowIterator();
			if (!rows.hasNext())
				continue;
			HSSFRow firstRow = (HSSFRow) rows.next();

			// Iterate over all cells of first row:
			Iterator cells = firstRow.cellIterator();
			Vector header = new Vector();
			IntVector ih = new IntVector();

			cells = firstRow.cellIterator();
			int cmax = 0;
			while (cells.hasNext()) {
				HSSFCell cell = (HSSFCell) cells.next();
				int icell = cell.getCellNum();
				if (icell < 0)
					throw new MiningException("illegal index of cell");
				if (icell > cmax)
					cmax = icell;

				Field field = rpg.getField().createField();
				field.setName(cell.getStringCellValue());
				DataType dataType = cpg.getDataType().createDataType();
				String dtname = "unknownType";
				if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
					dtname = "numeric";
				} else {
					dtname = "categorical";
				}
				;
				dataType.setName(dtname);
				field.setType(dataType);
				header.addElement(field);
				ih.addElement(icell);
			}
			;

			// Calculate inverse mapping:
			int ncell = cmax + 1;
			int[] ihInv = new int[ncell];
			for (int j = 0; j < ncell; j++)
				ihInv[j] = -1;
			for (int j = 0; j < ih.size(); j++)
				ihInv[ih.IntegerAt(j)] = j;

			// Add fields to RecordDef (ignore empty fields):
			for (int j = 0; j < ncell; j++) {
				int icell = ihInv[j];
				if (icell > -1)
					recdef.addFeature((Field) header.elementAt(icell));
			}
		}

		physicalModel = rfile;
	}

	// -----------------------------------------------------------------------
	//  General stream methods
	// -----------------------------------------------------------------------
	/**
	 * Reads and stores meta data of Excel table.
	 * 
	 * @return meta data of Excel table
	 * @exception MiningException
	 *                if the information is not read successfully
	 */
	public MiningDataSpecification recognize(int a_sheetIndex) throws MiningException {
		if (metaData == null) {
			// Ceate new meta data object:
			metaData = new MiningDataSpecification();
			metaData.setRelationName(wb.getSheetName(a_sheetIndex));

			// Get first row to check field names:
			Iterator rows = sheet.rowIterator();
			if (!rows.hasNext())
				throw new MiningException("sheet '"
						+ this.getSheetName(a_sheetIndex) + "' has no rows");
						//+ metaData.getRelationName() + "' has no rows"); remarked by Joyce 2005/04/12
			HSSFRow firstRow = (HSSFRow) rows.next();

			// Iterate through all cells of first row which are the attributes:
			Iterator cells = firstRow.cellIterator();
			Vector header = new Vector();
			IntVector ih = new IntVector();

			cells = firstRow.cellIterator();
			int cmax = 0;

			/*
			 * Commented by Franky Chan 23/02/05 Fix a bug in xelopes while (
			 * cells.hasNext() ) { HSSFCell cell = (HSSFCell) cells.next(); int
			 * icell = cell.getCellNum(); if (icell < 0) throw new
			 * MiningException("illegal index of cell"); if (icell > cmax) cmax =
			 * icell;
			 * 
			 * System.out.println("string value="+cell.getStringCellValue());
			 * 
			 * MiningAttribute matt; if ( cell.getCellType() ==
			 * HSSFCell.CELL_TYPE_NUMERIC ) { matt = new NumericAttribute(); }
			 * else { matt = new CategoricalAttribute();
			 * ((CategoricalAttribute)matt).setUnboundedCategories(true); };
			 * matt.setName( cell.getStringCellValue() );
			 * header.addElement(matt); ih.addElement(icell); };
			 */

			/*
			 * Added by Franky Chan 23/02/05 Fix a bug in xelopes
			 */
			Vector colName = new Vector();
			Hashtable colType = new Hashtable();

			Vector mycellidex = new Vector(); // added by Joyce 2005/02/24
			// Check 1st row to determine the field name
			while (cells.hasNext()) {
				HSSFCell cell = (HSSFCell) cells.next();
				int icell = cell.getCellNum();
				if (icell < 0)
					throw new MiningException("illegal index of cell");
				if (icell > cmax)
					cmax = icell;
				mycellidex.add(new Integer(icell)); // added by Joyce 2005/02/24
				colName.add(new Integer(icell));
				colName.add(cell.getStringCellValue());
				ih.addElement(icell);
			}

			MiningAttribute matt;
			// Check data type and boundary condition using 2nd row
			//cell.getCellNum();

			if (rows.hasNext()) {
				HSSFRow secondRow = (HSSFRow) rows.next();
				cells = secondRow.cellIterator();

				if (cells.hasNext()) {

					int lastcell = 0;
					//  Vector addedcell = new Vector();
					while (cells.hasNext()) {

						HSSFCell cell = (HSSFCell) cells.next();
						int icell = cell.getCellNum();
						if (icell < 0)
							throw new MiningException("illegal index of cell");
						//mycellidex.add(new Integer(icell)); // added by Joyce
						// 2005/02/24
						if (icell > cmax)
							cmax = icell;
						
						if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) 
						{
							// Added by Kelvin Jor to recognize date cell
							if (HSSFDateUtil.isCellDateFormatted(cell))
							{
								colType.put(new Integer(icell), new Integer(
									Constants.CELL_TYPE_DATE));
							}else
							{
							//                    	colType.put()
								colType.put(new Integer(icell), new Integer(
									HSSFCell.CELL_TYPE_NUMERIC));
							}
							//                        matt = new NumericAttribute();
						} else {
							colType.put(new Integer(icell), new Integer(
									HSSFCell.CELL_TYPE_STRING));
						}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -