📄 miningexcelstream.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Michael Thess
* @version 1.2
*/
package com.prudsys.pdm.Input.Records.Excel;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.Vector;
import org.apache.poi.hssf.record.RecordFormatException;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFDateUtil;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.omg.cwm.objectmodel.core.CorePackage;
import org.omg.cwm.objectmodel.core.DataType;
import org.omg.cwm.resource.record.Field;
import org.omg.cwm.resource.record.RecordDef;
import org.omg.cwm.resource.record.RecordFile;
import org.omg.cwm.resource.record.RecordPackage;
import com.prudsys.pdm.Common.Constants;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Core.NumericAttribute;
import com.prudsys.pdm.Input.MiningArrayStream;
import com.prudsys.pdm.Input.MiningFileStream;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Utils.IntVector;
/**
* Extends MiningFileStream for Excel files.
*/
public class MiningExcelStream extends MiningFileStream {
// -----------------------------------------------------------------------
// Variables declarations
// -----------------------------------------------------------------------
//<<Frank J. Xu, 16/02/2005
//Add function to update the type of categorical data.
//protected int m_BoundedThreshold = 20;
//>>Frank J. Xu, 16/02/2005
/** Input stream for Excel file. */
protected InputStream input;
/** POI file system. */
protected POIFSFileSystem fs;
/** Excel workbook. */
protected HSSFWorkbook wb;
/** Excel sheet. */
protected HSSFSheet sheet;
/** Array stream containing the Excel table data. */
protected MiningArrayStream dataTable;
// -----------------------------------------------------------------------
// Constructors
// -----------------------------------------------------------------------
/**
* Empty constructor.
*/
public MiningExcelStream() {
}
/**
* Mining Excel stream for a given Excel file and given meta data.
*
* @param dataFileName
* path of Excel file to access
* @param metaData
* meta data of file data
* @exception MiningException
* couldn't create excel stream
*/
public MiningExcelStream(String dataFileName,
MiningDataSpecification metaData) throws MiningException {
// Open Excel file:
try {
input = new FileInputStream(dataFileName);
fs = new POIFSFileSystem(input);
wb = new HSSFWorkbook(fs);
sheet = wb.getSheetAt(0);
} catch (IOException ex) {
throw new MiningException("Can't read from the file: "
+ dataFileName);
/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :Begin*/
} catch (RecordFormatException E){
throw new MiningException("Can't read from the file: "
+ dataFileName);
}catch(Exception E) {
throw new MiningException("Can't read from the file: "
+ dataFileName);
}
/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :End*/
this.fileName = dataFileName;
// Get meta data (and read data matrix):
this.metaData = metaData;
if (metaData == null) {
// metaData = recognize();
}
}
public MiningExcelStream(String dataFileName,
MiningDataSpecification metaData, int a_sheetIndex)
throws MiningException {
// Open Excel file:
try {
input = new FileInputStream(dataFileName);
fs = new POIFSFileSystem(input);
wb = new HSSFWorkbook(fs);
sheet = wb.getSheetAt(a_sheetIndex);
} catch (IOException ex) {
throw new MiningException("Can't read from the file: "
+ dataFileName);
/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :Begin*/
} catch (RecordFormatException E){
throw new MiningException("Can't read from the file: "
+ dataFileName);
}catch(Exception E) {
throw new MiningException("Can't read from the file: "
+ dataFileName);
}
/* 06Jun2005 Mark Li: Excel file with filters enabled flow expcetion :End*/
this.fileName = dataFileName;
// Get meta data (and read data matrix):
this.metaData = metaData;
if (metaData == null) {
metaData = recognize(a_sheetIndex);
}
}
/**
* Mining file stream for a given file.
*
* @param dataFileName
* path of Excel file to access
* @exception MiningException
* couldn't create excel stream
*/
public MiningExcelStream(String dataFileName) throws MiningException {
this(dataFileName, null);
}
// -----------------------------------------------------------------------
// Getter and setter methods
// -----------------------------------------------------------------------
/**
* Returns supported stream methods.
*
* @return supported stream methods
*/
public Enumeration getSupportedStreamMethods() {
Vector suppmeth = new Vector();
suppmeth.addElement("recognize");
suppmeth.addElement("reset");
suppmeth.addElement("move");
return suppmeth.elements();
}
/**
* Finds physical file model (CWM Resource Package "Record").
*
* @exception MiningException
* couldn't obtain physical model
*/
public void findPhysicalModel() throws MiningException {
// Factory methods:
com.prudsys.pdm.Cwm.CWMCompletePackage cwmFactory = com.prudsys.pdm.Cwm.CWMCompletePackage
.getCWMCompletePackage();
CorePackage cpg = cwmFactory.getCore();
RecordPackage rpg = cwmFactory.getRecord();
// Create RecordFile:
RecordFile rfile = rpg.getRecordFile().createRecordFile();
rfile.setName(fileName);
// Iterate over all sheets (==> RecordDef's):
int nsheet = wb.getNumberOfSheets();
for (int i = 0; i < nsheet; i++) {
// Get sheet and create RecordDef:
HSSFSheet sh = wb.getSheetAt(i);
RecordDef recdef = rpg.getRecordDef().createRecordDef();
recdef.setName(wb.getSheetName(i));
rfile.addRecord(recdef);
// Get first row for determining meta data:
Iterator rows = sh.rowIterator();
if (!rows.hasNext())
continue;
HSSFRow firstRow = (HSSFRow) rows.next();
// Iterate over all cells of first row:
Iterator cells = firstRow.cellIterator();
Vector header = new Vector();
IntVector ih = new IntVector();
cells = firstRow.cellIterator();
int cmax = 0;
while (cells.hasNext()) {
HSSFCell cell = (HSSFCell) cells.next();
int icell = cell.getCellNum();
if (icell < 0)
throw new MiningException("illegal index of cell");
if (icell > cmax)
cmax = icell;
Field field = rpg.getField().createField();
field.setName(cell.getStringCellValue());
DataType dataType = cpg.getDataType().createDataType();
String dtname = "unknownType";
if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC) {
dtname = "numeric";
} else {
dtname = "categorical";
}
;
dataType.setName(dtname);
field.setType(dataType);
header.addElement(field);
ih.addElement(icell);
}
;
// Calculate inverse mapping:
int ncell = cmax + 1;
int[] ihInv = new int[ncell];
for (int j = 0; j < ncell; j++)
ihInv[j] = -1;
for (int j = 0; j < ih.size(); j++)
ihInv[ih.IntegerAt(j)] = j;
// Add fields to RecordDef (ignore empty fields):
for (int j = 0; j < ncell; j++) {
int icell = ihInv[j];
if (icell > -1)
recdef.addFeature((Field) header.elementAt(icell));
}
}
physicalModel = rfile;
}
// -----------------------------------------------------------------------
// General stream methods
// -----------------------------------------------------------------------
/**
* Reads and stores meta data of Excel table.
*
* @return meta data of Excel table
* @exception MiningException
* if the information is not read successfully
*/
public MiningDataSpecification recognize(int a_sheetIndex) throws MiningException {
if (metaData == null) {
// Ceate new meta data object:
metaData = new MiningDataSpecification();
metaData.setRelationName(wb.getSheetName(a_sheetIndex));
// Get first row to check field names:
Iterator rows = sheet.rowIterator();
if (!rows.hasNext())
throw new MiningException("sheet '"
+ this.getSheetName(a_sheetIndex) + "' has no rows");
//+ metaData.getRelationName() + "' has no rows"); remarked by Joyce 2005/04/12
HSSFRow firstRow = (HSSFRow) rows.next();
// Iterate through all cells of first row which are the attributes:
Iterator cells = firstRow.cellIterator();
Vector header = new Vector();
IntVector ih = new IntVector();
cells = firstRow.cellIterator();
int cmax = 0;
/*
* Commented by Franky Chan 23/02/05 Fix a bug in xelopes while (
* cells.hasNext() ) { HSSFCell cell = (HSSFCell) cells.next(); int
* icell = cell.getCellNum(); if (icell < 0) throw new
* MiningException("illegal index of cell"); if (icell > cmax) cmax =
* icell;
*
* System.out.println("string value="+cell.getStringCellValue());
*
* MiningAttribute matt; if ( cell.getCellType() ==
* HSSFCell.CELL_TYPE_NUMERIC ) { matt = new NumericAttribute(); }
* else { matt = new CategoricalAttribute();
* ((CategoricalAttribute)matt).setUnboundedCategories(true); };
* matt.setName( cell.getStringCellValue() );
* header.addElement(matt); ih.addElement(icell); };
*/
/*
* Added by Franky Chan 23/02/05 Fix a bug in xelopes
*/
Vector colName = new Vector();
Hashtable colType = new Hashtable();
Vector mycellidex = new Vector(); // added by Joyce 2005/02/24
// Check 1st row to determine the field name
while (cells.hasNext()) {
HSSFCell cell = (HSSFCell) cells.next();
int icell = cell.getCellNum();
if (icell < 0)
throw new MiningException("illegal index of cell");
if (icell > cmax)
cmax = icell;
mycellidex.add(new Integer(icell)); // added by Joyce 2005/02/24
colName.add(new Integer(icell));
colName.add(cell.getStringCellValue());
ih.addElement(icell);
}
MiningAttribute matt;
// Check data type and boundary condition using 2nd row
//cell.getCellNum();
if (rows.hasNext()) {
HSSFRow secondRow = (HSSFRow) rows.next();
cells = secondRow.cellIterator();
if (cells.hasNext()) {
int lastcell = 0;
// Vector addedcell = new Vector();
while (cells.hasNext()) {
HSSFCell cell = (HSSFCell) cells.next();
int icell = cell.getCellNum();
if (icell < 0)
throw new MiningException("illegal index of cell");
//mycellidex.add(new Integer(icell)); // added by Joyce
// 2005/02/24
if (icell > cmax)
cmax = icell;
if (cell.getCellType() == HSSFCell.CELL_TYPE_NUMERIC)
{
// Added by Kelvin Jor to recognize date cell
if (HSSFDateUtil.isCellDateFormatted(cell))
{
colType.put(new Integer(icell), new Integer(
Constants.CELL_TYPE_DATE));
}else
{
// colType.put()
colType.put(new Integer(icell), new Integer(
HSSFCell.CELL_TYPE_NUMERIC));
}
// matt = new NumericAttribute();
} else {
colType.put(new Integer(icell), new Integer(
HSSFCell.CELL_TYPE_STRING));
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -