📄 multidimensionalstream.java
字号:
/*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*/
/**
* Title: XELOPES Data Mining Library
* Description: The XELOPES library is an open platform-independent and data-source-independent library for Embedded Data Mining.
* Copyright: Copyright (c) 2002 Prudential Systems Software GmbH
* Company: ZSoft (www.zsoft.ru), Prudsys (www.prudsys.com)
* @author Carsten Weisse
* @author Michael Thess
* @version 1.2
*/
package com.prudsys.pdm.Input.Multidimensional;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.Vector;
import org.omg.cwm.analysis.transformation.ClassifierFeatureMap;
import org.omg.cwm.analysis.transformation.ClassifierMap;
import org.omg.cwm.analysis.transformation.TransformationMap;
import org.omg.cwm.analysis.transformation.TransformationPackage;
import org.omg.cwm.objectmodel.core.CorePackage;
import org.omg.cwm.resource.multidimensional.Dimension;
import org.omg.cwm.resource.multidimensional.DimensionedObject;
import org.omg.cwm.resource.multidimensional.MultidimensionalPackage;
import org.omg.cwm.resource.multidimensional.Schema;
import com.prudsys.pdm.Core.CategoricalAttribute;
import com.prudsys.pdm.Core.Category;
import com.prudsys.pdm.Core.MiningAttribute;
import com.prudsys.pdm.Core.MiningDataSpecification;
import com.prudsys.pdm.Core.MiningException;
import com.prudsys.pdm.Input.MiningArrayStream;
import com.prudsys.pdm.Input.MiningInputStream;
import com.prudsys.pdm.Input.MiningVector;
import com.prudsys.pdm.Input.Predicates.CompoundPredicate;
import com.prudsys.pdm.Input.Predicates.Predicate;
import com.prudsys.pdm.Input.Predicates.SimplePredicate;
import com.prudsys.pdm.Input.Predicates.SimpleSetPredicate;
import com.prudsys.pdm.Input.Records.Csv.MiningCsvStream;
import com.prudsys.pdm.Utils.IntVector;
/**
* Multidimensional stream implementation. It takes an arbitrary
* mining input stream as input and creates data structures that
* allow to run fast selections and/or orderings. The resulting stream
* works on the selected and ordered data. The ordering is applied
* to the selected data (or complete data if no selections defined). <p>
*
* The Multidimensional stream was created for three purposes:
* <ul>
* <li> <i>Fast selection/ordering of data and results</i>: The data sources
* of algorithms could be restricted through various selection conditions.
* On the other hand, selections and orderings could also be applied to
* mining models like associations rules or classification rule sets.
* This brings OLAP flexibility into Data Mining.
* <li> <i>Dynamic selection/ordering inside algorithms</i>: Some algorithms
* like decision tree methods can be efficiently implemented using
* selections of the data. That is, decision tree algorithms can be
* directly combined with OLAP data sources. This can be realized
* via the Multidimensional stream; any algorithm that takes
* MultidimensionalStream as input can use its selection mechanisms.
* <li> <i>Physical model of OLAP package</i>: The new OLAP package
* of XELOPES (from JOLAP which extends the CWM OLAP package) defines
* the logical OLAP model. It requires a physical model for implementation.
* This could be based on a relational database (ROLAP) or on memory-based
* multidimensional systems (MOLAP). Next XELOPES Version 1.2.3 will
* contain an MOLAP implementation where the MultidimensionalStream will
* serve as physical model.
* </ul>
* The use of MultidimensionalStream is very easy. Its constructor takes
* a minining input stream as source. Then you have to apply the method
* readMultidimensionalStreamData which creates the internal structure
* for fast selections. Next you can define selection conditions over
* categorical attributes using the SelectAttribute class which are
* passed to the method runSelections as argument. The methods runs the
* selection and returns the set of mining vectors which meet the selection
* conditions. After that, all stream methods like reset, next, and read
* apply only to the selected data. <p>
* The method addSelection adds a new selection condition and runs the
* enlarged selection like runSelections. On the contrary, the method
* removeSelection removes a selection condition and runs the reduced
* selection. <p>
* Alternatively, you can define the selection conditions through a Predicate
* and use the method runSelectionsPredicate. This method works like
* runSelections but does not affect the selection conditions defined
* through SelectAttribute. Thus, it cannot be combined with the other
* selection methods described above. <p>
* Notice that just categorical attributes which are not of unstoredCategories
* type can be used for selections. Categorical attributes of unstoredCategories
* type and numeric attributes are taken "as is". <p>
* Ordering can be applied to the results of selections (or the complete
* stream if no selections are defined). The OrderAttribute class defines
* an attribute and its direction of ordering. It is passed through the
* setOrdering, addOrdering, removeOrdering methods to an internal list
* of ordered attributes (note that unlike the corresponding selection
* methods these methods don't run the ordering). Note further that the
* order of the ordering attributes is crucial for ordering whichs starts
* from the first, through the second (for equal first attribute values)
* until the last one. The runOrdering method runs the actual ordering.
* After that, the stream method next corresponds to the new order of
* the mining vectors. <p>
* Ordering always works on the selected data. In turn, all selection methods
* call runOrdering at the end thus automatically applying the ordering
* to the selected vectors. If no ordering conditions were defined, no
* ordering is applied inside the selection methods. <p>
* Ordering can be applied to all types of attributes including numeric
* ones. However, ordering can only be applied to streams which support
* the move method. <p>
* About efficiency. Though this is a relatively simple implementation - which
* uses ideas from association rules algorithms - the selections run very
* fast even on large data, i.e. in linear time. More problematic is memory:
* The stream requires O(N*D) integers for its internal data structure
* where N is the number of mining vectors of the source stream and D
* is the number of categorical attributes (which are not of unstoredCategories
* type). For example, if an input stream contains 1 million mining vectors
* and 50 categorical attributes, then 1*50*4 = 200 MB are required for
* storing the selection structures; 10 million vectors require 10*50*4 = 2 GB!
* Although RAM may be cheap today, nevertheless the MultidimensionalStream
* is not ably to handle very large datasets. Here, ROLAP mechanisms must
* be applied which, however, are much slower in running the selections. <p>
* Ordering scales like O(N*log(N)) and requires only O(N) integers to be
* stored.
*/
public class MultidimensionalStream extends MiningInputStream
{
// -----------------------------------------------------------------------
// Variables declarations
// -----------------------------------------------------------------------
/** Source input stream. */
protected MiningInputStream miningInputStream;
/** Move operation supported on source input stream. */
protected boolean allowsMove = false;
/** Number of all data vectors of source stream. */
protected int nVec = 0;
/** Names of attributes to be used for selections. */
protected Vector selectionAttributes = null;
/** Array of attribute types (0 - categ., 1 - unstored categ., 2 - numeric, 3 -excluded). */
protected byte[] attTypes = null;
/** Vector containing all vector indexes for all dimensions and categories. */
protected Vector[] dimVec = null;
/** Number of selection attributes. */
protected int nSelAtt = 0;
/** Sums of selected vectors. */
protected int[] vecSums = null;
/** Contains numbers of all vectors after selection, null - if all. */
protected IntVector selVec = null;
// -----------------------------------------------------------------------
// Constructor
// -----------------------------------------------------------------------
/**
* Constructor with source mining input stream.
*
* @param miningInputStream source mining input stream
*/
public MultidimensionalStream(MiningInputStream miningInputStream) {
this.miningInputStream = miningInputStream;
cursorPosition = -1;
Enumeration suppinp = miningInputStream.getSupportedStreamMethods();
allowsMove = false;
while ( suppinp.hasMoreElements() ) {
String meth = (String) suppinp.nextElement();
if ( meth.equals("move") )
allowsMove = true;
}
}
// -----------------------------------------------------------------------
// Getter and setter methods
// -----------------------------------------------------------------------
/**
* Returns supported stream methods. Taken from the source input stream;
* if there are update methods they are removed.
*
* @return supported stream methods
*/
public Enumeration getSupportedStreamMethods() {
Enumeration suppinp = miningInputStream.getSupportedStreamMethods();
Vector suppmeth = new Vector();
while ( suppinp.hasMoreElements() ) {
String meth = (String) suppinp.nextElement();
if ( meth.equals("updateSetMetaData") || meth.equals("updateRemoveAllVector") ||
meth.equals("updateAppendVector"))
continue;
suppmeth.addElement(meth);
}
return suppmeth.elements();
}
/**
* Finds physical multidimensional model (CWM Resource Package "Multidimensional").
* Delivers multidimensional schema. <p>
*
* Here we simply associate one dimension with every attribute of the source
* mining input stream. Notice that Dimension is of the Classifier type
* whereas MiningAttribute extends Attribute. (Like in CWM OLAP Package this
* reflects the fact that OLAP data is more than simply a matrix of attributes -
* in contrast to Data Mining where MetaData is actuelly a collection of
* MiningAttributes. Nevertheless basic properties of OLAP data are matrix-like,
* so it is useful to extend this class from MiningInputStream.) Thus we
* simply add the associated mining attribute to the dimension. The
* DimensionedObject, which represents an attribute of Dimension, currently
* does not contain further information. <p>
*
* Although this definition of the physical model is very simple - and to some
* extent voluntary - it precisely reflects the underlying representation.
* As emphasized in the CWM specification, multidimensional databases tend
* to be proprietary in structure. This is what we see here.
*
* @exception MiningException couldn't obtain physical model
* @see getPhysicalToLogicalModelTransformation
*/
public void findPhysicalModel() throws MiningException {
com.prudsys.pdm.Cwm.CWMCompletePackage cwmFactory =
com.prudsys.pdm.Cwm.CWMCompletePackage.getCWMCompletePackage();
CorePackage cpg = cwmFactory.getCore();
MultidimensionalPackage mpg = cwmFactory.getMultidimensional();
Schema schema = mpg.getSchema().createSchema();
schema.setName("MultidimensionalStream");
for (int i = 0; i < metaData.getAttributesNumber(); i++) {
MiningAttribute ma = metaData.getMiningAttribute(i);
Dimension dim = mpg.getDimension().createDimension();
dim.setName( ma.getName() );
dim.addFeature(ma);
DimensionedObject dimObj = mpg.getDimensionedObject().createDimensionedObject();
dimObj.setDimension(dim);
dim.setDimensionedObject(dimObj);
schema.addDimension(dim);
schema.addDimensionedObject(dimObj);
}
physicalModel = schema;
}
/**
* Returns the CWM mapping from the physical to the logical data model.
* Uses ClassifierFeatureMaps to map the Dimensions of the Multidimensional
* model to the MiningAttributes of the MetaData.
*
* @return transformation of physical to logical data model
* @throws MiningException couldn't get transformation
* @see findPhysicalModel
*/
public org.omg.cwm.analysis.transformation.TransformationMap getPhysicalToLogicalModelTransformation()
throws MiningException {
com.prudsys.pdm.Cwm.CWMCompletePackage cwmFactory =
com.prudsys.pdm.Cwm.CWMCompletePackage.getCWMCompletePackage();
TransformationPackage tpg = cwmFactory.getTransformation();
TransformationMap tm = tpg.getTransformationMap().createTransformationMap();
ClassifierMap cm = tpg.getClassifierMap().createClassifierMap();
tm.addOwnedElement(cm);
Schema schema = (Schema) getPhysicalModel();
Iterator it = schema.getDimension().iterator();
for (int i = 0; i < metaData.getAttributesNumber(); i++) {
ClassifierFeatureMap cfm = tpg.getClassifierFeatureMap().createClassifierFeatureMap();
cfm.addClassifier( (Dimension) it.next() );
cfm.addFeature( metaData.getMiningAttribute(i) );
cfm.setClassifierToFeature(true);
cm.addCfMap(cfm);
};
metaData.addOwnedElement(tm);
physicalModel.addOwnedElement(tm);
return tm;
}
/**
* Returns number of vectors where the selection condition is applied.
*
* @return number of selected vectors
* @exception MiningException method readMultidimensionalStreamData not called before
*/
public int getVectorsNumber() throws MiningException
{
if (dimVec == null)
throw new MiningException("run readMultidimensionalStreamData method first");
if (selVec == null) return nVec;
return selVec.size();
}
/**
* Returns list of names of all attributes that can be used for selection.
*
* @return selection attributes, null if all attributes are used
*/
public Vector getSelectionAttributes()
{
return selectionAttributes;
}
/**
* Add name of attribute to selection list.
*
* @param selectionAttribute name of attribute to be added
*/
public void addSelectionAttribute(String selectionAttribute) {
if (selectionAttributes == null)
selectionAttributes = new Vector();
selectionAttributes.addElement(selectionAttribute);
}
/**
* Sets list of names of all attributes that can be used for selection.
*
* @param selectionAttributes new selection attributes, null if all
* attributes are used
*/
public void setSelectionAttributes(Vector selectionAttributes)
{
this.selectionAttributes = selectionAttributes;
}
// -----------------------------------------------------------------------
// Loads data from input stream
// -----------------------------------------------------------------------
/**
* Reads data matrix from source input stream and builds
* Multidimensional structure. <p>
*
* The multidimensional structure is only build for the categorical
* attributes which are not of unstoredCategories type. Categorical
* attributes of unstoredCategories type and numeric attribute are
* accepted but cannot be used for selections.
*
* @exception MiningException can't read data from stream
*/
public void readMultidimensionalStreamData() throws MiningException {
// Init:
nSelAtt = 0;
selVec = null;
vecSums = null;
// Get meta data and check for attribute types:
metaData = miningInputStream.getMetaData();
int nAtt = metaData.getAttributesNumber();
attTypes = new byte[nAtt];
for (int i = 0; i < nAtt; i++) {
MiningAttribute ma = metaData.getMiningAttribute(i);
if ( selectionAttributes != null && !selectionAttributes.contains( ma.getName() ) )
attTypes[i] = 3; // excluded from selection
else if (!(ma instanceof CategoricalAttribute))
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -