📄 miningcsvstream.java
字号:
// first try both ways
MiningDataSpecification mds_create = recognize(MiningCsvStream.COLUMN_NAME_CREATE);
MiningDataSpecification mds_first = recognize(MiningCsvStream.COLUMN_NAME_FIRST_LINE);
if(mds_create==null || mds_first==null || mds_create.getAttributesNumber()!=mds_first.getAttributesNumber()) {
// something really went wrong
reset();
usedColumnNameType = MiningCsvStream.COLUMN_NAME_CREATE;
return mds_create;
}
for(int i=0; i < mds_create.getAttributesNumber() && i < mds_first.getAttributesNumber(); i++) {
MiningAttribute att_c = mds_create.getMiningAttribute(i);
MiningAttribute att_f = mds_first.getMiningAttribute(i);
// find a clue for using the first line for column names
if(NumericAttribute.class.isInstance(att_f) && CategoricalAttribute.class.isInstance(att_c)) {
reset();
usedColumnNameType = MiningCsvStream.COLUMN_NAME_FIRST_LINE;
return mds_first;
}
}
reset();
usedColumnNameType = MiningCsvStream.COLUMN_NAME_CREATE;
return mds_create;
}
reset();
return recognize(usedColumnNameType);
}
/**
* Gets new meta data using the first {@link #nLines} lines.
* Called by {@link #recognize}.
*
* @param m_usedColumnNameType Specifies either to use the first line of the input file as the attributes names or to create them.
* @return the recognized meta data
* @throws MiningException could not recognize file
*/
protected MiningDataSpecification recognize(short m_usedColumnNameType) throws MiningException {
MiningDataSpecification m_metaData;
reset();
String columnNames[] = null;
Vector attrs = new Vector();
for(int i=0; this.nLines==-1 || i < this.nLines; i++) {
try
{
if(curLine==0 && m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE) {
// Get column names and continue with second line
parse.parseString(inReader.readLine());
curLine++;
columnNames = new String[parse.countTokens()];
for(int j=0; j < parse.countTokens(); j++)
if(!parse.getToken(j+1).equals(""))
columnNames[j] = parse.getToken(j+1);
else {
columnNames[j] = "field"+ String.valueOf(j+1);
// columnNames = null;
// break;
}
}
parse.parseString(inReader.readLine());
curLine++;
}
catch (java.io.IOException ex)
{
break;
}
if(parse.countTokens()<1)
continue;
for(int j=0; j < parse.countTokens(); j++) {
if(attrs.size() > j) { // already object with current index
Object obj = attrs.get(j);
// If obj instanceof CategoricalAttribute, parsing already failed one time, so skip current "field".
if(obj!=null && CategoricalAttribute.class.isInstance(obj))
continue;
}
try
{
double val = Double.parseDouble(parse.getToken(j+1));
}
catch (NumberFormatException ex)
{ // If some checks succeed, create CategoricalAttribute for the current "column"
String token = parse.getToken(j+1).trim();
if(token.equals("") || isMissingValue(token)) { // If missing value leave attribute for this field untouched
continue;
}
if(attrs.size() <= j)
attrs.setSize(j+1); // Expand vector
String attName;
if(m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE && columnNames!=null && columnNames.length >j)
attName = columnNames[j];
else
attName = "field"+ String.valueOf(j+1);
CategoricalAttribute catt = new CategoricalAttribute(attName);
if(this.categoriesType==CATEGORIES_UNSTORED)
catt.setUnstoredCategories(true);
else
catt.setUnboundedCategories(true);
attrs.setElementAt(catt,j);
continue;
}
// Current value is double, so create NumericAttribute
if(attrs.size() <= j)
attrs.setSize(j+1); // Expand vector
String attName;
if(m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE && columnNames!=null && columnNames.length >j)
attName = columnNames[j];
else
attName = "field"+ String.valueOf(j+1);
attrs.setElementAt(new NumericAttribute(attName), j);
}
}
m_metaData = new MiningDataSpecification();
for(int i=0; i < attrs.size(); i++) {
if(attrs.elementAt(i)==null) { // If the attribute for one field has not been set yet
// then set a CategoricalAttribute (most secure).
String attName;
if(m_usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE && columnNames!=null && columnNames.length >i)
attName = columnNames[i];
else
attName = "field"+ String.valueOf(i+1);
CategoricalAttribute catt = new CategoricalAttribute(attName);
if(this.categoriesType==CATEGORIES_UNSTORED)
catt.setUnstoredCategories(true);
else
catt.setUnboundedCategories(true);
attrs.setElementAt(catt, i);
}
m_metaData.addMiningAttribute((MiningAttribute) attrs.elementAt(i));
}
if(this.fileName!=null)
m_metaData.setRelationName(this.fileName);
return m_metaData;
}
// -----------------------------------------------------------------------
// Methods of cursor positioning
// -----------------------------------------------------------------------
/**
* Places the cursor before first row.
* This is done by closing and reopening the file reader.
*
* @throws MiningException
*/
public void reset() throws MiningException
{
super.reset();
curLine = 0;
inReader = new BufferedReader(reader);
cursorPosition = -1;
}
/**
* Advances cursor by one position.
*
* @return true if next vector exists, else false
* @exception MiningException if the meta data does not match the current line or is invalid.
*/
public boolean next() throws MiningException {
cursorVector = null;
if(inReader==null)
return false;
if(metaData==null)
this.metaData = this.recognize();
String nextLine;
try
{
if(curLine==0 && usedColumnNameType==MiningCsvStream.COLUMN_NAME_FIRST_LINE) {
inReader.readLine();
curLine++;
}
nextLine = inReader.readLine();
curLine++;
}
catch (java.io.IOException ex)
{
ex.printStackTrace();
return false;
}
if(nextLine==null)
return false;
parse.parseString(nextLine);
if(parse.countTokens() < 1)
return next();
double[] instance = new double[ metaData.getAttributesNumber() ];
// Get values for all attributes.
for (int i = 0; i < metaData.getAttributesNumber(); i++)
{
if(parse.countTokens()>=i+1) {
String token = parse.getToken(i+1);
MiningAttribute attribute = metaData.getMiningAttribute( i );
if(attribute instanceof CategoricalAttribute)
{
if(isMissingValue(token)) {
instance[i] = Category.MISSING_VALUE;
}
else {
Category cat = new Category( token.toString());
double key = ( (CategoricalAttribute) attribute).getKey(cat);
if (Category.isMissingValue(key))
key = ( (CategoricalAttribute) attribute).addCategory(cat);
instance[i] = key;
}
}
else if(attribute instanceof NumericAttribute) {
if(isMissingValue(token)) {
instance[i] = Category.MISSING_VALUE;
}
else {
try {
instance[i] = Double.parseDouble(token);
}
catch (NumberFormatException ex) {
// throw new MiningException("Value of field \""+attribute.getName()+"\" at line "+this.curLine+" is not a numeric value (\""+token+"\")");
instance[i] = Category.MISSING_VALUE;
}
}
}
else {
throw new MiningException("Unknown attribute type");
}
}
else { // File contains less columns than attributes defined in metaData
instance[i] = Category.MISSING_VALUE;
}
// if( instance[i] == Category.MISSING_VALUE )
// {
// missingValues = true;
// }
}
// Add instance to dataset
cursorVector = new MiningVector(instance);
cursorVector.setMetaData( metaData );
cursorPosition++;
return true;
}
// -----------------------------------------------------------------------
// Methods of reading from the stream
// -----------------------------------------------------------------------
/**
* Reads current data vector.
*
* @return data vector at current cursor position
* @exception MiningException never thrown.
*/
public MiningVector read() throws MiningException
{
return this.cursorVector;
}
// -----------------------------------------------------------------------
// Methods of writing into the stream
// -----------------------------------------------------------------------
/**
* Sets new meta data to this stream.
*
* @param metaData new meta data of stream
* @exception MiningException if an error occurs
*/
public void updateSetMetaData(MiningDataSpecification metaData) throws MiningException
{
if(metaData==null)
throw new MiningException("Invalid MiningDataSpecification: null");
if(metaData.getAttributesNumber() < 1)
throw new MiningException("Invalid MiningDataSpecification: Specification does not contain attribute(s)");
this.metaData = metaData;
};
/**
* Removes all mining vectors from this stream. Note that metadata is not
* affected by this operation since it is fixed for any stream.
*
* @exception MiningException if an error occurs
*/
public void updateRemoveAllVectors() throws MiningException {
throw new MiningException("not supported yet");
}
/**
* Appends new mining vector to this stream.
*
* @param vector new mining vector to append
* @exception MiningException if an error occurs
*/
public void updateAppendVector(MiningVector vector) throws MiningException {
throw new MiningException("not supported yet");
}
//<<Frank J. Xu, 16/02/2005
//Add method to reset the categorical attributes' type of csv stream.
public void updateCategoricalAttrsType()throws MiningException{
MiningVector mv;
reset();
MiningDataSpecification metaData = this.getMetaData();
MiningAttribute[] attrs = metaData.getAttributesArray();
Vector categoricalAttrs = new Vector();
for(int i = 0; i < attrs.length; i++){
if(attrs[i] instanceof CategoricalAttribute){
categoricalAttrs.add(attrs[i]);
}
}
if(categoricalAttrs.size() > 0){
Vector[] categoricalVal = new Vector[categoricalAttrs.size()];
try{
while (this.next() ) {
mv = this.read();
for(int j = 0; j < categoricalAttrs.size(); j++){
Category catObj = mv.getValueCategory((CategoricalAttribute)(categoricalAttrs.get(j)));
//add distinct value.
if(null == categoricalVal[j]){
categoricalVal[j] = new Vector();
if(catObj != null)
categoricalVal[j].add(catObj);
}
else{
//<<13/04/2005, Frank J. Xu
/*
int k = 0;
for(k = 0; k < categoricalVal[j].size(); k++){
if(null == catObj)
break;
else if(categoricalVal[j].get(k).equals(catObj))
break;
}
if((k == categoricalVal[j].size()) && (catObj != null))
categoricalVal[j].add(catObj);
*/
//The order of categorical value affects the correctness of
//assessment operations, so adjust the order from the
//data importing operations.
insertCatValByOrder(categoricalVal[j], catObj);
//>>13/04/2005, Frank J. Xu
}
}
}
//update categorical attributes type based on the threshold.
for(int j = 0; j < categoricalAttrs.size(); j++)
{
if((categoricalVal[j].size() > 0) && (categoricalVal[j].size() < MiningInputStream.CATEGORICAL_ATTRIBUTE_BOUND))
((CategoricalAttribute)(categoricalAttrs.get(j))).setUnboundedCategories(false);
ArrayList catVals = new ArrayList();
for(int k = 0; k < categoricalVal[j].size(); k++){
catVals.add(categoricalVal[j].get(k));
}
((CategoricalAttribute)(categoricalAttrs.get(j))).setValues(catVals);
}
//update attributes array.
int categoricalAttrsIndex = 0;
for(int i = 0; i < attrs.length; i++){
if(attrs[i] instanceof CategoricalAttribute){
attrs[i] = (MiningAttribute) categoricalAttrs.get(categoricalAttrsIndex);
categoricalAttrsIndex++;
}
}
//update metadata.
metaData.setAttributesArray(attrs);
this.metaData = metaData;
}
catch (MiningException ex){
ex.printStackTrace();
}
}
}
private void insertCatValByOrder(Vector categoricalVal, Category catObj)
{
int k = 0;
int insertIndex = -1;
if(catObj != null)
{
String srcDisplayName = catObj.getDisplayValue();
for(k = 0; k < categoricalVal.size(); k++)
{
if(categoricalVal.get(k).equals(catObj))
break;
else
{
String tarDisplayName = ((Category)categoricalVal.get(k)).getDisplayValue();
if(srcDisplayName.compareTo(tarDisplayName) < 0){
insertIndex = k;
break;
}
}
}
if(k == categoricalVal.size())
insertIndex = k;
if((insertIndex >=0))
{
categoricalVal.insertElementAt(catObj, insertIndex);
}
}
}
//Frank J. Xu, 16/02/2005>>
// -----------------------------------------------------------------------
// Test
// -----------------------------------------------------------------------
/**
* Test of CSV stream.
*
* @param args arguments (ignored)
*/
public static void main(String[] args) {
try
{
MiningCsvStream csvStream = new MiningCsvStream("data/csv/vowel.csv");
csvStream.setColumnNameType(MiningCsvStream.COLUMN_NAME_FIRST_LINE);
// csvStream.setSeparator(';');
// csvStream.setQuotationMark('\"');
csvStream.setNumberTestLines(20);
csvStream.open();
System.out.println( csvStream.getMetaData() );
while ( csvStream.next() ) {
MiningVector mv = csvStream.read();
System.out.println(mv);
}
}
catch (MiningException ex)
{
ex.printStackTrace();
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -