📄 instancelist.java
字号:
*
* @param mask Boolean array of the same length as the number of attributes. TRUE values
* indicate that attribute should have a RealAndLabelColumn object created for it,
* FALSE otherwise.
* @return An array of RealAndLabelColumns generated from the attribute values for the
* Instances stored in this InstanceList.
*/
public RealAndLabelColumn[] transpose(boolean[] mask) {
// @@ it is inefficient to get the number of instances up front since we
// have to traverse the whole instance list, but allocating the array as
// a single piece will allow efficiency in splitting later.
int numAttr = schema.num_attr();
//obs PtrArray<RealAndLabelColumn*>* columns =
//obs new PtrArray<RealAndLabelColumn*>(numAttr);
RealAndLabelColumn[] columns = new RealAndLabelColumn[numAttr];
// Select only continuous attributes
int numInst = num_instances();
int numColumns = 0;
for (int k = 0; k < numAttr; k++){
AttrInfo ai = schema.attr_info(k);
if (ai.can_cast_to_real() && mask[k]) {
if (schema.is_labelled())
columns[k] =
new RealAndLabelColumn(numInst,
schema.num_label_values(),
ai.cast_to_real(),
nominal_label_info());
else
columns[k] =
new RealAndLabelColumn(numInst, ai.cast_to_real());
numColumns++;
}
}
// Store each column (attribute, label and weight) in a linear array
if (schema.is_labelled()) {
NominalAttrInfo labelInfo = schema.label_info().cast_to_nominal();
for (ListIterator pix = instances.listIterator(); pix.hasNext();) {
Instance instance =(Instance)pix.next();
AttrValue labVal = instance.get_label();
MLJ.ASSERT(!labelInfo.is_unknown(labVal),"InstanceList.transpose: label is unknown");
int lab = labelInfo.get_nominal_val(labVal) + 1;//plus 1 added to offset schema -JL
int iLab = lab - Globals.FIRST_CATEGORY_VAL - Globals.FIRST_NOMINAL_VAL;
double weight = instance.get_weight();
for(int k = 0; k < numAttr; k++) {
// Skip the NULL references (default in the constructor)
RealAndLabelColumn column = columns[k];
if (column == null) continue;
RealAttrInfo attrInfo = column.attr_info();
AttrValue attrValue = instance.get_value(k);
if (attrInfo.is_unknown(attrValue))
column.add_unknown(iLab, weight);
else
column.add_known(
(float)(attrInfo.get_real_val(attrValue)),
iLab, weight);
}
}
} else {
for (ListIterator pix = instances.listIterator(); pix.hasNext();) {
Instance instance =(Instance)pix.next();
double weight = instance.get_weight();
for(int k = 0; k < numAttr; k++) {
// Skip the NULL references (default in the constructor)
RealAndLabelColumn column = columns[k];
if (column == null) continue;
RealAttrInfo attrInfo = column.attr_info();
AttrValue attrValue = instance.get_value(k);
if (attrInfo.is_unknown(attrValue))
column.add_unknown(weight);
else
column.add_known(
(float)(attrInfo.get_real_val(attrValue)), weight);
}
}
}
logOptions.LOG(6, "Instance list transposed into "+numColumns+" columns."
+'\n');
return columns;
}
/** Checks if the total weight of this InstanceList is approximately 0.
* @return TRUE if the total weight is approximately equal to 0, FALSE
* otherwise.
*/
public boolean no_weight(){
return MLJ.approx_equal((float)totalWeight,0.0);
}
/** Returns the Category corresponding to the label that occurs most
* frequently in the InstanceList. In case of a tie, we prefer the
* given tieBreaker if it is one of those tied. TieBreaker can be
* UNKNOWN_CATEGORY_VAL if you prefer the earlier category to the
* tied ones. The method used differs depending on whether or not we have
* counters on this List. It is considerably faster if the counters are
* present. This method is only meaningful for labels with AttrInfo
* derived from NominalAttrInfo. This method will cause fatal_error
* otherwise. In the case of a tie, returns the Category corresponding to
* the label which occurs first in the NominalAttrInfo.
* InstanceList.majority_category() takes time proportional to the number
* of different categories + the number of instances.
*
* @return The category that occurs the most in this InstanceList or
* UNKNOWN_CATEGORY_VAL if there are no instances.
* @param tieBreakingOrder Array indicating the order in which ties are broken. The array should be the same
* length as the number of attributes and each element corresponds to an attribute.
* Lower number elements represent attributes that are more favorable in an tie
* than higher number elements.
*/
public int majority_category(int[] tieBreakingOrder) {
double[] computedCounts = null;
double[] labelCounts = null;
if (bagCounters != null)
labelCounts = bagCounters.label_counts();
else {
NominalAttrInfo nai = nominal_label_info();
// +1 because of Globals.UNKNOWN_CATEGORY_VAL instead of Globals.FIRST_CATEGORY_VAL
computedCounts =new double[nai.num_values()+1]; // (Globals.UNKNOWN_CATEGORY_VAL, nai.num_values() + 1, 0);
double[] count = computedCounts;
// for (ILPix pix(*this); pix; ++pix)
for(ListIterator pixLI = instance_list().listIterator();pixLI.hasNext();){
Instance pix = (Instance)pixLI.next();
count[nai.get_nominal_val(pix.get_label())] += pix.get_weight();
}
labelCounts = computedCounts;
}
int best = CatDist.majority_category(labelCounts, tieBreakingOrder);
computedCounts = null;
return best;
}
/** This function takes an attribute mask which is an array of booleans
* indicating whether the corresponding attribute should be included in
* the projection. InstanceList.project() takes
* O(num attributes * (num instances + num attributes)) time.
* @param attrMask A boolean array with the same number of values as there are
* attributes. Each boolean element corresponds to an attribute
* in the order they were input. True values represent
* attributes that are used.
* @return An InstanceList with a new Schema that includes only the attributes
* with a mask value true. May return null if an exception occures.
*/
public InstanceList project(boolean[] attrMask) {
// DBGSLOW(OK());
MLJ.ASSERT(attrMask.length == num_attr(),"InstanceList.project: attrMask's length does not match number of attributes");
try{
Schema newSchema = get_schema().project(attrMask);
InstanceList newInstList = new InstanceList(newSchema);
for(ListIterator pix = instances.listIterator();pix.hasNext();)
newInstList.add_instance(((Instance)pix.next()).project(newSchema, attrMask));
return newInstList;
}catch(CloneNotSupportedException e){e.printStackTrace();}
return null;
}
/** Sets the given StreamTokenizer to parse/not parse numbers.
* @param stream StreamTokenizer tokenizing information.
* @param ifYes TRUE if numbers should be parsed as double values, FALSE otherwise.
*/
private void parseNumbers(StreamTokenizer stream, boolean ifYes) {
if (ifYes) stream.parseNumbers();
else {
stream.ordinaryChars((int)'0', (int)'9');
stream.wordChars((int)'0', (int)'9');
}
}
/** Returns a String representation of this InstanceList object.
* @param normalizeReal TRUE if real values in an Instance object should be normalized.
* @return A String representation of this InstanceList object.
*/
public String out(boolean normalizeReal) {
String rtrn = new String();
ListIterator pix = instances.listIterator(0);
while(pix.hasNext()) {
Instance inst = (Instance)pix.next();
rtrn = rtrn + inst.out(is_weighted(), normalizeReal);
}
if(no_instances())
rtrn = rtrn + "InstanceList.display: No instances";
return rtrn;
}
/** Returns a clone of this InstanceList object.
* @param preserveCounters TRUE if counters of values should be copied, FALSE otherwise.
* @return A new object with a copy of the data stored in the supplied InstanceList.
*/
public Object clone(boolean preserveCounters) {
return new InstanceList(this,preserveCounters);
}
/** Returns a clone of this InstanceList object. Does not preserve counters.
* @return A new object with a copy of the data stored in the supplied InstanceList.
*/
public Object clone() {
return new InstanceList(this,false);
}
/** Checks integrity constraints. We verify that all instances have the
* same schema at level 0
* Comments : Because the schema has attrinfo's that are updated,
* everyone must share the EXACT representation, not
* just logical equivalence. Specifically, if the schema
* is updated, we want to make sure all instances see
* the exact same min/max for RealAttrInfo's. Level of
* checking is automatically set to 0.
*
*/
public void OK() {
OK(0);
}
/** Checks integrity constraints. We verify that all instances have the
* same schema at level 0
* Comments : Because the schema has attrinfo's that are updated,
* everyone must share the EXACT representation, not
* just logical equivalence. Specifically, if the schema
* is updated, we want to make sure all instances see
* the exact same min/max for RealAttrInfo's
*
* @param level Level of checking done.
*/
public void OK(int level) {
/* if (level < 1 && schema != null && instances) {
Schema schemaRep = get_schema().read_rep();
for (ILPix pix(*this); pix; ++pix)
if ((*pix).get_schema().read_rep() != schemaRep)
err << "InstanceList.OK mismatch in schemas for list and "
" instance. Instance is:\n" << *pix <<
" with schema " << (void *)(*pix).get_schema().read_rep() <<
" = " << (*pix).get_schema() <<
"\nList schema is: " << (void *)schemaRep <<
" = " << *schemaRep << fatal_error;
}
// Check that the counters agree with the actual number of instances
// if we have counters
// WARNING: we need to use totalWeight here instead of total_weight().
// total_weight() calls OK() in DBG level 2, which would lead
// to infinite recursion!!!
if(bagCounters) {
double num = counters().OK(); // @@ Dan, change num to weight?
MLJ.verify_approx_equal(StoredReal(num), StoredReal(totalWeight),
"InstanceList.OK: "
"Counters claim of weight does not match "
"list's total weight");
}
ASSERT(instances);
// Check for numerical inaccuracy in the totalWeight cache.
// DO NOT refresh the cache here because we don't want results
// to differ in high debug levels!
// Check that totalWeight is correct (if we're weighted)
// Passing StoredReals to approx_equal means using coarse granularity.
if(is_weighted()) {
Real compTotalWeight = 0;
for (ILPix pix(*this); pix; ++pix)
compTotalWeight += get_weight(pix);
mlc.verify_approx_equal(StoredReal(compTotalWeight),
StoredReal(totalWeight),
"InstanceList.OK: computed weight "
"fails to match totalWeight");
}
// Check that totalWeight is close to number of instances
// (if we're not weighted)
// Passing StoredReals to approx_equal means using coarse granularity.
if(!is_weighted()) {
mlc.verify_approx_equal((StoredReal) num_instances(),
(StoredReal) totalWeight,
"InstanceList.OK: List is unweighted, "
"but total weight fails to match whole "
"number of instances");
}
// Total weight may not be negative
if (totalWeight < -MLC.stored_real_epsilon())
err << "InstanceList.OK: total weight (" << totalWeight
<< ") is negative" << fatal_error;
*/
}
/** Displays the names file associated with the InstanceList.
* @param stream Writer object to which the names file will be displayed.
* @param protectChars TRUE if protected characters are used, FALSE otherwise.
* @param header A String to use for the header to the display.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -