📄 instancelist.java
字号:
/** Adds a new instance to the list, using the structures maintained by
* InstanceReader. Properly updates both schemas so that automatic instance
* removal will work.
* @param vals The values of the instance to be added.
* @param labelVal The label value of the instance to be added.
* @param weight The weight of the instance to be added.
* @param allowUnknownLabels TRUE if unknown label values are allowed for the instance to be added.
* @return A new Instance object containing the supplied information.
*/
public Instance reader_add_instance(AttrValue[] vals, AttrValue labelVal, double weight, boolean allowUnknownLabels) {
if(schema.num_attr() != vals.length)
Error.err("InstanceList.reader_add_instance: "
+"schema has "+schema.num_attr()+" attributes, while supplied "
+"array has "+vals.length+" -->fatal_error");
Instance newInst = new Instance(schema);
for(int i=0;i<vals.length;i++)
newInst.values[i] = vals[i];
if(labelVal!=null)
newInst.set_label(labelVal);
//set the weight. if the weight is near zero, don't add the instance.
//if the weight is negative, don't add the instance but warn as well.
int numNegWeights = 0;
if(MLJ.approx_equal(weight,0))
weight = 0; //silent clamp
else if(weight < 0) {
newInst.set_weight(0.0);
if(numNegWeights++ < MAX_NEG_WEIGHT_WARNINGS)
System.out.println("Instance has a negative weight and will "
+"be ignored");
if(numNegWeights == MAX_NEG_WEIGHT_WARNINGS)
System.out.println("There have been max amount of warning on "
+"negative weights. Further warnings will be suppressed");
return newInst;
}
newInst.set_weight(weight);
//Do not add the instance id the weight is near zero and
//REMOVE_ZERO_WEIGHTS is set. Note that negative weight instances
//will never be added.
//can use operator == because near zero weights were clamped above.
if(weight == 0.0 && removeZeroWeights)
return newInst;
//simply add and return the instance if unlabelled.
if(labelVal == null) {
add_instance(newInst);
return newInst;
}
//abort if too many label values
if(schema.nominal_label_info().num_values() > get_max_label_vals()){
if(mineset)
System.out.println("MINESET clause encountered in InstanceList");
else
System.out.println("InstanceList.reader_add_instance: the"
+" selected label '"+schema.label_info().name() +"' has more "
+"than the current limit of "+get_max_label_vals() +" label "
+"values. It is highly recommeded that you do not use an "
+"attribute with many label values, but you may increase "
+"the paramter MAX_LAVEL_VALS to allow this operation "
+"-->fatal_error");
}
//don't add(but warn) if label is unknown
int numUnknownLabels = 0;
if(schema.label_info().is_unknown(labelVal) && !allowUnknownLabels) {
if(numUnknownLabels++ < MAX_UNKNOWN_LABEL_WARNING) {
System.out.println("Warning: instance has an unknown label"
+" value and will be ignored!");
if(numUnknownLabels == MAX_UNKNOWN_LABEL_WARNING)
System.out.println("There have been max amount of label "
+"warnings on unknown labels, further warnings will be "
+"suppressed ");
}
}else
add_instance(newInst);
return newInst;
}
/** Adds the specified Instance to this InstanceList.
* @return A ListIterator of all Instances in this InstanceList.
* @param instance The Instance to bo added.
*/
public ListIterator add_instance(Instance instance) {
//causes fatal_error if not equal
//if (Globals.DBG) MLJ.ASSERT((schema != null)&&(schema.equal(instance.get_schema(), true)),"InstanceList.add_instance: schema not equal to instance.schema");
//update totalWeight cache
double wt = instance.get_weight();
if(wt != 1.0)
weighted = true;
totalWeight += wt;
//InstanceRC inst(instance);
//set the instance's shema to match the schema for the list.
//we're just making sure that the two schemas (which are equal)
//are also equal in memory, so this operation is logically const
//Failure to perform this step will cause the OK() function
//to occasionally fail for this list.
//instance.set_schema(schema); //inst.set_schema(schema);
//update the counters if we have them
if(bagCounters != null)
bagCounters.add_instance(instance); //(inst);
try{
instances.add(instance.clone());
}catch(CloneNotSupportedException e){
Error.err("InstanceList.add_instance:CloneNotSupportedException caught");}
//instances.add(instance);
ListIterator pix = instances.listIterator(0);
/*while(pix.hasNext())
{
Instance inst = (Instance)pix.next();
inst.display(false,false);
}*/
return pix;
}
/** Updates the list by removing specified attributes. This is similar to
* the project() call, except that it is designed to be used WHILE READING.
* The size of the projMask may be larger than the number of attributes in
* the schema. This is to allow InstanceReader to maintain a single copy of
* the projMask even as the schema shrinks.
* @param projMask A boolean array with the same number of values as there are
* attributes. Each boolean element coresponds to an attribute
* In the order they were input. True values represent
* attributes that are used.
*/
public void update_for_overflows(boolean[] projMask) {
//determine if projection is needed
boolean projNeeded = false;
for(int i=0;i<schema.num_attr();i++)
if(projMask[i] == false) {
projNeeded = true;
break;
}
//only act if attributes need projection
if(projNeeded) {
//build a projMask of the correct size
//Projection happens rarely so this is not a big hit
boolean[] truncProjMask = new boolean[schema.num_attr()];
for(int i=0;i<schema.num_attr();i++)
truncProjMask[i] = projMask[i];
try{
project_in_place(truncProjMask);
}catch(CloneNotSupportedException e){
Error.err("InstanceList.update_for_overflows:"
+" clone not supported exception was caught"); }
}
}
/** Returns the tiebreaking distribution order stored in the CatDist object
* for this InstanceList.
* @return The tiebreaking order.
*/
public int[] get_distribution_order() {
return CatDist.tiebreaking_order(counters().label_counts());
}
/** Returns the sum of the weights of all Instances in the InstanceList.
* This value is cached for faster access.
* @return The sum of weights for all Instances stored in this InstanceList.
*/
public double total_weight(){return total_weight(false);}
/** Returns the sum of the weights of all Instances in the InstanceList.
* This value is cached for faster access, but can be recalculated to
* avoid the numerical instabilities involved in weight updates.
* @return The sum of weights for all Instances stored in this InstanceList.
* @param recalculate TRUE if the sum should be recalculated, FALSE if
* the cached value should be used.
*/
public double total_weight(boolean recalculate) {
//Compute total_weight on the fly and compare to
//the cached value. This is a very slow test.
//DBGSLOW(OK());
if(recalculate){
double newTotalWeight = 0;
ListIterator pix = instances.listIterator();
Instance inst = null;
for(;pix.hasNext();inst = (Instance)pix.next())
newTotalWeight += get_weight(inst);
totalWeight = newTotalWeight;
}
return totalWeight;
}
/** Returns the weight for the specified Instance.
* @return The weight for the Instance supplied.
* @param instance The Instance for which weight is questioned.
*/
public double get_weight(Instance instance) {
double wt = instance.get_weight();
if (Globals.DBG) MLJ.ASSERT((weighted) || (wt == 1.0),"InstanceList.get_weight: InstanceList is not weighted");
return wt;
}
/** Deletes the counters stored for Instances in this InstanceList.
*/
public void drop_counters() {
if(bagCounters != null) {
InstanceList thisNC = this;//(InstanceList)this;
// delete thisNC->bagCounters;
thisNC.bagCounters = null;
// thisNC->bagCounters = NULL;
}
}
/** Normalize all weights by the number of instances in the list.
* After this operation, totalWeight should equal the number of instances.
* The normalization factor is 1 and zeros are allowed for Instance weights.
*/
public void normalize_weights() {
normalize_weights(1.0,true);
}
/** Normalize all weights by the number of instances in the list, times
* an optional normalization factor. After this operation, totalWeight
* should equal the number of instances * the normalization factor. Zeros are
* allowed for Instance weights.
* @param normFactor The normalization factor.
*/
public void normalize_weights(double normFactor) {
normalize_weights(normFactor,true);
}
/** Normalize all weights by the number of instances in the list, times
* an optional normalization factor. After this operation, totalWeight
* should equal the number of instances * the normalization factor.
* @param normFactor The normalization factor.
* @param allowZeros TRUE if zeros are allowed for Instance weights. If FALSE,
* Instance weights that are approximately equal 0, the weight
* is automatically reset to a lower bound.
*/
public void normalize_weights(double normFactor,
boolean allowZeros) {
// drop counters when calling this--you get too many precision
// errors otherwise.
drop_counters();
// Set the weighted flag here. When we call get_weight, we check
// if weighted is set whenever we find a nonzero weight.
weighted = true;
double newTotalWeight = 0;
double r = normFactor * num_instances() / totalWeight;
double lbound = MLJ.storedRealEpsilon * 2; //mlc.stored_clamping_epsilon()*2;
// We can use Instance.set_weight() here because we'll be resetting
// the weights at the end anyway.
for(ListIterator li = instances.listIterator();
li.hasNext(); ) {
Instance p = (Instance)li.next();
// for(ILPix p(this); p; ++p) {
double newWeight = get_weight(p) * r;
if (MLJ.approx_equal(newWeight,0.0)
&& !allowZeros)
newWeight = lbound;
p.set_weight(newWeight); //instance_list()(p).set_weight(newWeight);
newTotalWeight += get_weight(p);
}
// float is used for comparison with coarser granularity.
//@@ The OK() check here as well as the weight checks are disabled
//@@ because of precision problems introduced in this function.
//@@DBGSLOW(OK());
//@@if (!mlc.approx_equal((float) totalWeight,
//@@ (float) (num_instances() * normFactor)))
//@@ err << "InstanceList.normalize_weights: total weight, "
//@@ << (float) totalWeight << ", is not near number of instances "
//@@ "times normalization factor, "
//@@ << (float) (num_instances() * normFactor) << fatal_error;
// Reset total weight here
totalWeight = newTotalWeight;
// DBG(OK());
}
//PtrArray<RealAndLabelColumn*>*
/** Splits the InstanceList into several RealAndLabelColumn structures for the
* parallel discretization.
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -