📄 addnoise.java
字号:
/**
* Sets index of the attribute used.
*
* @param index the index of the attribute
*/
public void setAttributeIndex(String attIndex) {
m_AttIndex.setSingleIndex(attIndex);
}
/**
* Sets the format of the input instances.
*
* @param instanceInfo an Instances object containing the input
* instance structure (any instances contained in the object are
* ignored - only the structure is required).
* @return true if the outputFormat may be collected immediately
* @exception Exception if the input format can't be set
* successfully
*/
public boolean setInputFormat(Instances instanceInfo)
throws Exception {
super.setInputFormat(instanceInfo);
// set input format
//m_InputFormat = new Instances(instanceInfo, 0);
m_AttIndex.setUpper(getInputFormat().numAttributes() - 1);
// set index of attribute to be changed
// test if nominal
if (!getInputFormat().attribute(m_AttIndex.getIndex()).isNominal()) {
throw new Exception("Adding noise is not possible:"
+ "Chosen attribute is numeric.");
}
// test if two values are given
if ((getInputFormat().attribute(m_AttIndex.getIndex()).numValues() < 2)
&& (!m_UseMissing)) {
throw new Exception("Adding noise is not possible:"
+ "Chosen attribute has less than two values.");
}
setOutputFormat(getInputFormat());
m_NewBatch = true;
return false;
}
/**
* Input an instance for filtering.
*
* @param instance the input instance
* @return true if the filtered instance may now be
* collected with output().
* @exception Exception if the input format was not set
*/
public boolean input(Instance instance) throws Exception {
// check if input format is defined
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
if (m_NewBatch) {
resetQueue();
m_NewBatch = false;
}
// make new instance
Instance newInstance = (Instance)instance.copy();
getInputFormat().add(instance);
return false;
}
/**
* Signify that this batch of input to the filter is finished.
* If the filter requires all instances prior to filtering,
* output() may now be called to retrieve the filtered instances.
*
* @return true if there are instances pending output
* @exception Exception if no input structure has been defined
*/
public boolean batchFinished() throws Exception {
Instance current;
if (getInputFormat() == null) {
throw new Exception("No input instance format defined");
}
// Do the subsample, and clear the input instances.
addNoise (getInputFormat(), m_RandomSeed, m_Percent, m_AttIndex.getIndex(),
m_UseMissing);
for(int i=0; i<getInputFormat().numInstances(); i++) {
push (new Instance(getInputFormat().instance(i)));
}
m_NewBatch = true;
return (numPendingOutput() != 0);
}
/**
* add noise to the dataset
*
* a given percentage of the instances are changed in the way, that
* a set of instances are randomly selected using seed. The attribute
* given by its index is changed from its current value to one of the
* other possibly ones, also randomly. This is done with leaving the
* apportion the same.
* if m_UseMissing is true, missing value is used as a value of its own
* @param instances is the dataset
* @param seed used for random function
* @param percent percentage of instances that are changed
* @param attIndex index of the attribute changed
* @param useMissingValue if true missing values are treated as extra value
*/
public void addNoise (Instances instances,
int seed,
int percent,
int attIndex,
boolean useMissing) {
int indexList [];
int partition_count [];
int partition_max [];
double splitPercent = (double) percent; // percentage used for splits
// fill array with the indexes
indexList = new int [instances.numInstances()];
for (int i=0; i<instances.numInstances(); i++) {
indexList[i] = i;
}
// randomize list of indexes
Random random = new Random(seed);
for (int i=instances.numInstances()-1; i>=0; i--) {
int hValue = indexList[i];
int hIndex = (int)(random.nextDouble()*(double) i);
indexList[i] = indexList[hIndex];
indexList[hIndex] = hValue;
}
// initialize arrays that are used to count instances
// of each value and to keep the amount of instances of that value
// that has to be changed
// this is done for the missing values in the two variables
// missing_count and missing_max
int numValues = instances.attribute(attIndex).numValues();
partition_count = new int[numValues];
partition_max = new int[numValues];
int missing_count = 0;;
int missing_max = 0;;
for (int i = 0; i < numValues; i++) {
partition_count[i] = 0;
partition_max[i] = 0;
}
// go through the dataset and count all occurrences of values
// and all missing values using temporarily .._max arrays and
// variable missing_max
for (Enumeration e = instances.emerateInstances();
e.hasMoreElements();) {
Instance instance = (Instance) e.nextElement();
if (instance.isMissing(attIndex)) {
missing_max++;
}
else {
int j = (int) instance.value(attIndex);
partition_max[(int) instance.value(attIndex)]++;
}
}
// use given percentage to calculate
// how many have to be changed per split and
// how many of the missing values
if (!useMissing) {
missing_max = missing_count;
} else {
missing_max = (int) (((double)missing_max/100) * splitPercent + 0.5);
}
int sum_max = missing_max;
for (int i=0; i<numValues; i++) {
partition_max[i]=(int) (((double)partition_max[i]/100) * splitPercent
+ 0.5);
sum_max = sum_max + partition_max[i];
}
// initialize sum_count to zero, use this variable to see if
// everything is done already
int sum_count = 0;
// add noise
// using the randomized index-array
//
Random randomValue = new Random (seed);
int numOfValues = instances.attribute(attIndex).numValues();
for(int i=0; i<instances.numInstances(); i++) {
if (sum_count >= sum_max) { break; } // finished
Instance currInstance = instances.instance(indexList[i]);
// if value is missing then...
if (currInstance.isMissing(attIndex)) {
if (missing_count < missing_max) {
changeValueRandomly (randomValue,
numOfValues,
attIndex,
currInstance,
useMissing);
missing_count++;
sum_count++;
}
} else {
int vIndex = (int) currInstance.value(attIndex);
if (partition_count[vIndex] < partition_max[vIndex]) {
changeValueRandomly (randomValue,
numOfValues,
attIndex,
currInstance,
useMissing);
partition_count[vIndex]++;
sum_count++;
}
}
}
}
/**
* method to set a new value
*
* @param r random function
* @param numOfValues
* @param instance
* @param useMissing
*/
private void changeValueRandomly(Random r, int numOfValues,
int indexOfAtt,
Instance instance,
boolean useMissing) {
int currValue;
// get current value
// if value is missing set current value to number of values
// whiche is the highest possible value plus one
if (instance.isMissing(indexOfAtt)) {
currValue = numOfValues;
} else {
currValue = (int) instance.value(indexOfAtt);
}
// with only two possible values it is easier
if ((numOfValues == 2) && (!instance.isMissing(indexOfAtt))) {
instance.setValue(indexOfAtt, (double) ((currValue+1)% 2));
} else {
// get randomly a new value not equal to the current value
// if missing values are used as values they must be treated
// in a special way
while (true) {
int newValue;
if (useMissing) {
newValue = (int) (r.nextDouble() * (double) (numOfValues + 1));
} else {
newValue = (int) (r.nextDouble() * (double) numOfValues);
}
// have we found a new value?
if (newValue != currValue) {
// the value 1 above the highest possible value (=numOfValues)
// is used as missing value
if (newValue == numOfValues) { instance.setMissing(indexOfAtt); }
else { instance.setValue(indexOfAtt, (double) newValue); }
break;
}
}
}
}
/**
* Main method for testing this class.
*
* @param argv should contain arguments to the filter:
* use -h for help
*/
public static void main(String [] argv) {
try {
if (Utils.getFlag('b', argv)) {
Filter.batchFilterFile(new AddNoise(), argv);
} else {
Filter.filterFile(new AddNoise(), argv);
}
} catch (Exception ex) {
System.out.println(ex.getMessage());
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -