📄 conceptdriftsimulator.java.old
字号:
* @version $Id: ConceptDriftSimulator.java,v 2.6 2003/04/04 11:59:30 fischer Exp $ */public class ConceptDriftSimulator extends ValidationChain { // History: // -> RK/2001 / RK/2002: re-implementation of the concept drift simulator of the text classification // experiment environment DyCE (e.g. used in [Klinkenberg/1998a] and // [Klinkenberg/Joachims/2000a]) int Yale; // -> RK/2002/05/21: adaption to Yale extensions related to the parameter service etc.; // -> RK/2003/03/21: merger of RK's Yale 1.0 version into Yale 2.0; private static final Class[] INPUT_CLASSES = { ExampleSet.class }; // exactly like a ValidationChain private static final Class[] OUTPUT_CLASSES = { PerformanceVector.class, RunVector.class }; /** label to be used for examples considered interesting to the user (relevant) in the concept drift simulation */ // private static final String POS_LABEL = "+1"; // label as String -- //// private static final double POS_LABEL = +1.0; // label as double ++ /** label to be used for examples considered not interesting to the user (non-relevant) in the concept drift simulation */ // private static final String NEG_LABEL = "-1"; // label as String -- //// private static final double NEG_LABEL = -1.0; // label as double ++ /** type of enclosed learner: static learner to be used on all old data (= full memory approach). */ private static final int STATIC_LEARNER = 0; /** type of enclosed learner: static learner to be used on a fixe time window on the old data * (= no memory approach for window size 1, or other fixed window size approach otherwise). */ private static final int STATIC_WINDOW_LEARNER = 1; /** type of enclosed learner: adaptive learner that maintains an adaptive time window or example weighting by itself. */ private static final int ADAPTIVE_LEARNER = 2; /** names of the learner types as they may be specified in the Yale configuration file. */ private static final String[] LEARNER_TYPE_NAMES = {"static","static_window","adaptive"}; /** default time window size of a fixed window (= 3 batches). */ private static final int DEFAULT_TIME_WINDOW_SIZE = 3; // private PerformanceCriterion lastPerformance; // in super class // private IOContainer learningResult; // in super class private ExampleSet inputSet; // input example set passed to this operator private ExampleSet exampleSet; // copy of this example set with additional attributes for internal use // (weight, time_index, batch_index, user_interest (= new class label) // private Attribute weightAttribute; // weight attribute (weight of an example) //// <- obsolete ///// private Attribute timeIndexAttribute; // time index attribute (describing simulated order of examples in time) private Attribute batchIndexAttribute; // batch index attribute (number of the batch an example is assigned to) private Attribute streamNameAttribute; // original class label: name of the stream an example comes from private Attribute userInterestAttribute; // new class label: simulated user interest in an example private int noOfRuns, currentRun; // (= XVal.number, XVal.iteration) private int noOfBatches, currentBatch; private int noOfStreams; // no. of input data streams (e.g. no. of topic of text documents) private int noOfExamples; // total no. of examples (e.g. documents) in the input example set private int batchSize; // no. of documents per batch (evenly distributed, left overs discarded) private String[] streamNames; private double[][] streamRelevance; // streamRelevance[s][b] = probability for a example from stream 's' // to be relevant in batch 'b' private int[] timeIndex2example; // maps a time point to the example (e.g. document) at that time point private int[] example2timeIndex; // maps an example (e.g. document) to a time point private int[] example2batchIndex; // maps an example (e.g. document) to a batch private int[] example2label; // maps an example (e.g. document) to its (user interest) label private int[] example2streamIndex; // example2streamIndex[e] = index of the stream the example 'e' comes from private int[][] noOfStreamExamplesInBatch; // noOfStreamExamplesInBatch[s][b] = no. of examples from stream 's' // in batch 'b' private int[] noOfExamplesInBatch; // noOfExamplesInBatch[b] = no. of examples in batch 'b' private int learnerType; // type of enclosed learner (static vs. static window vs. adaptive) private int timeWindowSize; // window size in case of a static window learner (window of fixed size) private static final int ILLEGAL_STREAM_INDEX = -1; public ConceptDriftSimulator() { // Make the number of the current run of this operator, which starts with 1 and // goes up to 'noOfRuns' (= parameter 'number_of_runs'), and which can for example // be used to monitor the progress of this operator, externally accessible: addValue(new Value("run", "The number of the current run.") { public double getValue() { return currentRun; } }); } /** returns the the classes this operator expects as input. */ public Class[] getInputClasses() { return INPUT_CLASSES; } /** returns the the classes this operator provides as output. */ public Class[] getOutputClasses() { return OUTPUT_CLASSES; } // ==== apply() : Concept Drift Simulation ==== // public IOObject[] apply() throws OperatorException { // #### initialize #### LogService.logMessage("ConceptDriftSimulator '" + getName() + "': prepare concept drift simulation", LogService.TASK); // ---- retrieve the operator input objects ---- IOContainer input = getInput(); // operator input objects inputSet = (ExampleSet)input.getInput(ExampleSet.class); // input example set exampleSet = (ExampleSet)inputSet.clone(); // internal copy of the input example set // ---- retrieve the operator parameters (no. of runs, no. of batches, data streams) ---- noOfRuns = getParameterAsInt("number_of_runs"); // min. 2 runs, default 10 runs noOfBatches = getParameterAsInt("number_of_batches"); // min. 2 batches, default 10 batches noOfStreams = getParameterAsInt("number_of_streams"); // min. 2 streams, default 2 streams // data_stream_relevance[s][b] = probability of a document of stream 's' (= Topic t) to be relevant in batch 'b') streamNames = new String[noOfStreams]; streamRelevance = new double[noOfStreams][noOfBatches]; String streamNamesString = getParameterAsString("data_stream_names"); scanStreamNames (streamNames, streamNamesString); String streamRelevanceString = getParameterAsString("data_stream_relevance"); scanStreamRelevanceSpecification (streamRelevance, streamRelevanceString, streamNames, exampleSet); // ---- compute further variables ---- noOfExamples = (exampleSet.getSize()); batchSize = ((int) (((double) noOfExamples) / ((double) noOfBatches))); // sanity check: min. 2 batches, min. 1 example per batch (=> min. 2 examples): if (noOfBatches < 2) { throw new FatalException("ConceptDriftSimulator '"+getName()+"': There must be at least 2 batches " + "for a concept drift simulation (here "+noOfBatches+" batches)."); } if (batchSize < 1) { throw new FatalException("ConceptDriftSimulator '"+getName()+"': There must be at least as many examples as " + "batches for a concept drift simulation (here "+noOfExamples+" examples are to be " + "distributed to " + noOfBatches + " batches)."); } // ---- retrieve the operator parameters (learner type, window size) ---- learnerType = getParameterAsInt ("learner_type"); timeWindowSize = getParameterAsInt ("window_size"); // #### add new attributes to example set: [stream[_index]], time_index, batch_index, weight, user_interest #### // #### (and set 'stream', 'time_index', and 'batch_index' to 'unused') #### // keep old class labels (= data stream names) in a new attribute before dropping the old class label: streamNameAttribute = (Attribute) (exampleSet.getLabel()).clone(); streamNameAttribute.setName (Attribute.createName("stream_name")); // exampleSet.appendAttributeReference (new AttributeReference (streamNameAttribute,false)); //// create special weight attribute (get /set values via Example.getWeight /.setWeight): exampleSet.createWeightAttribute(); //// create new attributes for the time index and the batch index of an example: timeIndexAttribute = new Attribute (Attribute.createName("time_index"), // create new attribute Ontology.INTEGER, Ontology.SINGLE_VALUE, Attribute.UNDEFINED_BLOCK_NR, null); (exampleSet.getExampleTable()).addAttribute(timeIndexAttribute); // create example table column // exampleSet.appendAttributeReference (new AttributeReference (timeIndexAttribute,false)); // register ref. in example set batchIndexAttribute = new Attribute (Attribute.createName("batch_index"), Ontology.INTEGER, Ontology.SINGLE_VALUE, Attribute.UNDEFINED_BLOCK_NR, null); (exampleSet.getExampleTable()).addAttribute(batchIndexAttribute); // exampleSet.appendAttributeReference (new AttributeReference (batchIndexAttribute,false)); //// create new attribute for the new class labels (= simulated user interest): userInterestAttribute = new Attribute (Attribute.createName("user_interest"), Ontology.CLASSIFICATION, Ontology.SINGLE_VALUE, Attribute.UNDEFINED_BLOCK_NR, null); (exampleSet.getExampleTable()).addAttribute(userInterestAttribute); //// exampleSet.appendAttributeReference (new AttributeReference (userInterestAttribute,true)); ////// change class label attribute from stream name (original) to user interest (new): exampleSet.setLabel (userInterestAttribute); LogService.logMessage("ConceptDriftSimulator '" + getName() + "': start concept drift simulation", LogService.TASK); // #### loop for runs #### BatchedExampleSet currentExampleSet; // current ExampleSet // performance result data structures: SeriesVector performanceResultMatrix = new SeriesVector(); // results of all batches for all runs RunVector singleRunResults; // result time series of a single run RunVector averagedTimeSeriesResults; // average result time series, averaged over all runs PerformanceVector averagedOverallResults; // overall result averaged over all runs and batches for (int run = 0; run < noOfRuns; run++) { singleRunResults = new RunVector(); // #### sample example2batch and example2class #### sampleExamples2Batches(); sampleExamples2Labels(); // #### loop for batches #### // Training set: batches 0..b-1 (for induction) (vs.: for transduction also batch b, but without labels) // Test set: batch b for (int batch = 1; batch < noOfBatches; batch++) { LogService.logMessage("ConceptDriftSimulator '" + getName() + "': current run = " + (run+1) + ", current batch = " + batch + "\n", LogService.TASK); // ---- prepare training set ---- // induction: int firstBatch = 0; switch (learnerType) { case STATIC_WINDOW_LEARNER : firstBatch = maximumInt (0, batch-timeWindowSize); break; case STATIC_LEARNER : case ADAPTIVE_LEARNER : default : firstBatch = 0; } currentExampleSet = new BatchedExampleSet (exampleSet, batchIndexAttribute, firstBatch, batch-1); // // // transduction: // currentExampleSet = new BatchedExampleSet (exampleSet, batchIndexAttribute, firstBatch, batch); // // ... for transduction: clone label and set to unlabeled for last batch // ---- call learning chain ---- // ((SVMLearner)((OperatorChain)this.getOperator(0)).getOperator(0)).setPositiveLabelIndex((int)userInterestAttribute.mapString(Attribute.POSITIVE_CLASS)); // TMP: HACK !!! learn((ExampleSet)currentExampleSet); // if (learnerType == ADAPTIVE_LEARNER) : BatchLearner < Learner, // learn(currentExampleSet,batchIndexAttribute,firstBatch,currentBatch,lastBatch) // or learn(currentExampleSet) with getBatchIndexAttribute(), getFirstBatch(), getLastBatch(); // ---- prepare test set ---- currentExampleSet = new BatchedExampleSet (exampleSet, batchIndexAttribute, batch, batch); // ---- call applier and performance evaluator chain and get its output ---- IOContainer evalOutput = evaluate(currentExampleSet); // read results PerformanceVector currentResults = (PerformanceVector)evalOutput.getInput(PerformanceVector.class); setLastPerformance(currentResults.get(0)); singleRunResults.add(currentResults); inApplyLoop(); // for GNU-Plot-Service } performanceResultMatrix.add(singleRunResults); } // end of 'run' loop // #### average over runs and batches #### averagedTimeSeriesResults = performanceResultMatrix.getTimePointWiseAverageRunVector(); averagedOverallResults = performanceResultMatrix.getOverallAveragePerformanceVector(); // equivalent to line below // averagedOverallResults = RunVector.average(averagedTimeSeriesResults); // equivalent to line above // #### construct operator output #### IOObject[] outputArray = new IOObject[2]; outputArray[0] = averagedOverallResults; outputArray[1] = averagedTimeSeriesResults;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -