⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 instancelist.java

📁 java数据挖掘算法
💻 JAVA
📖 第 1 页 / 共 5 页
字号:
        fileSchema = new FileSchema(trainList.get_original_schema());
        weighted = false;
        bagCounters = null;
        init_max_vals();
        read_data(testName, true);
    }
    
    
    
    /** Checks if this InstanceList has a set of bagcounters yet.
     * @return False if BagCounters is set to null, True otherwise.
     */
    public boolean has_counters() {
        return bagCounters != null;
    }
    
    /** Creates and fills bagCounters.
     * @return The BagCounters object created.
     */
    public BagCounters counters() {
        ensure_counters();
        return bagCounters;
    }
    
    /** Fills bagCounters by adding all instances into it.
     */
    public void ensure_counters() {
        if(bagCounters == null) {
            //Construct counters by adding each instance in turn
            bagCounters = new BagCounters(get_schema());
            if(!no_instances()) {
                //	    ListIterator pix = instances.listIterator();
                Instance inst = null;
                //	    for(;pix.hasNext();inst = (Instance)pix.next())
                for(ListIterator pix = instances.listIterator();
                pix.hasNext();){
                    inst =(Instance)pix.next();
                    bagCounters.add_instance(inst);
                }
            }
        }
    }
    
    /** Reads the data from the supplied file. InstanceList.read_data() takes
     * time proportional to the number of instances * the complexity of
     * read_data_line() + complexity of free_instances().
     * @param file		The name of the file containing the data set.
     * @param isTest	Indicator of whether this is a test data set. True
     * indicates this is a test data set, False otherwise.
     */
    public void read_data(String file, boolean isTest) {
        GetEnv getenv = new GetEnv();
        
        removeUnknownInstances = getenv.get_option_bool("REMOVE_UNKOWN_INST");
        corruptToUnknownRate = getenv.get_option_real_range("CORRUPT_UNKOWN_RATE");
        
        remove_all_instances();
        if(bagCounters!=null)
            bagCounters = null;
        try{
            BufferedReader dataFile = new BufferedReader(new FileReader(file));
            
            /*SECTION ADDED BY JL*/
            StreamTokenizer dataStream = new StreamTokenizer(dataFile);
            dataStream.eolIsSignificant(true);
            dataStream.commentChar((int)'|');
            dataStream.ordinaryChar((int)'?');
            dataStream.ordinaryChar((int)',');
            dataStream.ordinaryChar((int)'.');
            dataStream.wordChars((int)'_',(int)'_');
            dataStream.wordChars((int)' ',(int)' ');
            //	dataStream.parseNumbers();
            if(fileSchema.attrInfos[0] instanceof RealAttrInfo)
            {parseNumbers(dataStream,true);}
            else {parseNumbers(dataStream,false);}
            /*END OF SECTION ADDED BY JL*/
            
            InstanceList thisList = this;
            InstanceReader reader = new InstanceReader(thisList, maxAttrVals, isTest);
            
            fileSchema.skip_white_comments_same_line(dataFile);
            
            
            try{
                /*SECTION ADDED BY JL*/
                while(dataStream.nextToken() != StreamTokenizer.TT_EOF){
                    if(dataStream.ttype != StreamTokenizer.TT_EOL){
                        read_data_line(dataStream, isTest, reader);
                        if(num_instances() % 100 == 0)
                            ; //GLOBLOG(1,'.',flush);
                    }
                }
                /*END OF SECTION ADDED BY JL*/
                
/*REPLACES THIS SECTION
         while(dataFile.ready()){
            read_data_line(dataFile, isTest, reader);
            if(num_instances() % 100 == 0)
              ; //GLOBLOG(1,'.',flush);
         }
/*END OF SECTION REPLACED*/
                //done reading; release the list
                reader.release_list();
                
                if(!removeUnknownInstances)
                    ;//GLOBLOG(1," done.");
                else{
                    int num = num_instances();
                    //GLOBLOG(1,' '); //show we finished reading
                    remove_inst_with_unknown_attr();
                    int newNum = num_instances();
                    if(newNum < num)
                        ;//GLOBLOG(1,"Removed " + num-newNum +" instances.");
                    else
                        ;//GLOBLOG(1,"done.");
                }
                
                if(no_instances())
                    System.out.println("InstanceList.read_data WARNING: no"
                    + " instances in file");
                
                unknownSeed = -1;
                mrandomForUnknowns = null;
                if(corruptToUnknownRate > 0){
                    if(unknownSeed == -1) { //get seed first time
                        unknownSeed = getenv.get_option_int("UNKOWN_RATE_SEED");
                        mrandomForUnknowns = new Random(unknownSeed);
                    }
                    corrupt_values_to_unknown(corruptToUnknownRate, mrandomForUnknowns);
                }
                
                //remove any nominals which have no values other than unknowns here
                try{
                    remove_unknown_attributes();  //causes problems!
                }catch(CloneNotSupportedException e){
                    Error.err("Clone not supported exception caught");}
                
                
                //apply the loss matrix (from the FileSchema) now
                fileSchema.apply_loss_spec(schema);
                
                //some comments about next two lines
                Schema newSchema = schema;             //SchemaRC -> Schema
                try{
                    set_schema(newSchema);
                }catch(CloneNotSupportedException e){
                    Error.err("Clone not supported exception caught");}
            }catch(IOException e){Error.err("InstanceList.read_data"
            +" ERROR");}
        }catch(FileNotFoundException e){Error.err("-"
        +" Data file NOT found");}
    }
    
    /** Removes all instances that have unknown attributes from the data set.
     */
    
    //change for C45
    //   private void remove_inst_with_unknown_attr()
    public void remove_inst_with_unknown_attr() {
        ListIterator pix = instances.listIterator(0);
        while(pix.hasNext()) {
            boolean hasUnknownAttr = false;
            Instance instance = (Instance)pix.next();
            for(int attrNum=0;attrNum<num_attr() && !hasUnknownAttr;attrNum++) {
                AttrInfo attrInfo = attr_info(attrNum);
                AttrValue attrValue = instance.get_value(attrNum);
                if(attrInfo.is_unknown(attrValue))
                    hasUnknownAttr = true;
            }
            if(hasUnknownAttr)
                remove_instance(pix,instance);  //removes from list last element seen by next()
        }
    }
    
    /** Removes the specified Instance from the ListIterator of Instances
     * supplied.
     * @param pix		The ListIterator containing the Instance.
     * @param instance 	The Instance to be removed.
     */
    public void remove_instance(ListIterator pix,Instance instance) {
        if(instance==null)
            Error.err("InstanceList.remove_instance: tried "
            +"to dereference a null instance -->fatal_error");
        pix.remove();//instance_list().del(instance);
        //Remove from counters if we have them
        if(bagCounters!=null)
            bagCounters.del_instance(instance);
        
        //Update totalWeight cache
        totalWeight = instance.get_weight() -1 ;
        
    }
    
    /** Removes all Instance objects stored in this InstanceList object.
     */    
    public void remove_all_instances() {
        //drop_counters();
        MLJ.ASSERT(instances != null,"InstanceList.remove_all_instances: instance is null");
        while(!no_instances())
            instances.removeFirst();
        totalWeight = 0;
    }
    
    /** Returns the number of instances in the InstanceList.
     * InstanceList.num_instances() takes time proportional to the number of
     * instances in the List.
     * @return An integer value of the number of Instances contained in this list.
     */
    public int num_instances() {
        return instances.size();
    }
    
    /** Returns the number of categories that the instances in the List can have.
     * Only works if the Label is of a nominal attribute.
     * @return An integer value of the number of categories.
     */
    public int num_categories() {
        return nominal_label_info().num_values();
    }
    
    /** Returns the nominal label information contained in this InstanceList's
     * schema.
     * @return The information on the nominal labels contained in the schema.
     */
    public NominalAttrInfo nominal_label_info() {
        return label_info().cast_to_nominal();
    }
    
    /** Returns the label information contained in this InstanceList's
     * schema.
     * @return The information on the labels contained in the schema.
     */
    public AttrInfo label_info() {
        return get_schema().label_info();
    }
    
    /** Checks if this InstanceList contains Instances.
     * @return Returns True if there are no Instances in this InstanceList, False
     * otherwise.
     */
    public boolean no_instances() {
        return instances.size() == 0;
    }
    
    /** This function projects out any attributes which have only unknown values.
     *
     * @throws CloneNotSupportedException If InstanceList.project_in_place encounters an exception during cloning of the
     * Schema.
     */    
    private void remove_unknown_attributes() throws CloneNotSupportedException {
        boolean[] attrMask = new boolean[num_attr()];
        for(int i=0;i<attrMask.length;i++)attrMask[i] = true;
        for(int i=0;i<num_attr();i++)
            if(schema.attr_info(i).can_cast_to_nominal() &&
            schema.nominal_attr_info(i).num_values() == 0)
                attrMask[i] = false;
        project_in_place(attrMask);
    }
    
    /** Returns the list of Instances stored in this InstanceList.
     * @return A LinkedList containing the Instances sotred in this InstanceList.
     */
    public LinkedList instance_list() {
        return instances;
    }
    
    /** This function is very similar to project(), except that the list is
     * projected "in place"--attributes are removed directly from the list
     * and the schema is updated.
     * @param projMask An array of boolean values representing which attributes shall be use in this
     * InstanceList object. Values of projMask are related by order to the atributes.
     * Values of TRUE indicate that attribute will be used, FALSE indicates the
     * attribute will not be used.
     * @throws CloneNotSupportedException if the cloning process in Schema encounters an exception.
     */
    public void project_in_place(boolean[] projMask) throws CloneNotSupportedException {
        MLJ.ASSERT(schema != null,"InstanceList.project_in_place: schema is null");
        Schema newSchema = new Schema(schema.project(projMask));
        
        //Project all instances in the list "in place" --we cheat a bit
        // here because we have instances in the list with different
        // schemas.  However, we clean everything up at the end and check
        // the schemas carefully.
        
        int numInstBefore = num_instances();
        //ListIterator temp = instances.listIterator(0);
        int index = 0;
        for(int i=0;i<numInstBefore;i++) {
            //Work ona  temporary pix; otherwise we'll remove an instance
            //  before advancing the pix which is bad.
            

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -