⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 owdatasampler.py

📁 orange源码 数据挖掘技术
💻 PY
字号:
"""
<name>Data Sampler</name>
<description>Selects a subset of instances from the data set.</description>
<icon>icons/DataSampler.png</icon>
<contact>Aleksander Sadikov (aleksander.sadikov(@at@)fri.uni-lj.si)</contact> 
<priority>1125</priority>
"""
from OWWidget import *
import OWGUI
import random

class OWDataSampler(OWWidget):
    
    def __init__(self, parent=None, signalManager=None):
        OWWidget.__init__(self, parent, signalManager, 'SampleData')
        
        self.inputs = [("Data", ExampleTable, self.cdata)]
        self.outputs = [("Examples", ExampleTable), ("Classified Examples", ExampleTableWithClass), ("Remaining Examples", ExampleTable), ("Remaining Classified Examples", ExampleTableWithClass)]

        # initialization of variables
        self.data = None                        # dataset (incoming stream)
        self.indices = None                     # indices that control sampling
        self.ind = None                         # indices that control sampling
        
        self.Stratified = 1                     # use stratified sampling if possible?
        self.Repeat = 0                         # can elements repeat in a sample?
        self.UseSpecificSeed = 0                # use a specific random seed?
        self.RandomSeed = 1                     # specific seed used
        self.GroupSeed = 1                      # current seed for multiple group selection
        self.outFold = 1                        # folder/group to output
        self.Folds = 1                          # total number of folds/groups

        self.SelectType = 0                     # sampling type (LOO, CV, ...)
        self.useCases = 0                       # use a specific number of cases?
        self.nCases = 25                        # number of cases to use
        self.selPercentage = 30                 # sample size in %
        self.LOO = 1                            # use LOO?
        self.CVFolds = 10                       # number of CV folds
        self.CVFoldsInternal = 10               # number of CV folds (for internal use)
        self.nGroups = 3                        # number of groups
        self.pGroups = [0.1,0.25,0.5]           # sizes of groups
        self.GroupText = '0.1,0.25,0.5'         # assigned to Groups Control (for internal use)
 
        # GUI
        # Info Box
        box1 = QVGroupBox("Information", self.controlArea)
        self.infoa = QLabel('No data on input.', box1)
        self.infob = QLabel('', box1)
        self.infoc = QLabel('', box1)
        OWGUI.separator(self.controlArea)

        # Options Box
        box2 = QVGroupBox('Options', self.controlArea)
        OWGUI.checkBox(box2, self, 'Stratified', 'Stratified (if possible)')
        OWGUI.checkWithSpin(box2, self, 'Set random seed:', 0, 32767, 'UseSpecificSeed', 'RandomSeed')
        OWGUI.separator(self.controlArea)

        # Sampling Type Box
        self.s = [None, None, None, None]
        self.sBox = QVButtonGroup("Sampling Type", self.controlArea)        

        # Random Sampling
        self.s[0] = QRadioButton('Random sampling', self.sBox)
        # repeat checkbox
        self.h1Box = QHBox(self.sBox)
        QWidget(self.h1Box).setFixedSize(19, 8)
        OWGUI.checkBox(self.h1Box, self, 'Repeat', 'Repeated sampling')
        # specified number of elements checkbox
        self.h2Box = QHBox(self.sBox)
        QWidget(self.h2Box).setFixedSize(19, 8)
        OWGUI.checkWithSpin(self.h2Box, self, 'Sample size (instances):', 1, 1000000000, 'useCases', 'nCases', checkCallback=self.uCases)
        # percentage slider
        self.h3Box = QHBox(self.sBox)
        QWidget(self.h3Box).setFixedSize(19, 8)
        QLabel("Sample size:", self.h3Box)
        self.slidebox = QHBox(self.sBox)
        QWidget(self.slidebox).setFixedSize(19, 8)
        OWGUI.hSlider(self.slidebox, self, 'selPercentage', minValue=1, maxValue=100, step=1, ticks=10, labelFormat="   %d%%")        
        
        # Cross Validation
        self.s[1] = QRadioButton('Cross validation', self.sBox)
        box = QHBox(self.sBox)
        QWidget(box).setFixedSize(19, 8)
        OWGUI.spin(box, self, 'CVFolds', 2, 100, step=1, label='Number of folds:  ', callback=self.changeCombo)

        # Leave-One-Out
        self.s[2] = QRadioButton('Leave-one-out', self.sBox)

        # Multiple Groups
        self.s[3] = QRadioButton('Multiple subsets', self.sBox)        
        gbox = QHBox(self.sBox)
        QWidget(gbox).setFixedSize(19, 8)
        OWGUI.lineEdit(gbox, self, 'GroupText', label='Subset sizes (e.g. "0.1, 0.2, 0.5"):', callback=self.changeCombo)

        # Output Group Box
        OWGUI.separator(self.controlArea)
        self.foldBox = QHGroupBox('Ouput Data for Fold / Group', self.controlArea)
        QLabel('Fold / group:', self.foldBox)
        self.foldcombo = QComboBox(self.foldBox)
        # fill the combo box (later make it sensitive to number of folds)
        self.foldcombo.clear()
        for x in range(100):
            self.foldcombo.insertItem(str(x+1))
        self.foldBox.setEnabled(False)
        
        # Select Data Button
        OWGUI.separator(self.controlArea)
        OWGUI.button(self.controlArea, self, 'Sample &Data', callback = self.process)
        self.s[self.SelectType].setChecked(True)    # set initial radio button on (default sample type)

        # CONNECTIONS        
        # set connections for RadioButton (SelectType)
        self.dummy1 = [None]*len(self.s)
        for i in range(len(self.s)):
            self.dummy1[i] = lambda x, v=i: self.sChanged(x, v)
            self.connect(self.s[i], SIGNAL("toggled(bool)"), self.dummy1[i])

        # set connection for ComboBox (fold to output)
        self.connect(self.foldcombo, SIGNAL('activated(int)'), self.foldChanged)
        
        # final touch
        self.resize(200, 275)

    # CONNECTION TRIGGER AND GUI ROUTINES
    # enables RadioButton switching
    def sChanged(self, value, id):
        self.SelectType = id
        self.process()

    # reflect user's actions that change combobox contents
    def changeCombo(self):
        # refill combobox
        self.Folds = 1
        if self.SelectType == 1: self.Folds = self.CVFolds
        if self.SelectType == 2:
            if self.data: self.Folds = len(self.data)
            else:         self.Folds = 1
        if self.SelectType == 3: self.Folds = self.nGroups
        self.foldcombo.clear()
        for x in range(self.Folds):
            self.foldcombo.insertItem(str(x+1))
     
    # triggered on change in output fold combobox
    def foldChanged(self, ix):
        self.outFold = int(ix+1)
        if self.data: self.sdata()

    # switches between cases and percentage (random sampling)
    def uCases(self):
        if self.useCases == 1:
            self.h3Box.setEnabled(False)
            self.slidebox.setEnabled(False)
        else:
            self.h3Box.setEnabled(True)
            self.slidebox.setEnabled(True)

    # I/O STREAM ROUTINES
    # handles changes of input stream
    def cdata(self, dataset):
        if dataset:
            self.infoa.setText('%d instances in input data set.' % len(dataset))
            self.data = dataset
            self.process()
        else:
            self.infoa.setText('No data on input.')
            self.infob.setText('')
            self.infoc.setText('')
            self.send("Examples", None)
            self.send("Remaining Examples", None)
            self.send("Classified Examples", None)
            self.send("Remaining Classified Examples", None)
            self.data = None

    # feeds the output stream
    def sdata(self):
        # select data
        if self.SelectType == 0:
            if self.useCases == 1 and self.Repeat == 1:
                sample = orange.ExampleTable(self.data.domain)
                for x in range(self.nCases):
                    sample.append(self.data[random.randint(0,len(self.data)-1)])
                remainder = None
                self.infob.setText('Random sampling with repetitions, %d instances.' % self.nCases)
            else:                
                sample = self.data.select(self.ind, 0)
                remainder = self.data.select(self.ind, 1)
            self.infoc.setText('Output: %d instances.' % len(sample))
        elif self.SelectType == 3:
            self.ind = self.indices(self.data, p0 = self.pGroups[self.outFold-1])
            sample = self.data.select(self.ind, 0)
            remainder = self.data.select(self.ind, 1)
            self.infoc.setText('Output: subset %d of %d, %d instance(s).' % (self.outFold, self.Folds, len(sample)))
        else:
            sample = self.data.select(self.ind, self.outFold-1)
            remainder = self.data.select(self.ind, self.outFold-1, negate=1)
            self.infoc.setText('Output: fold %d of %d, %d instance(s).' % (self.outFold, self.Folds, len(sample)))
        # set name (by PJ)
        if sample:
            sample.name = self.data.name
        if remainder:
            remainder.name = self.data.name
        # send data
        self.send("Examples", sample)
        self.send("Remaining Examples", remainder)
        # send classified data (if class exists)
        if self.data.domain.classVar:
            self.send("Classified Examples", sample)
            self.send("Remaining Classified Examples", remainder)

    # MAIN SWITCH
    # processes data after the user requests it
    def process(self):
        # reset errors, fold selected
        self.error()
        self.outFold = 1
        
        # check for data
        if self.data == None:
            return
        else:
            self.infob.setText('')
            self.infoc.setText('')

        # Random Selection
        if self.SelectType == 0:
            # apply selected options
            if self.useCases == 1 and self.Repeat != 1:
                #print int(self.nCases)
                if self.nCases > len(self.data):
                    self.error("Sample size (w/o repetitions) larger than dataset.")
                    return                    
                self.indices = orange.MakeRandomIndices2(p0=int(self.nCases))
                self.infob.setText('Random sampling, using exactly %d instances.' % self.nCases)
            else:
                #print float(self.selPercentage/100.0)
                self.indices = orange.MakeRandomIndices2(p0=float(self.selPercentage/100.0))
                self.infob.setText('Random sampling, %d%% of input instances.' % self.selPercentage)
            if self.Stratified == 1: self.indices.stratified = self.indices.StratifiedIfPossible
            else:                    self.indices.stratified = self.indices.NotStratified
            if self.UseSpecificSeed == 1: self.indices.randseed = self.RandomSeed
            else:                         self.indices.randomGenerator = orange.RandomGenerator(random.randint(0,65536))    
            
            # call output stream handler to send data
            self.ind = self.indices(self.data)

        # Cross Validation / LOO            
        elif self.SelectType == 1 or self.SelectType == 2:
            # apply selected options
            if self.SelectType == 2:
                self.CVFoldsInternal = len(self.data)
                self.infob.setText('Leave-one-out.')
            else:
                self.CVFoldsInternal = self.CVFolds
                self.infob.setText('%d-fold cross validation.' % self.CVFolds)
            self.indices = orange.MakeRandomIndicesCV(folds = self.CVFoldsInternal)
            if self.Stratified == 1:
                self.indices.stratified = self.indices.StratifiedIfPossible
            else:
                self.indices.stratified = self.indices.NotStratified
            if self.UseSpecificSeed == 1:
                #self.indices.randomGenerator = orange.RandomGenerator(random.randint(0,65536))
                self.indices.randseed = self.RandomSeed
            else:
                #self.indices.randomGenerator = orange.RandomGenerator(random.randint(0,65536))
                self.indices.randseed = random.randint(0,65536)

            # call output stream handler to send data
            self.ind = self.indices(self.data)

        # MultiGroup
        elif self.SelectType == 3:
            self.infob.setText('Multiple subsets.')
            #parse group specification string
            try:
                self.pGroups = []
                for x in self.GroupText.split(','):
                    self.pGroups.append(float(x))
                self.nGroups = len(self.pGroups)
            except:
                self.error("Invalid specification for sizes of subsets.")
                return

            #prepare indices generator
            self.indices = orange.MakeRandomIndices2()
            if self.Stratified == 1: self.indices.stratified = self.indices.StratifiedIfPossible  
            else:                    self.indices.stratified = self.indices.NotStratified
            if self.UseSpecificSeed == 1: self.indices.randseed = self.RandomSeed
            else:                         self.indices.randomGenerator = orange.RandomGenerator(random.randint(0,65536))    

        # enable fold selection and fill combobox if applicable
        if self.SelectType == 0:
            self.foldBox.setEnabled(False)
        else:
            self.foldBox.setEnabled(True)
            self.changeCombo()

        # call data output routine        
        self.sdata()

##############################################################################
# Test the widget, run from prompt

if __name__=="__main__":
    appl = QApplication(sys.argv)
    ow = OWDataSampler()
    appl.setMainWidget(ow)

    data = orange.ExampleTable('iris.tab')
    ow.cdata(data)
    ow.show()
    appl.exec_loop()
    ow.saveSettings()

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -