⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 ghmm.py

📁 一个通用的隐性马尔可夫C代码库 开发环境:C语言 简要说明:这是一个通用的隐性马尔可夫C代码库
💻 PY
📖 第 1 页 / 共 5 页
字号:
    #-------------------------------------------------------------------------------#- Distribution and derived  ---------------------------------------------------class Distribution:    """ Abstract base class for distribution over EmissionDomains    """    # From Spass (S. Rahmann):    # add density, mass, cumuliative dist, quantils, sample, fit pars,    # momentsclass DiscreteDistribution(Distribution):    """ A DiscreteDistribution over an Alphabet: The discrete distribution        is parameterized by the vectors of probabilities.    """    def __init__(self, alphabet):        self.alphabet = alphabet        self.prob_vector = None            def set(self, prob_vector):        self.prob_vector = prob_vector    def get(self):        return self.prob_vector        class ContinousDistribution(Distribution):    passclass GaussianDistribution(ContinousDistribution):    # XXX attributes unused     def __init__(self, domain):        self.emissionDomain = domain        self.mu = None        self.sigma = None            def set(self, (mu, sigma)):        self.mu = mu        self.sigma = sigma    def get(self):        return (self.mu, self.sigma2)        class MixtureContinousDistribution(ContinousDistribution):    passclass GaussianMixtureDistribution(MixtureContinousDistribution):    def __init__(self, domain):        self.emissionDomain = domain        self.M = None   # number of mixture components        self.mu = []        self.sigma = []        self.weight = []            def set(self, index, (mu, sigma,w)):        pass        # assert M > index         # self.mu = mu        # self.sigma = sigma    def get(self):        pass#-------------------------------------------------------------------------------#Sequence, SequenceSet and derived  ------------------------------------------# XXX write to FASTA function# XXX check for seq_label existanceclass EmissionSequence:    """ An EmissionSequence contains the *internal* representation of        a sequence of emissions. It also contains a reference to the        domain where the emission orginated from.            """        def __init__(self, emissionDomain, sequenceInput):        self.emissionDomain = emissionDomain                 if self.emissionDomain.CDataType == "int": # underlying C data type is integer                        # necessary C functions for accessing the sequence_t struct            self.getPtr = ghmmwrapper.get_col_pointer_int # defines C function to be used to access a single sequence            self.getSymbol = ghmmwrapper.get_2d_arrayint            self.setSymbol = ghmmwrapper.set_2d_arrayint            self.cleanFunction = ghmmwrapper.sequence_clean                        if isinstance(sequenceInput, list):  # from list                internalInput = map( self.emissionDomain.internal, sequenceInput)                (seq,l) = ghmmhelper.list2matrixint([internalInput])                self.cseq = ghmmwrapper.sequence_calloc(1)                self.cseq.seq = seq                self.cseq.seq_number = 1                                # deactivating labels                self.cseq.state_labels = None                self.cseq.state_labels_len = None                                                ghmmwrapper.set_arrayint(self.cseq.seq_len,0,l[0])                             elif isinstance(sequenceInput, str): # from file                # reads in the first sequence struct in the input file                if  not path.exists(sequenceInput):                     raise IOError, 'File ' + str(sequenceInput) + ' not found.'                else:                    self.cseq  = ghmmwrapper.seq_read(sequenceInput)                            elif isinstance(sequenceInput, ghmmwrapper.sequence_t):# internal use                if sequenceInput.seq_number > 1:                    raise badCPointer, "Use SequenceSet for multiple sequences."                self.cseq = sequenceInput                            else:                    raise UnknownInputType, "inputType " + str(type(sequenceInput)) + " not recognized."                elif self.emissionDomain.CDataType == "double": # underlying C data type is double                        # necessary C functions for accessing the sequence_d_t struct            self.getPtr = ghmmwrapper.get_col_pointer_d # defines C function to be used to access a single sequence            self.getSymbol = ghmmwrapper.get_2d_arrayd            self.setSymbol = ghmmwrapper.set_2d_arrayd            self.cleanFunction = ghmmwrapper.sequence_d_clean                                    if isinstance(sequenceInput, list):                 (seq,l) = ghmmhelper. list2matrixd([sequenceInput])                self.cseq = ghmmwrapper.sequence_d_calloc(1)                self.cseq.seq = seq                self.cseq.seq_number = 1                ghmmwrapper.set_arrayint(self.cseq.seq_len,0,l[0])                            elif isinstance(sequenceInput, str): # from file                # reads in the first sequence struct in the input file                if  not path.exists(sequenceInput):                     raise IOError, 'File ' + str(sequenceInput) + ' not found.'                else:                    self.cseq  = ghmmwrapper.seq_d_read(sequenceSetInput)                                            elif isinstance(sequenceInput, ghmmwrapper.sequence_d_t): # internal use                if sequenceInput.seq_number > 1:                    raise badCPointer, "Use SequenceSet for multiple sequences."                self.cseq = sequenceInput            else:                    raise UnknownInputType, "inputType " + str(type(sequenceInput)) + " not recognized."                        else:            raise NoValidCDataType, "C data type " + str(self.emissionDomain.CDataType) + " invalid."    def __del__(self):        "Deallocation of C sequence struct."                #print "__del__ EmissionSequence " + str(self.cseq)                self.cleanFunction(self.cseq)        self.cseq = None    def __len__(self):        "Returns the length of the sequence."        return ghmmwrapper.get_arrayint(self.cseq.seq_len,0)	    def __setitem__(self, index, value):        internalValue = self.emissionDomain.internal(value)        self.setSymbol(self.cseq.seq,0,index,internalValue)		    def __getitem__(self, index):        """ Return the symbol at position 'index'. """        if index < len(self):            return self.getSymbol(self.cseq.seq, 0, index)                else:            raise IndexError            def getLabel(self):        if (self.emissionDomain.CDataType != "double"):            print "WARNING: Discrete sequences do not support sequence labels."        else:                 return ghmmwrapper.get_arrayl(self.cseq.seq_label,0)    def setLabel(self,value):        if (self.emissionDomain.CDataType != "double"):            print "WARNING: Discrete sequences do not support sequence labels."        else:                 ghmmwrapper.set_arrayl(self.cseq.seq_label,0,value)            def __str__(self):        "Defines string representation."        seq = self.cseq        strout = ""        strout += "\nEmissionSequence Instance:\nlength " + str(ghmmwrapper.get_arrayint(seq.seq_len,0))+ ", weight " + str(ghmmwrapper.get_arrayd(seq.seq_w,0))  + ":\n"        for j in range(ghmmwrapper.get_arrayint(seq.seq_len,0) ):            strout += str( self.emissionDomain.external(self[j]) )               if self.emissionDomain.CDataType == "double":                strout += " "                    # checking for labels         if self.emissionDomain.CDataType == "int" and self.cseq.state_labels != None:                        strout += "\nState labels:\n"            for j in range(ghmmwrapper.get_arrayint(seq.state_labels_len,0) ):                strout += str( ghmmwrapper.get_2d_arrayint(seq.state_labels,0,j)) + ", "                  	return strout		    def sequenceSet(self):        """ Return a one-element SequenceSet with this sequence."""        return SequenceSet(self.emissionDomain, self.cseq)            def write(self,fileName):        "Writes the EmissionSequence into file 'fileName'."                 # different function signatures require explicit check for C data type        if self.emissionDomain.CDataType == "int":            ghmmwrapper.call_sequence_print(fileName, self.cseq)        if self.emissionDomain.CDataType == "double":                ghmmwrapper.call_sequence_d_print(fileName, self.cseq,0)             def setWeight(self, value):        ghmmwrapper.set_arrayd(self.cseq.seq_w,0,value)        self.cseq.total_w = value            def getWeight(self):        return ghmmwrapper.get_arrayd(self.cseq.seq_w,0)class SequenceSet:    def __init__(self, emissionDomain, sequenceSetInput):        self.emissionDomain = emissionDomain         self.cseq = None                if self.emissionDomain.CDataType == "int": # underlying C data type is integer            # necessary C functions for accessing the sequence_t struct            self.sequenceAllocationFunction = ghmmwrapper.sequence_calloc            self.getPtr = ghmmwrapper.get_col_pointer_int # defines C function to be used to access a single sequence            self.castPtr = ghmmwrapper.cast_ptr_int # cast int* to int** pointer            self.emptySeq = ghmmwrapper.int_2d_array_nocols # allocate an empty int**            self.setSeq = ghmmwrapper.set_2d_arrayint_col # assign an int* to a position within an int**            self.cleanFunction = ghmmwrapper.sequence_clean            self.addSeqFunction = ghmmwrapper.sequence_add # add sequences to the underlying C struct            self.getSymbol = ghmmwrapper.get_2d_arrayint            self.setSymbolSingle = ghmmwrapper.set_arrayint                        if isinstance(sequenceSetInput, str): # from file                # reads in the first sequence struct in the input file                if  not path.exists(sequenceSetInput):                     raise IOError, 'File ' + str(sequenceSetInput) + ' not found.'                else:                     self.cseq  = ghmmwrapper.seq_read(sequenceSetInput)                           elif isinstance(sequenceSetInput, list):                 seq_nr = len(sequenceSetInput)                self.cseq = ghmmwrapper.sequence_calloc(seq_nr)                self.cseq.seq_number = seq_nr                                internalInput = []                for i in range(seq_nr):                    internalInput.append( map( self.emissionDomain.internal, sequenceSetInput[i]))                (seq,lenghts) = ghmmhelper.list2matrixint(internalInput)                self.cseq.seq = seq                for i in range(seq_nr):                    ghmmwrapper.set_arrayint(self.cseq.seq_len,i ,lenghts[i])            elif isinstance(sequenceSetInput, ghmmwrapper.sequence_t): # inputType == sequence_t*                self.cseq = sequenceSetInput                            else:                    raise UnknownInputType, "inputType " + str(type(sequenceSetInput)) + " not recognized."                self.__array = []                for index in range(len(self)):                oneseq = self.getPtr(self.cseq.seq,index)                self.__array.append(oneseq)                    elif self.emissionDomain.CDataType == "double": # underlying C data type is double            # necessary C functions for accessing the sequence_d_t struct            self.sequenceAllocationFunction =  ghmmwrapper.sequence_d_calloc            self.getPtr = ghmmwrapper.get_col_pointer_d # defines C function to be used to access a single sequence            self.castPtr = ghmmwrapper.cast_ptr_d # cast double* to double** pointer            self.emptySeq = ghmmwrapper.double_2d_array_nocols  # cast double* to int** pointer            self.setSeq = ghmmwrapper.set_2d_arrayd_col # assign a double* to a position within a double**            self.cleanFunction = ghmmwrapper.sequence_d_clean            self.addSeqFunction = ghmmwrapper.sequence_d_add # add sequences to the underlying C struct            self.getSymbol = ghmmwrapper.get_2d_arrayd            self.setSymbolSingle = ghmmwrapper.set_arrayd                                    if isinstance(sequenceSetInput, list):                 seq_nr = len(sequenceSetInput)                self.cseq = ghmmwrapper.sequence_d_calloc(seq_nr)                self.cseq.seq_number = seq_nr                (seq,lenghts) = ghmmhelper.list2matrixd(sequenceSetInput)                self.cseq.seq = seq                for i in range(seq_nr):                    ghmmwrapper.set_arrayint(self.cseq.seq_len, i, lenghts[i])            elif isinstance(sequenceSetInput, str): # from file                print "fromFile", sequenceSetInput                                                if  not path.exists(sequenceSetInput):                     raise IOError, 'File ' + str(sequenceSetInput) + ' not found.'                else:                    #self.cseq  = ghmmwrapper.seq_d_read(sequenceSetInput)                    i = ghmmwrapper.int_array(1)                    self.__s = ghmmwrapper.sequence_d_read(sequenceSetInput,i)                      self.cseq = ghmmwrapper.get_seq_d_ptr(self.__s,0)                                                                                                          elif isinstance(sequenceSetInput, ghmmwrapper.sequence_d_t): # i# inputType == sequence_d_t**, internal use                self.cseq = sequenceSetInput            else:                    raise UnknownInputType, "inputType " + str(type(sequenceSetInput)) + " not recognized."            self.__array = []            for index in range(len(self)):                oneset = self.getPtr(self.cseq.seq, index)                 self.__array.append(oneset)                                        else:            raise NoValidCDataType, "C data type " + str(self.emissionDomain.CDataType) + " invalid."    def __del__(self):        "Deallocation of C sequence struct."        

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -