📄 ghmm.py
字号:
#-------------------------------------------------------------------------------#- Distribution and derived ---------------------------------------------------class Distribution: """ Abstract base class for distribution over EmissionDomains """ # From Spass (S. Rahmann): # add density, mass, cumuliative dist, quantils, sample, fit pars, # momentsclass DiscreteDistribution(Distribution): """ A DiscreteDistribution over an Alphabet: The discrete distribution is parameterized by the vectors of probabilities. """ def __init__(self, alphabet): self.alphabet = alphabet self.prob_vector = None def set(self, prob_vector): self.prob_vector = prob_vector def get(self): return self.prob_vector class ContinousDistribution(Distribution): passclass GaussianDistribution(ContinousDistribution): # XXX attributes unused def __init__(self, domain): self.emissionDomain = domain self.mu = None self.sigma = None def set(self, (mu, sigma)): self.mu = mu self.sigma = sigma def get(self): return (self.mu, self.sigma2) class MixtureContinousDistribution(ContinousDistribution): passclass GaussianMixtureDistribution(MixtureContinousDistribution): def __init__(self, domain): self.emissionDomain = domain self.M = None # number of mixture components self.mu = [] self.sigma = [] self.weight = [] def set(self, index, (mu, sigma,w)): pass # assert M > index # self.mu = mu # self.sigma = sigma def get(self): pass#-------------------------------------------------------------------------------#Sequence, SequenceSet and derived ------------------------------------------# XXX write to FASTA function# XXX check for seq_label existanceclass EmissionSequence: """ An EmissionSequence contains the *internal* representation of a sequence of emissions. It also contains a reference to the domain where the emission orginated from. """ def __init__(self, emissionDomain, sequenceInput): self.emissionDomain = emissionDomain if self.emissionDomain.CDataType == "int": # underlying C data type is integer # necessary C functions for accessing the sequence_t struct self.getPtr = ghmmwrapper.get_col_pointer_int # defines C function to be used to access a single sequence self.getSymbol = ghmmwrapper.get_2d_arrayint self.setSymbol = ghmmwrapper.set_2d_arrayint self.cleanFunction = ghmmwrapper.sequence_clean if isinstance(sequenceInput, list): # from list internalInput = map( self.emissionDomain.internal, sequenceInput) (seq,l) = ghmmhelper.list2matrixint([internalInput]) self.cseq = ghmmwrapper.sequence_calloc(1) self.cseq.seq = seq self.cseq.seq_number = 1 # deactivating labels self.cseq.state_labels = None self.cseq.state_labels_len = None ghmmwrapper.set_arrayint(self.cseq.seq_len,0,l[0]) elif isinstance(sequenceInput, str): # from file # reads in the first sequence struct in the input file if not path.exists(sequenceInput): raise IOError, 'File ' + str(sequenceInput) + ' not found.' else: self.cseq = ghmmwrapper.seq_read(sequenceInput) elif isinstance(sequenceInput, ghmmwrapper.sequence_t):# internal use if sequenceInput.seq_number > 1: raise badCPointer, "Use SequenceSet for multiple sequences." self.cseq = sequenceInput else: raise UnknownInputType, "inputType " + str(type(sequenceInput)) + " not recognized." elif self.emissionDomain.CDataType == "double": # underlying C data type is double # necessary C functions for accessing the sequence_d_t struct self.getPtr = ghmmwrapper.get_col_pointer_d # defines C function to be used to access a single sequence self.getSymbol = ghmmwrapper.get_2d_arrayd self.setSymbol = ghmmwrapper.set_2d_arrayd self.cleanFunction = ghmmwrapper.sequence_d_clean if isinstance(sequenceInput, list): (seq,l) = ghmmhelper. list2matrixd([sequenceInput]) self.cseq = ghmmwrapper.sequence_d_calloc(1) self.cseq.seq = seq self.cseq.seq_number = 1 ghmmwrapper.set_arrayint(self.cseq.seq_len,0,l[0]) elif isinstance(sequenceInput, str): # from file # reads in the first sequence struct in the input file if not path.exists(sequenceInput): raise IOError, 'File ' + str(sequenceInput) + ' not found.' else: self.cseq = ghmmwrapper.seq_d_read(sequenceSetInput) elif isinstance(sequenceInput, ghmmwrapper.sequence_d_t): # internal use if sequenceInput.seq_number > 1: raise badCPointer, "Use SequenceSet for multiple sequences." self.cseq = sequenceInput else: raise UnknownInputType, "inputType " + str(type(sequenceInput)) + " not recognized." else: raise NoValidCDataType, "C data type " + str(self.emissionDomain.CDataType) + " invalid." def __del__(self): "Deallocation of C sequence struct." #print "__del__ EmissionSequence " + str(self.cseq) self.cleanFunction(self.cseq) self.cseq = None def __len__(self): "Returns the length of the sequence." return ghmmwrapper.get_arrayint(self.cseq.seq_len,0) def __setitem__(self, index, value): internalValue = self.emissionDomain.internal(value) self.setSymbol(self.cseq.seq,0,index,internalValue) def __getitem__(self, index): """ Return the symbol at position 'index'. """ if index < len(self): return self.getSymbol(self.cseq.seq, 0, index) else: raise IndexError def getLabel(self): if (self.emissionDomain.CDataType != "double"): print "WARNING: Discrete sequences do not support sequence labels." else: return ghmmwrapper.get_arrayl(self.cseq.seq_label,0) def setLabel(self,value): if (self.emissionDomain.CDataType != "double"): print "WARNING: Discrete sequences do not support sequence labels." else: ghmmwrapper.set_arrayl(self.cseq.seq_label,0,value) def __str__(self): "Defines string representation." seq = self.cseq strout = "" strout += "\nEmissionSequence Instance:\nlength " + str(ghmmwrapper.get_arrayint(seq.seq_len,0))+ ", weight " + str(ghmmwrapper.get_arrayd(seq.seq_w,0)) + ":\n" for j in range(ghmmwrapper.get_arrayint(seq.seq_len,0) ): strout += str( self.emissionDomain.external(self[j]) ) if self.emissionDomain.CDataType == "double": strout += " " # checking for labels if self.emissionDomain.CDataType == "int" and self.cseq.state_labels != None: strout += "\nState labels:\n" for j in range(ghmmwrapper.get_arrayint(seq.state_labels_len,0) ): strout += str( ghmmwrapper.get_2d_arrayint(seq.state_labels,0,j)) + ", " return strout def sequenceSet(self): """ Return a one-element SequenceSet with this sequence.""" return SequenceSet(self.emissionDomain, self.cseq) def write(self,fileName): "Writes the EmissionSequence into file 'fileName'." # different function signatures require explicit check for C data type if self.emissionDomain.CDataType == "int": ghmmwrapper.call_sequence_print(fileName, self.cseq) if self.emissionDomain.CDataType == "double": ghmmwrapper.call_sequence_d_print(fileName, self.cseq,0) def setWeight(self, value): ghmmwrapper.set_arrayd(self.cseq.seq_w,0,value) self.cseq.total_w = value def getWeight(self): return ghmmwrapper.get_arrayd(self.cseq.seq_w,0)class SequenceSet: def __init__(self, emissionDomain, sequenceSetInput): self.emissionDomain = emissionDomain self.cseq = None if self.emissionDomain.CDataType == "int": # underlying C data type is integer # necessary C functions for accessing the sequence_t struct self.sequenceAllocationFunction = ghmmwrapper.sequence_calloc self.getPtr = ghmmwrapper.get_col_pointer_int # defines C function to be used to access a single sequence self.castPtr = ghmmwrapper.cast_ptr_int # cast int* to int** pointer self.emptySeq = ghmmwrapper.int_2d_array_nocols # allocate an empty int** self.setSeq = ghmmwrapper.set_2d_arrayint_col # assign an int* to a position within an int** self.cleanFunction = ghmmwrapper.sequence_clean self.addSeqFunction = ghmmwrapper.sequence_add # add sequences to the underlying C struct self.getSymbol = ghmmwrapper.get_2d_arrayint self.setSymbolSingle = ghmmwrapper.set_arrayint if isinstance(sequenceSetInput, str): # from file # reads in the first sequence struct in the input file if not path.exists(sequenceSetInput): raise IOError, 'File ' + str(sequenceSetInput) + ' not found.' else: self.cseq = ghmmwrapper.seq_read(sequenceSetInput) elif isinstance(sequenceSetInput, list): seq_nr = len(sequenceSetInput) self.cseq = ghmmwrapper.sequence_calloc(seq_nr) self.cseq.seq_number = seq_nr internalInput = [] for i in range(seq_nr): internalInput.append( map( self.emissionDomain.internal, sequenceSetInput[i])) (seq,lenghts) = ghmmhelper.list2matrixint(internalInput) self.cseq.seq = seq for i in range(seq_nr): ghmmwrapper.set_arrayint(self.cseq.seq_len,i ,lenghts[i]) elif isinstance(sequenceSetInput, ghmmwrapper.sequence_t): # inputType == sequence_t* self.cseq = sequenceSetInput else: raise UnknownInputType, "inputType " + str(type(sequenceSetInput)) + " not recognized." self.__array = [] for index in range(len(self)): oneseq = self.getPtr(self.cseq.seq,index) self.__array.append(oneseq) elif self.emissionDomain.CDataType == "double": # underlying C data type is double # necessary C functions for accessing the sequence_d_t struct self.sequenceAllocationFunction = ghmmwrapper.sequence_d_calloc self.getPtr = ghmmwrapper.get_col_pointer_d # defines C function to be used to access a single sequence self.castPtr = ghmmwrapper.cast_ptr_d # cast double* to double** pointer self.emptySeq = ghmmwrapper.double_2d_array_nocols # cast double* to int** pointer self.setSeq = ghmmwrapper.set_2d_arrayd_col # assign a double* to a position within a double** self.cleanFunction = ghmmwrapper.sequence_d_clean self.addSeqFunction = ghmmwrapper.sequence_d_add # add sequences to the underlying C struct self.getSymbol = ghmmwrapper.get_2d_arrayd self.setSymbolSingle = ghmmwrapper.set_arrayd if isinstance(sequenceSetInput, list): seq_nr = len(sequenceSetInput) self.cseq = ghmmwrapper.sequence_d_calloc(seq_nr) self.cseq.seq_number = seq_nr (seq,lenghts) = ghmmhelper.list2matrixd(sequenceSetInput) self.cseq.seq = seq for i in range(seq_nr): ghmmwrapper.set_arrayint(self.cseq.seq_len, i, lenghts[i]) elif isinstance(sequenceSetInput, str): # from file print "fromFile", sequenceSetInput if not path.exists(sequenceSetInput): raise IOError, 'File ' + str(sequenceSetInput) + ' not found.' else: #self.cseq = ghmmwrapper.seq_d_read(sequenceSetInput) i = ghmmwrapper.int_array(1) self.__s = ghmmwrapper.sequence_d_read(sequenceSetInput,i) self.cseq = ghmmwrapper.get_seq_d_ptr(self.__s,0) elif isinstance(sequenceSetInput, ghmmwrapper.sequence_d_t): # i# inputType == sequence_d_t**, internal use self.cseq = sequenceSetInput else: raise UnknownInputType, "inputType " + str(type(sequenceSetInput)) + " not recognized." self.__array = [] for index in range(len(self)): oneset = self.getPtr(self.cseq.seq, index) self.__array.append(oneset) else: raise NoValidCDataType, "C data type " + str(self.emissionDomain.CDataType) + " invalid." def __del__(self): "Deallocation of C sequence struct."
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -