📄 ghmm.py
字号:
ids[s.index-1] = s.background # s.index ranges from [1, m.N] m.setBackground(bg, ids) log.debug( "model_type %x" % m.cmodel.model_type) log.debug("background_id" + str( ghmmhelper.int_array2list(m.cmodel.background_id, m.N))) else: m.cmodel.bp = None m.cmodel.background_id = None # check for tied states tied = hmm_dom.getTiedStates() if len(tied) > 0: m.setFlags(kTiedEmissions) m.cmodel.tied_to = ghmmhelper.list2int_array(tied) durations = hmm_dom.getStateDurations() if len(durations) == m.N: log.debug("durations: " + str(durations)) m.extendDurations(durations) return m def openSMO(self, fileName, modelIndex): # MO & SMO Files, format is deprecated # check if ghmm is build with smo support if not ghmmwrapper.SMO_FILE_SUPPORT: raise UnsupportedFeature ("smo files are deprecated. Please convert your files" "to the new xml-format or rebuild the GHMM with the" "conditional \"GHMM_OBSOLETE\".") (hmmClass, emission_domain, distribution) = self.determineHMMClass(fileName) log.debug("determineHMMClass = "+ str( (hmmClass, emission_domain, distribution))) nrModelPtr = ghmmwrapper.int_array_alloc(1) # XXX broken since silent states are not supported by .smo file format if hmmClass == DiscreteEmissionHMM: models = ghmmwrapper.ghmm_dmodel_read(fileName, nrModelPtr) getPtr = ghmmwrapper.dmodel_ptr_array_getitem base_model_type = ghmmwrapper.KDiscreteHMM else: models = ghmmwrapper.ghmm_cmodel_read(fileName, nrModelPtr) getPtr = ghmmwrapper.cmodel_ptr_array_getitem base_model_type = ghmmwrapper.kContinuousHMM nrModels = ghmmwrapper.int_array_getitem(nrModelPtr, 0) if modelIndex == None: result = [] for i in range(nrModels): cmodel = getPtr(models, i) cmodel.setModelTypeFlag(base_model_type) m = hmmClass(emission_domain, distribution(emission_domain), cmodel) result.append(m) else: if modelIndex < nrModels: cmodel = getPtr(models, modelIndex) cmodel.setModelTypeFlag(base_model_type) result = hmmClass(emission_domain, distribution(emission_domain), cmodel) else: result = None return result def openHMMER(self, fileName): # HMMER format models h = modhmmer.hmmer(fileName) if h.m == 4: # DNA model emission_domain = DNA elif h.m == 20: # Peptide model emission_domain = AminoAcids else: # some other model emission_domain = IntegerRange(0,h.m) distribution = DiscreteDistribution(emission_domain) # XXX TODO: Probably slow for large matrices (Rewrite for 0.9) [A,B,pi,modelName] = h.getGHMMmatrices() return HMMFromMatrices(emission_domain, distribution, A, B, pi, hmmName=modelName) def determineHMMClass(self, fileName): # # smo files. Obsolete # file = open(fileName,'r') hmmRe = re.compile("^HMM\s*=") shmmRe = re.compile("^SHMM\s*=") mvalueRe = re.compile("M\s*=\s*([0-9]+)") densityvalueRe = re.compile("density\s*=\s*([0-9]+)") cosvalueRe = re.compile("cos\s*=\s*([0-9]+)") emission_domain = None while 1: l = file.readline() if not l: break l = l.strip() if len(l) > 0 and l[0] != '#': # Not a comment line hmm = hmmRe.search(l) shmm = shmmRe.search(l) mvalue = mvalueRe.search(l) densityvalue = densityvalueRe.search(l) cosvalue = cosvalueRe.search(l) if hmm != None: if emission_domain != None and emission_domain != 'int': log.error( "HMMOpenFactory:determineHMMClass: both HMM and SHMM? " + str(emission_domain)) else: emission_domain = 'int' if shmm != None: if emission_domain != None and emission_domain != 'double': log.error( "HMMOpenFactory:determineHMMClass: both HMM and SHMM? " + str(emission_domain)) else: emission_domain = 'double' if mvalue != None: M = int(mvalue.group(1)) if densityvalue != None: density = int(densityvalue.group(1)) if cosvalue != None: cos = int(cosvalue.group(1)) file.close() if emission_domain == 'int': # only integer alphabet emission_domain = IntegerRange(0,M) distribution = DiscreteDistribution hmm_class = DiscreteEmissionHMM return (hmm_class, emission_domain, distribution) elif emission_domain == 'double': # M number of mixture components # density component type # cos number of state transition classes if M == 1 and density == 0: emission_domain = Float() distribution = GaussianDistribution hmm_class = GaussianEmissionHMM return (hmm_class, emission_domain, distribution) elif M > 1 and density == 0: emission_domain = Float() distribution = GaussianMixtureDistribution hmm_class = GaussianMixtureHMM return (hmm_class, emission_domain, distribution) else: raise TypeError, "Model type can not be determined." return (None, None, None)# the following three methods are deprecatedHMMOpenHMMER = HMMOpenFactory(GHMM_FILETYPE_HMMER) # read single HMMER model from fileHMMOpenSMO = HMMOpenFactory(GHMM_FILETYPE_SMO)HMMOpenXML = HMMOpenFactory(GHMM_FILETYPE_XML)# use only HMMOpen and specify the filetype if it can't guessed from the extensionHMMOpen = HMMOpenFactory()def readMultipleHMMERModels(fileName): """ Reads a file containing multiple HMMs in HMMER format, returns list of HMM objects. """ # XXX Integrate into HMMOpen, check for single hmm files if not os.path.exists(fileName): raise IOError, 'File ' + str(fileName) + ' not found.' modelList = [] string = "" f = open(fileName,"r") res = re.compile("^//") stat = re.compile("^ACC\s+(\w+)") for line in f.readlines(): string = string + line m = stat.match(line) if m: name = m.group(1) log.info( "Reading model " + str(name) + ".") match = res.match(line) if match: fileLike = StringIO.StringIO(string) modelList.append(HMMOpenHMMER(fileLike)) string = "" match = None return modelList class HMMFromMatricesFactory(HMMFactory): """ XXX Document matrix formats """ # XXX TODO: this should use the editing context def __call__(self, emissionDomain, distribution, A, B, pi, hmmName = None, labelDomain= None, labelList = None, densities = None): if isinstance(emissionDomain,Alphabet): if not emissionDomain == distribution.alphabet: raise TypeError, "emissionDomain and distribution must be compatible" # checking matrix dimensions and argument validation, only some obvious errors are checked if not len(A) == len(A[0]): raise InvalidModelParameters, "A is not quadratic." if not len(pi) == len(A): raise InvalidModelParameters, "Length of pi does not match length of A." if not len(A) == len(B): raise InvalidModelParameters, " Different number of entries in A and B." if (labelDomain is None and labelList is not None) or (labelList is None and labelList is not None): raise InvalidModelParameters, "Specify either both labelDomain and labelInput or neither." if isinstance(distribution,DiscreteDistribution): # HMM has discrete emissions over finite alphabet: DiscreteEmissionHMM cmodel = ghmmwrapper.ghmm_dmodel(len(A), len(emissionDomain)) # assign model identifier (if specified) if hmmName != None: cmodel.name = hmmName else: cmodel.name = '' states = ghmmwrapper.dstate_array_alloc(cmodel.N) silent_states = [] tmpOrder = [] #initialize states for i in range(cmodel.N): state = ghmmwrapper.dstate_array_getRef(states, i) # compute state order if cmodel.M > 1: order = math.log(len(B[i]), cmodel.M)-1 else: order = len(B[i]) - 1 log.debug( "order in state " + str(i) + " = " + str(order) ) # check or valid number of emission parameters order = int(order) if cmodel.M**(order+1) == len(B[i]): tmpOrder.append(order) else: raise InvalidModelParameters("The number of " + str(len(B[i])) + " emission parameters for state " + str(i) + " is invalid. State order can not be determined.") state.b = ghmmhelper.list2double_array(B[i]) state.pi = pi[i] if sum(B[i]) == 0.0: silent_states.append(1) else: silent_states.append(0) #set out probabilities state.out_states, state.out_id, state.out_a = ghmmhelper.extract_out(A[i]) #set "in" probabilities A_col_i = map(lambda x: x[i], A) # Numarray use A[,:i] state.in_states, state.in_id, state.in_a = ghmmhelper.extract_out(A_col_i) #fix probabilities in reestimation, else 0 state.fix = 0 cmodel.s = states if sum(silent_states) > 0: cmodel.model_type |= kSilentStates cmodel.silent = ghmmhelper.list2int_array(silent_states) cmodel.maxorder = max(tmpOrder) if cmodel.maxorder > 0: log.debug( "Set kHigherOrderEmissions.") cmodel.model_type |= kHigherOrderEmissions cmodel.order = ghmmhelper.list2int_array(tmpOrder) # initialize lookup table for powers of the alphabet size, # speeds up models with higher order states powLookUp = [1] * (cmodel.maxorder+2) for i in range(1,len(powLookUp)): powLookUp[i] = powLookUp[i-1] * cmodel.M cmodel.pow_lookup = ghmmhelper.list2int_array(powLookUp) # check for state labels if labelDomain is not None and labelList is not None: if not isinstance(labelDomain,LabelDomain): raise TypeError, "LabelDomain object required." cmodel.model_type |= kLabeledStates m = StateLabelHMM(emissionDomain, distribution, labelDomain, cmodel) m.setLabels(labelList) return m else: return DiscreteEmissionHMM(emissionDomain, distribution, cmodel) else: raise GHMMError(type(distribution), "Not a valid distribution for Alphabet") else: if isinstance(distribution,GaussianDistribution): # determining number of transition classes cos = ghmmhelper.classNumber(A) if cos == 1: A = [A] cmodel = ghmmwrapper.ghmm_cmodel(len(A[0]), 1, cos) log.debug("cmodel.cos = " + str(cmodel.cos)) states = ghmmwrapper.cstate_array_alloc(cmodel.N) # Switching functions and transition classes are handled # elswhere #initialize states for i in range(cmodel.N): state = ghmmwrapper.cstate_array_getRef(states, i) state.pi = pi[i] state.M = 1 # allocate arrays of emmission parameters state.c = ghmmhelper.list2double_array([1.0]) # Mixture weights. Unused (mu, sigma) = B[i] state.mue = ghmmhelper.list2double_array([mu]) #mu = mue in GHMM C-lib. state.u = ghmmhelper.list2double_array([sigma]) state.c = ghmmhelper.list2double_array([1.0]) state.a = ghmmhelper.list2double_array([0.0]) # mixture fixing deactivated by default state.mixture_fix = ghmmhelper.list2int_array([0]) # setting densities types (all normal by default) densities = ghmmwrapper.density_array_alloc(1) state.density = densities state.setDensity(0,0)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -