modhmmer.py

来自「一个通用的隐性马尔可夫C代码库开发环境:C语言简要说明:这是一个通用的」· Python 代码 · 共 427 行 · 第 1/2 页

427 行

#module that reads HMMER file and converts it to XML import sys,re,string,StringIOfrom xml.dom import minidomdef gotoLine(f,res):    rematch = None    r = " "    while (rematch is None) and (r != ""):               r = f.readline()        rematch = res.search(r)    if ( rematch == None ):        # print "Fehler in gotoLine"        pass    else:        return rematchdef build_matrix(n,m):    "builds n x m matrix with lists"    matrix = range(n)    for i in range(n):        matrix[i] = range(m)        for j in range(m):            matrix[i][j] = 0    return matrixdef sum_mrows(mat):    "sums the rows of a matrix"    mout = range(len(mat))    for i in range(len(mat)):        s = 0        for j in range(len(mat[i])):            s=s+mat[i][j]        mout[i] = s    return moutdef norm_mat(mat):    #print mat    for i in range(len(mat)):        s = 0.0        #print "i: " + str(i)        for j in range(len(mat[i])):            s = s+mat[i][j]        for j in range(len(mat[i])):            #print "j: " + str(j) + ",  "+ str(mat[i][j])            mat[i][j] = mat[i][j]/sdef red_mat_end(mat,r):    "delete <r> rows and columns from the end of the matrix"    for i in range(len(mat)-r):        del mat[i][-1*r:]    del mat[-1*r:]def del_mat(mat,r):    "deletes the <r>th columns and row from the matrix"    for i in range(len(mat)):        del mat[i][r]    del mat[r]def toint(i):    "return integer if i is value or 0"    try:        iout = int(i)    except ValueError:        iout = 0    return ioutdef map_entries(dic,lis):    "translates the letters to the number of the columns"    dicout = {}    for k in dic.keys():        dicout[k] = []        for i in range(len(dic[k])):            dicout[k].append((lis.index(dic[k][i][0]),lis.index(dic[k][i][1])))    return dicoutdef xml_newdatanode(doc,nodename,attributename,attribute,text):    "returns node with attribute and text"    nod = doc.createElement(nodename)    nod.setAttribute(attributename,attribute)    nod.appendChild(doc.createTextNode(text))    return noddef write_file(strf,strcontent):    "writes <strcontent> in file <strf>"    try:        f = open(strf,"w")    except IOError,info:        sys.stderr.write(str(info) + "\n")        sys.exit(1)    try:        f.write(strcontent)    finally:        f.close()def remove_state(matrix, index):	for i in range(len(matrix)):		del(matrix[i][index])	del(matrix[index])		return matrixclass hmmer:    "reads hmmer file and converts it to xml"    #a few constants    intscale = 1000.0 #hmmer format specific (but it has to be a float!!!)        #positions of the entries    lisHead = ["N","B","E","J","C","T","M","I","D"]    lisMID = ["M","I","D"]    #map table for HMMER file format    dicTEntries = { "XT": [("N","B"),("N","N"),("E","C"),("E","J"),                           ("C","T"),("C","C"),("J","B"),("J","J")],                    "BD": [("B","D")],                    "HMM":[("M","M"),("M","I"),("M","D"),("I","M"),                            ("I","I"),("D","M"),("D","D"),("B","M"),("M","E")],                    }    #dictionary for the letters    dicLetters = { 4:  ["A","C","G","T"],                   20: ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q",                        "R","S","T","V","W","Y"]}        #map table translated with positions    dicTE = map_entries(dicTEntries,lisHead)    dicType = {"Amino":20,"Nucleotide":4}    #coordinate offset for graphical presentation    off_distx = 75    off_disty = 75    def __init__(self,strf):        #print "start __init__"                try:		    # f = StringIO.StringIO(strf)            if isinstance(strf,str):                 f = open(strf,"r")            else:                f = strf            except IOError,info:                 sys.stderr.write(str(info) + "\n")            #     sys.exit(1)                try:            r = f.readline()                        self.acc = gotoLine(f,re.compile("^NAME\s+([\w+\s*]+)" ) ).group(1)  # NAME as unique model identifier            # self.acc  = gotoLine(f,re.compile("^ACC\s+(\w+)" ) ).group(1)      # ACC as unique model identifier                        if self.acc[-1] == '\n':  # remove newline at end of string                self.acc = self.acc[:-1]                    	            #get number of match states            n = int(gotoLine(f,re.compile("^LENG\s*(\d+)")).group(1))            self.n = n            #get type of profile hmm: amino/nucleotide            m = self.dicType[gotoLine(f,re.compile("^ALPH\s*(\S+)")).group(1)];            self.m = m            #build matrix for transition: N B E J C T M1 I1 D1 M2 I2 D2 ... Mn In Dn            self.matrans = build_matrix(3*n+6,3*n+6)            			#emission matrix: match state, insert state, null_model            self.maems = [build_matrix(n,m),build_matrix(n,m),build_matrix(1,m)]			#get line "XT" transitions            trans = string.split(gotoLine(f,re.compile("^XT([\s\d\S]*)")).group(1))            self.set_matrix("XT",trans)            #null model            trans = string.split(gotoLine(f,re.compile("^NULT([\s\d\S]*)")).group(1))            self.manull = [self.H2P(trans[0],1.0),self.H2P(trans[1],1.0)] #[G->G,G->F]            trans = string.split(gotoLine(f,re.compile("^NULE([\s\d\S]*)")).group(1))            for k in range(len(trans)):                self.maems[2][0][k] = self.H2P(trans[k],1/float(m))                         # ***  Main section ***            gotoLine(f,re.compile("^HMM"))            f.readline()            #print "33333"            #get B->D transition probability            self.set_matrix("BD",[string.split(f.readline())[2]])            #recurse over number of match states            for i in range(n):                for j in range(3):                    lis = string.split(f.readline())                    del lis[0]                    if j==2:                        #state transition probs                        self.set_matrix("HMM",lis,i*3)                    else:                        #emmission probs: [match=0/insert=1][state][emission-letter]                        for k in range(self.m):                            #print "state: "+ str(i) +" symbol: " +str(k) +" - score = " + str(lis[k]) + ", NP = " + str(self.maems[2][0][k]) +" -> " + str(self.H2P(lis[k],self.maems[2][0][k]))							                            self.maems[j][i][k] = self.H2P(lis[k],self.maems[2][0][k])                            #print "null prob: " + str(self.maems[2][0][k])            #print "44444"            del self.maems[1][-1] #delete last (not existing) insertion state            self.matrans[self.lisHead.index("T")][self.lisHead.index("T")] = 1.0 #set end state as loop to prevent normalization issues            del_mat(self.matrans,-2)            self.matrans[-1][self.lisHead.index("E")] = 1.0 # set last D->E=1        finally:            #pass            f.close()		        #print "55555"	        #normalize matrices:        for i in range(len(self.maems)):

modhmmer.py - 源码说明

本页面展示了「一个通用的隐性马尔可夫C代码库开发环境:C语言简要说明:这是一个通用的隐性马尔可夫C代码库」中的 modhmmer.py 源码文件，采用 Python 编程语言编写，共 427 行代码。您可以在线阅读完整代码内容，也可以返回资源详情页下载完整源码包进行本地学习和开发。

虫虫下载站收录了大量与马尔可夫相关的技术资源，包括源代码、技术文档、电路图等，是电子工程师和嵌入式开发者的专业学习平台。

⌨️ 快捷键说明

复制代码Ctrl + C

搜索代码Ctrl + F

全屏模式F11

增大字号Ctrl + =

减小字号Ctrl + -

显示快捷键?