📄 modhmmer.py
字号:
#module that reads HMMER file and converts it to XML import sys,re,string,StringIOfrom xml.dom import minidomdef gotoLine(f,res): rematch = None r = " " while (rematch is None) and (r != ""): r = f.readline() rematch = res.search(r) if ( rematch == None ): # print "Fehler in gotoLine" pass else: return rematchdef build_matrix(n,m): "builds n x m matrix with lists" matrix = range(n) for i in range(n): matrix[i] = range(m) for j in range(m): matrix[i][j] = 0 return matrixdef sum_mrows(mat): "sums the rows of a matrix" mout = range(len(mat)) for i in range(len(mat)): s = 0 for j in range(len(mat[i])): s=s+mat[i][j] mout[i] = s return moutdef norm_mat(mat): #print mat for i in range(len(mat)): s = 0.0 #print "i: " + str(i) for j in range(len(mat[i])): s = s+mat[i][j] for j in range(len(mat[i])): #print "j: " + str(j) + ", "+ str(mat[i][j]) mat[i][j] = mat[i][j]/sdef red_mat_end(mat,r): "delete <r> rows and columns from the end of the matrix" for i in range(len(mat)-r): del mat[i][-1*r:] del mat[-1*r:]def del_mat(mat,r): "deletes the <r>th columns and row from the matrix" for i in range(len(mat)): del mat[i][r] del mat[r]def toint(i): "return integer if i is value or 0" try: iout = int(i) except ValueError: iout = 0 return ioutdef map_entries(dic,lis): "translates the letters to the number of the columns" dicout = {} for k in dic.keys(): dicout[k] = [] for i in range(len(dic[k])): dicout[k].append((lis.index(dic[k][i][0]),lis.index(dic[k][i][1]))) return dicoutdef xml_newdatanode(doc,nodename,attributename,attribute,text): "returns node with attribute and text" nod = doc.createElement(nodename) nod.setAttribute(attributename,attribute) nod.appendChild(doc.createTextNode(text)) return noddef write_file(strf,strcontent): "writes <strcontent> in file <strf>" try: f = open(strf,"w") except IOError,info: sys.stderr.write(str(info) + "\n") sys.exit(1) try: f.write(strcontent) finally: f.close()def remove_state(matrix, index): for i in range(len(matrix)): del(matrix[i][index]) del(matrix[index]) return matrixclass hmmer: "reads hmmer file and converts it to xml" #a few constants intscale = 1000.0 #hmmer format specific (but it has to be a float!!!) #positions of the entries lisHead = ["N","B","E","J","C","T","M","I","D"] lisMID = ["M","I","D"] #map table for HMMER file format dicTEntries = { "XT": [("N","B"),("N","N"),("E","C"),("E","J"), ("C","T"),("C","C"),("J","B"),("J","J")], "BD": [("B","D")], "HMM":[("M","M"),("M","I"),("M","D"),("I","M"), ("I","I"),("D","M"),("D","D"),("B","M"),("M","E")], } #dictionary for the letters dicLetters = { 4: ["A","C","G","T"], 20: ["A","C","D","E","F","G","H","I","K","L","M","N","P","Q", "R","S","T","V","W","Y"]} #map table translated with positions dicTE = map_entries(dicTEntries,lisHead) dicType = {"Amino":20,"Nucleotide":4} #coordinate offset for graphical presentation off_distx = 75 off_disty = 75 def __init__(self,strf): #print "start __init__" try: # f = StringIO.StringIO(strf) if isinstance(strf,str): f = open(strf,"r") else: f = strf except IOError,info: sys.stderr.write(str(info) + "\n") # sys.exit(1) try: r = f.readline() self.acc = gotoLine(f,re.compile("^NAME\s+([\w+\s*]+)" ) ).group(1) # NAME as unique model identifier # self.acc = gotoLine(f,re.compile("^ACC\s+(\w+)" ) ).group(1) # ACC as unique model identifier if self.acc[-1] == '\n': # remove newline at end of string self.acc = self.acc[:-1] #get number of match states n = int(gotoLine(f,re.compile("^LENG\s*(\d+)")).group(1)) self.n = n #get type of profile hmm: amino/nucleotide m = self.dicType[gotoLine(f,re.compile("^ALPH\s*(\S+)")).group(1)]; self.m = m #build matrix for transition: N B E J C T M1 I1 D1 M2 I2 D2 ... Mn In Dn self.matrans = build_matrix(3*n+6,3*n+6) #emission matrix: match state, insert state, null_model self.maems = [build_matrix(n,m),build_matrix(n,m),build_matrix(1,m)] #get line "XT" transitions trans = string.split(gotoLine(f,re.compile("^XT([\s\d\S]*)")).group(1)) self.set_matrix("XT",trans) #null model trans = string.split(gotoLine(f,re.compile("^NULT([\s\d\S]*)")).group(1)) self.manull = [self.H2P(trans[0],1.0),self.H2P(trans[1],1.0)] #[G->G,G->F] trans = string.split(gotoLine(f,re.compile("^NULE([\s\d\S]*)")).group(1)) for k in range(len(trans)): self.maems[2][0][k] = self.H2P(trans[k],1/float(m)) # *** Main section *** gotoLine(f,re.compile("^HMM")) f.readline() #print "33333" #get B->D transition probability self.set_matrix("BD",[string.split(f.readline())[2]]) #recurse over number of match states for i in range(n): for j in range(3): lis = string.split(f.readline()) del lis[0] if j==2: #state transition probs self.set_matrix("HMM",lis,i*3) else: #emmission probs: [match=0/insert=1][state][emission-letter] for k in range(self.m): #print "state: "+ str(i) +" symbol: " +str(k) +" - score = " + str(lis[k]) + ", NP = " + str(self.maems[2][0][k]) +" -> " + str(self.H2P(lis[k],self.maems[2][0][k])) self.maems[j][i][k] = self.H2P(lis[k],self.maems[2][0][k]) #print "null prob: " + str(self.maems[2][0][k]) #print "44444" del self.maems[1][-1] #delete last (not existing) insertion state self.matrans[self.lisHead.index("T")][self.lisHead.index("T")] = 1.0 #set end state as loop to prevent normalization issues del_mat(self.matrans,-2) self.matrans[-1][self.lisHead.index("E")] = 1.0 # set last D->E=1 finally: #pass f.close() #print "55555" #normalize matrices: for i in range(len(self.maems)):
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -