📄 hmmer2modhmm.java
字号:
import java.util.*;import java.io.*;import java.text.*;class hmmer2modhmm{ /* Reads an HMM in hmmer-format and saves it in modhmm format * In parameters are 'infile' and 'outfile' */ public static void main(String[] args) { HmmerConverter hmmerConverter = null; if(args.length < 1 || args.length > 2) { System.out.println("java hmmer2modhmm <infile> [outdir]"); System.exit(0); } else if(args[0].equals("-h")){ System.out.println("java hmmer2modhmm <infile> [outdir]"); System.exit(0); } else { hmmerConverter = new HmmerConverter(args[0]); hmmerConverter.get(); } if(args.length == 1) { hmmerConverter.save("./"); } else { hmmerConverter.save(args[1] + "/"); } } }class HmmerConverter{ BufferedReader br = null; String filename = ""; final String[] aminoAcids = {"A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "Y", "W"}; final String[] nucleotides = {"A", "C", "G", "T"}; final int INTSCALE = 1000; final double START_TO_N_TERM_PROB = 99.0; /* this is not really a probability, but a score which will be weighted * towards a 1.0 score, so 99.0 means a probability of 0.99 */ final double MODEL_TO_C_TERM_PROB = 0.99; final double MODEL_TO_EXIT_PROB = 0.01; /* header info */ String name; String[] alphabet; int a_size; int length; double[] nullEmissProbs; //double[] nullTransProbs = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; double[] nullTransProbs = {0.997151, 0.997151, 1.0, 0.997151, 0.997151, 0.997151, 1.0, 1.0, 1.0}; /* model info */ double[][] matchEmissionProbs; double[][] insertEmissionProbs; double[][] transitionProbs; double startToMatch; double startToDelete; double startToNterm; double modelToExit; double modelToCterm; double NtermToModel; double NtermLoop; double CtermToExit; double CtermLoop; ModelMaker modelmaker; public HmmerConverter(String infile) { try { br = new BufferedReader(new FileReader(infile)); modelmaker = new ModelMaker(); } catch (IOException e) { System.out.println("Could not open " + infile); } filename = infile; } public void get() { readHeader(); readModel(); createModhmmModel(); setModhmmModelProbs(); } public void save(String outdir) { modelmaker.saveHMM(outdir); } private void createModhmmModel() { modelmaker.createHMM(name); modelmaker.setNrOfAlphabets(1); modelmaker.setAlphabet(alphabet); int res = modelmaker.createModule("s", HMM.STARTNODE, HMM.ZERO, 1, "0"); res = modelmaker.createModule("p1", HMM.PROFILE7, HMM.EVEN, length, "d", false); res = modelmaker.createModule("e", HMM.ENDNODE, HMM.ZERO, 1, "0"); modelmaker.setTransition("s", "p1"); modelmaker.initializeTransitionProbabilities("s"); modelmaker.setTransition("p1", "e"); modelmaker.initializeTransitionProbabilities("p1"); Module m = modelmaker.getModule("p1"); m.setDistribType(HMM.EVEN); } private void setModhmmModelProbs() { /* trans from start state */ int vNr = 1; Vertex v = ((Vertex)modelmaker.getVertex(vNr)); v.setTransitionProbability(2, startToNterm); v.setTransitionProbability(3, startToDelete); v.setTransitionProbability(4, startToMatch); int arrayIndex = 1; for(int i = 7; i < modelmaker.getNrVertices() - 4; i += 3) { v.setTransitionProbability(i, transitionProbs[arrayIndex][7]); arrayIndex++; } /* trans from N-term loop */ vNr = 2; v = ((Vertex)modelmaker.getVertex(vNr)); v.setTransitionProbability(2, NtermLoop); v.setTransitionProbability(1, NtermToModel); /* emissions in N-term loop */ v.setInitialEmissionProbs(nullEmissProbs); /* set model probs */ arrayIndex = 0; for(vNr = 3; vNr < modelmaker.getNrVertices() - 6; vNr++) { v = ((Vertex)modelmaker.getVertex(vNr)); if(vNr % 3 == 0) { v.setTransitionProbability(vNr + 3, transitionProbs[arrayIndex][6]); v.setTransitionProbability(vNr + 4, transitionProbs[arrayIndex][5]); } if(vNr % 3 == 1) { v.setInitialEmissionProbs(matchEmissionProbs[arrayIndex]); v.setTransitionProbability(vNr + 2, transitionProbs[arrayIndex][2]); v.setTransitionProbability(vNr + 3, transitionProbs[arrayIndex][0]); v.setTransitionProbability(vNr + 1, transitionProbs[arrayIndex][1]); v.setTransitionProbability(modelmaker.getNrVertices() - 4, transitionProbs[arrayIndex][8]); } if(vNr % 3 == 2) { v.setInitialEmissionProbs(insertEmissionProbs[arrayIndex]); v.setTransitionProbability(vNr, transitionProbs[arrayIndex][4]); v.setTransitionProbability(vNr + 2, transitionProbs[arrayIndex][3]); arrayIndex++; } } /* set emiss probsof last match state */ vNr = modelmaker.getNrVertices() - 5; v = ((Vertex)modelmaker.getVertex(vNr)); v.setInitialEmissionProbs(matchEmissionProbs[arrayIndex]); /* trans from end of model state */ vNr = modelmaker.getNrVertices() - 4; v = ((Vertex)modelmaker.getVertex(vNr)); v.setTransitionProbability(vNr + 1, modelToCterm); v.setTransitionProbability(vNr + 2, modelToExit); /* trans from C-term loop */ vNr = modelmaker.getNrVertices() - 3;; v = ((Vertex)modelmaker.getVertex(vNr)); v.setTransitionProbability(vNr, CtermLoop); v.setTransitionProbability(vNr - 1, CtermToExit); /* emissions in N-term loop */ v.setInitialEmissionProbs(nullEmissProbs); for(int i = 0; i < modelmaker.getNrVertices(); i++) { v = ((Vertex)modelmaker.getVertex(i)); v.normalizeEmissProbs(1); v.normalizeTransProbs(); } } private void readHeader() { try { while(true) { String s = br.readLine(); if(s.startsWith("NAME")) { StringTokenizer st = new StringTokenizer(s, " \t"); if(st.countTokens() != 2) { System.out.println("strange name line in input file"); System.exit(0); } else { st.nextToken(); name = st.nextToken(); } } else if(s.startsWith("ALPH")) { StringTokenizer st = new StringTokenizer(s, " \t"); if(st.countTokens() != 2) { System.out.println("strange alph line in input file"); System.exit(0); } else { st.nextToken(); String alph = st.nextToken(); if(alph.startsWith("Amino")) { alphabet = aminoAcids; a_size = 20; } else if(alph.startsWith("Nucleic")) { alphabet = nucleotides; a_size = 4; } else { System.out.println("Unknown alphabet: " + alph); System.exit(0); } } } else if(s.startsWith("LENG")) { StringTokenizer st = new StringTokenizer(s, " \t"); if(st.countTokens() != 2) { System.out.println("strange length line in input file"); System.exit(0); } else { st.nextToken(); try { length = Integer.parseInt(st.nextToken()); } catch(NumberFormatException e) { System.out.println("length line in input file has incorrect format"); System.exit(0); } } } else if(s.startsWith("XT")) { StringTokenizer st = new StringTokenizer(s, " \t"); if(st.countTokens() != 9) { System.out.println("Special transition row has wrong number of symbols"); System.exit(0); } else { st.nextToken(); for(int i = 0; i < 8;) { String probNrS= st.nextToken(); int probNr = 0; try { probNr = Integer.parseInt(probNrS); switch(i) { case 0: NtermToModel = Math.pow(2, ((double)probNr)/((double)INTSCALE)); i +=1; break; case 1: NtermLoop = Math.pow(2, ((double)probNr)/((double)INTSCALE)); i +=1; break; case 2: case 3: i += 1; break; case 4: CtermToExit = Math.pow(2, ((double)probNr)/((double)INTSCALE)); i += 1; break; case 5: CtermLoop = Math.pow(2, ((double)probNr)/((double)INTSCALE)); i += 1; break; default: i += 1; } } catch(NumberFormatException e) { if(probNrS.equals("*")) { switch(i) { case 0: NtermToModel = 0.0; i +=1; break; case 1: NtermLoop = 0.0; i +=1; break; case 2: case 3: i += 1; break; case 4: CtermToExit = 0.0; i += 1; break; case 5: CtermLoop = 0.0; i += 1; break; default: i += 1; } } else { System.out.println("Special transition line in input file has incorrect format"); System.exit(0); } } } } } else if(s.startsWith("NULE")) { StringTokenizer st = new StringTokenizer(s, " \t"); if(st.countTokens() != a_size + 1) { System.out.println("Null emission row has wrong number of emission prob symbols"); System.exit(0); } else { st.nextToken(); nullEmissProbs = new double[a_size]; double nullEmissProbsSum = 0.0; for(int i = 0; i < a_size; i++) { String probNrS = st.nextToken(); try { int probNr = Integer.parseInt(probNrS); nullEmissProbs[i] = 1.0 / ((double)a_size) * Math.pow(2, ((double)probNr)/((double)INTSCALE)); nullEmissProbsSum += nullEmissProbs[i]; } catch(NumberFormatException e) { if(probNrS.equals("*")) { nullEmissProbs[i] = 0.0; } else { System.out.println("null emission line in input file has incorrect format"); System.exit(0); } } } } } else if(s.startsWith("HMM") && !s.startsWith("HMMER")) { System.out.println("Read header OK"); break; } } } catch(IOException e) { System.out.println("Could not read header"); System.exit(0); } } private void readModel() { try { br.readLine(); String s = br.readLine(); /* read into model transition probs */ StringTokenizer st = new StringTokenizer(s, " \t"); if(st.countTokens() != 3) { System.out.println("Could not read into model probs"); System.exit(0); } else { String probNrS = st.nextToken(); try { int probNr = Integer.parseInt(probNrS); startToMatch = Math.pow(2.0, ((double)probNr)/((double)INTSCALE)); } catch(NumberFormatException e) { System.out.println("null emission line in input file has incorrect format"); System.exit(0); } probNrS = st.nextToken(); probNrS = st.nextToken(); try { int probNr = Integer.parseInt(probNrS); startToDelete = Math.pow(2.0, ((double)probNr)/((double)INTSCALE)); } catch(NumberFormatException e) { System.out.println("null emission line in input file has incorrect format"); System.exit(0); } /* set prob of going to insert state directly */ startToNterm = START_TO_N_TERM_PROB; modelToCterm = MODEL_TO_C_TERM_PROB; modelToExit = MODEL_TO_EXIT_PROB; /* read model transition and emission probs */ matchEmissionProbs = new double[length][a_size]; insertEmissionProbs = new double[length][a_size]; transitionProbs = new double[length][9]; for(int i = 0; i < length; i++) { /* read match emissions */ s = br.readLine(); readProbs(s, matchEmissionProbs[i], a_size, nullEmissProbs); /* read insert emissions */ s= br.readLine(); readProbs(s, insertEmissionProbs[i], a_size, nullEmissProbs); /* read transitions */ s= br.readLine(); readProbs(s, transitionProbs[i], 9, nullTransProbs); } } } catch(IOException e) { System.out.println("Could not read model"); System.exit(0); } } private void readProbs(String s, double[] probArray, int nrProbsToRead, double[] nullProbs) { StringTokenizer st = new StringTokenizer(s, " \t"); st.nextToken(); for(int i = 0; i < nrProbsToRead; i++) { String probNrS = st.nextToken(); if(probNrS.equals("*")) { probArray[i] = 0.0; } else { try { int probNr = Integer.parseInt(probNrS); //probArray[i] = Math.pow(2.0,((double)probNr)/((double)INTSCALE)); //probArray[i] = Math.pow(2.0,((double)probNr)/((double)INTSCALE)); probArray[i] = nullProbs[i] * Math.pow(2.0,((double)probNr)/((double)INTSCALE)); probArray[i] = nullProbs[i] * Math.pow(2.0,((double)probNr)/((double)INTSCALE)); } catch(NumberFormatException e) { System.out.println("model transition or emission line in input file has incorrect format"); System.exit(0); } } } }}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -