make_logodds
来自「EM算法的改进」· 代码 · 共 231 行
TXT
231 行
#!/bin/csh -f### $Id: make_logodds.txt 1339 2006-09-21 19:46:28Z tbailey $## $Log$## Revision 1.3 2006/03/08 20:50:11 nadya## merge chamges from v3_5_2 branch#### Revision 1.2.4.1 2006/01/31 20:50:43 nadya## add '-f' flag to prevent sourcing init files## was causing problem on cygwin#### Revision 1.2 2005/10/04 17:46:58 nadya## use full path for "rm". Some systems set rm alias that ask for confirmation.#### Revision 1.1.1.1 2005/07/28 23:55:47 nadya## Importing from meme-3.0.14, and adding configure/make###set perl = /usr/bin/perl# tlb 3-28-00; add support of new GCG profile format; change count to 100000if ($#argv < 2) then cat << "USAGE" USAGE: make_logodds <mfile> <logodds> [-c count] [-D dir] <mfile> name of file containing motifs <logodds> name of file for output [-c count] output only the first -c motifs Read the log-odds matrices from a meme, blockmaker or GCG profile file. Profile files may be concatenated together. The file output has format: [<w> <alength> <evalue> [<pair>]\n <column 1 of motif log-odds matrix> <column 2 of motif log-odds matrix> ... <column w of motif log-odds matrix> ]+ Prints: <alphabet> [<sequence dataset>]"USAGE" exit 1endif# get inputset mfile = $1shiftset logodds = $1shiftecho -n "" >! $logoddsif ($logodds != "/dev/null") then chmod 744 $logoddsendifset count = 100000while ("$1" != "") switch ($1) case -c: shift set count = $1 breaksw endsw shiftend# make awk script to get stuff from mfileset awk = make_logodds.awk.$$.tmponintr cleanupcat << "END" > $awk NR == 1 { # don't use BEGIN due to bug in awk count=count + 0; # make numeric width=0; # default value alpha = ""; # default value evalue=0; # default value pair=0; # default value old_alpha = ""; dataset = ""; # default value nmatrices=0; # number of matrices read } $1 == "DATAFILE=" {dataset = $2;} $1 == "ALPHABET=" {alpha = $2;} $1 == "data:" {n = $3; N = $5;} $1 == "(best)" && $4 == "e_ll" { evalue = $9; } # meme format $1 == "log-odds" && $2 == "matrix:" { if (nmatrices++ >= count) next; # skip this matrix # parse the log-odds matrix line for (i=3; i<=NF; i++) { if ($i == "alength=") { i++; alength = $i + 0; # add 0 in case garbage } else if ($i == "w=") { i++; w = $i + 0; } else if ($i == "E=") { i++; evalue = $i + 0; } else if ($i == "pair") { pair = 1; } } # parse header line # print the header line for the matrix print w, alength, evalue, pair > logodds # copy the matrix to the logodds file; allow for rows to be contain newlines tot = 0; # total fields read while (tot < w * alength) { # read lines until alength fields read if (!(getline)) { # read next line; extra parens for some # versions of awk/gawk (eg, gnu) printf \ "End of file reached while reading score matrix %s.\n", nmatrices; status = 1; exit; } #print "tot = ", tot, "NF = ", NF; for (i=0; i<NF; i++) lo[tot++] = $(i+1); # store fields in matrix if (tot > w * alength) { printf \ "Number of entries not correct for matrix %s.\n", nmatrices; status = 1; exit; } } k = 0; for (i=0; i<w; i++) { # row of logodds matrix for (j=0; j<alength; j++) { # column of logodds matrix printf "%s ", lo[k] > logodds; k++; } printf "\n" > logodds; } } # new profile format $1 == "Cons" && $(NF-1) == "Gap" { if (nmatrices++ >= count) next; # skip this matrix alength = NF - 3; # length of alphabet # get the alphabet from the first line alpha = ""; for (i=2; i<NF-1; i++) alpha = alpha $(i); # check that alphabet hasn't changed if (old_alpha != "" && alpha != old_alpha) { print "All profiles must use exactly the same alphabet." print "The first " nmatrices-1 " motif(s) use " old_alpha; print " but motif " nmatrices " uses " alpha "."; status = 1; exit; } old_alpha = alpha; # save alphabet # read the matrix for (line = 0; getline && $1 != "*"; line++) { # skip comments and blank lines and "}" first = substr($1,1,1); if (first == "!" || first == "}" || NF == 0 ) {line--; continue;} if (NF != alength+3) { printf \ "Alphabet length does not match number of scores in matrix %s.\n", \ nmatrices; status = 1; exit; } matrix[line] = ""; # save line skipping consensus letter for (i=0; i<alength; i++) matrix[line] = matrix[line] " " $(i+2); } w = line; # width of motif # print the header line for the matrix print w, alength, evalue, pair > logodds # copy the matrix to the logodds file for (i=0; i<w; i++) { print matrix[i] > logodds } } # old profile format $1 == "Cons" && $NF == ".." { if (nmatrices++ >= count) next; # skip this matrix alength = NF - 4; # length of alphabet # get the alphabet from the first line alpha = ""; for (i=0; i<alength; i++) alpha = alpha $(i+2); # check that alphabet hasn't changed if (old_alpha != "" && alpha != old_alpha) { print "All profiles must use exactly the same alphabet." print "The first " nmatrices-1 " motif(s) use " old_alpha; print " but motif " nmatrices " uses " alpha "."; status = 1; exit; } old_alpha = alpha; # save alphabet # read the matrix for (line = 0; getline && $1 != "*"; line++) { # skip comments and blank lines if (substr($1,1,1) == "!" || NF == 0) {line--; continue;} if (NF != alength+3) { printf \ "Alphabet length does not match number of scores in matrix %s.\n", \ nmatrices; status = 1; exit; } matrix[line] = ""; # save line skipping consensus letter for (i=0; i<alength; i++) matrix[line] = matrix[line] " " $(i+2); } w = line; # width of motif # print the header line for the matrix print w, alength, evalue, pair > logodds # copy the matrix to the logodds file for (i=0; i<w; i++) { print matrix[i] > logodds } } END { if (status) exit status; if (alpha == "") { print "Could not find the motif alphabet."; print "The motif file must contain a line starting with \"ALPHABET= \""; print "followed by 'alphabet', a list containing the letters used in"; print "the motifs. The order of the letters in 'alphabet' must be the"; print "same as the order of the columns of scores in the motifs."; exit 1; } print alpha, dataset; exit 0; }"END"# get the logodds matrices and stuff$perl -ne 's/\r$//; print;' $mfile | awk -f $awk logodds=$logodds count=$countset sav_status = $statuscleanup:/bin/rm -f make_logodds.*.$$.tmpexit $sav_status
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?