📄 hmmio.c
字号:
/************************************************************ * HMMER - Biological sequence analysis with profile HMMs * Copyright (C) 1992-1999 Washington University School of Medicine * All Rights Reserved * * This source code is distributed under the terms of the * GNU General Public License. See the files COPYING and LICENSE * for details. ************************************************************//* hmmio.c * * Input/output of HMMs. * * As of HMMER 2.0, HMMs are saved by default in a tabular ASCII format * as log-odds or log probabilities scaled to an integer. A binary save * file format is also available which is faster to access (a * consideration which might be important for HMM library applications). * HMMs can be concatenated into HMM libraries. * * A comment on loss of accuracy. Storing a number as a scaled log * probability guarantees us an error of about 0.035% or * less in the retrieved probability. We are relatively invulnerable * to the truncation errors which HMMER 1.8 was vulnerable to. * * Magic numbers (both for the ASCII and binary save formats) are used * to label save files with a major version number. This simplifies the task of * backwards compatibility as new versions of the program are created. * Reverse but not forward compatibility is guaranteed. I.e. HMMER 2.0 * can read `1.7' save files, but not vice versa. Note that the major * version number in the save files is NOT the version of the software * that generated it; rather, the number of the last major version in which * save format changed. * ****************************************************************** * * The HMM input API: * * HMMFILE *hmmfp; * char *hmmfile; * struct plan7_s *hmm; * char env[] = "HMMERDB"; (a la BLASTDB) * * hmmfp = HMMFileOpen(hmmfile, env) NULL on failure * while (HMMFileRead(hmmfp, &hmm)) 0 if no more HMMs * if (hmm == NULL) Die(); NULL on file parse failure * whatever; * FreeHMM(hmm); * } * HMMFileClose(hmmfp); * ***************************************************************** * * The HMM output API: * * FILE *ofp; * struct plan7_s *hmm; * * WriteAscHMM(ofp, hmm); to write/append an HMM to open file * or WriteBinHMM(ofp, hmm); to write/append binary format HMM to open file * ***************************************************************** * * V1.0: original implementation * V1.1: regularizers removed from model structure * V1.7: ref and cs annotation lines added from alignment, one * char per match state 1..M * V1.9: null model and name added to HMM structure. ASCII format changed * to compact tabular one. * V2.0: Plan7. Essentially complete rewrite. */#include <stdio.h>#include <stdlib.h>#include <string.h>#include <ctype.h>#include <time.h>#include <unistd.h> /* to get SEEK_CUR definition on silly Suns */#include "squid.h"#include "config.h"#include "structs.h"#include "funcs.h"#include "version.h"#include "ssi.h"/* Magic numbers identifying binary formats. * Do not change the old magics! Necessary for backwards compatibility. */static unsigned int v10magic = 0xe8ededb1; /* v1.0 binary: "hmm1" + 0x80808080 */static unsigned int v10swap = 0xb1edede8; /* byteswapped v1.0 */static unsigned int v11magic = 0xe8ededb2; /* v1.1 binary: "hmm2" + 0x80808080 */static unsigned int v11swap = 0xb2edede8; /* byteswapped v1.1 */static unsigned int v17magic = 0xe8ededb3; /* v1.7 binary: "hmm3" + 0x80808080 */static unsigned int v17swap = 0xb3edede8; /* byteswapped v1.7 */static unsigned int v19magic = 0xe8ededb4; /* V1.9 binary: "hmm4" + 0x80808080 */static unsigned int v19swap = 0xb4edede8; /* V1.9 binary, byteswapped */ static unsigned int v20magic = 0xe8ededb5; /* V2.0 binary: "hmm5" + 0x80808080 */static unsigned int v20swap = 0xb5edede8; /* V2.0 binary, byteswapped *//* Old HMMER 1.x file formats. */#define HMMER1_0B 1 /* binary HMMER 1.0 */#define HMMER1_0F 2 /* flat ascii HMMER 1.0 */#define HMMER1_1B 3 /* binary HMMER 1.1 */#define HMMER1_1F 4 /* flat ascii HMMER 1.1 */#define HMMER1_7B 5 /* binary HMMER 1.7 */#define HMMER1_7F 6 /* flat ascii HMMER 1.7 */#define HMMER1_9B 7 /* HMMER 1.9 binary */#define HMMER1_9F 8 /* HMMER 1.9 flat ascii */static int read_asc20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_bin20hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_asc19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_bin19hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_asc17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_bin17hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_asc11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_bin11hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_asc10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static int read_bin10hmm(HMMFILE *hmmfp, struct plan7_s **ret_hmm);static void byteswap(char *swap, int nbytes);static char *prob2ascii(float p, float null);static float ascii2prob(char *s, float null);static void write_bin_string(FILE *fp, char *s);static int read_bin_string(FILE *fp, int doswap, char **ret_s);static void multiline(FILE *fp, char *pfx, char *s);static struct plan9_s *read_plan9_binhmm(FILE *fp, int version, int swapped);static struct plan9_s *read_plan9_aschmm(FILE *fp, int version);/***************************************************************** * HMM input API functions: * HMMFileOpen() * HMMFileRead() * HMMFileClose() * HMMFileRewind() *****************************************************************/ /* Function: HMMFileOpen() * * Purpose: Open an HMM file for reading. The file may be either * an index for a library of HMMs, or an HMM. * * Args: hmmfile - name of file * env - NULL, or environment variable for HMM database. * * Return: Valid HMMFILE *, or NULL on failure. */HMMFILE * HMMFileOpen(char *hmmfile, char *env){ HMMFILE *hmmfp; unsigned int magic; char buf[512]; char *ssifile; char *dir; /* dir name in which HMM file was found */ int status; hmmfp = (HMMFILE *) MallocOrDie (sizeof(HMMFILE)); hmmfp->f = NULL; hmmfp->parser = NULL; hmmfp->is_binary = FALSE; hmmfp->byteswap = FALSE; hmmfp->is_seekable= TRUE; /* always; right now, an HMM must always be in a file. */ /* Open the file. Look in current directory. * If that doesn't work, check environment var for * a second possible directory (usually the location * of a system-wide HMM library). * Using dir name if necessary, construct correct SSI file name. */ hmmfp->f = NULL; hmmfp->ssi = NULL; if ((hmmfp->f = fopen(hmmfile, "r")) != NULL) { ssifile = MallocOrDie(sizeof(char) * (strlen(hmmfile) + 5)); sprintf(ssifile, "%s.ssi", hmmfile); if ((hmmfp->mode = SSIRecommendMode(hmmfile)) == -1) Die("SSIRecommendMode() failed"); } else if ((hmmfp->f = EnvFileOpen(hmmfile, env, &dir)) != NULL) { char *full; full = FileConcat(dir, hmmfile); ssifile = MallocOrDie(sizeof(char) * (strlen(full) + strlen(hmmfile) + 5)); sprintf(ssifile, "%s.ssi", full); if ((hmmfp->mode = SSIRecommendMode(full)) == -1) Die("SSIRecommendMode() failed"); free(full); free(dir); } else return NULL; /* Open the SSI index file. If it doesn't exist, or it's corrupt, or * some error happens, hmmfp->ssi stays NULL. */ SQD_DPRINTF1(("Opening ssifile %s...\n", ssifile)); SSIOpen(ssifile, &(hmmfp->ssi)); free(ssifile); /* Initialize the disk offset stuff. */ status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset)); if (status != 0) Die("SSIGetFilePosition() failed"); /* Check for binary or byteswapped binary format * by peeking at first 4 bytes. */ if (! fread((char *) &magic, sizeof(unsigned int), 1, hmmfp->f)) { HMMFileClose(hmmfp); return NULL; } rewind(hmmfp->f); if (magic == v20magic) { hmmfp->parser = read_bin20hmm; hmmfp->is_binary = TRUE; return hmmfp; } else if (magic == v20swap) { SQD_DPRINTF1(("Opened a HMMER 2.0 binary file [byteswapped]\n")); hmmfp->parser = read_bin20hmm; hmmfp->is_binary = TRUE; hmmfp->byteswap = TRUE; return hmmfp; } else if (magic == v19magic) { hmmfp->parser = read_bin19hmm; hmmfp->is_binary = TRUE; return hmmfp; } else if (magic == v19swap) { hmmfp->parser = read_bin19hmm; hmmfp->is_binary = TRUE; hmmfp->byteswap = TRUE; return hmmfp; } else if (magic == v17magic) { hmmfp->parser = read_bin17hmm; hmmfp->is_binary = TRUE; return hmmfp; } else if (magic == v17swap) { hmmfp->parser = read_bin17hmm; hmmfp->is_binary = TRUE; hmmfp->byteswap = TRUE; return hmmfp; } else if (magic == v11magic) { hmmfp->parser = read_bin11hmm; hmmfp->is_binary = TRUE; return hmmfp; } else if (magic == v11swap) { hmmfp->parser = read_bin11hmm; hmmfp->is_binary = TRUE; hmmfp->byteswap = TRUE; return hmmfp; } else if (magic == v10magic) { hmmfp->parser = read_bin10hmm; hmmfp->is_binary = TRUE; return hmmfp; } else if (magic == v10swap) { hmmfp->parser = read_bin10hmm; hmmfp->is_binary = TRUE; hmmfp->byteswap = TRUE; return hmmfp; } /* else we fall thru; it may be an ASCII file. */ /* If magic looks binary but we don't recognize it, choke and die. */ if (magic & 0x80000000) { Warn("\%s appears to be a binary but format is not recognized\n\It may be from a HMMER version more recent than yours,\n\or may be a different kind of binary altogether.\n", hmmfile); HMMFileClose(hmmfp); return NULL; } /* Check for ASCII format by peeking at first word. */ if (fgets(buf, 512, hmmfp->f) == NULL) { HMMFileClose(hmmfp); return NULL; } rewind(hmmfp->f); if (strncmp("HMMER2.0", buf, 8) == 0) { hmmfp->parser = read_asc20hmm; return hmmfp; } else if (strncmp("HMMER v1.9", buf, 10) == 0) { hmmfp->parser = read_asc19hmm; return hmmfp; } else if (strncmp("# HMM v1.7", buf, 10) == 0) { hmmfp->parser = read_asc17hmm; return hmmfp; } else if (strncmp("# HMM v1.1", buf, 10) == 0) { hmmfp->parser = read_asc11hmm; return hmmfp; } else if (strncmp("# HMM v1.0", buf, 10) == 0) { hmmfp->parser = read_asc10hmm; return hmmfp; } /* If we haven't recognized it yet, it's bogus. */ HMMFileClose(hmmfp); return NULL;}intHMMFileRead(HMMFILE *hmmfp, struct plan7_s **ret_hmm){ int status; /* Set the disk position marker. */ if (hmmfp->is_seekable) { status = SSIGetFilePosition(hmmfp->f, hmmfp->mode, &(hmmfp->offset)); if (status != 0) Die("SSIGetFilePosition() failed"); } /* Parse the HMM and return it. */ return (*hmmfp->parser)(hmmfp, ret_hmm);}voidHMMFileClose(HMMFILE *hmmfp){ if (hmmfp->f != NULL) fclose(hmmfp->f); if (hmmfp->ssi != NULL) SSIClose(hmmfp->ssi); free(hmmfp);}void HMMFileRewind(HMMFILE *hmmfp){ rewind(hmmfp->f);}intHMMFilePositionByName(HMMFILE *hmmfp, char *name){ SSIOFFSET offset; /* offset in hmmfile, from SSI */ int fh; /* ignored. */ if (hmmfp->ssi == NULL) return 0; if (SSIGetOffsetByName(hmmfp->ssi, name, &fh, &offset) != 0) return 0; if (SSISetFilePosition(hmmfp->f, &offset) != 0) return 0; return 1;}int HMMFilePositionByIndex(HMMFILE *hmmfp, int idx){ /* idx runs from 0..nhmm-1 */ int fh; /* file handle is ignored; only one HMM file */ SSIOFFSET offset; /* file position of HMM */ if (hmmfp->ssi == NULL) return 0; if (SSIGetOffsetByNumber(hmmfp->ssi, idx, &fh, &offset) != 0) return 0; if (SSISetFilePosition(hmmfp->f, &offset) != 0) return 0; return 1;}/***************************************************************** * HMM output API: * WriteAscHMM() * WriteBinHMM() * *****************************************************************/ /* Function: WriteAscHMM() * * Purpose: Save an HMM in flat text ASCII format. * * Args: fp - open file for writing * hmm - HMM to save */voidWriteAscHMM(FILE *fp, struct plan7_s *hmm){ int k; /* counter for nodes */ int x; /* counter for symbols */ int ts; /* counter for state transitions */ fprintf(fp, "HMMER2.0 [%s]\n", RELEASE); /* magic header */ /* write header information */ fprintf(fp, "NAME %s\n", hmm->name); if (hmm->flags & PLAN7_ACC) fprintf(fp, "ACC %s\n", hmm->acc); if (hmm->flags & PLAN7_DESC) fprintf(fp, "DESC %s\n", hmm->desc); fprintf(fp, "LENG %d\n", hmm->M); fprintf(fp, "ALPH %s\n", (Alphabet_type == hmmAMINO) ? "Amino":"Nucleic"); fprintf(fp, "RF %s\n", (hmm->flags & PLAN7_RF) ? "yes" : "no"); fprintf(fp, "CS %s\n", (hmm->flags & PLAN7_CS) ? "yes" : "no"); fprintf(fp, "MAP %s\n", (hmm->flags & PLAN7_MAP) ? "yes" : "no"); multiline(fp, "COM ", hmm->comlog); fprintf(fp, "NSEQ %d\n", hmm->nseq); fprintf(fp, "DATE %s\n", hmm->ctime); fprintf(fp, "CKSUM %d\n", hmm->checksum); if (hmm->flags & PLAN7_GA) fprintf(fp, "GA %.1f %.1f\n", hmm->ga1, hmm->ga2); if (hmm->flags & PLAN7_TC) fprintf(fp, "TC %.1f %.1f\n", hmm->tc1, hmm->tc2); if (hmm->flags & PLAN7_NC) fprintf(fp, "NC %.1f %.1f\n", hmm->nc1, hmm->nc2); /* Specials */ fputs("XT ", fp); for (k = 0; k < 4; k++) for (x = 0; x < 2; x++) fprintf(fp, "%6s ", prob2ascii(hmm->xt[k][x], 1.0)); fputs("\n", fp); /* Save the null model first, so HMM readers can decode * log odds scores on the fly. Save as log odds probabilities * relative to 1/Alphabet_size (flat distribution) */ fprintf(fp, "NULT "); fprintf(fp, "%6s ", prob2ascii(hmm->p1, 1.0)); /* p1 */ fprintf(fp, "%6s\n", prob2ascii(1.0-hmm->p1, 1.0)); /* p2 */ fputs("NULE ", fp); for (x = 0; x < Alphabet_size; x++) fprintf(fp, "%6s ", prob2ascii(hmm->null[x], 1/(float)(Alphabet_size))); fputs("\n", fp); /* EVD statistics */ if (hmm->flags & PLAN7_STATS) fprintf(fp, "EVD %10f %10f\n", hmm->mu, hmm->lambda);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -