📄 sequence.c
字号:
/******************************************************************************* author : Bernd Wichern and extended by Andrea Weisse and Utz J. Pape filename : /zpr/bspk/src/hmm/ghmm/ghmm/sequence.c created : TIME: 11:29:02 DATE: Thu 12. February 1998 $Id: sequence.c,v 1.17 2004/03/01 14:59:08 wasinee Exp $Copyright (C) 1998-2001, ZAIK/ZPR, Universit鋞 zu K鰈nThis program is free software; you can redistribute it and/or modifyit under the terms of the GNU General Public License as published bythe Free Software Foundation; either version 2 of the License, or(at your option) any later version.This program is distributed in the hope that it will be useful,but WITHOUT ANY WARRANTY; without even the implied warranty ofMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See theGNU General Public License for more details.You should have received a copy of the GNU General Public Licensealong with this program; if not, write to the Free SoftwareFoundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*****************************************************************************/#include "mprintf.h"#include "mes.h"#include "sequence.h"#include <math.h>#include <float.h>#include "mprintf.h"#include "mes.h"#include "matrix.h"#include "vector.h"#include "const.h"#include "model.h"#include "foba.h"#include "sfoba.h"#include "vector.h"#include "rng.h"#include "string.h"/*============================================================================*/sequence_t** sequence_read(const char *filename, int *sq_number) {#define CUR_PROC "sequence_read" int i; sequence_t **sequence = NULL; scanner_t *s = NULL; *sq_number = 0; s = scanner_alloc(filename); if(!s) {mes_proc(); goto STOP;} while(!s->err && !s->eof && s->c - '}') { scanner_get_name(s); scanner_consume(s, '='); if(s->err) goto STOP; /* sequence file */ if (!strcmp(s->id, "SEQ")) { (*sq_number)++; /* more mem */ if (m_realloc(sequence, *sq_number)) { mes_proc(); goto STOP; } sequence[*sq_number-1] = sequence_read_alloc(s); if (!sequence[*sq_number-1]) { mes_proc(); goto STOP; } } else { scanner_error(s, "unknown identifier"); goto STOP; } scanner_consume( s, ';' ); if(s->err) goto STOP; } scanner_free(&s); return sequence;STOP: for (i = 0; i < *sq_number; i++) sequence_free(&(sequence[i])); m_free(sequence); *sq_number = 0; return NULL;#undef CUR_PROC}/*============================================================================*/sequence_t* sequence_read_alloc(scanner_t *s){#define CUR_PROC "sequence_read_alloc" int symbols = 0, lexWord = 0; sequence_t *sq = NULL; int seq_len_lex = 0; if( !m_calloc( sq, 1 ) ) {mes_proc();goto STOP;} scanner_consume( s, '{' ); if(s->err) goto STOP; while(!s->err && !s->eof && s->c - '}') { scanner_get_name(s); scanner_consume(s, '='); if(s->err) goto STOP; /* array of sequences to read */ if (!strcmp(s->id, "O")) { scanner_consume(s, '{'); if (s->err) goto STOP; sq->seq_number = 0; sq->total_w = 0.0; while( !s->eof && !s->err && s->c - '}') { /* another sequence --> realloc */ if (m_realloc(sq->seq, sq->seq_number + 1)) { mes_proc(); goto STOP; } if (m_realloc(sq->seq_len, sq->seq_number + 1)) { mes_proc(); goto STOP; } if (m_realloc(sq->seq_label, sq->seq_number + 1)) { mes_proc(); goto STOP; } if (m_realloc(sq->seq_id, sq->seq_number + 1)) { mes_proc(); goto STOP; } if (m_realloc(sq->seq_w, sq->seq_number + 1)) { mes_proc(); goto STOP; } /* Label and ID */ /* default */ sq->seq_label[sq->seq_number] = -1; sq->seq_id[sq->seq_number] = -1.0; sq->seq_w[sq->seq_number] = 1; while (s->c == '<' || s->c == '(' || s->c == '|') { if (s->c == '<') { scanner_consume(s, '<'); if (s->err) goto STOP; sq->seq_label[sq->seq_number] = scanner_get_int(s); if(s->err) goto STOP; scanner_consume(s, '>'); if (s->err) goto STOP; } if (s->c == '(') { scanner_consume(s, '('); if (s->err) goto STOP; sq->seq_id[sq->seq_number] = scanner_get_edouble(s); if(s->err) goto STOP; scanner_consume(s, ')'); if (s->err) goto STOP; } if (s->c == '|') { scanner_consume(s, '|'); if (s->err) goto STOP; sq->seq_w[sq->seq_number] = (double) scanner_get_int(s); if (sq->seq_w[sq->seq_number] <= 0) { scanner_error(s, "sequence weight not positiv\n"); goto STOP; } if(s->err) goto STOP; scanner_consume(s, '|'); if (s->err) goto STOP; } } sq->seq[sq->seq_number] = scanner_get_int_array(s, sq->seq_len+sq->seq_number); if (sq->seq_len[sq->seq_number] > MAX_SEQ_LEN) { scanner_error(s, "sequence too long"); goto STOP; } scanner_consume(s, ';'); if (s->err) goto STOP; sq->total_w += sq->seq_w[sq->seq_number]; sq->seq_number++; } /* while( !s->eof...) */ if ((sq->seq_number == 0) || (sq->seq_number > MAX_SEQ_NUMBER)) { char *str = mprintf(NULL, 0, "Number of sequences %ld exceeds possible range", sq->seq_number); mes_prot(str); m_free(str); goto STOP; } scanner_consume(s, '}'); if (s->err) goto STOP; } /* all possible seqs., sorted lexicographical */ else if (!strcmp(s->id, "L")) { lexWord = 1; scanner_consume( s, '{' ); if(s->err) goto STOP; while(!s->err && !s->eof && s->c - '}') { scanner_get_name(s); scanner_consume(s, '='); if(s->err) goto STOP; if (!strcmp(s->id, "seq_len")) { seq_len_lex = scanner_get_int(s); if(s->err) goto STOP; if (seq_len_lex <= 0) { mes_prot("Value for sequence length not allowed"); goto STOP; } } else if (!strcmp(s->id, "symb")) { if (symbols < 0) { mes_prot("Value for number of symbols not allowed"); goto STOP; } symbols = scanner_get_int(s); if(s->err) goto STOP; } else { scanner_error(s, "unknown identifier"); goto STOP; } scanner_consume(s, ';'); if(s->err) goto STOP; } scanner_consume(s, '}'); if ((seq_len_lex <= 0) || (symbols < 0)) { mes_prot("Values for seq. length or number of symbols not spezified"); goto STOP; } sq = sequence_lexWords(seq_len_lex, symbols); if (!sq) goto STOP; } /*if (!strcmp(s->id, "L"))*/ else { scanner_error(s, "unknown identifier"); goto STOP; } scanner_consume( s, ';' ); if(s->err) goto STOP; } /* while(!s->err && !s->eof && s->c - '}') */ scanner_consume( s, '}'); if(s->err) goto STOP; return(sq); STOP: sequence_free(&sq); return(NULL);#undef CUR_PROC} /* sequence_read_alloc */ /*============================================================================*/sequence_d_t** sequence_d_read(const char *filename, int *sqd_number) {#define CUR_PROC "sequence_d_read" int i; scanner_t *s = NULL; sequence_d_t **sequence = NULL; *sqd_number = 0; s = scanner_alloc(filename); if(!s) {mes_proc(); goto STOP;} while(!s->err && !s->eof && s->c - '}') { scanner_get_name(s); scanner_consume(s, '='); if(s->err) goto STOP; /* sequence file */ if (!strcmp(s->id, "SEQD")) { (*sqd_number)++; /* more mem */ if (m_realloc(sequence, *sqd_number)) { mes_proc(); goto STOP; } sequence[*sqd_number-1] = sequence_d_read_alloc(s); if (!sequence[*sqd_number-1]) { mes_proc(); goto STOP; } } else { scanner_error(s, "unknown identifier"); goto STOP; } scanner_consume( s, ';' ); if(s->err) goto STOP; } scanner_free(&s); return sequence;STOP: scanner_free(&s); for (i = 0; i < *sqd_number; i++) sequence_d_free(&(sequence[i])); m_free(sequence); *sqd_number = 0; return NULL;#undef CUR_PROC} /* sequence_d_read */ /*============================================================================*/sequence_d_t* sequence_d_read_alloc(scanner_t *s){#define CUR_PROC "sequence_d_read_alloc" sequence_d_t *sqd = NULL; if( !m_calloc( sqd, 1 ) ) {mes_proc();goto STOP;} scanner_consume( s, '{' ); if(s->err) goto STOP; while(!s->err && !s->eof && s->c - '}') { scanner_get_name(s); scanner_consume(s, '='); if(s->err) goto STOP; /* array of sequences to read */ if (!strcmp(s->id, "O")) { scanner_consume(s, '{'); if (s->err) goto STOP; sqd->seq_number = 0; sqd->total_w = 0.0; while( !s->eof && !s->err && s->c - '}') { /* another sequence --> realloc */ if (m_realloc(sqd->seq, sqd->seq_number+1)) { mes_proc(); goto STOP; } if (m_realloc(sqd->seq_len, sqd->seq_number+1)) { mes_proc(); goto STOP; } if (m_realloc(sqd->seq_label, sqd->seq_number + 1)) { mes_proc(); goto STOP; } if (m_realloc(sqd->seq_id, sqd->seq_number + 1)) { mes_proc(); goto STOP; } if (m_realloc(sqd->seq_w, sqd->seq_number + 1)) { mes_proc(); goto STOP; } /* Label and ID and weight */ /* default */ sqd->seq_label[sqd->seq_number] = -1; sqd->seq_id[sqd->seq_number] = -1.0; sqd->seq_w[sqd->seq_number] = 1; while (s->c == '<' || s->c == '(' || s->c == '|') { if (s->c == '<') { scanner_consume(s, '<'); if (s->err) goto STOP; sqd->seq_label[sqd->seq_number] = scanner_get_int(s); if(s->err) goto STOP; scanner_consume(s, '>'); if (s->err) goto STOP; } if (s->c == '(') { scanner_consume(s, '('); if (s->err) goto STOP; sqd->seq_id[sqd->seq_number] = scanner_get_edouble(s); if(s->err) goto STOP; scanner_consume(s, ')'); if (s->err) goto STOP; } if (s->c == '|') { scanner_consume(s, '|'); if (s->err) goto STOP; sqd->seq_w[sqd->seq_number] = (double) scanner_get_int(s); if (sqd->seq_w[sqd->seq_number] < 0) { scanner_error(s, "negativ sequence weight\n"); goto STOP; } if(s->err) goto STOP; scanner_consume(s, '|'); if (s->err) goto STOP; } } sqd->seq[sqd->seq_number] = scanner_get_double_earray(s, sqd->seq_len + sqd->seq_number); if (sqd->seq_len[sqd->seq_number] > MAX_SEQ_LEN) { scanner_error(s, "sequence too long"); goto STOP; } scanner_consume(s, ';'); if (s->err) goto STOP; sqd->total_w += sqd->seq_w[sqd->seq_number]; sqd->seq_number++; } /* while( !s->eof...) */ if ((sqd->seq_number == 0) || (sqd->seq_number > MAX_SEQ_NUMBER)) { char *str = mprintf(NULL, 0, "Number of sequences %ld exceeds possible range", sqd->seq_number); mes_prot(str); m_free(str); goto STOP; } scanner_consume(s, '}'); if (s->err) goto STOP; } else { scanner_error(s, "unknown identifier"); goto STOP; } scanner_consume( s, ';' ); if(s->err) goto STOP; } /* while(!s->err && !s->eof && s->c - '}') */ scanner_consume( s, '}'); if(s->err) goto STOP; return(sqd); STOP: sequence_d_free(&sqd); return(NULL);#undef CUR_PROC} /* sequence_d_read_alloc */ /*============================================================================*//* Truncate Sequences in a given sqd_field; useful for Testing; returns truncated sqd_field; trunc_ratio 0: no truncation trunc_ratio 1: truncation (mean truncation faktor = 0.5) trunc_ratio -1: 100 % truncation*/sequence_d_t **sequence_d_truncate(sequence_d_t **sqd_in, int sqd_fields, double trunc_ratio, int seed) { #define CUR_PROC "sequence_d_truncate" sequence_d_t **sq; int i, j, trunc_len; /* Hack, use -1 for complete truncation */ if ((0 > trunc_ratio || 1 < trunc_ratio) && trunc_ratio != -1) { mes_prot("Error: trunc_ratio not valid\n"); goto STOP; } if( !m_calloc(sq, sqd_fields) ) {mes_proc();goto STOP;} gsl_rng_init(); gsl_rng_set(RNG,seed); for (i = 0; i < sqd_fields; i++) { sq[i] = sequence_d_calloc(sqd_in[i]->seq_number); sq[i]->total_w = sqd_in[i]->total_w; for (j = 0; j < sqd_in[i]->seq_number; j++) { if(!m_calloc(sq[i]->seq[j], sqd_in[i]->seq_len[j])) { mes_proc(); goto STOP; } /* length of truncated seq. */ if (trunc_ratio == -1) trunc_len = 0; else trunc_len = (int) ceil(( sqd_in[i]->seq_len[j] * (1 - trunc_ratio * gsl_rng_uniform(RNG)) )); sequence_d_copy(sq[i]->seq[j], sqd_in[i]->seq[j], trunc_len); if(m_realloc(sq[i]->seq[j], trunc_len)) {mes_proc(); goto STOP;} sq[i]->seq_len[j] = trunc_len; sq[i]->seq_label[j] = sqd_in[i]->seq_label[j]; sq[i]->seq_id[j] = sqd_in[i]->seq_id[j]; sq[i]->seq_w[j] = sqd_in[i]->seq_w[j]; } } /* for all sqd_fields */ return sq;STOP: return NULL;#undef CUR_PROC}/*============================================================================*/sequence_d_t *sequence_d_calloc(long seq_number) {#define CUR_PROC "sequence_dcalloc" int i; //printf("*** sequence_d_t *sequence_d_calloc, nr: %d\n",seq_number); sequence_d_t *sqd = NULL; if (seq_number > MAX_SEQ_NUMBER) {
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -