sequence.h
来自「General Hidden Markov Model Library 一个通用」· C头文件 代码 · 共 470 行
H
470 行
/********************************************************************************* This file is part of the General Hidden Markov Model Library,* GHMM version 0.8_beta1, see http://ghmm.org** Filename: ghmm/ghmm/sequence.h* Authors: Bernd Wichern, Benjamin Georgi** Copyright (C) 1998-2004 Alexander Schliep * Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik, * Berlin* * Contact: schliep@ghmm.org ** This library is free software; you can redistribute it and/or* modify it under the terms of the GNU Library General Public* License as published by the Free Software Foundation; either* version 2 of the License, or (at your option) any later version.** This library is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU* Library General Public License for more details.** You should have received a copy of the GNU Library General Public* License along with this library; if not, write to the Free* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*** This file is version $Revision: 1931 $ * from $Date: 2007-10-30 15:54:41 +0100 (Tue, 30 Oct 2007) $* last change by $Author: grunau $.********************************************************************************/#ifndef GHMM_OBSOLETE#warning "GHMM_OBSOLETE is not defined!"#endif#ifndef GHMM_SEQUENCE_H#define GHMM_SEQUENCE_H#ifdef __cplusplusextern "C" {#endif/**@name sequences (double and int) *//*@{ (Doc++-Group: sequence) *//** @name struct ghmm_dseq Sequence structure for integer sequences. Contains an array of sequences and corresponding data like sequence label, sequence weight, etc. Sequences may have different length. */ typedef struct ghmm_dseq { /** sequence array. sequence[i] [j] = j-th symbol of i-th seq. */ int **seq; /** matrix of state ids, can be used to save the viterbi path during sequence generation. ATTENTION: is NOT allocated by ghmm_dseq_calloc */ int **states; /** array of sequence length */ int *seq_len; /** array of state path lengths */ int *states_len;#ifdef GHMM_OBSOLETE /** array of sequence labels */ long *seq_label;#endif /* GHMM_OBSOLETE */ /** array of sequence IDs*/ double *seq_id; /** positiv! sequence weights. default is 1 = no weight */ double *seq_w; /** total number of sequences */ long seq_number; /** reserved space for sequences is always >= seq_number */ long capacity; /** sum of sequence weights */ double total_w; /** matrix of state labels corresponding to seq */ int **state_labels; /** number of labels for each sequence */ int *state_labels_len; /** flags (internal) */ unsigned int flags; } ghmm_dseq;/** @name struct ghmm_cseq Sequence structure for double sequences. Contains an array of sequences and corresponding data like sequnce label, sequence weight, etc. Sequences may have different length. */ typedef struct ghmm_cseq { /** sequence array. sequence[i][j] = j-th symbol of i-th seq. */ double **seq; /** array of sequence length */ int *seq_len;#ifdef GHMM_OBSOLETE /** array of sequence labels */ long *seq_label;#endif /* GHMM_OBSOLETE */ /** array of sequence IDs*/ double *seq_id; /** positive! sequence weights. default is 1 = no weight */ double *seq_w; /** total number of sequences */ long seq_number; /** reserved space for sequences is always >= seq_number */ long capacity; /** sum of sequence weights */ double total_w; /** flags (internal) */ unsigned int flags; } ghmm_cseq;#ifdef __cplusplus}#endif/* don't include model.h at the beginning of this file. struct ghmm_dseq has to be known in model.h */#include "model.h"#include "smodel.h"#ifdef __cplusplusextern "C" {#endif/** Truncate double sequences in a given sequence array. Useful for Testing; @return truncated sqd_field; @param sqd\_in sequence arrays for truncation @param sqd\_arrays number of sequence arrays @param trunc\_ratio 0 means no truncation, 1 max. truncation @param seed rng seed*/ ghmm_cseq **ghmm_cseq_truncate (ghmm_cseq ** sqd_in, int sqd_arrays, double trunc_ratio, int seed);/** Extract a single sequence from a larger ghmm_dseq into a new struct. @return ghmm_dseq struct containing a single sequence @param sq source ghmm_dseq @param index index of sequence to extract*/ghmm_dseq *ghmm_dseq_get_singlesequence(ghmm_dseq *sq, int index);/** Extract a single sequence_d from a larger ghmm_cseq into a new struct. @return ghmm_cseq struct containing a single sequence @param sq source ghmm_cseq @param index index of sequence to extract*/ghmm_cseq *ghmm_cseq_get_singlesequence(ghmm_cseq *sq, int index);/*XXX TEST: frees everything but the seq field *//** Free a ghmm_dseq struct which holds as sequence a reference to a sequence in a different sequence_t. The function deallocates everything but the reference.*/int ghmm_dseq_subseq_free (ghmm_dseq *sq);/** Free a ghmm_cseq struct which holds as sequence a reference to a sequence in a different sequence_d_t. The function deallocates everything but the reference.*/int ghmm_cseq_subseq_free (ghmm_cseq *sqd);/** Reads a FastA file and returns a ghmm_dseq object @param filename filemane of the fasta file @param alfabet alfabet @return ghmm_dseq of the fasta file*/ghmm_dseq *ghmm_dseq_open_fasta(const char *filename, ghmm_alphabet *alphabet);/** Generates all possible integer sequence of lenght n from an alphabet with M letters. Use lexicographical ordering. Memory allocation here. @param n length of sequences @param M size of alphabet @return array of generated integer sequences*/ ghmm_dseq *ghmm_dseq_lexWords (int n, int M);/** Determine best model for a given integer sequence. Choose from the set of models the one with the highest likelihood for the given sequence. @param mo array of models @param model\_number number of models @param sequence sequence @param seq\_len length of sequence @param log\_p log likelihood of the sequence given the best model @return index of best\_model (between 0 and model\_number - 1)*/ int ghmm_dseq_best_model (ghmm_dmodel ** mo, int model_number, int *sequence, int seq_len, double *log_p);/** Make sure that the sequences only contain allowed symbols. (between 0 and max\_symbol - 1) @param sq sequences @param max_symb number of different symbols @return -1 for error, 0 for no errors*/ int ghmm_dseq_check (ghmm_dseq * sq, int max_symb);/** copy one integer sequence. Memory for target has to be allocated outside. @param target target sequence @param source source sequence @param len length of source sequence */ void ghmm_dseq_copy (int *target, int *source, int len);/** copy one double sequence. Memory for target has to be allocated outside. @param target target sequence @param source source sequence @param len length of source sequence */ void ghmm_cseq_copy (double *target, double *source, int len);/** Adds all integer sequences, sequence lengths etc from source to target. Memory allocation is done here. @param target target sequence structure @param source source sequence structure @return -1 for error, 0 for success */ int ghmm_dseq_add (ghmm_dseq * target, ghmm_dseq * source);/** Adds all double sequences, sequence lengths etc from source to target. Memory allocation is done here. @param target target sequence structure @param source source sequence structure @return -1 for error, 0 for success */ int ghmm_cseq_add (ghmm_cseq * target, ghmm_cseq * source);/** Prints one array of integer sequences in a file. @param file output file @param sequence array of sequences */ void ghmm_dseq_print (ghmm_dseq * sequence, FILE * file);/** Prints one array of integer sequences in a xml file @param file output file @param sequence array of sequences */ void ghmm_dseq_print_xml (ghmm_dseq * sequence, FILE * file);/** Prints one array of integer sequences in Mathematica format. (List of lists) @param file output file @param sq array of sequences @param name arbitrary sequence name for usage in Mathematica. */ void ghmm_dseq_mathematica_print (ghmm_dseq * sq, FILE * file, char *name);/** Prints one array of double sequences in a file. @param file output file @param sqd array of sequences @param discrete switch: 0 means double output for symbols, 1 means truncate symbols to integer */ void ghmm_cseq_print (ghmm_cseq * sqd, FILE * file, int discrete);/** Prints one array of double sequences in Mathematica format. (List of lists) @param file output file @param sqd array of sequences @param name arbitrary sequence name for usage in Mathematica. */ void ghmm_cseq_mathematica_print (ghmm_cseq * sqd, FILE * file, char *name);/** Output of double sequences suitable for gnuplot. One symbol per line, sequences seperated by double newline. @param file output file @param sqd array of double sequences*/ void ghmm_cseq_gnu_print (ghmm_cseq * sqd, FILE * file);/** Cleans integer sequence pointers in sequence struct. sets seq\_number to zero. Differs from sequence\_free since memory is not freed here. @param sq sequence structure */ void ghmm_dseq_clean (ghmm_dseq * sq);/** Cleans double sequence pointers in sequence struct. sets seq\_number to zero. Differs from sequence\_free since memory is not freed here. @param sqd sequence structure */ void ghmm_cseq_clean (ghmm_cseq * sqd);/** Frees all memory in a given array of integer sequences. @param sq sequence structure @return 0 for succes, -1 for error */ int ghmm_dseq_free (ghmm_dseq ** sq);/** Frees all memory in a given array of double sequences. @param sq sequence structure @return 0 for succes, -1 for error */ int ghmm_cseq_free (ghmm_cseq ** sq);/** Return biggest symbol in an interger sequence. @param sq sequence structure @return max value */ int ghmm_dseq_max_symbol (ghmm_dseq * sq);/** Memory allocation for an integer sequence struct. Allocates arrays of lenght seq\_number. NO allocation for the actual sequence, since its length is unknown. @param seq\_number: number of sequences @return: pointer of sequence struct*/ ghmm_dseq *ghmm_dseq_calloc (long seq_number);/** Completes Memory allocation for an integer sequence struct. NO allocation for the actual sequence, since its length is unknown.*/ int ghmm_dseq_calloc_state_labels (ghmm_dseq *sq);/** Memory allocation for a double sequence struct. Allocates arrays of lenght seq\_number. NO allocation for the actual sequence, since its length is unknown. @param seq\_number: number of sequences @return: pointer of sequence struct*/ ghmm_cseq *ghmm_cseq_calloc (long seq_number);/** Copies array of integer sequences to double sequences. @return double sequence struct (target) @param sq integer sequence struct (source) */ ghmm_cseq *ghmm_cseq_create_from_dseq (const ghmm_dseq * sq);/** Copies array of double sequences into an array of integer sequences. Truncates positions after decimal point. @return integer sequence struct (target) @param sq double sequence struct (source) */ ghmm_dseq *ghmm_dseq_create_from_cseq (const ghmm_cseq * sqd);/** Determines max sequence length in a given int sequence struct. @author Peter Pipenbacher @param sqd sequence struct @return max sequence length */ int ghmm_dseq_max_len (const ghmm_dseq * sqd);/** Determines max sequence length in a given double sequence struct. @param sqd sequence struct @return max sequence length */ int ghmm_cseq_max_len (const ghmm_cseq * sqd);/** Calculates a mean sequence of a given array of double sequences. Missing values of shorter sequences a assumed to be zero. @param sqd sequence struct @return pointer of sequence struct containing the mean sequence */ ghmm_cseq *ghmm_cseq_mean (const ghmm_cseq * sqd);/** Calculates the scatter matrix of an array of double sequences. Missing parts of short sequences are NOT taken into account. @return scatter matrix @param sqd sequence struct @param sqd (calculated) dimension of scatter matrix */ double **ghmm_cseq_scatter_matrix (const ghmm_cseq * sqd, int *dim);/** Calculates transition class for a given double sequence at a specified position. Very application specific!!! Currently implemented only dummy function: allways returns 0 which means no usage of multiple transition classes. @param O double sequence @param index position for class calculation @param osum sum of symbols upto index @return currently always 0 */ int ghmm_cseq_class (const double *O, int index, double *osum); /*int ghmm_cseq_class(const ghmm_cseq *sqd, const int seq_number, int index, double *osum, );*//** Divides randomly a given array of double sequences into two sets. Useful if a training and test set is needed. Memory allocation is done here. @param sqd input sequence array @param sqd\_train training sequences @param sqd\_test test sequences @param train\_ratio ratio of number of train vs number of test sequences @return 0 for success, -1 for error*/ int ghmm_cseq_partition (ghmm_cseq * sqd, ghmm_cseq * sqd_train, ghmm_cseq * sqd_test, double train_ratio);/** Copies all entries from one sequence in a source array to a target array. No memory allocation here. @param target double sequence target @param source double sequence source @param t_num position in target array @param s_num position in source array*/ void ghmm_cseq_copy_all (ghmm_cseq * target, long t_num, ghmm_cseq * source, long s_num);/** Log-Likelihood function in a mixture model: (mathe mode?) $\sum_k w^k \log( \sum_c (\alpha_c p(O^k | \lambda_c)))$ @param smo pointer to array of smodels @param smo\_number number of models @param sqd sequence struct @param like log likelihood*/ int ghmm_cseq_mix_like (ghmm_cmodel ** smo, int smo_number, ghmm_cseq * sqd, double *like);#ifdef __cplusplus}#endif#endif/*@} (Doc++-Group: sequence) */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?