📄 smixturehmm.c
字号:
/********************************************************************************* This file is part of the General Hidden Markov Model Library,* GHMM version 0.8_beta1, see http://ghmm.org** Filename: ghmm/ghmm/smixturehmm.c* Authors: Bernd Wichern** Copyright (C) 1998-2004 Alexander Schliep* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,* Berlin** Contact: schliep@ghmm.org** This library is free software; you can redistribute it and/or* modify it under the terms of the GNU Library General Public* License as published by the Free Software Foundation; either* version 2 of the License, or (at your option) any later version.** This library is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU* Library General Public License for more details.** You should have received a copy of the GNU Library General Public* License along with this library; if not, write to the Free* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*** This file is version $Revision: 1713 $* from $Date: 2006-10-16 16:06:28 +0200 (Mon, 16 Oct 2006) $* last change by $Author: grunau $.********************************************************************************/#ifdef WIN32# include "win_config.h"#endif#ifdef HAVE_CONFIG_H# include "../config.h"#endif#ifdef GHMM_OBSOLETE#include <stdio.h>#include <float.h>#include <math.h>#include <time.h>#include "ghmm.h"#include "mprintf.h"#include "mes.h"#include "matrix.h"#include "smixturehmm.h"#include "smodel.h"#include "sreestimate.h"#include "sfoba.h"#include "rng.h"#include "sequence.h"#include "smap_classify.h"#include "ghmm_internals.h"/* used in main and deactivated other functions */#if 0double total_train_w = 0.0;double total_test_w = 0.0;#endif#if 0 /* no main *//*============================================================================*/int main (int argc, char *argv[]){#define CUR_PROC "smixturehmm_main" int exitcode = -1; ghmm_cseq *sqd = NULL, *sqd_train = NULL, *sqd_test = NULL; ghmm_cseq **sqd_dummy = NULL; ghmm_cmodel **smo_initial = NULL, **smo = NULL; int i, k, iter, iterations, field_number, min_smo, max_smo, smo_number, total_smo_number, print_models, mode, free_parameter; long errors_train = 0, errors_test = 0; char *str; double train_likelihood, test_likelihood, train_ratio, bic; double *avg_comp_like = NULL; double **cp = NULL; /* component probabilities for all train-seqs. */ FILE *outfile = NULL, *likefile = NULL, *smofile = NULL; char filename[256], likefilename[256]; ghmm_rng_init (); if (argc == 10 || argc == 11) { sprintf (filename, "%s.smo", argv[3]); if (!(smofile = ighmm_mes_fopen (filename, "wt"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } sprintf (likefilename, "%s.like", argv[3]); if (!(likefile = ighmm_mes_fopen (likefilename, "at"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } sprintf (filename, "%s", argv[3]); if (!(outfile = ighmm_mes_fopen (filename, "at"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } iterations = atoi (argv[4]); min_smo = atoi (argv[5]); max_smo = atoi (argv[6]); printf ("Iter %d, min model no %d, max model no %d\n", iterations, min_smo, max_smo); if (max_smo < min_smo) { printf (" Max. Model No. < Min. Model No.! \n"); goto STOP; } train_ratio = atof (argv[7]); if (train_ratio > 1 || train_ratio < 0) { printf ("Fehler Train_Ratio (argv[7]): %f\n", train_ratio); goto STOP; } print_models = atoi (argv[8]); mode = atoi (argv[9]); if (mode < 1 || mode > 5) { printf ("Error: initial mode out of range\n"); goto STOP; } printf ("TRAIN RATIO %.2f\n", train_ratio); if (argc == 11) GHMM_RNG_SET (RNG, atoi (argv[10])); else { ghmm_rng_timeseed (RNG); } } else { printf ("Insufficient arguments. Usage: \n"); printf ("smixturehmm [Seq.File] [Model File] [Out File] [No. of runs] \n"); printf ("[Min Model No.] [Max Model No.] [train ratio] [print models] [mode] <seed>\n"); goto STOP; } ghmm_smixturehmm_print_header (likefile, argv, 1); ghmm_smixturehmm_print_header (outfile, argv, 0); /* Read Seqs. and Initial Models only once */ sqd_dummy = ghmm_cseq_read (argv[1], &field_number); printf ("Length first Seq: %d\n", sqd_dummy[0]->seq_len[0]); if (!sqd_dummy) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } sqd = sqd_dummy[0]; if (field_number > 1) GHMM_LOG(LCONVERTED, "Warning: Seq. File contains multiple Seq. Fields; use only the first one\n"); smo_initial = ghmm_cmodel_read (argv[2], &total_smo_number); if (!smo_initial) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } ARRAY_CALLOC (smo, total_smo_number); if (total_smo_number < max_smo) { str = ighmm_mprintf (NULL, 0, "Need %d initial models, read only %d from model file\n", max_smo, total_smo_number); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } if (smo_initial[0]->density == 0) { printf ("normal\n"); fprintf (outfile, "normal\n"); } else { printf ("normal_pos\n"); fprintf (outfile, "normal_pos\n"); } /*-------------- Main loop over different no. of models-------------------------*/ for (smo_number = min_smo; smo_number <= max_smo; smo_number++) { /* different partitions of test und train seqs. */ for (iter = 0; iter < iterations; iter++) { if (!outfile) if (!(outfile = ighmm_mes_fopen (filename, "at"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } if (!likefile) if (!(likefile = ighmm_mes_fopen (likefilename, "at"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } fprintf (outfile, "*********** Model No. %d, CV Iter %d ****************\n", smo_number, iter + 1); /*==================INIT========================================*/ /* divide into train and test seqs. */ ARRAY_CALLOC (sqd_train, 1); ARRAY_CALLOC (sqd_test, 1); if (ghmm_cseq_partition (sqd, sqd_train, sqd_test, train_ratio) == -1) { str = ighmm_mprintf (NULL, 0, "Error partitioning seqs, (model %d, iter %d)\n", smo_number, iter); GHMM_LOG(LCONVERTED, str); m_free (str); if (sqd_train->seq == NULL) { str = ighmm_mprintf (NULL, 0, "Error: empty train seqs, (model %d, iter %d)\n", smo_number, iter); GHMM_LOG(LCONVERTED, str); m_free (str); } goto STOP; } printf ("%ld seqs divided into %ld train seqs and %ld test seqs\n", sqd->seq_number, sqd_train->seq_number, sqd_test->seq_number); /* copy appropriate no. of initial models */ for (k = 0; k < smo_number; k++) { smo[k] = ghmm_cmodel_copy (smo_initial[k]); if (!smo[k]) { str = ighmm_mprintf (NULL, 0, "Error copying models, (model %d, iter %d)\n", smo_number, iter); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } } /* find out no. of free parameters; smo_number - 1 = component weights */ free_parameter = ghmm_cmodel_count_free_parameter (smo, smo_number) + smo_number - 1; cp = ighmm_cmatrix_alloc (sqd_train->seq_number, smo_number); if (!cp) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } /* Initial values for component probs for all seqs. and model priors before the actual clustering starts */ if (ghmm_smixturehmm_init (cp, sqd_train, smo, smo_number, mode) == -1) { str = ighmm_mprintf (NULL, 0, "Error in initialization comp. prob (model %d, iter %d)\n", smo_number, iter); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } /*==================END INIT=====================================*/ /* clustering */ if (ghmm_smixturehmm_cluster (outfile, cp, sqd_train, smo, smo_number) == -1) { str = ighmm_mprintf (NULL, 0, "Error in clustering, (model %d, iter %d)\n", smo_number, iter); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } /* TEST */ /* if (ghmm_smixturehmm_calc_cp(cp, sqd_train, smo, smo_number) == -1) { str = ighmm_mprintf(NULL, 0, "Error after clustering, (model %d, CV Iter %d)\n", smo_number, iter + 1); GHMM_LOG(LCONVERTED, str); m_free(str); goto STOP; } fprintf(outfile, "Component Probs. after Clustering:\n"); ighmm_cmatrix_print(outfile, cp, sqd_train->seq_number, smo_number, " ", ",", ";"); */ /* End TEST */ /*================ OUTPUT=====================================*/ /* print trained models */ if (print_models) { fprintf (smofile, "# ****************** Models: %d, CV-Iter %d *****************\n", smo_number, iter); for (k = 0; k < smo_number; k++) ghmm_cmodel_print (smofile, smo[k]); } avg_comp_like = ghmm_smixturehmm_avg_like (cp, sqd_train, smo, smo_number); if (avg_comp_like == NULL) { GHMM_LOG(LCONVERTED, "Error calculating avg_like \n"); goto STOP; } fprintf (outfile, "\nTrain Set:\nModel Priors \t Likel. per Seq\n"); for (k = 0; k < smo_number; k++) fprintf (outfile, "%.4f\t%.4f\t%.4f\n", smo[k]->prior, avg_comp_like[k]); errors_train = ghmm_cseq_mix_like (smo, smo_number, sqd_train, &train_likelihood); fprintf (outfile, "Likelihood Train Set %.4f (%ld seqs); (per seq %.4f)\n", train_likelihood, sqd_train->seq_number, train_likelihood / total_train_w); /* likelihood on test set */ if (sqd_test != NULL) { errors_test = ghmm_cseq_mix_like (smo, smo_number, sqd_test, &test_likelihood); fprintf (outfile, "Likelihood Test Set %.4f (%ld seqs); (per normseq ", test_likelihood, sqd_test->seq_number); if (total_test_w > 0) fprintf (outfile, " %.4f)\n", test_likelihood / total_test_w); else fprintf (outfile, " ---)\n"); } /* Bayes Information Criterion, see: Kass, Raftery: Bayes Faktors. */ bic = 2 * train_likelihood - free_parameter * log (sqd_train->seq_number); fprintf (likefile, "%d\t%d\t%ld\t%ld\t%.4f\t%.4f\t%.4f\t", smo_number, iter, sqd_train->seq_number, sqd_test->seq_number, train_likelihood, test_likelihood, train_likelihood / total_train_w); if (total_test_w > 0) fprintf (likefile, " %.4f\t", test_likelihood / total_test_w); else fprintf (likefile, " ---\t"); fprintf (likefile, "%ld\t%ld\t%.4f\n", errors_train, errors_test, bic); /*================ FREE=====================================*/ for (k = 0; k < smo_number; k++) ghmm_cmodel_free (&(smo[k])); ighmm_cmatrix_free (&cp, sqd_train->seq_number); ghmm_cseq_free (&sqd_train); ghmm_cseq_free (&sqd_test); m_free (avg_comp_like); if (outfile) fclose (outfile); /* save results, open to append at the beginning of the loop */ if (likefile) fclose (likefile); likefile = NULL; outfile = NULL; } /* for (iterations) */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -