📄 cluster.c
字号:
/********************************************************************************* This file is part of the General Hidden Markov Model Library,* GHMM version 0.8_beta1, see http://ghmm.org** Filename: ghmm/ghmm/cluster.c* Authors: Bernd Wichern** Copyright (C) 1998-2004 Alexander Schliep* Copyright (C) 1998-2001 ZAIK/ZPR, Universitaet zu Koeln* Copyright (C) 2002-2004 Max-Planck-Institut fuer Molekulare Genetik,* Berlin** Contact: schliep@ghmm.org** This library is free software; you can redistribute it and/or* modify it under the terms of the GNU Library General Public* License as published by the Free Software Foundation; either* version 2 of the License, or (at your option) any later version.** This library is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU* Library General Public License for more details.** You should have received a copy of the GNU Library General Public* License along with this library; if not, write to the Free* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA*** This file is version $Revision: 1713 $* from $Date: 2006-10-16 16:06:28 +0200 (Mon, 16 Oct 2006) $* last change by $Author: grunau $.********************************************************************************/#ifdef WIN32# include "win_config.h"#endif#ifdef HAVE_CONFIG_H# include "../config.h"#endif#ifdef GHMM_OBSOLETE #include <stdio.h>#include "ghmm.h"#include "mprintf.h"#include "mes.h"#include "cluster.h"#include "rng.h"#include "model.h"#include "sequence.h"#include "reestimate.h"#include "foba.h"#include "matrix.h"#include "ghmm_internals.h"#include "obsolete.h"/*============================================================================*/int ghmm_cluster_hmm (char *seq_file, char *mo_file, char *out_filename){# define CUR_PROC "ghmm_cluster_hmm" int res = -1, i, iter = 0, sq_number; ghmm_dseq *sq = NULL, **sq_vec = NULL; long j, changes = 1; long *oldlabel; double log_p; char * str; FILE *outfile = NULL; cluster_t cl; cl.mo = NULL; cl.mo_seq = NULL; if (!(outfile = ighmm_mes_fopen (out_filename, "wt"))) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; }/*----------------------------------------------------------------------------*/ /* Allocate memory and read in data: Sequences and initial model */ sq_vec = ghmm_dseq_read (seq_file, &sq_number); if (!sq_vec[0]) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } if (sq_number > 1) GHMM_LOG(LCONVERTED, "Warning: seq. file contains multiple seq. arrays. \ Only first array is used for clustering\n"); sq = sq_vec[0]; fprintf (outfile, "Cluster Sequences\n"); ghmm_dseq_print (outfile, sq); cl.mo = ghmm_dmodel_read (mo_file, &cl.mo_number); if (!cl.mo) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } ARRAY_CALLOC (oldlabel, sq->seq_number); for (i = 0; i < sq->seq_number; i++) oldlabel[i] = (-1); ARRAY_CALLOC (cl.mo_seq, cl.mo_number); for (i = 0; i < cl.mo_number; i++) cl.mo_seq[i] = NULL; if (ghmm_dmodel_check_compatibility (cl.mo, cl.mo_number)) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; }/*----------------------------------------------------------------------------*/ fprintf (outfile, "\nInitial Models:\n"); for (i = 0; i < cl.mo_number; i++) ghmm_dmodel_print (outfile, cl.mo[i]);/*----------------------------------------------------------------------------*/ while (changes > 0) { iter++; /* Associate the sequences */ fprintf (outfile, "\nSequence, Best Model, logP of generating Seq.:\n"); for (j = 0; j < sq->seq_number; j++) { sq->seq_label[j] = ghmm_dseq_best_model (cl.mo, cl.mo_number, sq->seq[j], sq->seq_len[j], &log_p); fprintf (outfile, "seq %ld, mo %ld, log p %.4f\n", j, sq->seq_label[j], log_p); if (sq->seq_label[j] == -1 || sq->seq_label[j] >= cl.mo_number) { /* No model fits! */ str = ighmm_mprintf (NULL, 0, "Seq. %ld: ghmm_dseq_best_model gives %d\n", j, sq->seq_label[j]); GHMM_LOG(LCONVERTED, str); m_free (str); goto STOP; } } if (ghmm_cluster_avoid_empty_model (sq->seq_label, sq->seq_number, cl.mo_number)) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } changes = ghmm_cluster_update_label (oldlabel, sq->seq_label, sq->seq_number); fprintf (outfile, "%ld changes\n", changes); fprintf (stdout, "\n*** %ld changes in iteration %d ***\n\n", changes, iter); /* Reestimate models with the associated sequences */ if (changes > 0) { if (ghmm_cluster_update (&cl, sq)) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } fprintf (outfile, "\nGes. WS VOR %d.Reestimate:\n", iter); ghmm_cluster_print_likelihood (outfile, &cl); for (i = 0; i < cl.mo_number; i++) { if (ghmm_dmodel_baum_welch (cl.mo[i], cl.mo_seq[i])) { str = ighmm_mprintf (NULL, 0, "%d.reestimate false, mo[%d]\n", iter, i); GHMM_LOG(LCONVERTED, str); m_free (str); /* ghmm_dmodel_print(stdout, cl.mo[i]); */ goto STOP; } } fprintf (outfile, "\nGes. WS NACH %d.Reestimate:\n", iter); ghmm_cluster_print_likelihood (outfile, &cl); } /* if changes */ } /* while *//*----------------------------------------------------------------------------*/ if (!ghmm_cluster_out (&cl, sq, outfile, out_filename)) { GHMM_LOG_QUEUED(LCONVERTED); goto STOP; } res = 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ /* ...noch div. free! */ if (outfile) fclose (outfile); return (res);# undef CUR_PROC} /* ghmm_cluster_hmm *//*============================================================================*/int ghmm_cluster_update (cluster_t * cl, ghmm_dseq * sq){#define CUR_PROC "ghmm_cluster_update" int i, res = -1; long *seq_counter; ghmm_dseq *seq_t; ARRAY_CALLOC (seq_counter, cl->mo_number); /* Fix the number of associated sequences */ for (i = 0; i < sq->seq_number; i++) seq_counter[sq->seq_label[i]]++; /* allocate memory block for block */ for (i = 0; i < cl->mo_number; i++) { if (cl->mo_seq[i]) { /* Important: No ghmm_dseq_free here, otherwise are the original sequences gone! */ ghmm_dseq_clean (cl->mo_seq[i]); m_free (cl->mo_seq[i]); } cl->mo_seq[i] = ghmm_dseq_calloc (seq_counter[i]); cl->mo_seq[i]->seq_number = 0; /* Counted later */ } /* Set entries */ for (i = 0; i < sq->seq_number; i++) { seq_t = cl->mo_seq[sq->seq_label[i]]; seq_t->seq_len[seq_t->seq_number] = sq->seq_len[i]; seq_t->seq[seq_t->seq_number] = sq->seq[i]; /* Pointer!!! */ seq_t->seq_label[seq_t->seq_number] = sq->seq_label[i]; seq_t->seq_number++; } res = 0;STOP: /* Label STOP from ARRAY_[CM]ALLOC */ m_free (seq_counter); return (res);# undef CUR_PROC} /* ghmm_cluster_update *//*============================================================================*/void ghmm_cluster_print_likelihood (FILE * outfile, cluster_t * cl){ double ges_prob = 0.0, mo_prob; int i; for (i = 0; i < cl->mo_number; i++) { mo_prob = ghmm_dmodel_likelihood (cl->mo[i], cl->mo_seq[i]); ges_prob += mo_prob; fprintf (outfile, "mo %d (#Seq. %ld): %.4f\n", i, cl->mo_seq[i]->seq_number, mo_prob); } fprintf (outfile, "Summe: %.4f\n\n", ges_prob);} /* ghmm_cluster_print_likelihood *//*============================================================================*//* Prevents that empty models are sent out (no associated seqences) by associating a random sequence. Since it's possible to produce an empty model in this way, we swap the sequences until a nonempty model is produced. (This could be a never-ending process and therefore it's only done 100 times). */int ghmm_cluster_avoid_empty_model (long *seq_label, long seq_number, int mo_number){#define CUR_PROC "ghmm_cluster_avoid_empty_model" int i; long j = 0; long *counter; char error = 1, change = 0; int iter = 0; /* Initialization */ ARRAY_CALLOC (counter, mo_number); for (i = 0; i < mo_number; i++) counter[i] = 0; for (i = 0; i < seq_number; i++) counter[seq_label[i]]++; while (error && iter < 100) { iter++; error = 0; /* Do we have empty models ? */ for (i = 0; i < mo_number; i++) { if (counter[i] == 0) { change = 1; /* Choose a random sequence for an empty model */ j = m_int (GHMM_RNG_UNIFORM (RNG) * (seq_number - 1)); /* The orginal model loses one sequence */ printf ("!!\"avoid_empty_model\":Verschiebe Seq. %ld: %ld --> %d !!\n", j, seq_label[j], i); counter[seq_label[j]]--; counter[i] = 1; seq_label[j] = i; } } /* Is now everything OK ? */ if (change) { for (i = 0; i < mo_number; i++) { if (counter[i] <= 0) { error = 1; break; } } } } /* while */STOP: /* Label STOP from ARRAY_[CM]ALLOC */ if (error) return (-1); else return 0;#undef CUR_PROC} /* ghmm_cluster_avoid_empty_model *//*============================================================================*/int ghmm_cluster_out (cluster_t * cl, ghmm_dseq * sq, FILE * outfile, char *out_filename){#define CUR_PROC "ghmm_cluster_out" int res = -1; int i; ghmm_cseq *sqd = NULL; fprintf (outfile, "\nFinal Models:\n"); for (i = 0; i < cl->mo_number; i++) ghmm_dmodel_print (outfile, cl->mo[i]); res = 0; ghmm_cseq_free (&sqd); return (res);#undef CUR_PROC} /* ghmm_cluster_out *//*============================================================================*/long ghmm_cluster_update_label (long *oldlabel, long *seq_label, long seq_number){ long i, changes = 0; for (i = 0; i < seq_number; i++) if (oldlabel[i] != seq_label[i]) { changes++; oldlabel[i] = seq_label[i]; } return changes;} /* ghmm_cluster_update_label */#endif /* GHMM_OBSOLETE */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -