📄 scluster.c
字号:
/*--------------------------------------------------------------------------*/
res = 0;
STOP:
#if POUT == 0
pthread_attr_destroy(&Attribute);
#endif /* POUT */
/* XXX ...still div. free! XXX*/
if (outfile) fclose(outfile);
return(res);
# undef CUR_PROC
}/* scluster_hmm */
/*============================================================================*/
int scluster_t_free(scluster_t *scl){
int i;
#define CUR_PROC "scluster_t_free"
mes_check_ptr(scl, return(-1));
if( !scl ) return(0);
for (i=0;i<=scl->smo_number;i++){
smodel_free(&(scl->smo[i]));
sequence_d_free(&(scl->smo_seq[i]));
}
printf("hier1\n");
m_free(scl->smo);
m_free(scl->smo_seq);
m_free(scl->seq_counter);
m_free(scl->smo_Z_MD);
m_free(scl->smo_Z_MAW);
return 0;
# undef CUR_PROC
}
/*============================================================================*/
int scluster_out(scluster_t *cl, sequence_d_t *sqd, FILE *outfile,
char *argv[]) {
#define CUR_PROC "scluster_out"
int res = -1, i;
char filename[128];
char *out_filename = argv[3];
FILE *out_model = NULL;
/*fprintf(outfile, "\nFinal Models:\n");
for (i = 0; i < cl->smo_number; i++) {
fprintf(outfile, "smodel[%d]:\n", i);
smodel_print(outfile, cl->smo[i]);
fprintf(outfile, "Sequences of smodel[%d] (# %ld, total weight %.0f)\n",
i, cl->smo_seq[i]->seq_number,
cl->smo_seq[i]->total_w);
//for (k = 0; k < sqd->seq_number; k++) {
//if (sqd->seq_label[k] == i) {
// fuer Wetterdaten: Numerierung von 1 - ..
//fprintf(outfile, "\t%4d\t|%.0f|\t", k+1, sqd->seq_w[k]);
//vector_d_print(outfile, sqd->seq[k], sqd->seq_len[k]," "," ","");
//}
//}
fprintf(outfile, "(%ld sequences)\n\n", cl->seq_counter[i]);
}
*/
sprintf(filename, "%s.smo", out_filename);
if(!(out_model = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
scluster_print_header(out_model, argv);
for (i = 0; i < cl->smo_number; i++) {
fprintf(out_model, "#trained smodel[%d]:\n", i);
smodel_print(out_model, cl->smo[i]);
}
fclose(out_model);
/* Output from all sequences in HMM-format; different clusters
make seperate lists */
fclose(out_model);
sprintf(filename, "%s.sqd", out_filename);
if(!(out_model = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
scluster_print_header(out_model, argv);
for (i = 0; i < cl->smo_number; i++) {
if (cl->smo_seq[i] != NULL)
sequence_d_print(out_model, cl->smo_seq[i], 0);
}
/* Output from all sequences in one row with clusterlabel */
/* sequence_d_print(out_model, sqd, 0); */
/* Output: The number of sequences and sequence weights pro cluster,
respektively (for the generator) */
fclose(out_model);
sprintf(filename, "%s.numbers", out_filename);
if(!(out_model = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
scluster_print_header(out_model, argv);
fprintf(out_model, "numbers = {\n");
fprintf(out_model,
"# Clusterung mit Gewichten --> in BS/10, sonst Anzahl Seqs.\n");
/* Weights */
if (cl->smo_seq[0]->total_w > cl->smo_seq[0]->seq_number) {
for (i = 0; i < cl->smo_number-1; i++)
fprintf(out_model, "%.0f,\n", 0.1 * cl->smo_seq[i]->total_w);
fprintf(out_model, "%.0f;\n};", 0.1 * cl->smo_seq[cl->smo_number-1]->total_w);
}
/* Number */
else {
for (i = 0; i < cl->smo_number-1; i++)
fprintf(out_model, "%ld,\n", cl->seq_counter[i]);
fprintf(out_model, "%ld;\n};", cl->seq_counter[cl->smo_number-1]);
}
res = 0;
STOP:
if (out_model) fclose(out_model);
return(res);
#undef CUR_PROC
} /* scluster_out */
/*============================================================================*/
/* Memory for sequences for each model is allocated only once and not done with
realloc for each sequence as before. */
int scluster_update(scluster_t *cl, sequence_d_t *sqd) {
#define CUR_PROC "scluster_update"
int i;
sequence_d_t *seq_ptr;
/* Allocate memoery block by block */
for (i = 0; i < cl->smo_number; i++) {
if (cl->smo_seq[i]) {
/* Important: No sequence_free here, because then the originals will be lost. */
sequence_d_clean(cl->smo_seq[i]);
m_free(cl->smo_seq[i]);
}
if (cl->seq_counter[i] != 0) {
cl->smo_seq[i] = sequence_d_calloc(cl->seq_counter[i]);
cl->smo_seq[i]->seq_number = 0; /* counted below */
}
else
cl->smo_seq[i] = NULL;
}
/* Set entries */
for (i = 0; i < sqd->seq_number; i++) {
seq_ptr = cl->smo_seq[sqd->seq_label[i]];
seq_ptr->seq_len[seq_ptr->seq_number] = sqd->seq_len[i];
seq_ptr->seq_id[seq_ptr->seq_number] = sqd->seq_id[i];
seq_ptr->seq[seq_ptr->seq_number] = sqd->seq[i]; /* Pointer!!! */
seq_ptr->seq_label[seq_ptr->seq_number] = sqd->seq_label[i];
seq_ptr->seq_w[seq_ptr->seq_number] = sqd->seq_w[i];
seq_ptr->seq_number++;
seq_ptr->total_w += sqd->seq_w[i];
}
return(0);
# undef CUR_PROC
} /* scluster_update */
/*============================================================================*/
void scluster_print_likelihood(FILE *outfile, scluster_t *cl) {
double total_Z_MD = 0.0, total_Z_MAW = 0.0;
int i;
for (i = 0; i < cl->smo_number; i++) {
total_Z_MD += cl->smo_Z_MD[i];
total_Z_MAW += cl->smo_Z_MAW[i];
fprintf(outfile, "smo %d\t(#Seq. %4ld):\t", i, cl->seq_counter[i]);
if (CLASSIFY == 0)
fprintf(outfile, "ZMD:%8.2f", cl->smo_Z_MD[i]);
else
fprintf(outfile, "ZMAW:%8.2f", cl->smo_Z_MAW[i]);
fprintf(outfile, "\n");
}
fprintf(outfile, "sum\t");
if (CLASSIFY == 0)
fprintf(outfile, "ZMD: %12.5f", total_Z_MD);
else
fprintf(outfile, "ZMAW: %12.5f", total_Z_MAW);
fprintf(outfile, "\n\n");
if (CLASSIFY == 0)
printf("total error function (ZMD): %15.4f\n", total_Z_MD);
else
printf("total error function (ZMAW): %15.4f\n", total_Z_MAW);
} /* scluster_print_likelihood */
/*============================================================================*/
/* Avoids empty models going out as outputs by assigning them a random
sequence. This may lead to a produce of a new empty model - therefore
change out sequences until a non empty model is found. (Quit after 100
iterations to avoid a infinite loop). */
int scluster_avoid_empty_smodel(sequence_d_t *sqd, scluster_t *cl){
#define CUR_PROC "scluster_avoid_empty_smodel"
int i, best_smo;
long i_old, j = 0;
char error = 1, change = 0;
int iter = 0;
double log_p_minus, log_p_plus;
double *result = NULL;
/* If only one Model or one sequencet: Nothing to be done */
if (sqd->seq_number < 2 || cl->smo_number < 2)
return(0);
if (CLASSIFY == 1)
if(!m_calloc(result, cl->smo_number)) {mes_proc(); goto STOP;}
while (error && iter < 100) {
iter++;
error = 0;
for (i = 0; i < cl->smo_number; i++) {
if (cl->seq_counter[i] <= 1) {
change = 1;
/* Choose a random sequence for an empty model */
j = m_int(gsl_rng_uniform(RNG) * (sqd->seq_number - 1));
/* If it's not possible to produce a sequence for an empty model: Quit */
if (sfoba_logp(cl->smo[i], sqd->seq[j], sqd->seq_len[j],
&log_p_plus) == -1)
continue;
if (CLASSIFY == 1) {
best_smo = smap_bayes(cl->smo, result, cl->smo_number, sqd->seq[j],
sqd->seq_len[j]);
if (best_smo == -1) continue;
}
i_old = sqd->seq_label[j];
/* updates */
printf("!!\"avoid_empty_model\": move seq. %ld: %ld --> %d !!\n",
j, i_old, i);
cl->seq_counter[i_old]--;
cl->seq_counter[i]++;
sqd->seq_label[j] = i;
/* smo_Z_MD update */
if (sfoba_logp(cl->smo[i_old], sqd->seq[j], sqd->seq_len[j],
&log_p_minus) == -1)
{ mes_prot("sfoba_logp returns -1!\n"); goto STOP; };
cl->smo_Z_MD[i_old] -= sqd->seq_w[j] * log_p_minus;
cl->smo_Z_MD[i] += sqd->seq_w[j] * log_p_plus;
/* smo_Z_MAW update */
if (CLASSIFY == 1) {
cl->smo_Z_MAW[i_old] -= sqd->seq_w[j] * result[i_old];
cl->smo_Z_MAW[i] += sqd->seq_w[j] * result[i];
}
}
}
/* Is now everything OK ? */
if (change) {
for (i = 0; i < cl->smo_number; i++) {
if (cl->seq_counter[i] <= 1) {
error = 1;
break;
}
}
}
} /* while */
if (!error) return (0);
STOP:
if (result) m_free(result);
return(-1);
#undef CUR_PROC
} /* scluster_avoid_empty_smodel */
/*============================================================================*/
long scluster_update_label(long *oldlabel, long *seq_label, long seq_number,
long *smo_changed) {
long i, changes = 0;
for (i = 0; i < seq_number; i++)
if (oldlabel[i] != seq_label[i]) {
changes++;
smo_changed[oldlabel[i]] = smo_changed[seq_label[i]] = 1;
oldlabel[i] = seq_label[i];
}
return changes;
} /* scluster_update_label */
/*============================================================================*/
/* Determines from an already calculated probability matrix, which model
fits best to the sequence with the ID seq_id. */
int scluster_best_model(scluster_t *cl, long seq_id, double **all_log_p, double *log_p) {
#define CUR_PROC "scluster_best_model"
int i, smo_id;
double save = -DBL_MAX;
*log_p = -DBL_MAX;
smo_id = -1;
/* MD-Classify: argmax_i (log(p(O | \lambda_i ))) */
if (CLASSIFY == 0) {
for (i = 0; i < cl->smo_number; i++) {
if (all_log_p[i][seq_id] == PENALTY_LOGP) continue; /* model and seq. don't fit */
if (*log_p < all_log_p[i][seq_id]) {
*log_p = all_log_p[i][seq_id];
smo_id = i;
}
}
}
/* MAW-Classify: argmax_i (log(p(O | \lambda_i )) + log(p( \lambda_i))) */
else {
for (i = 0; i < cl->smo_number; i++) {
if (cl->smo[i]->prior == -1) {
mes_prot("Error! Need model prior for MAW-classification\n");
}
if (cl->smo[i]->prior != 0) {
if (all_log_p[i][seq_id] == PENALTY_LOGP) continue; /* model and seq. don't fit */
if (save < all_log_p[i][seq_id] + log(cl->smo[i]->prior)) {
save = all_log_p[i][seq_id] + log(cl->smo[i]->prior);
*log_p = all_log_p[i][seq_id];
smo_id = i;
}
}
}
}
return (smo_id);
#undef CUR_PROC
} /* scluster_best_model */
/*============================================================================*/
void scluster_prob(smosqd_t *cs) {
int i;
//printf("seq_num = %d\n", cs->sqd->seq_number);
for (i = 0; i < cs->sqd->seq_number; i++)
if (sfoba_logp(cs->smo, cs->sqd->seq[i], cs->sqd->seq_len[i], &(cs->logp[i])) == -1)
cs->logp[i] = (double) PENALTY_LOGP; /* Penalty costs */
} /* scluster_prob */
/*============================================================================*/
int scluster_random_labels(sequence_d_t *sqd, int smo_number) {
#define CUR_PROC "scluster_random_labels"
int j, label;
for (j = 0; j < sqd->seq_number; j++) {
label = m_int(gsl_rng_uniform(RNG) * (smo_number - 1));
sqd->seq_label[j] = label;
}
return(0);
# undef CUR_PROC
} /* scluster_random_labels */
/*============================================================================*/
int scluster_log_aposteriori(scluster_t *cl, sequence_d_t *sqd, int seq_id,
double *log_apo) {
#define CUR_PROC "scluster_log_aposteriori"
double *result = NULL;
int best_smo = -1;
if(!m_calloc(result, cl->smo_number)) {mes_proc(); goto STOP;}
best_smo = smap_bayes(cl->smo, result, cl->smo_number, sqd->seq[seq_id],
sqd->seq_len[seq_id]);
if (best_smo == -1) {
mes_prot("best_smo == -1 !\n");
goto STOP;
}
else
*log_apo = result[best_smo];
/* TEST */
/* printf("result: ");
for (i = 0; i< cl->smo_number; i++)
printf("%.2f ", result[i]); printf("\n");
*/
STOP:
m_free(result);
return best_smo;
# undef CUR_PROC
} /* scluster_log_aposteriori */
/*============================================================================*/
void scluster_print_header(FILE *file, char* argv[]) {
time_t zeit;
time(&zeit);
fprintf(file, "# Date: %s# scluster:\n", ctime((const time_t*)&zeit));
fprintf(file, "# Sequence File: %s\n", argv[1]);
fprintf(file, "# Model File: %s\n", argv[2]);
fprintf(file, "# Start Partion Label: ");
switch(atoi(argv[4])) {
case 0: fprintf(file, "SP_BEST (best model)\n\n"); break;
case 1: fprintf(file, "NO_SP (no start partition)\n\n"); break;
case 2: fprintf(file, "SP_KM (partition from k-means)\n\n"); break;
case 3: fprintf(file, "SP_ZUF (random start partition)\n\n"); break;
default: fprintf(file, "???\n\n");
}
}
#if POUT == 0
#undef THREADS
#endif /* HAVE_LIBPTHREAD */
#undef POUT
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -