📄 scluster.c

📁 一个通用的隐性马尔可夫C代码库开发环境:C语言简要说明:这是一个通用的隐性马尔可夫C代码库
💻 C
📖 第 1 页 / 共 2 页
字号:
上一页 12

/*--------------------------------------------------------------------------*/
  res = 0;
STOP:
#if POUT == 0
  pthread_attr_destroy(&Attribute);
#endif /* POUT */
  /* XXX ...still div. free! XXX*/
  if (outfile) fclose(outfile);
  return(res);
# undef CUR_PROC
}/* scluster_hmm */

/*============================================================================*/

int scluster_t_free(scluster_t *scl){
	int i;
	#define CUR_PROC "scluster_t_free"
	mes_check_ptr(scl, return(-1));
    if( !scl ) return(0);
	
	for (i=0;i<=scl->smo_number;i++){
		smodel_free(&(scl->smo[i]));
		sequence_d_free(&(scl->smo_seq[i]));
	}
	printf("hier1\n");
	m_free(scl->smo);
	m_free(scl->smo_seq);
	
	m_free(scl->seq_counter);
	m_free(scl->smo_Z_MD);
	m_free(scl->smo_Z_MAW);
	return 0;
	# undef CUR_PROC
}

/*============================================================================*/
int scluster_out(scluster_t *cl, sequence_d_t *sqd, FILE *outfile,
		 char *argv[]) {
#define CUR_PROC "scluster_out"
  int res = -1, i;
  char filename[128];
  char *out_filename = argv[3];
  FILE *out_model = NULL;
  /*fprintf(outfile, "\nFinal Models:\n");
    for (i = 0; i < cl->smo_number; i++) {
    fprintf(outfile, "smodel[%d]:\n", i);
    smodel_print(outfile, cl->smo[i]);
    fprintf(outfile, "Sequences of smodel[%d] (# %ld, total weight %.0f)\n",
    i, cl->smo_seq[i]->seq_number, 
    cl->smo_seq[i]->total_w);
    
    //for (k = 0; k < sqd->seq_number; k++) {
    //if (sqd->seq_label[k] == i) {
    // fuer Wetterdaten: Numerierung von 1 - .. 
    //fprintf(outfile, "\t%4d\t|%.0f|\t", k+1, sqd->seq_w[k]);
    //vector_d_print(outfile, sqd->seq[k], sqd->seq_len[k]," "," ","");
    //}     
    //}
  
    fprintf(outfile, "(%ld sequences)\n\n", cl->seq_counter[i]);
    }
  */

  sprintf(filename, "%s.smo", out_filename);
  if(!(out_model = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
  scluster_print_header(out_model, argv);
  for (i = 0; i < cl->smo_number; i++) {
    fprintf(out_model, "#trained smodel[%d]:\n", i);
    smodel_print(out_model, cl->smo[i]);
  }  
  fclose(out_model);

  /* Output from all sequences in HMM-format; different clusters
     make seperate lists */
  fclose(out_model); 
  sprintf(filename, "%s.sqd", out_filename);
  if(!(out_model = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
  scluster_print_header(out_model, argv);
  for (i = 0; i < cl->smo_number; i++) {
    if (cl->smo_seq[i] != NULL) 
      sequence_d_print(out_model, cl->smo_seq[i], 0);  
  }
  /* Output from all sequences in one row with clusterlabel */
  /* sequence_d_print(out_model, sqd, 0); */

  /* Output: The number of sequences and sequence weights pro cluster,
     respektively (for the generator) */
  fclose(out_model); 
  sprintf(filename, "%s.numbers", out_filename);
  if(!(out_model = mes_fopen(filename, "wt"))) {mes_proc(); goto STOP;}
  scluster_print_header(out_model, argv);
  fprintf(out_model, "numbers = {\n");
  fprintf(out_model, 
	  "# Clusterung mit Gewichten --> in BS/10, sonst Anzahl Seqs.\n");
  /* Weights */
  if (cl->smo_seq[0]->total_w > cl->smo_seq[0]->seq_number) {
    for (i = 0; i < cl->smo_number-1; i++)
      fprintf(out_model, "%.0f,\n", 0.1 * cl->smo_seq[i]->total_w);
    fprintf(out_model, "%.0f;\n};", 0.1 * cl->smo_seq[cl->smo_number-1]->total_w);
  }
  /* Number */
  else {        
    for (i = 0; i < cl->smo_number-1; i++)
      fprintf(out_model, "%ld,\n", cl->seq_counter[i]);
    fprintf(out_model, "%ld;\n};", cl->seq_counter[cl->smo_number-1]);
  }
    

  res = 0;
STOP:
  if (out_model) fclose(out_model);    
  return(res);
#undef CUR_PROC
} /* scluster_out */


/*============================================================================*/
/* Memory for sequences for each model is allocated only once and not done with
   realloc for each sequence as before. */
int scluster_update(scluster_t *cl, sequence_d_t *sqd) {
#define CUR_PROC "scluster_update"
  int i;
  sequence_d_t *seq_ptr;
  /* Allocate memoery block by block */
  for (i = 0; i < cl->smo_number; i++) {
    if (cl->smo_seq[i]) {
      /* Important: No sequence_free here, because then the originals  will be lost. */
      sequence_d_clean(cl->smo_seq[i]);
      m_free(cl->smo_seq[i]);
    }
    if (cl->seq_counter[i] != 0) {
      cl->smo_seq[i] = sequence_d_calloc(cl->seq_counter[i]);
      cl->smo_seq[i]->seq_number = 0; /* counted below */
    }
    else
      cl->smo_seq[i] = NULL;
  }
  /* Set entries */
  for (i = 0; i < sqd->seq_number; i++) {
    seq_ptr = cl->smo_seq[sqd->seq_label[i]];
    seq_ptr->seq_len[seq_ptr->seq_number] = sqd->seq_len[i];
    seq_ptr->seq_id[seq_ptr->seq_number] = sqd->seq_id[i];
    seq_ptr->seq[seq_ptr->seq_number] = sqd->seq[i]; /* Pointer!!! */
    seq_ptr->seq_label[seq_ptr->seq_number] = sqd->seq_label[i];
    seq_ptr->seq_w[seq_ptr->seq_number] = sqd->seq_w[i];
    seq_ptr->seq_number++;
    seq_ptr->total_w += sqd->seq_w[i];
  }
  return(0);
# undef CUR_PROC
} /* scluster_update */

/*============================================================================*/
void scluster_print_likelihood(FILE *outfile, scluster_t *cl) {
  double  total_Z_MD = 0.0, total_Z_MAW = 0.0;
  int i;
  for (i = 0; i < cl->smo_number; i++) {
    total_Z_MD += cl->smo_Z_MD[i];
    total_Z_MAW += cl->smo_Z_MAW[i];
    fprintf(outfile, "smo %d\t(#Seq. %4ld):\t", i, cl->seq_counter[i]);
    if (CLASSIFY == 0)
      fprintf(outfile, "ZMD:%8.2f", cl->smo_Z_MD[i]);
    else
      fprintf(outfile, "ZMAW:%8.2f", cl->smo_Z_MAW[i]);
    fprintf(outfile, "\n");      
  }
  fprintf(outfile, "sum\t");
  if  (CLASSIFY == 0)
    fprintf(outfile, "ZMD: %12.5f", total_Z_MD);
  else
    fprintf(outfile, "ZMAW: %12.5f", total_Z_MAW);
  fprintf(outfile, "\n\n");      
  if (CLASSIFY == 0)
    printf("total error function (ZMD): %15.4f\n", total_Z_MD);
  else 
    printf("total error function (ZMAW): %15.4f\n", total_Z_MAW);
} /* scluster_print_likelihood */

/*============================================================================*/
/* Avoids empty models going out as outputs by assigning them a random 
   sequence. This may lead to a produce of a new empty model - therefore
   change out sequences until a non empty model is found. (Quit after 100 
   iterations to avoid a infinite loop). */
int scluster_avoid_empty_smodel(sequence_d_t *sqd, scluster_t *cl){
#define CUR_PROC "scluster_avoid_empty_smodel"
  int i, best_smo;
  long i_old, j = 0;
  char error = 1, change = 0;
  int iter = 0;
  double log_p_minus, log_p_plus;
  double *result = NULL;

  /* If only one Model or one sequencet: Nothing to be done */
  if (sqd->seq_number < 2 || cl->smo_number < 2)
    return(0);
  if (CLASSIFY == 1)
    if(!m_calloc(result, cl->smo_number)) {mes_proc(); goto STOP;}

  while (error && iter < 100) {
    iter++;
    error = 0;
    for (i = 0; i < cl->smo_number; i++) {
      if (cl->seq_counter[i] <= 1) {
	change = 1;
	/* Choose a random sequence for an empty model */
	j = m_int(gsl_rng_uniform(RNG) * (sqd->seq_number - 1));
	/* If it's not possible to produce a sequence for an empty model: Quit */
	if (sfoba_logp(cl->smo[i], sqd->seq[j], sqd->seq_len[j],
		       &log_p_plus) == -1) 
	  continue;
	if (CLASSIFY == 1) {
	  best_smo = smap_bayes(cl->smo, result, cl->smo_number, sqd->seq[j],
				sqd->seq_len[j]);
	  if (best_smo == -1) continue;
	}
	i_old = sqd->seq_label[j];
	/* updates  */
	printf("!!\"avoid_empty_model\": move seq. %ld: %ld --> %d !!\n", 
	       j, i_old, i);
	cl->seq_counter[i_old]--;
	cl->seq_counter[i]++;
	sqd->seq_label[j] = i;
	/* smo_Z_MD update */
	if (sfoba_logp(cl->smo[i_old], sqd->seq[j], sqd->seq_len[j],
		       &log_p_minus) == -1) 
	  { mes_prot("sfoba_logp returns -1!\n"); goto STOP; };
	cl->smo_Z_MD[i_old] -= sqd->seq_w[j] * log_p_minus;
	cl->smo_Z_MD[i] += sqd->seq_w[j] * log_p_plus;
	/* smo_Z_MAW update */
	if (CLASSIFY == 1) {
	  cl->smo_Z_MAW[i_old] -= sqd->seq_w[j] * result[i_old];
	  cl->smo_Z_MAW[i] += sqd->seq_w[j] * result[i];
	}
      }
    }
    /* Is now everything OK ? */
    if (change) {
      for (i = 0; i < cl->smo_number; i++) {
	if (cl->seq_counter[i] <= 1) {
	  error = 1;
	  break;
	}
      }
    }
  } /* while */
  if (!error) return (0);

STOP: 
  if (result) m_free(result);
  return(-1);
#undef CUR_PROC
} /* scluster_avoid_empty_smodel */

/*============================================================================*/
long scluster_update_label(long *oldlabel, long *seq_label, long seq_number,
			   long *smo_changed) {
  long i, changes = 0; 
  for (i = 0; i < seq_number; i++) 
    if (oldlabel[i] != seq_label[i]) {
      changes++;
      smo_changed[oldlabel[i]] = smo_changed[seq_label[i]] = 1;
      oldlabel[i] = seq_label[i];      
    }
  return changes;
} /* scluster_update_label */

/*============================================================================*/

/* Determines from an already calculated probability matrix, which model 
   fits best to the sequence with the ID seq_id. */
int scluster_best_model(scluster_t *cl, long seq_id, double **all_log_p, double *log_p) {
#define CUR_PROC "scluster_best_model"
  int i, smo_id;
  double save = -DBL_MAX;
  
  *log_p = -DBL_MAX;
  smo_id = -1;
  /* MD-Classify: argmax_i (log(p(O | \lambda_i ))) */
  if (CLASSIFY == 0) {
    for (i = 0; i < cl->smo_number; i++) {
      if (all_log_p[i][seq_id] == PENALTY_LOGP) continue; /* model and seq. don't fit */
      if (*log_p < all_log_p[i][seq_id]) {
	*log_p = all_log_p[i][seq_id];
	smo_id = i;
      }  
    }
  }
  /* MAW-Classify: argmax_i (log(p(O | \lambda_i )) + log(p( \lambda_i))) */
  else {
    for (i = 0; i < cl->smo_number; i++) {
      if (cl->smo[i]->prior == -1) {
	mes_prot("Error! Need model prior for MAW-classification\n");
      }
      if (cl->smo[i]->prior != 0) {
	if (all_log_p[i][seq_id] == PENALTY_LOGP) continue; /* model and seq. don't fit */
	if (save < all_log_p[i][seq_id] + log(cl->smo[i]->prior)) {
	  save = all_log_p[i][seq_id] + log(cl->smo[i]->prior);
	  *log_p = all_log_p[i][seq_id];
	  smo_id = i;
	} 
      }
    } 
  }


  return (smo_id);
#undef CUR_PROC
} /* scluster_best_model */

/*============================================================================*/

void scluster_prob(smosqd_t *cs) {
  int i;
  
  //printf("seq_num = %d\n", cs->sqd->seq_number);
  
  for (i = 0; i < cs->sqd->seq_number; i++)
    if (sfoba_logp(cs->smo, cs->sqd->seq[i], cs->sqd->seq_len[i], &(cs->logp[i])) == -1)
      cs->logp[i] = (double) PENALTY_LOGP;  /*  Penalty costs */
} /* scluster_prob */

/*============================================================================*/
int scluster_random_labels(sequence_d_t *sqd, int smo_number) {
#define CUR_PROC "scluster_random_labels"
  int j, label;
  for (j = 0; j < sqd->seq_number; j++) {
    label = m_int(gsl_rng_uniform(RNG) * (smo_number - 1));
    sqd->seq_label[j] = label;
  }
  return(0);
# undef CUR_PROC
} /* scluster_random_labels */

/*============================================================================*/

int  scluster_log_aposteriori(scluster_t *cl, sequence_d_t *sqd, int seq_id, 
			      double *log_apo) {
#define CUR_PROC "scluster_log_aposteriori"
  double *result = NULL;
  int best_smo = -1;
  if(!m_calloc(result, cl->smo_number)) {mes_proc(); goto STOP;}
  best_smo = smap_bayes(cl->smo, result, cl->smo_number, sqd->seq[seq_id],
			sqd->seq_len[seq_id]);
  if (best_smo == -1) {
    mes_prot("best_smo == -1 !\n");
    goto STOP;
  } 
  else 
    *log_apo = result[best_smo]; 
  /* TEST */
  /*  printf("result: ");
      for (i = 0; i< cl->smo_number; i++)
      printf("%.2f ", result[i]); printf("\n");
  */
STOP:
  m_free(result);
  return best_smo;
# undef CUR_PROC
} /* scluster_log_aposteriori */

/*============================================================================*/

void scluster_print_header(FILE *file, char* argv[]) {
  time_t zeit;
  time(&zeit);
  fprintf(file, "# Date: %s# scluster:\n", ctime((const time_t*)&zeit));
  fprintf(file, "# Sequence File: %s\n", argv[1]);
  fprintf(file, "# Model File: %s\n", argv[2]);
  fprintf(file, "# Start Partion Label: ");
  switch(atoi(argv[4])) {
  case 0: fprintf(file, "SP_BEST (best model)\n\n"); break;
  case 1: fprintf(file, "NO_SP (no start partition)\n\n"); break;
  case 2: fprintf(file, "SP_KM (partition from k-means)\n\n"); break;
  case 3: fprintf(file, "SP_ZUF (random start partition)\n\n"); break;
  default: fprintf(file, "???\n\n");
  }
}

#if POUT == 0
#undef THREADS
#endif /* HAVE_LIBPTHREAD */
#undef POUT
上一页 12
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -