meme.c
来自「EM算法的改进」· C语言 代码 · 共 513 行 · 第 1/2 页
C
513 行
/* Do the reduction. */ reduce_across_models(model, dataset->alength); /*fprintf(stderr, "%d: past reduction\n", mpMyID()); fflush(stderr);*/#endif /* PARALLEL */ /* quit if model has too few sites */ if (model->nsites_dis < MINSITES) break; /*fprintf(stderr, "%d: past few sites\n", mpMyID()); fflush(stderr);*/ /* quit if model fails E-value test */ if (model->logev > log(dataset->evt)) break; /*fprintf(stderr, "%d: past E-value\n", mpMyID()); fflush(stderr);*/ /* Store the total number of EM's in the model. */ model->iter = iter; /* ERASE the site and starts */ /*fprintf(stderr, "%d: at erase\n", mpMyID()); fflush(stderr);*/ erase(dataset, model); /*fprintf(stderr, "%d: past erase\n", mpMyID()); fflush(stderr);*/ /* calculate negative model by doing one EM iteration on negative ex's */ if (neg_model) { /* copy motif model to negative model */ copy_model(model, neg_model, dataset->alength); /* get negative model */ neg_dataset->maxiter = 1; /* 1 iteration of em */ em(neg_model, neg_dataset); /* ERASE the site and starts */ erase(neg_dataset, neg_model); } /* print results */ /*fprintf(stderr, "%d: at print results\n", mpMyID()); fflush(stderr);*/ if (!no_print) print_results(dataset, neg_dataset, model, neg_model, candidates); /*fprintf(stderr, "%d: past print results\n", mpMyID()); fflush(stderr);*/ /* stop if out of time */ if (dataset->max_time && imotif<nmotifs) { /* considering another motif */ stop_time = myclock()/1E6; /* current time */#ifdef PARALLEL /* use stop_time from process 0 to avoid it stopping while others continue*/ /*fprintf(stderr, "%d: time broadcast size %d\n", mpMyID(),sizeof(stop_time)); fflush(stderr); */ mpBroadcast((void *)&stop_time, sizeof(stop_time), 0); /*fprintf(stderr, "%d: past time broadcast\n", mpMyID()); fflush(stderr); */#endif /* record time if this is motif 1 */ if (imotif == 1) m1time = stop_time; if ((dataset->max_time - stop_time) < m1time) { ++imotif; /* this motif OK */ break; } } /* check time */ /*fprintf(stderr, "%d: past check time\n", mpMyID()); fflush(stderr); */ } /* nmotifs loop */ --imotif; /* number of motifs found */ /* print the motif block diagrams using all the motifs */ if (!no_print && imotif) print_summary(model, dataset, los, imotif, pv, stdout); /* print reason for stopping */ printf("\n"); PSTARS; if (n_starts == 0) { printf("Stopped because couldn't find any more starting points for EM.\n"); } else if (imotif == nmotifs) { printf("Stopped because nmotifs = %d reached.\n", nmotifs); } else if (dataset->max_time && (dataset->max_time - stop_time) < m1time) { printf("Stopped because would probably run out of time (%.2f secs).\n", dataset->max_time); } else if (model->nsites_dis < MINSITES) { printf("Stopped because next motif has fewer than %d sites.\n", MINSITES); } else { printf("Stopped because motif E-value > %8.2e.\n", dataset->evt); } PSTARS; fflush(stdout);#ifdef UNIX if (!no_print) system("echo ''; echo CPU: `hostname`; echo '' ");#else if (!no_print) printf("\nCPU: unknown\n\n");#endif /* UNIX */ PSTARS; if (!NO_STATUS) fprintf(stderr, "\n");#ifdef PARALLEL mpFinalize();#endif return(0);} /* main *//**********************************************************************//* erase For all models: Reset the weights of the letters to probabilisticaly "erase" the ones which occur in sites already found.*//**********************************************************************/static void erase( DATASET *dataset, /* the dataset */ MODEL *model /* the model */){ int i, j, k; int n_samples = dataset->n_samples; /* number of sequences */ SAMPLE **samples = dataset->samples; /* the sequences */ int w = model->w; /* width of motif */ /* Set z from the maxima stored in the learned model. */ /*fprintf(stderr, "%d: at set_z\n", mpMyID()); fflush(stderr); */ set_z(model, dataset); /*fprintf(stderr, "%d: past set_z\n", mpMyID()); fflush(stderr); */ /* z_ij is taken as the probability of a site occurring at i,j. The probability of a position being in a site is taken as the maximum of the z_ij for sites containing (overlapping) it. w_ij is set to 1-max(z_ij) times its previous value which reflects the independence assumption among motifs. */ for (i=0; i<n_samples; i++) { /* sequence */ double *weights = samples[i]->weights; /* w_ij */ int lseq = samples[i]->length; /* seq length */ double *zi = samples[i]->z; /* z_ij */ /*if (mpMyID()==0)fprintf(stderr, "0: i loop %d lseq %d\n", i, lseq); fflush(stderr); */ if (lseq < w) continue; /* sample too short for motif */ for (j=0; j<lseq; j++) { /* position */ double max_z = 0.0; /*if (mpMyID()==0)fprintf(stderr, "0: j loop %d\n", j); fflush(stderr); */ /* find largest probability that site overlaps this position */ for (k=MAX(0,j-w+1); k<=j && k<lseq-w+1; k++) { /*if (mpMyID()==0)fprintf(stderr, "0: k loop %d\n", k); fflush(stderr); */ max_z = MAX(max_z, zi[k]); } max_z = MIN(1.0, max_z); /* fix roundoff errors */ /* update the probability that position not in a site */ weights[j] *= 1.0 - max_z; } } if (PRINT_W) print_wij(dataset);} /* erase *//**********************************************************************//* init_model Initialize a model from a starting point. Returns false if starting point was not valid.*//**********************************************************************/static BOOLEAN init_model( S_POINT *s_point, /* the starting point */ MODEL *model, /* the model to intialize */ DATASET *dataset, /* the dataset */ int imotif /* motif number */){ int w0; /* skip if no good starting points found for w0, nsites0 */ if (s_point->score == LITTLE) { return FALSE; } /* initialize the new motif */ strcpy(model->cons0, s_point->cons0); w0 = model->w = model->pw = s_point->w0; init_theta(model->theta, s_point->e_cons0, w0, dataset->map,dataset->alength); /* initialize lambda */ model->lambda = MIN(s_point->nsites0/wps(dataset, w0), 1); model->pal = dataset->pal; /* initialize prior estimate of number of sites */ model->psites = s_point->nsites0; if (PRINTALL) { printf("component %2d: lambda= %8.6f ps= %8.0f\n", 1, model->lambda, wps(dataset, w0)); print_theta(0, 2, model->nsites_dis, model->theta, model->w, 0, "", dataset, stdout); } /* init motif number */ model->imotif = imotif; model->iseq = s_point->iseq; model->ioff = s_point->ioff; return TRUE;} /* init_model *//**********************************************************************//* save_candidate Save the starting point and part of model if it is most significant model of its width so far: model->sig is smallest Returns true if the model is the best so far among all widths.*/ /**********************************************************************/static BOOLEAN save_candidate( MODEL *model, /* final model */ DATASET *dataset, /* the dataset */ S_POINT *s_point, /* starting point */ CANDIDATE *candidates, /* candidate for best model of width */ double best_sig /* best motif significance */){ int w = model->w; /* final motif w */ /* objective function value */ double sig = dataset->objfun==Pv ? model->logpv : model->logev; /* print the results for this w0, nsites0 and THETA */ if (PRINT_STARTS) { printf("\n(start) %3d %6.1f %.*s --> %s ", s_point->w0, s_point->nsites0, s_point->w0, s_point->cons0, model->cons); printf("w %3d nsites %4d sig %20.10g\n\n", w, model->nsites_dis, exp(sig)); fflush(stdout); } /* save the results if best so far for this width */ if (sig < candidates[w].sig) { candidates[w].s_point = s_point; candidates[w].w = w; candidates[w].pal = model->pal; candidates[w].invcomp = model->invcomp; candidates[w].lambda = model->lambda; strcpy(candidates[w].cons, model->cons); candidates[w].rel = model->rel; candidates[w].ll = model->ll; candidates[w].sig = sig; } return (sig < best_sig);} /* save candidates */
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?