📄 malign.c
字号:
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#include "clustalw.h"
/*
* Prototypes
*/
/*
* Global Variables
*/
extern double **tmat;
extern Boolean no_weights;
extern sint debug;
extern sint max_aa;
extern sint nseqs;
extern sint profile1_nseqs;
extern sint nsets;
extern sint **sets;
extern sint divergence_cutoff;
extern sint *seq_weight;
extern sint output_order, *output_index;
extern Boolean distance_tree;
extern char seqname[];
extern sint *seqlen_array;
extern char **seq_array;
sint malign(sint istart,char *phylip_name) /* full progressive alignment*/
{
static sint *aligned;
static sint *group;
static sint ix;
sint *maxid, max, sum;
sint *tree_weight;
sint i,j,set,iseq=0;
sint status,entries;
lint score = 0;
info("Start of Multiple Alignment");
/* get the phylogenetic tree from *.ph */
if (nseqs >= 2)
{
status = read_tree(phylip_name, (sint)0, nseqs);
if (status == 0) return((sint)0);
}
/* calculate sequence weights according to branch lengths of the tree -
weights in global variable seq_weight normalised to sum to 100 */
calc_seq_weights((sint)0, nseqs, seq_weight);
/* recalculate tmat matrix as percent similarity matrix */
status = calc_similarities(nseqs);
if (status == 0) return((sint)0);
/* for each sequence, find the most closely related sequence */
maxid = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
for (i=1;i<=nseqs;i++)
{
maxid[i] = -1;
for (j=1;j<=nseqs;j++)
if (j!=i && maxid[i] < tmat[i][j]) maxid[i] = tmat[i][j];
}
/* group the sequences according to their relative divergence */
if (istart == 0)
{
sets = (sint **) ckalloc( (nseqs+1) * sizeof (sint *) );
for(i=0;i<=nseqs;i++)
sets[i] = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
create_sets((sint)0,nseqs);
info("There are %d groups",(pint)nsets);
/* clear the memory used for the phylogenetic tree */
if (nseqs >= 2)
clear_tree(NULL);
/* start the multiple alignments......... */
info("Aligning...");
/* first pass, align closely related sequences first.... */
ix = 0;
aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
for (i=0;i<=nseqs;i++) aligned[i] = 0;
for(set=1;set<=nsets;++set)
{
entries=0;
for (i=1;i<=nseqs;i++)
{
if ((sets[set][i] != 0) && (maxid[i] > divergence_cutoff))
{
entries++;
if (aligned[i] == 0)
{
if (output_order==INPUT)
{
++ix;
output_index[i] = i;
}
else output_index[++ix] = i;
aligned[i] = 1;
}
}
}
if(entries > 0) score = prfalign(sets[set], aligned);
else score=0.0;
/* negative score means fatal error... exit now! */
if (score < 0)
{
return(-1);
}
if ((entries > 0) && (score > 0))
info("Group %d: Sequences:%4d Score:%d",
(pint)set,(pint)entries,(pint)score);
else
info("Group %d: Delayed",
(pint)set);
}
for (i=0;i<=nseqs;i++)
sets[i]=ckfree((void *)sets[i]);
sets=ckfree(sets);
}
else
{
/* clear the memory used for the phylogenetic tree */
if (nseqs >= 2)
clear_tree(NULL);
aligned = (sint *)ckalloc( (nseqs+1) * sizeof (sint) );
ix = 0;
for (i=1;i<=istart+1;i++)
{
aligned[i] = 1;
++ix;
output_index[i] = i;
}
for (i=istart+2;i<=nseqs;i++) aligned[i] = 0;
}
/* second pass - align remaining, more divergent sequences..... */
/* if not all sequences were aligned, for each unaligned sequence,
find it's closest pair amongst the aligned sequences. */
group = (sint *)ckalloc( (nseqs+1) * sizeof (sint));
tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
for (i=0;i<nseqs;i++)
tree_weight[i] = seq_weight[i];
/* if we haven't aligned any sequences, in the first pass - align the
two most closely related sequences now */
if(ix==0)
{
max = -1;
iseq = 0;
for (i=1;i<=nseqs;i++)
{
for (j=i+1;j<=nseqs;j++)
{
if (max < tmat[i][j])
{
max = tmat[i][j];
iseq = i;
}
}
}
aligned[iseq]=1;
if (output_order == INPUT)
{
++ix;
output_index[iseq] = iseq;
}
else
output_index[++ix] = iseq;
}
while (ix < nseqs)
{
for (i=1;i<=nseqs;i++) {
if (aligned[i] == 0)
{
maxid[i] = -1;
for (j=1;j<=nseqs;j++)
if ((maxid[i] < tmat[i][j]) && (aligned[j] != 0))
maxid[i] = tmat[i][j];
}
}
/* find the most closely related sequence to those already aligned */
max = -1;
iseq = 0;
for (i=1;i<=nseqs;i++)
{
if ((aligned[i] == 0) && (maxid[i] > max))
{
max = maxid[i];
iseq = i;
}
}
/* align this sequence to the existing alignment */
/* weight sequences with percent identity with profile*/
/* OR...., multiply sequence weights from tree by percent identity with new sequence */
if(no_weights==FALSE) {
for (j=0;j<nseqs;j++)
if (aligned[j+1] != 0)
seq_weight[j] = tree_weight[j] * tmat[j+1][iseq];
/*
Normalise the weights, such that the sum of the weights = INT_SCALE_FACTOR
*/
sum = 0;
for (j=0;j<nseqs;j++)
if (aligned[j+1] != 0)
sum += seq_weight[j];
if (sum == 0)
{
for (j=0;j<nseqs;j++)
seq_weight[j] = 1;
sum = j;
}
for (j=0;j<nseqs;j++)
if (aligned[j+1] != 0)
{
seq_weight[j] = (seq_weight[j] * INT_SCALE_FACTOR) / sum;
if (seq_weight[j] < 1) seq_weight[j] = 1;
}
}
entries = 0;
for (j=1;j<=nseqs;j++)
if (aligned[j] != 0)
{
group[j] = 1;
entries++;
}
else if (iseq==j)
{
group[j] = 2;
entries++;
}
aligned[iseq] = 1;
score = prfalign(group, aligned);
info("Sequence:%d Score:%d",(pint)iseq,(pint)score);
if (output_order == INPUT)
{
++ix;
output_index[iseq] = iseq;
}
else
output_index[++ix] = iseq;
}
group=ckfree((void *)group);
aligned=ckfree((void *)aligned);
maxid=ckfree((void *)maxid);
tree_weight=ckfree((void *)tree_weight);
aln_score();
/* make the rest (output stuff) into routine clustal_out in file amenu.c */
return(nseqs);
}
sint seqalign(sint istart,char *phylip_name) /* sequence alignment to existing profile */
{
static sint *aligned, *tree_weight;
static sint *group;
static sint ix;
sint *maxid, max;
sint i,j,status,iseq;
sint sum,entries;
lint score = 0;
info("Start of Multiple Alignment");
/* get the phylogenetic tree from *.ph */
if (nseqs >= 2)
{
status = read_tree(phylip_name, (sint)0, nseqs);
if (status == 0) return(0);
}
/* calculate sequence weights according to branch lengths of the tree -
weights in global variable seq_weight normalised to sum to 100 */
calc_seq_weights((sint)0, nseqs, seq_weight);
tree_weight = (sint *) ckalloc( (nseqs) * sizeof(sint) );
for (i=0;i<nseqs;i++)
tree_weight[i] = seq_weight[i];
/* recalculate tmat matrix as percent similarity matrix */
status = calc_similarities(nseqs);
if (status == 0) return((sint)0);
/* for each sequence, find the most closely related sequence */
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -