📄 alignio.c
字号:
* aseq1, aseq2 - alignments to write (not necessarily * flushed right with gaps) * name1, name2 - names of sequences * spos1, spos2 - starting position in each (raw) sequence * pam - PAM matrix; positive values define * conservative changes * indent - how many extra spaces to print on left * * Return: 1 on success, 0 on failure */intWritePairwiseAlignment(FILE *ofp, char *aseq1, char *name1, int spos1, char *aseq2, char *name2, int spos2, int **pam, int indent){ char sname1[11]; /* shortened name */ char sname2[11]; int still_going; /* True if writing another block */ char buf1[61]; /* buffer for writing seq1; CPL+1*/ char bufmid[61]; /* buffer for writing consensus */ char buf2[61]; char *s1, *s2; /* ptrs into each sequence */ int count1, count2; /* number of symbols we're writing */ int rpos1, rpos2; /* position in raw seqs */ int rawcount1, rawcount2; /* number of nongap symbols written */ int apos; strncpy(sname1, name1, 10); sname1[10] = '\0'; strtok(sname1, WHITESPACE); strncpy(sname2, name2, 10); sname2[10] = '\0'; strtok(sname2, WHITESPACE); s1 = aseq1; s2 = aseq2; rpos1 = spos1; rpos2 = spos2; still_going = TRUE; while (still_going) { still_going = FALSE; /* get next line's worth from both */ strncpy(buf1, s1, 60); buf1[60] = '\0'; strncpy(buf2, s2, 60); buf2[60] = '\0'; count1 = strlen(buf1); count2 = strlen(buf2); /* is there still more to go? */ if ((count1 == 60 && s1[60] != '\0') || (count2 == 60 && s2[60] != '\0')) still_going = TRUE; /* shift seq ptrs by a line */ s1 += count1; s2 += count2; /* assemble the consensus line */ for (apos = 0; apos < count1 && apos < count2; apos++) { if (!isgap(buf1[apos]) && !isgap(buf2[apos])) { if (buf1[apos] == buf2[apos]) bufmid[apos] = buf1[apos]; else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0) bufmid[apos] = '+'; else bufmid[apos] = ' '; } else bufmid[apos] = ' '; } bufmid[apos] = '\0'; rawcount1 = 0; for (apos = 0; apos < count1; apos++) if (!isgap(buf1[apos])) rawcount1++; rawcount2 = 0; for (apos = 0; apos < count2; apos++) if (!isgap(buf2[apos])) rawcount2++; (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", sname1, rpos1, buf1, rpos1 + rawcount1 -1); (void) fprintf(ofp, "%*s %s\n", indent, "", bufmid); (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "", sname2, rpos2, buf2, rpos2 + rawcount2 -1); (void) fprintf(ofp, "\n"); rpos1 += rawcount1; rpos2 += rawcount2; } return 1;}/* Function: MingapAlignment() * * Purpose: Remove all-gap columns from a multiple sequence alignment * and its associated data. The alignment is assumed to be * flushed (all aseqs the same length). */intMingapAlignment(char **aseqs, AINFO *ainfo){ int apos; /* position in original alignment */ int mpos; /* position in new alignment */ int idx; /* We overwrite aseqs, using its allocated memory. */ for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++) { /* check for all-gap in column */ for (idx = 0; idx < ainfo->nseq; idx++) if (! isgap(aseqs[idx][apos])) break; if (idx == ainfo->nseq) continue; /* shift alignment and ainfo */ if (mpos != apos) { for (idx = 0; idx < ainfo->nseq; idx++) aseqs[idx][mpos] = aseqs[idx][apos]; if (ainfo->cs != NULL) ainfo->cs[mpos] = ainfo->cs[apos]; if (ainfo->rf != NULL) ainfo->rf[mpos] = ainfo->rf[apos]; } mpos++; } /* null terminate everything */ for (idx = 0; idx < ainfo->nseq; idx++) aseqs[idx][mpos] = '\0'; ainfo->alen = mpos; /* set new length */ if (ainfo->cs != NULL) ainfo->cs[mpos] = '\0'; if (ainfo->rf != NULL) ainfo->rf[mpos] = '\0'; return 1;}/* Function: RandomAlignment() * * Purpose: Create a random alignment from raw sequences. * * Ideally, we would like to sample an alignment from the * space of possible alignments according to its probability, * given a prior probability distribution for alignments. * I don't see how to describe such a distribution, let alone * sample it. * * This is a rough approximation that tries to capture some * desired properties. We assume the alignment is generated * by a simple HMM composed of match and insert states. * Given parameters (pop, pex) for the probability of opening * and extending an insertion, we can find the expected number * of match states, M, in the underlying model for each sequence. * We use an average M taken over all the sequences (this is * an approximation. The expectation of M given all the sequence * lengths is a nasty-looking summation.) * * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) ) * * Then, we assign positions in each raw sequence onto the M match * states and M+1 insert states of this "HMM", by rolling random * numbers and inserting the (rlen-M) inserted positions randomly * into the insert slots, taking into account the relative probability * of open vs. extend. * * The resulting alignment has two desired properties: insertions * tend to follow the HMM-like exponential distribution, and * the "sparseness" of the alignment is controllable through * pop and pex. * * Args: rseqs - raw sequences to "align", 0..nseq-1 * sqinfo - array of 0..nseq-1 info structures for the sequences * nseq - number of sequences * pop - probability to open insertion (0<pop<1) * pex - probability to extend insertion (0<pex<1) * ret_aseqs - RETURN: alignment (flushed) * ainfo - fill in: alignment info * * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause * of failure. */ intRandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex, char ***ret_aseqs, AINFO *ainfo){ char **aseqs; /* RETURN: alignment */ int alen; /* length of alignment */ int *rlen; /* lengths of each raw sequence */ int M; /* length of "model" */ int **ins; /* insertion counts, 0..nseq-1 by 0..M */ int *master_ins; /* max insertion counts, 0..M */ int apos, rpos, idx; int statepos; int count; int minlen; /* calculate expected length of model, M */ rlen = (int *) MallocOrDie (sizeof(int) * nseq); M = 0; minlen = 9999999; for (idx = 0; idx < nseq; idx++) { rlen[idx] = strlen(rseqs[idx]); M += rlen[idx]; minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen; } M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex)))); M /= nseq; if (M > minlen) M = minlen; /* make arrays that count insertions in M+1 possible insert states */ ins = (int **) MallocOrDie (sizeof(int *) * nseq); master_ins = (int *) MallocOrDie (sizeof(int) * (M+1)); for (idx = 0; idx < nseq; idx++) { ins[idx] = (int *) MallocOrDie (sizeof(int) * (M+1)); for (rpos = 0; rpos <= M; rpos++) ins[idx][rpos] = 0; } /* normalize */ pop = pop / (pop+pex); pex = 1.0 - pop; /* make insertions for individual sequences */ for (idx = 0; idx < nseq; idx++) { apos = -1; for (rpos = 0; rpos < rlen[idx]-M; rpos++) { if (sre_random() < pop || apos == -1) /* open insertion */ apos = CHOOSE(M+1); /* choose 0..M */ ins[idx][apos]++; } } /* calculate master_ins, max inserts */ alen = M; for (apos = 0; apos <= M; apos++) { master_ins[apos] = 0; for (idx = 0; idx < nseq; idx++) if (ins[idx][apos] > master_ins[apos]) master_ins[apos] = ins[idx][apos]; alen += master_ins[apos]; } /* Now, construct alignment */ aseqs = (char **) MallocOrDie (sizeof (char *) * nseq); for (idx = 0; idx < nseq; idx++) aseqs[idx] = (char *) MallocOrDie (sizeof(char) * (alen+1)); for (idx = 0; idx < nseq; idx++) { apos = rpos = 0; for (statepos = 0; statepos <= M; statepos++) { for (count = 0; count < ins[idx][statepos]; count++) aseqs[idx][apos++] = rseqs[idx][rpos++]; for (; count < master_ins[statepos]; count++) aseqs[idx][apos++] = ' '; if (statepos != M) aseqs[idx][apos++] = rseqs[idx][rpos++]; } aseqs[idx][alen] = '\0'; } ainfo->flags = 0; ainfo->alen = alen; ainfo->nseq = nseq; ainfo->sqinfo = (SQINFO *) MallocOrDie (sizeof(SQINFO) * nseq); for (idx = 0; idx < nseq; idx++) SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx])); free(rlen); free(master_ins); Free2DArray((void **) ins, nseq); *ret_aseqs = aseqs; return 1;}/* Function: AlignmentHomogenousGapsym() * Date: SRE, Sun Mar 19 19:37:12 2000 [wren, St. Louis] * * Purpose: Sometimes we've got to convert alignments to * a lowest common denominator, and we need * a single specific gap character -- for example, * PSI-BLAST blastpgp -B takes a very simplistic * alignment input format which appears to only * allow '-' as a gap symbol. * * Anything matching the isgap() macro is * converted. * * Args: aseq - aligned character strings, [0..nseq-1][0..alen-1] * nseq - number of aligned strings * alen - length of alignment * gapsym - character to use for gaps. * * Returns: void ("never fails") */voidAlignmentHomogenousGapsym(char **aseq, int nseq, int alen, char gapsym){ int i, apos; for (i = 0; i < nseq; i++) for (apos = 0; apos < alen; apos++) if (isgap(aseq[i][apos])) aseq[i][apos] = gapsym;}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -