📄 parse.cpp
字号:
} if (leminus[ph].back().Score() > p->Score()) { p = &leminus[ph].back(); } if (feplus[ph].back().Score() > p->Score()) { p = &feplus[ph].back(); } } path = p;}template<class T> void Out(T t, int w, CNcbiOstream& to = cout){ to.width(w); to.setf(IOS_BASE::right,IOS_BASE::adjustfield); to << t;}void Out(double t, int w, CNcbiOstream& to = cout, int prec = 1){ to.width(w); to.setf(IOS_BASE::right,IOS_BASE::adjustfield); to.setf(IOS_BASE::fixed,IOS_BASE::floatfield); to.precision(prec); if (t > 1000000000) { to << "+Inf"; } else if (t < -1000000000) { to << "-Inf"; } else { to << t; }}inline char toACGT(int c){ switch (c) { case nA: return 'A'; case nC: return 'C'; case nG: return 'G'; case nT: return 'T'; case nN: return 'N'; }}int Parse::PrintGenes(CNcbiOstream& to, CNcbiOstream& toprot, bool complete) const{ enum {DNA_Align = 1, Prot_Align = 2 }; list<Gene> genes = GetGenes(); int right = seqscr.SeqMap(seqscr.SeqLen() - 1,true); if (complete && !genes.empty() && !genes.back().RightComplete()) { int partial_start = genes.back().front().Start(); genes.pop_back(); if ( !genes.empty() ) // end of the last complete gene { right = genes.back().back().Stop(); } else { if (partial_start > seqscr.SeqMap(0,true) + 1000) { right = partial_start - 100; } else { return -1; // calling program MUST be aware of this!!!!!!! } } } to << "\n" << seqscr.Contig() << '_' << seqscr.SeqMap(0,true) << '_' << right << "\n\n"; to << genes.size() << " genes found\n"; set<int> chain_id, prot_id; for (list<Gene>::iterator it = genes.begin(); it != genes.end(); ++it) { const Gene& gene = *it; for (int i = 0; i < gene.size(); ++i) { ITERATE(set<int>, iter, gene[i].ChainID()) { chain_id.insert(*iter); } ITERATE(set<int>, iter, gene[i].ProtID()) { prot_id.insert(*iter); } //chain_id.insert(gene[i].ChainID().begin(), gene[i].ChainID().end()); //prot_id.insert(gene[i].ProtID().begin(), gene[i].ProtID().end()); } } to << (chain_id.size() + prot_id.size()) << " alignments used\n"; int num = 0; for (list<Gene>::iterator it = genes.begin(); it != genes.end(); ++it) { ++num; to << "\n\n"; Out(" ",11,to); Out("Start",10,to); Out("Stop",10,to); Out("Length",7,to); Out("Align",10,to); Out("Prot",12,to); Out("FShift",7,to); to << endl; const Gene& gene = *it; int gene_start = -1; int gene_stop; int align_type = 0; for (int i = 0; i < gene.size(); ++i) { const ExonData& exon = gene[i]; const TFrameShifts& fshifts = exon.ExonFrameShifts(); int estart = exon.Start(); if (gene_start < 0 && exon.Type() == ExonData::Cds) { gene_start = estart; } if ( !exon.ChainID().empty() ) { align_type = align_type|DNA_Align; } if ( !exon.ProtID().empty() ) { align_type = align_type|Prot_Align; } for (int k = 0; k < fshifts.size(); ++k) { int estop = fshifts[k].Loc() - 1; Out(num,4,to); Out((exon.Type() == ExonData::Cds) ? "CDS" : "UTR",5,to); Out((gene.Strand() == Plus) ? '+' : '-',2,to); Out(estart,10,to); Out(estop,10,to); Out(estop - estart + 1,7,to); if (exon.ChainID().empty()) { Out("-",10,to); } else { Out(*exon.ChainID().begin(),10,to); } if (exon.ProtID().empty()) { Out("-",12,to); } else { Out(*exon.ProtID().begin(),12,to); } if (fshifts[k].IsInsertion()) { Out("+",7,to); estart = estop; } else { Out("-",7,to); estart = estop + 2; } to << endl; } int estop = exon.Stop(); Out(num,4,to); Out((exon.Type() == ExonData::Cds) ? "CDS" : "UTR",5,to); Out((gene.Strand() == Plus) ? '+' : '-',2,to); Out(estart,10,to); Out(estop,10,to); Out(estop - estart + 1,7,to); if (exon.ChainID().empty()) { Out("-",10,to); } else { Out(*exon.ChainID().begin(),10,to); } if (exon.ProtID().empty()) { Out("-",12,to); } else { Out(*exon.ProtID().begin(),12,to); } Out("*",7,to); to << endl; if (exon.Type() == ExonData::Cds) { gene_stop = estop; } } toprot << '>' << seqscr.Contig() << "_" << gene_start << "_" << gene_stop << "_" << ((gene.Strand() == Plus) ? "plus" : "minus") << "_" << align_type << endl; const IVec& cds = gene.CDS(); int i; for (i = 0; i < cds.size() - 2; i +=3) { toprot << aa_table[cds[i]*25 + cds[i + 1]*5 + cds[i + 2]]; if ((i + 3)%150 == 0) { toprot << endl; } } if (i%150 != 0) { toprot << endl; } // for (i = 0; i < cds.size(); ++i) // { // toprot << toACGT(cds[i]); // if ((i + 3)%50 == 0) toprot << endl; // } // if (i%50 != 0) toprot << endl; } return right;}/*int Parse::PrintGenes(CNcbiOstream& to, CNcbiOstream& toprot, bool complete) const{ const char* aa_table = "KNKNXTTTTTRSRSXIIMIXXXXXXQHQHXPPPPPRRRRRLLLLLXXXXXEDEDXAAAAAGGGGGVVVVVXXXXX * Y*YXSSSSS * CWCXLFLFXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; const HMM_State* plast = Path(); if (complete && dynamic_cast<const Intron*>(plast)) { while (!dynamic_cast<const Intergenic*>(plast)) { plast = plast->LeftState(); } if ( !plast->LeftState() ) { return plast->Start() + seqscr.SeqMap(0,true) - 1; } else { plast = plast->LeftState(); } } const CClusterSet& algns = seqscr.Alignments(); int right = seqscr.SeqMap(plast->Stop(),false); // either end of chunk or and of last complete gene and its alignments for (CClusterSet::ConstIt it = algns.begin(); it != algns.end(); ++it) { if (it->Limits().Include(right)) { right = it->Limits().second; } } to << "\n" << seqscr.Contig() << '_' << seqscr.SeqMap(0,true) << '_' << right << "\n\n"; vector<const Exon*> states; const Exon* pe; int gennum = 0; for (const HMM_State* p = plast; p != 0; p = p->LeftState()) { if (pe = dynamic_cast<const Exon*>(p)) { states.push_back(pe); } if ( !states.empty() ) { if ( !p->LeftState() || dynamic_cast<const Intergenic*>(p) ) { ++gennum; } } } reverse(states.begin(),states.end()); int algnum = 0, algused = 0; IPair chunk(seqscr.SeqMap(0,true),seqscr.SeqMap(seqscr.SeqLen() - 1,false)); for (CClusterSet::ConstIt it = algns.begin(); it != algns.end(); ++it) { IPair lim = it->Limits(); if (lim.Intersect(chunk)) { algnum += it->size(); } for (int i = 0; i < states.size(); ++i) { int a = seqscr.SeqMap(states[i]->Start(),true); int b = seqscr.SeqMap(states[i]->Stop(),false); IPair exon(a,b); if (exon.Intersect(lim)) { algused += it->size(); break; } } } IVec ids(states.size(),0), prot_ids(states.size(),0); for (int i = 0; i < states.size(); ++i) { int a = seqscr.SeqMap(states[i]->Start(),true); int b = seqscr.SeqMap(states[i]->Stop(),false); IPair exon(a,b); for (CClusterSet::ConstIt cls_it = algns.begin(); cls_it != algns.end(); ++cls_it) { for (CCluster::ConstIt it = cls_it->begin(); it != cls_it->end(); ++it) { const AlignVec& al = *it; for (int j = 0; j < al.size(); ++j) { if (exon.Intersect(al[j])) { if (al.Type() == AlignVec::Prot) { prot_ids[i] = al.ID(); } else { ids[i] = al.ID(); if (exon.first != al[j].first || exon.second != al[j].second) { ids[i] = -ids[i]; } } } } } } } to << gennum << " genes found\n"; to << algused << " alignments used out of " << algnum << endl; if (gennum == 0) { return seqscr.SeqMap(plast->Stop(),false); } IVec cds; int num = 0; double gene_fscore, left_score; int gene_start, gene_stop, align_type; enum {DNA_Align = 1, Prot_Align = 2 }; for (int i = 0; i < states.size(); ++i) { pe = states[i]; int shift = 0; if (i == 0 || dynamic_cast<const SingleExon*>(pe) || (pe->isPlus() && dynamic_cast<const FirstExon*>(pe)) || (pe->isMinus() && dynamic_cast<const LastExon*>(pe))) { gene_start = seqscr.SeqMap(pe->Start(),true); ++num; to << "\n\n"; Out(" ",19,to); Out("Start",10,to); Out("Stop",10,to); Out("Length",7,to); Out("Align",10,to); Out("Prot",12,to); Out("FShift",7,to); to << endl; left_score = pe->LeftState()->Score(); cds.clear(); const Intron* pi = dynamic_cast<const Intron*>(pe->LeftState()); if (pi != 0) { int ph = pi->Phase(); if (pe->isPlus()) { shift = (3 - ph)%3; } else { shift = ph; } } align_type = 0; } //If first / last base is insertion, this information is lost vector<IPair> subexons; int a = seqscr.SeqMap(pe->Start(),true); int b = a; for (int k = pe->Start() + 1; k <= pe->Stop(); ++k) { int c = seqscr.SeqMap(k,false); if (c == b) { subexons.push_back(IPair(a,b)); a = b; } if (c == b + 2) { subexons.push_back(IPair(a,b)); a = c; } b = c; } subexons.push_back(IPair(a,b)); for (int k = 0; k < subexons.size(); ++k) { int a = subexons[k].first; int b = subexons[k].second; Out(num,4,to); Out(pe->GetStateName(),13,to); Out(pe->isPlus() ? '+' : '-',2,to); Out(a,10,to); Out(b,10,to); Out(b - a+1,7,to); if (ids[i]) { align_type = align_type|DNA_Align; Out(abs(ids[i]),9,to); to << ((ids[i] > 0) ? ' ' : '*'); } else { Out("-",9,to); to << ' '; } if (prot_ids[i]) { align_type = align_type|Prot_Align; Out(prot_ids[i],12,to); } else { Out("-",12,to); } if (k != 0) { if (a == subexons[k - 1].second) { Out("+",7,to); } else if (a == subexons[k - 1].second + 2) { Out("-",7,to); } else { Out("*",7,to); } } else { Out("*",7,to); } to << endl; } bool last = false; if (i == states.size() - 1 || dynamic_cast<const SingleExon*>(pe) || (pe->isPlus() && dynamic_cast<const LastExon*>(pe)) || (pe->isMinus() && dynamic_cast<const FirstExon*>(pe))) { last = true; gene_stop = seqscr.SeqMap(pe->Stop(),false); } copy(seqscr.SeqPtr(pe->Start() + shift,Plus), seqscr.SeqPtr(pe->Stop() + 1,Plus),back_inserter(cds)); if (last) { to << endl; cds.resize(cds.size() - cds.size()%3); if (pe->isMinus()) { for (int i = 0; i < cds.size(); ++i) cds[i] = toMinus[cds[i]]; reverse(cds.begin(),cds.end()); } toprot << '>' << seqscr.Contig() << "_" << gene_start << "_" << gene_stop << "_" << (pe->isPlus() ? "plus" : "minus") << "_" << align_type << endl; int i; for (i = 0; i < cds.size() - 2; i +=3) { toprot << aa_table[cds[i]*25 + cds[i + 1]*5 + cds[i + 2]]; if ((i + 3)%150 == 0) { toprot << endl; } } if (i%150 != 0) { toprot << endl; } } } return right;}*/list<Gene> Parse::GetGenes() const{ list<Gene> genes; const CClusterSet& cls = seqscr.Alignments(); if (dynamic_cast<const Intron*>(Path())) { genes.push_front(Gene(Path()->Strand(),false,false)); } for (const HMM_State* p = Path(); p != 0; p = p->LeftState()) { if (const Exon* pe = dynamic_cast<const Exon*>(p)) { bool stopcodon = false; bool startcodon = false;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -