📄 denovosolutions.cpp
字号:
Tests each tag to see how many de novo paths it is part of.
Sets the variablesi: multi_path_rank (first rank),percent5, percent20, percent_all
in each tag.
****************************************************************************/
void determine_tag_containment_ratios(vector<SeqPath>& tags,
const vector<SeqPath>& paths,
const vector<score_pair>& denovo_scores,
int start_idx)
{
int i;
for (i=start_idx; i<tags.size(); i++)
{
int num_5=0, num_20=0, num_all=0, first=POS_INF;
const vector<PathPos>& tag_positions = tags[i].positions;
const int first_node_idx = tag_positions[0].node_idx;
/* cout << "Tag: ";
int q;
for (q=0; q<tag_positions.size(); q++)
cout << tag_positions[q].node_idx << " ";
cout << endl;*/
int j;
for (j=0; j<denovo_scores.size(); j++)
{
const SeqPath& denovo_path = paths[denovo_scores[j].idx];
const vector<PathPos>& path_positions = denovo_path.positions;
const int last_pos = path_positions.size()-1;
/* cout << "Path " << j << " : ";
int q;
for (q=0; q<path_positions.size(); q++)
cout << path_positions[q].node_idx << " ";
cout << endl;*/
int k=0;
while (k<last_pos && path_positions[k].node_idx<first_node_idx)
k++;
if (path_positions[k].node_idx != first_node_idx)
continue;
int a;
for (a=1; a<tag_positions.size() && a+k<path_positions.size(); a++)
if (tag_positions[a].node_idx != path_positions[a+k].node_idx)
break;
if (a<tag_positions.size())
continue;
if (j<first)
first=j;
num_all++;
if (j<20)
num_20++;
if (j<5)
num_5++;
}
tags[i].multi_path_rank=first;
if (paths.size()>0)
{
tags[i].tag_percent_top_5 = num_5*0.2;
tags[i].tag_percent_top_20 = num_20*0.05;
tags[i].tag_percent_all = num_all/(float)paths.size();
}
// cout << i << "\t" << tags[i].seq_str << "\t" << tags[i].multi_path_rank <<
// "\t" << tags[i].tag_percent_top_5 << "\t" <<
// tags[i].tag_percent_top_20 << "\t" << setprecision(4) << tags[i].tag_percent_all << endl;
}
// exit(0);
}
/*************************************************************************
Generates tags by making a mixture of local/de novo tags
checks which tags appear in the longer denovo sequences sets the boolean
indicators in the tag seq paths
**************************************************************************/
void generate_tags(vector<PrmGraph *>& prm_ptrs,
AdvancedScoreModel *model,
BasicSpectrum& bs,
Spectrum *spec,
const vector<int>& final_num_tags,
int main_tag_length, // the length for which we parse de novo sequences
const vector<mass_t>& pms_with_19,
const vector<int>& charges,
vector<SeqPath>& final_tags,
bool use_original_num_tags,
int prm_ptr_start_idx)
{
const int max_tag_length=10;
const double search_time_limit = 7.0;
const int denovo_seq_heap_size = 50;
static SeqPathHeap denovo_seq_heap;
static vector<SeqPathHeap> tag_heaps;
Config *config = model->get_config();
const mass_t tolerance = config->get_tolerance();
const int charge = charges[0];
const int size_idx = config->calc_size_idx(charge,pms_with_19[0]);
int i;
int min_denovo_length=6;
for (i=0; i<final_num_tags.size(); i++)
if (final_num_tags[i]>0)
{
min_denovo_length=i;
break;
}
if (min_denovo_length<6)
min_denovo_length=6;
final_tags.clear();
// init models, num tags, etc.
// don't use de novo where it is time consuming and not accurate
int num_denovo_solutions = denovo_seq_heap_size;
bool perform_denovo_rerank=false;
bool perform_denovo=true;
if (charge==2 && pms_with_19[0]>=2400 ||
charge>2 && pms_with_19[0]>2450)
{
perform_denovo=false;
num_denovo_solutions=0;
}
DeNovoRankScorer * denovo_rank_model = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
/* if (0 && perform_denovo && denovo_rank_model && denovo_rank_model->get_ind_part_model_was_initialized(charge,size_idx))
{
perform_denovo_rerank=true;
num_denovo_solutions = denovo_seq_heap_size * 3;
}*/
vector<bool> perform_tag_reranks;
vector<int> num_tags;
vector<DeNovoRankScorer *> tag_rank_models;
perform_tag_reranks.resize(max_tag_length,false);
num_tags.resize(max_tag_length,0);
tag_rank_models.resize(max_tag_length,NULL);
int tag_round=0;
int tag_length;
for (tag_length=0; tag_length<final_num_tags.size() && tag_length<max_tag_length; tag_length++)
{
if (final_num_tags[tag_length]<=0)
continue;
num_tags[tag_length]=final_num_tags[tag_length];
tag_rank_models[tag_length] = (DeNovoRankScorer *)model->get_rank_tag_model_ptr(tag_length);
if (tag_rank_models[tag_length] && tag_rank_models[tag_length]->get_ind_part_model_was_initialized(charge,size_idx))
{
perform_tag_reranks[tag_length]=true;
if (! use_original_num_tags)
{
num_tags[tag_length] *= (4+2*tag_round); // add more to larger lengths since they are covered by shorter tags
num_tags[tag_length] += 10;
}
}
tag_round++;
}
denovo_seq_heap.init(denovo_seq_heap_size, tolerance);
tag_heaps.resize(max_tag_length);
for (tag_length=0; tag_length<max_tag_length; tag_length++)
if (num_tags[tag_length]>0)
tag_heaps[tag_length].init(num_tags[tag_length],tolerance);
const clock_t start_t = clock();
// collect de novo sequences and generate local tag
for (i=0; i<pms_with_19.size(); i++)
{
// ignore differnet charges, should not be here
if (charges[i] != charges[0])
continue;
const clock_t end_t = clock();
const double run_time = (end_t - start_t)/(double)CLOCKS_PER_SEC;
// limit the time spent here if searches run too long
if (i>0 && run_time>search_time_limit)
{
break;
}
// First generate de novo solutions
const score_t min_seq_score_needed = ( (i==0 || denovo_seq_heap.min_score_heap.size()<num_denovo_solutions) ?
NEG_INF : denovo_seq_heap.min_value );
int max_length = 13;
if (pms_with_19[i]>1800)
max_length = 10;
if (charges[i]>2 || pms_with_19[i]>2200)
max_length = 9;
if (perform_denovo)
{
vector<SeqPath> denovo_sols;
generate_denovo_solutions(prm_ptrs[i+prm_ptr_start_idx], model, spec, true,
pms_with_19[i], charges[i], num_denovo_solutions, min_denovo_length, max_length,
min_seq_score_needed, denovo_sols, false);
if (i==0 && bs.ssf->peptide.get_num_aas()>2)
{
// cout << endl;
// SeqPath best = prm_ptrs[0+prm_ptr_start_idx]->get_longest_subpath(bs.ssf->peptide,0);
// spec->print_expected_by();
// best.print_full(config);
// prm_ptrs[0]->print();
// exit(0);
}
// just remove duplicates
int j;
for (j=0; j<denovo_sols.size(); j++)
denovo_seq_heap.add_path(denovo_sols[j]);
}
// generate local tags
int tag_length;
for (tag_length=0; tag_length<max_tag_length; tag_length++)
{
if (num_tags[tag_length]>0)
{
const score_t min_tag_score_needed = ( (i==0 || tag_heaps[tag_length].min_score_heap.size()<num_tags[tag_length]) ?
NEG_INF : tag_heaps[tag_length].min_value );
vector<SeqPath> tag_sols;
generate_denovo_solutions(prm_ptrs[i+prm_ptr_start_idx], model, spec, false,
pms_with_19[i], charges[i], num_tags[tag_length], tag_length, tag_length,
min_tag_score_needed, tag_sols, false, false, (!perform_denovo));
int j;
for (j=0; j<tag_sols.size(); j++)
{
tag_heaps[tag_length].add_path(tag_sols[j]);
if (tag_sols[j].get_num_aa() != tag_length)
{
cout << "problem1 " << j << " " << tag_sols[j].get_num_aa() << " : " << tag_length <<endl;
}
}
}
}
}
// sort de novo seqs
vector<score_pair> denovo_scores;
denovo_scores.clear();
if (perform_denovo)
{
vector<SeqPath>& denovo_seqs = denovo_seq_heap.paths;
if (0) // don't rerank sequences
{
denovo_rank_model->score_denovo_sequences(denovo_seqs,bs.ssf,bs.peaks,bs.num_peaks,denovo_scores,size_idx);
sort(denovo_scores.begin(),denovo_scores.end());
}
else
{
int i;
denovo_scores.resize(denovo_seqs.size());
for (i=0; i<denovo_scores.size(); i++)
{
denovo_scores[i].idx=i;
denovo_scores[i].score=denovo_seqs[i].path_score;
}
sort(denovo_scores.begin(),denovo_scores.end());
}
// parse de novo paths into tags and add them to the tag heap
if (num_tags[main_tag_length]>0)
{
int max_num_to_force = num_tags[main_tag_length]/4;
if (max_num_to_force>15)
max_num_to_force=15;
int num_forced=0;
int i;
for (i=0; i<denovo_scores.size(); i++)
{
SeqPath& big_path = denovo_seqs[denovo_scores[i].idx];
vector<SeqPath> parsed_tags;
PrmGraph *prm_ptr = big_path.prm_ptr;
big_path.multi_path_rank = i;
prm_ptr->parse_seq_path_to_smaller_ones(big_path, main_tag_length, main_tag_length, parsed_tags);
int j;
for (j=0; j<parsed_tags.size(); j++)
{
int add_return_value = tag_heaps[main_tag_length].add_path(parsed_tags[j]);
// if the tag came from one of the top 5 sequences and it was rejected
// because of a low score, we'll force it in by adding a high score +10000
// that will later be removed
if (add_return_value == 0 &&
num_forced<max_num_to_force &&
parsed_tags[j].multi_path_rank<5)
{
parsed_tags[j].sort_key += 10000.0;
num_forced++;
// int ret_val = tag_heaps[main_tag_idx].add_path(parsed_tags[j]);
// cout << ret_val << " " << num_forced << " Added " << parsed_tags[j].multi_path_rank << "\t" <<
// parsed_tags[j].seq_str << "\t" << parsed_tags[j].path_score << "\t"
// << parsed_tags[j].sort_key <<"\t" << tag_heaps[main_tag_idx].min_value << endl;
}
if (parsed_tags[j].get_num_aa() != main_tag_length)
{
cout << "Problem2 " << j << " " << main_tag_length << " : " << parsed_tags[j].get_num_aa() << endl;
exit(1);
}
}
}
// since the heap will not be used as a heap anymore (it is now just a container)
// we can go back and remove the bonus scores that were added
for (i=0; i<tag_heaps[main_tag_length].paths.size(); i++)
{
if (tag_heaps[main_tag_length].paths[i].sort_key>8000.0)
{
tag_heaps[main_tag_length].paths[i].sort_key-=10000.0;
}
}
}
}
// sort tag heaps
tag_round=0;
for (tag_length=0; tag_length<max_tag_length; tag_length++)
{
if (num_tags[tag_length]<=0)
continue;
vector<SeqPath>& bigger_tags = tag_heaps[tag_length].paths;
sort(bigger_tags.begin(),bigger_tags.end(),comp_SeqPath_path_score);
const float top_score = (bigger_tags.size()>0 ? bigger_tags[0].path_score : NEG_INF);
int i;
for (i=0; i<bigger_tags.size(); i++)
{
bigger_tags[i].org_rank=i;
bigger_tags[i].delta_score = top_score - bigger_tags[i].path_score;
}
// check that tags are not covered by previous shorter ones
if (tag_round>0)
{
const vector<SeqPath>& smaller_tags = final_tags;
int i;
for (i=0; i<bigger_tags.size(); i++)
{
const vector<PathPos>& big_positions = bigger_tags[i].positions;
const int bigger_length = big_positions.size()-1;
int j;
for (j=0; j<smaller_tags.size(); j++)
{
const vector<PathPos>& small_positions = smaller_tags[j].positions;
const int smaller_length = small_positions.size()-1;
const int length_diff = bigger_length - smaller_length;
int k;
for (k=0; k<length_diff; k++)
{
if (small_positions[0].aa == big_positions[k].aa)
{
int a;
for (a=1; a<smaller_length; a++)
if (small_positions[a].aa != big_positions[a+k].aa)
break;
if (a==smaller_length)
break;
}
}
if (k<length_diff)
break;
}
// this tag is covered remove it
if (j<smaller_tags.size())
{
bigger_tags[i]=bigger_tags[bigger_tags.size()-1];
bigger_tags.pop_back();
i--;
}
}
}
if (perform_denovo)
determine_tag_containment_ratios(bigger_tags,denovo_seq_heap.paths, denovo_scores,0);
// sort tags
vector<score_pair> tag_scores;
if (perform_tag_reranks[tag_length])
{
tag_rank_models[tag_length]->score_tag_sequences(bigger_tags,bs.ssf,bs.peaks,bs.num_peaks,tag_scores,size_idx);
sort(tag_scores.begin(),tag_scores.end());
int i;
for (i=0; i<tag_scores.size(); i++)
bigger_tags[tag_scores[i].idx].rerank_score = tag_scores[i].score;
}
else
{
int i;
tag_scores.resize(bigger_tags.size());
for (i=0; i<tag_scores.size(); i++)
{
tag_scores[i].idx=i;
tag_scores[i].score=bigger_tags[i].path_score;
}
}
while (tag_scores.size()>final_num_tags[tag_length])
tag_scores.pop_back();
for (i=0; i<tag_scores.size(); i++)
{
final_tags.push_back(bigger_tags[tag_scores[i].idx]);
}
tag_round++;
}
}
void output_denovo_solutions(SingleSpectrumFile *ssf, Config *config, ostream& out_stream,
const vector<SeqPath>& solutions, int max_num_sols)
{
if (max_num_sols<0)
max_num_sols=solutions.size();
ssf->print_ssf_stats(config,out_stream);
if (solutions.size() == 0)
{
out_stream << "No solutions found." << endl;
}
else
{
out_stream << "#Index\tScore\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;
int i;
for (i=0; i<solutions.size() && i<max_num_sols; i++)
{
mass_t c_gap=solutions[i].pm_with_19 - solutions[i].c_term_mass;
if (c_gap<24.0)
c_gap = 0;
out_stream << setprecision(3) << fixed << i << "\t";
out_stream << solutions[i].path_score << "\t";
out_stream << solutions[i].n_term_mass << "\t";
out_stream << c_gap << "\t";
out_stream << solutions[i].pm_with_19 << "\t";
out_stream << solutions[i].charge << "\t";
out_stream << solutions[i].seq_str;
out_stream << endl;
}
}
out_stream << endl;
}
void output_tag_solutions(SingleSpectrumFile *ssf,
Config *config, ostream& out_stream,
const vector<SeqPath>& solutions)
{
ssf->print_ssf_stats(config,out_stream);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -