📄 denovosolutions.cpp
字号:
DeNovoRankScorer *rank_model = (DeNovoRankScorer *)model.get_rank_model_ptr(1);
static PrmGraph *prm_ptr = NULL;
static vector<PrmGraph *> prm_ptrs;
time_t start_time,last_time;
start_time = time(NULL);
last_time = start_time;
int f;
for (f=0; f<list_vector.size(); f++)
{
const char *spectra_file = list_vector[f].c_str();
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
///////////////////////////////////////////////
// Quick read, get all pointers to begining of spectra
if (get_file_extension_type(list_vector[f]) != MZXML)
{
fm.init_from_file(config,spectra_file);
}
else // reads peaks
fm.init_and_read_single_mzXML(config,spectra_file,f);
fs.select_all_files(fm);
// fs.select_files_in_mz_range(fm,555,645);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
// if (sc>500)
// break;
// if (ssf->get_scan() != 164)
// continue;
time_t curr_time = time(NULL);
double elapsed_time = (curr_time - last_time);
if (report_progress && elapsed_time>5)
{
last_time = curr_time;
cout << "Processing file " << f+1 << "/" << list_vector.size();
if (all_ssf.size() == 1)
{
cout << " spectrum 1/1 of current file." << endl;
}
else
cout << " spectrum " << sc+1 << "/" << all_ssf.size() <<
" of current file." << endl;
}
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = f+file_start_idx;
// convert peak list ot a spectrum with charge (if original charge ==0)
// the spectrum gets charge 2, but the true charge is computed from the data
if (num_peaks<5)
{
ssf->print_ssf_stats(config,out_stream);
out_stream << "# too few peaks..." << endl;
continue;
}
spec_counter++;
Spectrum s;
s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
vector<SeqPath> solutions;
solutions.clear();
if ( ssf->charge > model.get_max_score_model_charge())
{
ssf->print_ssf_stats(config,out_stream);
out_stream << "# Charge " << s.get_charge() << " not supported yet..." << endl << endl;
continue;
}
bool perform_rerank=false;
int rerank_size_idx = NEG_INF;
int num_sols_to_generate_before_ranking=num_solutions;
if (1)
{
vector<mass_t> pms_with_19;
vector<int> charges;
pms_with_19.clear();
charges.clear();
BasicSpectrum bs;
bs.ssf = ssf;
bs.peaks = &peaks[0];
bs.num_peaks = num_peaks;
// output m/z and prob values for the different charge states
model.select_pms_and_charges(config,bs,pms_with_19,charges,true);
if (prm_ptrs.size()<pms_with_19.size())
prm_ptrs.resize(pms_with_19.size(),NULL);
if (rank_model && num_sols_to_generate_before_ranking<num_rerank_sols)
num_sols_to_generate_before_ranking=num_rerank_sols;
generate_denovo_solutions_from_several_pms(
prm_ptrs,
&model,
&s,
true,
num_sols_to_generate_before_ranking,
min_length,
max_length,
pms_with_19,
charges,
solutions,
false);
// use charge of top scoring solution
if (solutions.size()>1)
{
const int sol_charge = solutions[0].charge;
int j;
for (j=1; j<solutions.size(); j++)
{
if (solutions[j].charge != sol_charge)
{
if (j==solutions.size()-1)
{
solutions.pop_back();
}
else
{
solutions[j]=solutions[solutions.size()-1];
solutions.pop_back();
j--;
}
}
}
}
}
if (rank_model && solutions.size()>0)
{
rerank_size_idx = config->calc_size_idx(solutions[0].charge,solutions[0].pm_with_19);
if (rank_model->get_ind_part_model_was_initialized(solutions[0].charge,rerank_size_idx))
{
perform_rerank=true;
}
}
vector<score_pair> scores;
if (perform_rerank)
{
rank_model->score_denovo_sequences(solutions,ssf,&peaks[0],num_peaks,scores,rerank_size_idx);
sort(scores.begin(),scores.end());
}
else
{
scores.resize(solutions.size());
int i;
for (i=0; i<solutions.size(); i++)
scores[i].idx=i;
}
////////////////////////////////////////////////////////////
// if we are here it is only for denovo/tags
// print results
////////////////////////////////////////////////////////////
bool had_pep = false;
bool had_correct = false;
ssf->print_ssf_stats(config,out_stream);
if (solutions.size() == 0)
{
out_stream << "No solutions found." << endl;
}
else
{
out_stream << "#Index\t";
if (perform_rerank)
out_stream << "RnkScr\t";
out_stream << "PnvScr\tN-Gap\tC-Gap\t[M+H]\tCharge\tSequence" << endl;
int i;
for (i=0; i<solutions.size() && i<num_solutions; i++)
{
const int idx = (perform_rerank ? scores[i].idx : i);
mass_t c_gap=solutions[idx].pm_with_19 - solutions[idx].c_term_mass;
if (c_gap<24.0)
c_gap = 0;
out_stream << setprecision(3) << fixed << i << "\t";
if (perform_rerank)
out_stream << scores[i].score << "\t";
out_stream << solutions[idx].path_score << "\t";
out_stream << solutions[idx].n_term_mass << "\t";
out_stream << c_gap << "\t";
out_stream << solutions[idx].pm_with_19 << "\t";
out_stream << solutions[idx].charge << "\t";
out_stream << solutions[idx].seq_str;
// out_stream << solutions[i].num_forbidden_nodes ;
if (ssf->peptide.get_num_aas()>2)
{
if (solutions[idx].check_if_correct(ssf->peptide.as_string(config),config))
{
out_stream << " *";
if (! had_correct)
{
correct_benchmark++;
had_correct=true;
}
}
had_pep=true;
}
out_stream << endl;
// out_stream << endl;
// solutions[i].print_full(config);
// cout << endl;
}
}
if (had_pep) // for annotated spectra (benchmark)
total_benchmark++;
if (0)
{
s.print_expected_by();
// s.print_spectrum();
// prm_ptrs[0]->print_with_combo_tables();
prm_ptrs[0]->print();
exit(0);
}
out_stream << endl;
}
}
/////////////////////////////////////////////////////////////////
// this part works only if the spectra are annotated (benchmark)
/////////////////////////////////////////////////////////////////
if (total_benchmark>0)
{
cout << "Correct spectra " << correct_benchmark << "/" << total_benchmark << " (" <<
fixed << setprecision(3) << (double)correct_benchmark/(double)total_benchmark << ")" << endl;
}
if (report_progress)
{
time_t curr_time = time(NULL);
double elapsed_time = (last_time - start_time);
cout << "Total running time: " << elapsed_time << endl;
cout << "Processed " << list_vector.size() << " files and " << spec_counter << " spectra." << endl;
}
}
void benchmark_shew(Model& model, char *mgf_file)
{
Config *config = model.get_config();
time_t start_time,last_time;
start_time = time(NULL);
last_time = start_time;
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
///////////////////////////////////////////////
// Quick read, get all pointers to begining of spectra
if (get_file_extension_type(mgf_file) != MZXML)
{
fm.init_from_file(config,mgf_file);
}
else // reads peaks
fm.init_and_read_single_mzXML(config,mgf_file,0);
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
vector<int> max_ml_ranks;
int good_seqs=0;
int total_good_aa =0;
int total_aa_pred =0;
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
time_t curr_time = time(NULL);
double elapsed_time = (curr_time - last_time);
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = 0;
// convert peak list ot a spectrum with charge (if original charge ==0)
// the spectrum gets charge 2, but the true charge is computed from the data
if (num_peaks<5)
{
ssf->print_ssf_stats(config);
cout << "# too few peaks..." << endl;
continue;
}
Spectrum s;
s.init_from_QCPeaks(config,&peaks[0],num_peaks,ssf);
vector<SeqPath> solutions;
solutions.clear();
if ( ssf->charge > model.get_max_score_model_charge())
{
ssf->print_ssf_stats(config);
cout << "# Charge " << s.get_charge() << " not supported yet..." << endl << endl;
continue;
}
vector<mass_t> pms_with_19;
vector<int> charges;
pms_with_19.clear();
charges.clear();
BasicSpectrum bs;
bs.ssf = ssf;
bs.peaks = &peaks[0];
bs.num_peaks = num_peaks;
// output m/z and prob values for the different charge states
PrmGraph tprm;
tprm.create_graph_from_spectrum(&model,&s,s.get_true_mass_with_19(),s.get_charge());
PrmGraph *prm_ptr = &tprm;
generate_denovo_solutions(prm_ptr, &model,&s,true,s.get_true_mass_with_19(),s.get_charge(),
1, 6, 12, NEG_INF, solutions, true);
vector<int> true_aas = ssf->peptide.get_amino_acids();
vector<int> sol_aas;
solutions[0].get_amino_acids(sol_aas);
int j;
for (j=0; j<true_aas.size(); j++)
{
if (true_aas[j]==Gln)
true_aas[j]=Lys;
if (true_aas[j]==Ile)
true_aas[j]=Leu;
}
for (j=0; j<sol_aas.size(); j++)
{
if (sol_aas[j]==Gln)
sol_aas[j]=Lys;
if (sol_aas[j]==Ile)
sol_aas[j]=Leu;
solutions[0].positions[j].aa = sol_aas[j];
}
Peptide corr_pep;
corr_pep.set_peptide_aas(true_aas);
corr_pep.calc_mass(config);
int num_corr_aa = solutions[0].get_num_correct_aas(corr_pep,config);
bool is_good=false;
if (num_corr_aa == true_aas.size())
{
is_good=true;
good_seqs++;
}
total_good_aa += num_corr_aa;
total_aa_pred += sol_aas.size();
cout << sc << "\t" << corr_pep.as_string(config) << "\t" << solutions[0].seq_str;
if (is_good)
cout << " *";
cout << endl;
}
cout << "Good de novo : " << good_seqs << " " << fixed << setprecision(3) << (float)good_seqs/sc << endl;
cout << "AA prediction: " << total_good_aa/(float) total_aa_pred << endl;
time_t curr_time = time(NULL);
double elapsed_time = (curr_time - last_time);
cout << "Total running time: " << elapsed_time << endl;
}
void perform_prm_on_list_of_files(Model& model,
const vector<string>& list_vector,
float sqs_filter_prob,
int file_start_idx)
{
Config *config = model.get_config();
int f;
for (f=0; f<list_vector.size(); f++)
{
const char *spectra_file = list_vector[f].c_str();
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
///////////////////////////////////////////////
// Quick read, get all pointers to begining of spectra
if (get_file_extension_type(list_vector[f]) != MZXML)
{
fm.init_from_file(config,spectra_file);
}
else // reads peaks
fm.init_and_read_single_mzXML(config,spectra_file,f);
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
int sc;
for (sc=0; sc<all_ssf.size(); sc++)
{
static vector<QCPeak> peaks;
SingleSpectrumFile *ssf = all_ssf[sc];
if (peaks.size()<ssf->num_peaks)
{
int new_size = ssf->num_peaks*2;
if (new_size<2500)
new_size=2500;
peaks.resize(new_size);
}
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,&peaks[0]);
ssf->file_idx = f+file_start_idx;
if (num_peaks<0)
continue;
// convert peak list ot a spectrum with charge (if original charge ==0)
// the spectrum gets charge 2, but the true charge is computed from the data
Spectrum s;
s.init
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -