📄 denovorankscore.cpp
字号:
vector<int> ppp_frag_type_idxs;
ppp_frag_type_idxs.clear();
ppp_frag_type_idxs.push_back(0);
ppp_frag_type_idxs.push_back(1);
ppp_frag_type_idxs.push_back(2);
ppp_frag_type_idxs.push_back(3);
DeNovoPartitionModel *part_model = dnv_part_models[charge][size_idx];
part_model->init_features(model_type,charge,size_idx,ppp_frag_type_idxs,config);
// read sample peptide sequences
create_complete_denovo_set_map(config,mgf_list,db_dir,correct_dir,
denovo_dir,charge,size_idx,psm, file_indicators);
mass_t min_mz=0;
mass_t max_mz = 10000;
if (size_idx>0)
min_mz = (size_thresholds[charge][size_idx-1]+charge-1)/(mass_t)charge;
if (size_idx< size_thresholds[charge].size())
max_mz = (size_thresholds[charge][size_idx]+charge-1)/(mass_t)charge;
// max_mz = min_mz + 30;
cout << "Charge " << charge << " size " << size_idx << endl;
cout << "Min m/z " << setprecision(2) << fixed << min_mz << " Max m/z " << max_mz << endl;
// read spectra
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
QCPeak peaks[4000];
fm.init_from_list_file(model->get_config(),mgf_list.c_str());
fs.select_files_in_mz_range(fm,min_mz,max_mz,charge);
static vector<PrmGraph *> prm_ptrs;
static vector<SeqPath> solutions;
const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
int num_spectra_read=0;
int num_sams=0;
// Generate various types of samples from spectra
int i;
for (i=0; i<all_ssfs.size(); i++)
{
PeptideSetMap::const_iterator it;
MGF_single *ssf = (MGF_single *)all_ssfs[i];
scan_pair key(ssf->file_idx,ssf->idx_in_file);
if (ssf->peptide.get_num_aas() != 12)
continue;
it = psm.find(key);
if (it == psm.end())
continue;
const PeptideSet& set = (*it).second;
BasicSpectrum bs;
AnnotatedSpectrum as;
vector<PmcSqsChargeRes> pmc_sqs_res;
int charge1=0,charge2=0;
mass_t mz1=0,mz2=0;
float prob1=0,prob2=0;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
model->get_best_mz_charge(config,bs, &mz1, &charge1, &prob1,
&mz2, &charge2, &prob2, &pmc_sqs_res);
const mass_t mass_with_19 = mz1*charge1 - (charge1-1.0);
as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
as.set_corrected_pm_with_19(mass_with_19);
vector<int> idxs;
idxs.push_back(part_model->ann_peak_start_idx+6);
idxs.push_back(part_model->ann_peak_start_idx+7);
RankBoostSample corr_rbs;
fill_complete_peptide_rbs(set.correct_sol, peaks, num_peaks, as, pmc_sqs_res, corr_rbs, size_idx);
cout << ++num_sams <<
"\t" << part_model->feature_names[idxs[0]] <<
"\t" << part_model->feature_names[idxs[1]] << endl;
cout << set.correct_sol.type;
int k;
for (k=0; k<idxs.size(); k++)
{
float val=NEG_INF;
corr_rbs.get_feature_val(idxs[k],&val);
cout << "\t" << val;
}
cout << "\t" << set.correct_sol.pep.as_string(config) << endl;
// add db samples
int num_db=0;
int num_dnv=0;
int j;
for (j=0; j<set.incorrect_sols.size(); j++)
{
if (set.incorrect_sols[j].type == 1)
{
num_db++;
if (num_db>5)
continue;
}
if (set.incorrect_sols[j].type != 1)
{
num_dnv++;
if (num_dnv==1)
cout << endl;
}
if (num_dnv>5)
break;
RankBoostSample bad_rbs;
fill_complete_peptide_rbs(set.incorrect_sols[j], peaks, num_peaks, as, pmc_sqs_res,
bad_rbs, size_idx);
cout << set.incorrect_sols[j].type;
int k;
for (k=0; k<idxs.size(); k++)
{
float val=NEG_INF;
bad_rbs.get_feature_val(idxs[k],&val);
cout << "\t" << val + 0.05 * (set.incorrect_sols[j].type == 1 ? num_db : num_dnv);
}
cout << "\t" << set.incorrect_sols[j].pep.as_string(config) << endl;
}
if (num_sams==50)
break;
}
}
void create_bench_mgf(Config *config,
char *file_list,
char *out_name,
int num_spectra,
bool use_exact_pm)
{
mass_t min_mz=0;
mass_t max_mz = 1300;
int charge =2;
ofstream ofs(out_name);
// read spectra
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
QCPeak peaks[4000];
fm.init_from_list_file(config,file_list);
fs.select_files_in_mz_range(fm,min_mz,max_mz,charge);
fs.randomly_reduce_ssfs(int(num_spectra*1.07));
const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
int spec_idx;
int num_written=0;
vector<int> length_counts;
length_counts.resize(100,0);
for (spec_idx=0; spec_idx<all_ssfs.size() && num_written<num_spectra; spec_idx++)
{
MGF_single *ssf = (MGF_single *)all_ssfs[spec_idx];
BasicSpectrum bs;
AnnotatedSpectrum as;
PrmGraph prm;
const vector<int>& aas = ssf->peptide.get_amino_acids();
int a;
for (a=0; a<aas.size(); a++)
if (aas[a]>Val)
break;
if (a<aas.size())
continue;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks,false,true);
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
as.init_from_QCPeaks(config,peaks,num_peaks,ssf,false);
ostringstream oss;
oss << "spec_" << num_written;
as.set_file_name(oss.str());
if (use_exact_pm)
{
as.set_corrected_pm_with_19(as.get_true_mass_with_19());
as.set_org_pm_with_19(as.get_true_mass_with_19());
as.set_m_over_z((as.get_true_mass_with_19()+MASS_PROTON*(as.get_charge()-1))/as.get_charge());
}
as.output_as_MGF(ofs);
// cout << as.get_org_pm_with_19() << "\t" << as.get_peptide().get_mass_with_19() << "\t" <<
// as.get_peptide().as_string(config) << endl;
num_written++;
length_counts[as.get_peptide().get_num_aas()]++;
}
ofs.close();
cout << "Wrote " << num_written << " to " << out_name << endl;
cout << "Histogram: " << endl;
int i;
for (i=0; i<length_counts.size(); i++)
{
if (length_counts[i]>0)
{
cout << i << "\t" << length_counts[i] << "\t" << setprecision(1) << fixed << 100.0*length_counts[i]/(float)num_written << endl;
}
}
}
void run_peak_benchmark(AdvancedScoreModel *model, char *benchmark_file)
{
const int max_rank = 7;
const int num_ranks_to_consider = 50;
Config *config = model->get_config();
DeNovoRankScorer *dnv_rank = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
PeakRankModel *prm = dnv_rank->get_peak_prediction_model(3);
FileManager fm;
FileSet fs;
int num_exact_first=0;
vector<int> correct_rank_counts;
correct_rank_counts.resize(10,0);
fm.init_from_file(config,benchmark_file);
fs.select_all_files(fm);
fs.randomly_reduce_ssfs(2000);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
vector<QCPeak> peaks;
peaks.resize(5000);
BasicSpecReader bsr;
int i;
for (i=0; i<all_ssf.size(); i++)
{
vector< vector<intensity_t> > ann_intens;
vector< vector<mass_t> > ann_masses;
AnnotatedSpectrum as;
Peptide pep = all_ssf[i]->peptide;
PeptideSolution sol;
sol.pep = pep;
sol.reaches_n_terminal=true;
sol.reaches_c_terminal=true;
sol.charge = all_ssf[i]->charge;
sol.pm_with_19 = pep.get_mass_with_19();
const int num_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],&peaks[0]);
as.init_from_QCPeaks(config,&peaks[0],num_peaks,all_ssf[i]);
as.set_peptide(sol.pep);
as.annotate_spectrum(sol.pm_with_19, true);
as.extract_annotated_intens_and_masses(ann_intens,ann_masses);
PeptidePeakPrediction ppp;
prm->calc_peptide_predicted_scores(sol, ppp);
// reduce intensities to the same dimensionality
const int num_frags = ppp.frag_idxs.size();
vector< vector< float> > observed_intens;
observed_intens.resize(num_frags);
int f;
for (f=0; f<num_frags; f++)
{
const int frag_idx = ppp.frag_idxs[f];
observed_intens[f]=ann_intens[frag_idx];
}
// calculate the ranks and mapping between predicted and observed
vector< vector<int> > observed_ranks, predicted_ranks;
calc_combined_peak_ranks(observed_intens, observed_ranks);
calc_combined_peak_ranks(ppp.rank_scores, predicted_ranks);
vector<int> pred2obs, obs2pred;
vector<int> num_obs_for_frag, num_pred_for_frag;
vector<float> ordered_scores, // scores sorted according to their value
obs_ordered_scores; // scores sorted according to the observed intensity rank
pred2obs.resize(num_ranks_to_consider,999); // look at top 50 peaks
obs2pred.resize(num_ranks_to_consider,999);
ordered_scores.resize(num_ranks_to_consider,NEG_INF);
obs_ordered_scores.resize(num_ranks_to_consider,NEG_INF);
num_obs_for_frag.resize(num_frags,0);
num_pred_for_frag.resize(num_frags,0);
for (f=0; f<num_frags; f++)
{
if (observed_ranks[f].size() != predicted_ranks[f].size())
{
cout << "#obs frags: " << observed_ranks.size() << endl;
cout << "#pred frags: " << predicted_ranks.size() << endl;
cout << "Error: mismatch in rank dimensionalities!" << endl;
cout << f << "\tobs : " << observed_ranks[f].size() << " pred " << predicted_ranks[f].size() << endl;
exit(1);
}
const int num_ranks = predicted_ranks[f].size();
const vector<float>& frag_rank_scores = ppp.rank_scores[f];
const vector<float>& frag_intens = observed_intens[f];
int j;
for (j=0; j<num_ranks; j++)
{
const int obs_rank = observed_ranks[f][j];
const int pred_rank = predicted_ranks[f][j];
const float pred_score = frag_rank_scores[j];
if (pred_rank<num_ranks_to_consider)
{
pred2obs[pred_rank]=obs_rank;
ordered_scores[pred_rank]=pred_score;
}
if (obs_rank<num_ranks_to_consider)
{
obs2pred[obs_rank]=pred_rank;
obs_ordered_scores[obs_rank]=pred_score;
}
if (frag_intens[j]>0)
num_obs_for_frag[f]++;
if (frag_rank_scores[j]>NEG_INF)
num_pred_for_frag[f]++;
}
}
int j;
/* cout << i << ":\t";
for (j=0; j<7; j++)
cout << obs2pred[j] << "\t";
cout << endl;*/
if (obs2pred[0] == 0)
{
num_exact_first++;
}
for (j=0; j<max_rank; j++)
{
int min=j-3;
int max=j+3;
// if (min<0)
// max-=min;
if (pred2obs[j]>=min && pred2obs[j]<=max)
correct_rank_counts[j]++;
}
}
cout << i;
cout << setprecision(3) << fixed;
// cout << num_exact_first/(float)all_ssf.size() << "\t";
int j;
for (j=0; j<max_rank; j++)
{
cout << " & " << (correct_rank_counts[j]/(float)all_ssf.size());
}
cout << "\\\\" << endl;
}
void make_peak_hist_for_obs_rank(AdvancedScoreModel *model,
char *benchmark_file,
int obs_rank)
{
const int num_ranks_to_consider = 50;
const int max_hist_rank = 20;
Config *config = model->get_config();
DeNovoRankScorer *dnv_rank = (DeNovoRankScorer *)model->get_rank_model_ptr(1);
PeakRankModel *prm = dnv_rank->get_peak_prediction_model(3);
FileManager fm;
FileSet fs;
int num_exact_first=0;
vector<int> rank_hist;
rank_hist.resize(max_hist_rank+1,0);
fm.init_from_file(config,benchmark_file);
fs.select_all_files(fm);
fs.randomly_reduce_ssfs(10000);
const vector<SingleSpectrumFile *>& all_ssf = fs.get_ssf_pointers();
vector<QCPeak> peaks;
peaks.resize(5000);
BasicSpecReader bsr;
int count =0;
int i;
for (i=0; i<all_ssf.size(); i++)
{
vector< vector<intensity_t> > ann_intens;
vector< vector<mass_t> > ann_masses;
AnnotatedSpectrum as;
Peptide pep = all_ssf[i]->peptide;
PeptideSolution sol;
sol.pep = pep;
sol.reaches_n_terminal=true;
sol.reaches_c_terminal=true;
sol.charge = all_ssf[i]->charge;
sol.pm_with_19 = pep.get_mass_with_19();
const int num_peaks = bsr.read_basic_spec(config,fm,all_ssf[i],&peaks[0]);
as.init_from_QCPeaks(config,&peaks[0],num_peaks,all_ssf[i]);
as.set_peptide(sol.pep);
as.annotate_spectrum(sol.pm_with_19, true);
as.extract_annotated_intens_and_masses(ann_intens,ann_masses);
PeptidePeakPrediction ppp;
prm->calc_peptide_predicted_scores(sol, p
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -