📄 denovoranktrain.cpp
字号:
exit(1);
}
if (peak_model->get_feature_set_type() <= 2)
{
cout << "Error: training function intended for combined peak model!" << endl;
exit(1);
}
vector<bool> file_indicators;
PeptideSetMap psm;
if (peptide_strings)
peptide_strings->clear();
double start_time = time(NULL);
// read sample peptide sequences
create_complete_dbh_set_map(config,mgf_list,db_dir,correct_dir,
charge,size_idx,psm, file_indicators);
// Read previous rerank path
DeNovoRankScorer previous_rerank_model;
if (rerank_path)
{
cout << "Read previous rerank model, type " << previous_rerank_model.get_model_type() << endl;
cout << "Option not supported anymore!" << endl;
exit(1);
}
const float ratio_db = 1.0 - ratio_denovo;
const int num_db_pairs = (int)(max_num_pairs * ratio_db);
const int num_denovo_pairs = (int)(max_num_pairs * ratio_denovo);
// read spectra
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
QCPeak peaks[2000];
fm.init_from_list_file(model->get_config(),mgf_list.c_str(),file_indicators);
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
int i;
int num_spectra_read=0;
cout << endl << "Read " << all_ssfs.size() << " headers" << endl << endl;
cout << "Creating RankBoostDatasets... proportion for training " << train_ratio << endl;
cout << "Using following proportions for training data:" << endl;
cout << ratio_denovo << " pairs of correct, incorrect full dnv (same spectrum)" << endl;
cout << ratio_db << " pairs of correct, incorrect db hits (same spectrum)" << endl;
// first select most abundant fragments, use first 1000 ssfs for statistics
vector<int> frag_counts;
frag_counts.resize(config->get_all_fragments().size(),0);
for (i=0; i<all_ssfs.size() && i<1000; i++)
{
MGF_single *ssf = (MGF_single *)all_ssfs[i];
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
BasicSpectrum bs;
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
vector< vector<float> > ann_intens;
vector< vector<mass_t> > ann_masses;
AnnotatedSpectrum as;
as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
as.set_peptide(ssf->peptide);
as.annotate_spectrum(ssf->peptide.get_mass_with_19(),true);
as.extract_annotated_intens_and_masses(ann_intens,ann_masses);
int f;
for (f=0; f<ann_intens.size(); f++)
{
int j;
for (j=0; j<ann_intens[f].size(); j++)
if (ann_intens[f][j]>0)
frag_counts[f]++;
}
}
vector<int> ppp_frag_type_idxs;
ppp_frag_type_idxs.clear();
cout << "Using a combined peak model" << endl;
cout << "Selecting peaks of the paritally mobile model" << endl;
int mobility;
for (mobility=MOBILE+1; mobility<NONMOBILE; mobility++)
{
const vector<int>& frag_idxs = peak_model->get_model_ptr(charge,size_idx,mobility)->get_fragment_type_idxs();
int f;
for (f=0; f<frag_idxs.size(); f++)
{
if (ppp_frag_type_idxs.size() == max_num_ppp_frags)
break;
int j;
for (j=0; j<ppp_frag_type_idxs.size(); j++)
if (ppp_frag_type_idxs[j] == frag_idxs[f])
break;
if (j==ppp_frag_type_idxs.size())
ppp_frag_type_idxs.push_back(frag_idxs[f]);
}
}
int f;
for (f=0; f<ppp_frag_type_idxs.size(); f++)
cout << f+1 << "\t" << config->get_fragment(ppp_frag_type_idxs[f]).label << endl;
cout << endl;
sort(ppp_frag_type_idxs.begin(),ppp_frag_type_idxs.end());
DeNovoPartitionModel *&part_model = dnv_part_models[charge][size_idx];
part_model->init_features(model_type,charge,size_idx,ppp_frag_type_idxs,config);
int num_groups_in_train=0;
int num_groups_in_test=0;
int num_train_pairs=0;
ofstream test_scan_stream;
if (test_scan_file)
{
test_scan_stream.open(test_scan_file);
if (! test_scan_stream.is_open())
{
cout << "Error: couldn't open test stream!" << endl;
exit(1);
}
}
int set_counter=0;
for (i=0; i<all_ssfs.size(); i++)
{
PeptideSetMap::const_iterator it;
MGF_single *ssf = (MGF_single *)all_ssfs[i];
scan_pair key(ssf->file_idx,ssf->idx_in_file);
it = psm.find(key);
if (it == psm.end())
continue;
set_counter++;
}
cout << set_counter << " sets available." << endl;
int num_db_peptides_per_set = (int)(0.5 + (float)num_db_pairs/set_counter);
if (num_db_peptides_per_set>4)
num_db_peptides_per_set=4;
if (num_db_peptides_per_set<1)
num_db_peptides_per_set=1;
int db_pairs = num_db_peptides_per_set * set_counter;
int num_denovo_peptides_per_set = (int)(0.5 + (float)(max_num_pairs-db_pairs)/set_counter);
if (num_denovo_peptides_per_set<1)
num_denovo_peptides_per_set=1;
if (num_denovo_peptides_per_set>25)
num_denovo_peptides_per_set=25;
cout << "NUM DB PER SET : " << num_db_peptides_per_set << endl;
cout << "NUM DE NOVO PER SET: " << num_denovo_peptides_per_set << endl;
static PrmGraph *prm_ptr=NULL;
static vector<PrmGraph *> prm_ptrs;
static vector<SeqPath> seqpath_solutions;
int num_rand=0;
int num_first=0;
// Generate various types of samples from spectra
int ssf_idx;
for (ssf_idx=0; ssf_idx<all_ssfs.size(); ssf_idx++)
{
MGF_single *ssf = (MGF_single *)all_ssfs[ssf_idx];
const mass_t true_mass_with_19 = ssf->peptide.get_mass_with_19();
const Peptide& correct_peptide = ssf->peptide;
PeptideSetMap::const_iterator it;
scan_pair key(ssf->file_idx,ssf->idx_in_file);
// if (num_groups_in_train>800)
// break;
it = psm.find(key);
if (it == psm.end())
continue;
const vector<int>& aas = ssf->peptide.get_amino_acids();
int a;
for (a=0; a<aas.size(); a++)
if (aas[a]>Val)
break;
if (a<aas.size())
continue;
const PeptideSet& set = (*it).second;
BasicSpectrum bs;
AnnotatedSpectrum as;
vector<PmcSqsChargeRes> pmc_sqs_res;
if (fabs(set.correct_sol.pep.get_mass_with_19() - ssf->peptide.get_mass_with_19())>0.001)
{
cout << "Error: mismatch between set peptide and peptide in file:" << endl;
cout << "Set : " << set.correct_sol.pep.as_string(config) << "\t" << set.correct_sol.pep.get_mass_with_19() <<endl;
cout << "Spec: " << ssf->peptide.as_string(config) << "\t" << ssf->peptide.get_mass_with_19() << endl;
}
int charge1=0,charge2=0;
mass_t mz1=0,mz2=0;
float prob1=0,prob2=0;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
// calc corrected pm_with_19, if it is good use it, otherwise, use a value with +- U[0,toleance/2]
model->get_best_mz_charge(config,bs,&mz1,&charge1,&prob1,&mz2,&charge2,&prob2,&pmc_sqs_res);
mass_t pm_with_19=NEG_INF;
bool good_first = true;
const mass_t corr1_pm_with_19 = mz1*charge1 - MASS_PROTON*(charge1-1);
const mass_t corr2_pm_with_19 = mz2*charge2 - MASS_PROTON*(charge2-1);
if (fabs(corr2_pm_with_19-true_mass_with_19)<tolerance_diff)
{
pm_with_19 = corr2_pm_with_19;
good_first=false;
}
if (fabs(corr1_pm_with_19-true_mass_with_19)<tolerance_diff)
{
pm_with_19 = corr1_pm_with_19;
}
else
good_first=false;
bool bad_pm=false;
if (pm_with_19<0) // use a random value
{
double r=my_random();
mass_t offset = r*r*tolerance_diff;
if (my_random()<0.5)
offset *= -1;
pm_with_19 = true_mass_with_19 + offset;
bad_pm=true;
}
as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
as.set_corrected_pm_with_19(pm_with_19);
const bool add_to_train = (my_random() <= train_ratio);
RankBoostDataset& rds = (add_to_train ? train_ds : test_ds);
if (test_scan_file && ! add_to_train)
test_scan_stream << ssf->file_idx << " " << ssf->idx_in_file << " " <<
ssf->peptide.get_num_aas() << endl;
bool first=true;
int corr_sam_idx=-1;
int group_idx=-1;
if (add_to_train)
{
group_idx = num_groups_in_train++;
}
else
{
group_idx = num_groups_in_test++;
}
RankBoostSample corr_rbs;
fill_complete_peptide_rbs(set.correct_sol, peaks, num_peaks, as, pmc_sqs_res, corr_rbs, size_idx);
corr_sam_idx = rds.get_num_samples();
corr_rbs.group_idx=group_idx;
corr_rbs.tag1=NEG_INF;
corr_rbs.float_tag1 = set.correct_sol.pm_with_19;
if (peptide_strings && add_to_train)
{
corr_rbs.tag1 = peptide_strings->size();
peptide_strings->push_back(set.correct_sol.pep.as_string(config));
}
corr_rbs.rank_in_group=0;
rds.add_sample(corr_rbs);
if (print_sams)
{
float score;
corr_rbs.get_feature_val(117,&score);
cout << endl << ssf_idx << "\t"<< set.correct_sol.pep.as_string(config) << "\t" << score << endl;
}
// add db samples
int j;
for (j=0; j<set.incorrect_sols.size() && j<num_db_peptides_per_set; j++)
{
const int type = set.incorrect_sols[j].type;
if (type != 1)
break;
if (are_same_upto_NDILQK(set.correct_sol.pep,set.incorrect_sols[j].pep))
continue;
RankBoostSample bad_rbs;
fill_complete_peptide_rbs(set.incorrect_sols[j], peaks, num_peaks, as, pmc_sqs_res,
bad_rbs, size_idx);
int bad_sam_idx=rds.get_num_samples();
bad_rbs.group_idx = group_idx;
bad_rbs.rank_in_group=(j == 0 ? 1 : 2);
if (peptide_strings && add_to_train)
{
bad_rbs.tag1=peptide_strings->size();
peptide_strings->push_back(set.incorrect_sols[j].pep.as_string(config));
}
bad_rbs.tag2=set.incorrect_sols[j].type;
bad_rbs.tag3=set.incorrect_sols[j].num_correct_aas;
bad_rbs.float_tag1 = set.correct_sol.pm_with_19;
rds.add_sample(bad_rbs);
rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, set.incorrect_sols[j].type);
if (add_to_train)
num_train_pairs++;
if (print_sams)
{
float score;
bad_rbs.get_feature_val(117,&score);
cout << j << "\t" <<
set.incorrect_sols[j].pep.as_string(config) << "\t" << score << endl;
}
}
// add denovo samples (if the rerank model exists, rerank them)
if (num_denovo_peptides_per_set>0)
{
vector<mass_t> pms_with_19;
vector<int> charges;
pms_with_19.push_back(pm_with_19);
charges.push_back(charge);
/* if (my_random()<0.75)
{
pms_with_19.push_back(pm_with_19-MASS_PROTON);
charges.push_back(charge);
}
if (my_random()<0.5)
{
pms_with_19.push_back(pm_with_19-2*MASS_PROTON);
charges.push_back(charge);
}
if (my_random()<0.15)
{
pms_with_19.push_back(pm_with_19+MASS_PROTON);
charges.push_back(charge);
}
if (my_random()<0.1)
{
pms_with_19.push_back(pm_with_19+2*MASS_PROTON);
charges.push_back(charge);
} */
if (prm_ptrs.size()<pms_with_19.size())
prm_ptrs.resize(pms_with_19.size(),NULL);
generate_denovo_solutions_from_several_pms_with_good_start_end_idxs(prm_ptrs,
model,&as,true,300,6,14,pms_with_19,charges,seqpath_solutions);
// generate_denovo_solutions_with_good_start_end_idxs(prm_ptr,model,&as,true,
// pm_with_19,charge,300,6,14,seqpath_solutions);
int j;
for (j=0; j<seqpath_solutions.size(); j++)
{
const int num_correct_aas = seqpath_solutions[j].get_num_correct_aas(correct_peptide,config);
if (num_correct_aas==seqpath_solutions[j].get_num_aa())
{
seqpath_solutions[j]=seqpath_solutions[seqpath_solutions.size()-1];
seqpath_solutions.pop_back();
}
}
vector<PeptideSolution> pep_solutions;
if (seqpath_solutions.size()>0)
{
// create peptide solutions
pep_solutions.resize(seqpath_solutions.size());
int s_idx;
for (s_idx=0; s_idx<seqpath_solutions.size(); s_idx++)
{
convert_seq_path_to_peptide_soluition_and_fill_in_aas(config,
correct_peptide, seqpath_solutions[s_idx],pep_solutions[s_idx]);
}
}
else
continue;
vector<int> pep_sol_idxs;
pep_sol_idxs.clear();
vector<score_pair> rerank_scores;
// re-order the de novo solutions according to previous model
if (0 || rerank_path)
{
previous_rerank_model.score_complete_sequences(pep_solutions,ssf,peaks,
bs.num_peaks, rerank_scores, size_idx);
sort(rerank_scores.begin(),rerank_scores.end());
pep_sol_idxs.push_back(0); // include top-scoring de novo in any case
pep_sol_idxs.push_back(1);
const int num_top = num_denovo_peptides_per_set/2+1;
int j;
for (j=0; j<num_top; j++)
pep_sol_idxs.push_back(rerank_scores[j].idx);
const int num_left = num_denovo_peptides_per_set - pep_sol_idxs.size();
for (j=0; j<num_left; j++)
{
int p = int(my_random()*(rerank_scores.size()-pep_sol_idxs.size()))+pep_sol_idxs.size();
pep_sol_idxs.push_back(rerank_scores[p].idx);
}
sort(pep_sol_idxs.begin(),pep_sol_idxs.end());
}
else // use original order
{
const int num_top = int(num_denovo_peptides_per_set*0.6)+1;
int j;
for (j=0; j<num_top; j++)
pep_sol_idxs.push_back(j);
const int num_left = num_denovo_peptides_per_set - pep_sol_idxs.size();
for (j=0; j<num_left; j++)
{
int p = int(my_random()*(pep_solutions.size()-pep_sol_idxs.size()))+pep_sol_idxs.size();
pep_sol_idxs.push_back(p);
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -