📄 denovoranktrain.cpp
字号:
sort(pep_sol_idxs.begin(),pep_sol_idxs.end());
// add samples
for (j=0; j<pep_sol_idxs.size(); j++)
{
const int pep_sol_idx = pep_sol_idxs[j];
if (are_same_upto_NDILQK(set.correct_sol.pep,pep_solutions[pep_sol_idx].pep))
continue;
RankBoostSample bad_rbs;
fill_complete_peptide_rbs(pep_solutions[pep_sol_idx], peaks, num_peaks, as,
pmc_sqs_res, bad_rbs, size_idx);
const int bad_sam_idx=rds.get_num_samples();
bad_rbs.group_idx = group_idx;
bad_rbs.rank_in_group=2;
if (peptide_strings && add_to_train)
{
bad_rbs.tag1=peptide_strings->size();
peptide_strings->push_back(pep_solutions[pep_sol_idx].pep.as_string(config));
}
// give higher ranked db mismatches a stronger weight;
bad_rbs.tag2= 2; // full de novo
bad_rbs.float_tag1 = set.correct_sol.pm_with_19;
rds.add_sample(bad_rbs);
rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, 2);
if (add_to_train)
num_train_pairs++;
if (print_sams)
{
float score;
bad_rbs.get_feature_val(117,&score);
cout << j << "\t" << pep_sol_idx << "\t" <<
pep_solutions[pep_sol_idx].pep.as_string(config) << "\t" << score << endl;
}
}
}
if (bad_pm)
num_rand++;
if (good_first)
num_first++;
int num_denovo_to_add=0;
num_spectra_read++;
if ((model_type == 2 && num_spectra_read % 200 == 0) ||
(model_type == 0 && num_spectra_read % 1000 == 0) )
{
cout << "processed " << num_spectra_read << "/" << set_counter << " spectra (";
double curr_time = time(NULL);
cout << curr_time - start_time << " secs.)\t" << setprecision(3) << fixed << " pm acc: " <<
1.0 - num_rand/(float)num_spectra_read <<"\tfirst: " << num_first/(float)num_spectra_read<< endl;
}
}
if (test_scan_file)
test_scan_stream.close();
train_ds.set_num_groups(num_groups_in_train);
test_ds.set_num_groups(num_groups_in_test);
cout << "computing phi weights..." << endl;
train_ds.compute_total_phi_weight();
cout << "initializing potential lists..." << endl;
train_ds.initialize_potenital_lists();
cout << "initializing real feature table..." << endl;
train_ds.initialzie_real_feature_table(part_model->get_feature_names().size());
cout << "Processed " << num_spectra_read << " spectra" << endl;
cout << num_groups_in_train << " sets in training set (" << train_ds.get_phi_support().size() << " pairs)" << endl;
cout << num_groups_in_test << " sets in test set (" << test_ds.get_phi_support().size() << " pairs)" << endl << endl;
}
/*************************************************************************************
Creates the training data for models of complete predictions, i.e., it is assumed that
all peptides being scores start at mass 0 and end at the c-terminal. Therefore, we can
use the peptides mass as the spectrum's pm_with_19.
**************************************************************************************/
void DeNovoRankScorer::create_training_data_for_complete_sequence_ranking(
const string& db_dir,
const string& correct_dir,
const string& denovo_dir,
const string& mgf_list,
const double train_ratio,
const int max_num_pairs,
const int charge,
const int size_idx,
RankBoostDataset& train_ds,
RankBoostDataset& test_ds,
vector<string>* peptide_strings,
char *test_scan_file,
float ratio_db,
float ratio_denovo,
float ratio_db_cross)
{
const int max_num_ppp_frags = 4;
if (model_type !=0 && model_type != 2)
{
cout << "Error: this training function is only intended for full sequence samples!" << endl;
cout << "Need to set model type to 0 or 2, not " << model_type << endl;
exit(1);
}
if (ratio_db<=0)
ratio_db=0;
if (ratio_denovo<0)
ratio_denovo=0;
if (ratio_db_cross<0)
ratio_db_cross=0;
vector<bool> file_indicators;
PeptideSetMap psm;
Config *config = model->get_config();
if (peptide_strings)
peptide_strings->clear();
double start_time = time(NULL);
// read sample peptide sequences
create_complete_denovo_set_map(config,mgf_list,db_dir,correct_dir,
denovo_dir,charge,size_idx,psm, file_indicators);
// Read previous rerank path
if (model_type == 2)
ratio_db_cross = 0;
const int total_num_pairs=assign_denovo_weights_to_sets(psm, ratio_db, ratio_denovo);
const float total_ratio = ratio_db + ratio_denovo+ratio_db_cross;
const int num_db_pairs = (int)(max_num_pairs * (ratio_db/total_ratio));
const int num_denovo_pairs = (int)(max_num_pairs * (ratio_denovo/total_ratio));
// read spectra
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
QCPeak peaks[2000];
fm.init_from_list_file(model->get_config(),mgf_list.c_str(),file_indicators);
fs.select_all_files(fm);
const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
int i;
int num_spectra_read=0;
cout << endl << "Read " << all_ssfs.size() << " headers" << endl << endl;
cout << "Creating RankBoostDatasets... proportion for training " << train_ratio << endl;
cout << "Using following proportions for training data:" << endl;
cout << ratio_db << " pairs of correct, incorrect db hits (same spectrum)" << endl;
cout << ratio_denovo << " pairs of correct, incorrect full dnv (same spectrum)" << endl;
cout << ratio_db_cross << " pairs of correct, incorrect db hits (different spectra)" << endl;
// first select most abundant fragments, use first 1000 ssfs for statistics
vector<int> frag_counts;
frag_counts.resize(config->get_all_fragments().size(),0);
for (i=0; i<all_ssfs.size() && i<1000; i++)
{
MGF_single *ssf = (MGF_single *)all_ssfs[i];
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
BasicSpectrum bs;
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
vector< vector<float> > ann_intens;
vector< vector<mass_t> > ann_masses;
AnnotatedSpectrum as;
as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
as.set_peptide(ssf->peptide);
as.annotate_spectrum(ssf->peptide.get_mass_with_19(),true);
as.extract_annotated_intens_and_masses(ann_intens,ann_masses);
int f;
for (f=0; f<ann_intens.size(); f++)
{
int j;
for (j=0; j<ann_intens[f].size(); j++)
if (ann_intens[f][j]>0)
frag_counts[f]++;
}
}
vector<int> ppp_frag_type_idxs;
ppp_frag_type_idxs.clear();
// The peak prediction fragments are used elsewhere too
// select them here unless they are provided by the PeakRankModel
// (if it is a combined model)
PeakRankModel *&peak_model = peak_prediction_models[model_type];
if (peak_model->get_feature_set_type() <= 2)
{
cout << "Selecting frag models:" << endl;
for (i=0; i<max_num_ppp_frags; i++)
{
int max_frag=-1;
int max_count=-1;
int f;
for (f=0; f<frag_counts.size(); f++)
if (frag_counts[f]>max_count &&
peak_model->has_intialized_models(charge,size_idx,f))
{
max_count = frag_counts[f];
max_frag=f;
}
if (max_count<0)
{
cout << "Warning: not enough frag models initialized for charge " << charge <<
" size " << size_idx << endl;
break;
}
ppp_frag_type_idxs.push_back(max_frag);
cout << max_frag << "\t" << config->get_fragment(max_frag).label << "\t" << frag_counts[max_frag] << endl;
frag_counts[max_frag]=-1;
}
}
else
{
cout << "Using a combined peak model" << endl;
cout << "Selecting peaks of the paritally mobile model" << endl;
int mobility;
for (mobility=MOBILE+1; mobility<NONMOBILE; mobility++)
{
const vector<int>& frag_idxs = peak_model->get_model_ptr(charge,size_idx,mobility)->get_fragment_type_idxs();
int f;
for (f=0; f<frag_idxs.size(); f++)
{
if (ppp_frag_type_idxs.size() == max_num_ppp_frags)
break;
int j;
for (j=0; j<ppp_frag_type_idxs.size(); j++)
if (ppp_frag_type_idxs[j] == frag_idxs[f])
break;
if (j==ppp_frag_type_idxs.size())
ppp_frag_type_idxs.push_back(frag_idxs[f]);
}
}
int f;
for (f=0; f<ppp_frag_type_idxs.size(); f++)
cout << f+1 << "\t" << config->get_fragment(ppp_frag_type_idxs[f]).label << endl;
cout << endl;
}
sort(ppp_frag_type_idxs.begin(),ppp_frag_type_idxs.end());
DeNovoPartitionModel *&part_model = dnv_part_models[charge][size_idx];
part_model->init_features(model_type,charge,size_idx,ppp_frag_type_idxs,config);
int num_groups_in_train=0;
int num_groups_in_test=0;
int num_train_pairs=0;
ofstream test_scan_stream;
if (test_scan_file)
{
test_scan_stream.open(test_scan_file);
if (! test_scan_stream.is_open())
{
cout << "Error: couldn't open test stream!" << endl;
exit(1);
}
}
int set_counter=0;
for (i=0; i<all_ssfs.size(); i++)
{
PeptideSetMap::const_iterator it;
MGF_single *ssf = (MGF_single *)all_ssfs[i];
scan_pair key(ssf->file_idx,ssf->idx_in_file);
it = psm.find(key);
if (it == psm.end())
continue;
set_counter++;
}
cout << set_counter << " sets available." << endl;
int num_db_peptides_per_set = (int)(0.5 + (float)num_db_pairs/set_counter);
if (num_db_peptides_per_set>7)
num_db_peptides_per_set=7;
if (num_db_peptides_per_set<1)
num_db_peptides_per_set=1;
if (ratio_db_cross>0 && num_db_peptides_per_set<4)
num_db_peptides_per_set=4;
int num_denovo_peptides_per_set = (int)(0.75 + (float)num_denovo_pairs/set_counter);
if (num_denovo_peptides_per_set<1)
num_denovo_peptides_per_set=1;
if (num_denovo_peptides_per_set>10)
num_denovo_peptides_per_set=10;
const int org_num_denovo_peptides_per_set = num_denovo_peptides_per_set;
cout << "NUM DB PER SET : " << num_db_peptides_per_set << endl;
cout << "NUM DE NOVO PER SET: " << num_denovo_peptides_per_set << endl;
static vector<PrmGraph *> prm_ptrs;
static vector<SeqPath> seqpath_solutions;
// Generate various types of samples from spectra
for (i=0; i<all_ssfs.size(); i++)
{
PeptideSetMap::const_iterator it;
MGF_single *ssf = (MGF_single *)all_ssfs[i];
// remove samples with PTMs
const vector<int>& aas = ssf->peptide.get_amino_acids();
int j;
for (j=0; j<aas.size(); j++)
if (aas[j]>Val)
break;
if (j<aas.size())
continue;
scan_pair key(ssf->file_idx,ssf->idx_in_file);
it = psm.find(key);
if (it == psm.end())
continue;
const PeptideSet& set = (*it).second;
BasicSpectrum bs;
AnnotatedSpectrum as;
vector<PmcSqsChargeRes> pmc_sqs_res;
int charge1=0,charge2=0;
mass_t mz1=0,mz2=0;
float prob1=0,prob2=0;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
model->get_best_mz_charge(config,bs, &mz1, &charge1, &prob1,
&mz2, &charge2, &prob2, &pmc_sqs_res);
const mass_t mass_with_19 = mz1*charge1 - (charge1-1.0);
as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
as.set_corrected_pm_with_19(mass_with_19);
const bool add_to_train = (my_random() <= train_ratio);
RankBoostDataset& rds = (add_to_train ? train_ds : test_ds);
if (test_scan_file && ! add_to_train)
test_scan_stream << ssf->file_idx << " " << ssf->idx_in_file << " " <<
ssf->peptide.get_num_aas() << endl;
bool first=true;
int corr_sam_idx=-1;
int group_idx=-1;
if (add_to_train)
{
group_idx = num_groups_in_train++;
}
else
{
group_idx = num_groups_in_test++;
}
RankBoostSample corr_rbs;
fill_complete_peptide_rbs(set.correct_sol, peaks, num_peaks, as, pmc_sqs_res, corr_rbs, size_idx);
corr_sam_idx = rds.get_num_samples();
corr_rbs.group_idx=group_idx;
corr_rbs.tag1=NEG_INF;
corr_rbs.float_tag1 = set.correct_sol.pm_with_19;
if (peptide_strings && add_to_train)
{
corr_rbs.tag1 = peptide_strings->size();
peptide_strings->push_back(set.correct_sol.pep.as_string(config));
}
corr_rbs.rank_in_group=0;
rds.add_sample(corr_rbs);
// add db samples
for (j=0; j<set.incorrect_sols.size() && j<num_db_peptides_per_set; j++)
{
const int type = set.incorrect_sols[j].type;
if (type != 1)
break;
RankBoostSample bad_rbs;
fill_complete_peptide_rbs(set.incorrect_sols[j], peaks, num_peaks, as, pmc_sqs_res,
bad_rbs, size_idx);
int bad_sam_idx=rds.get_num_samples();
bad_rbs.group_idx = group_idx;
bad_rbs.rank_in_group=(j == 0 ? 1 : 2);
if (peptide_strings && add_to_train)
{
bad_rbs.tag1=peptide_strings->size();
peptide_strings->push_back(set.incorrect_sols[j].pep.as_string(config));
}
// give higher ranked db mismatches a stronger weight;
bad_rbs.tag2=set.incorrect_sols[j].type;
bad_rbs.tag3=set.incorrect_sols[j].num_correct_aas;
bad_rbs.float_tag1 = set.correct_sol.pm_with_19;
rds.add_sample(bad_rbs);
rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, set.incorrect_sols[j].type);
if (add_to_train)
num_train_pairs++;
}
int num_db_added=j;
while (j<set.incorrect_sols.size() && set.incorrect_sols[j].type==1)
j++;
const int den_start_idx = j;
// add the rerank samples
int num_denovo_to_add = org_num_denovo_peptides_per_set;
// add precomputed denovo samples
int num_den_available = set.incorrect_sols.size() - den_start_idx;
vector<int> den_idxs;
den_idxs.clear();
if (num_den_available>num_denovo_to_add)
{
choose_k_from_n(num_denovo_to_add,num_den_available,den_idxs);
int k;
for (k=0; k<num_denovo_to_add; k++)
den_idxs[k]+=den_start_idx;
}
else
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -