📄 denovoranktrain.cpp
字号:
{
int k;
for (k=0; k<num_den_available; k++)
den_idxs.push_back(den_start_idx+k);
}
for (j=0; j<den_idxs.size(); j++)
{
const int sam_idx = den_idxs[j];
const int type = set.incorrect_sols[sam_idx].type;
if (type != 2)
{
cout << "Error: adding non denovo type: " << set.incorrect_sols[sam_idx].type << endl;
cout << "sam idx: " << sam_idx << " out of " << set.incorrect_sols.size() << endl;
int k;
for (k=0; k<set.incorrect_sols.size(); k++)
{
cout << k << "\t" << set.incorrect_sols[k].type << endl;
}
exit(1);
}
RankBoostSample bad_rbs;
fill_complete_peptide_rbs(set.incorrect_sols[sam_idx], peaks, num_peaks, as,
pmc_sqs_res, bad_rbs, size_idx);
const int bad_sam_idx=rds.get_num_samples();
bad_rbs.group_idx = group_idx;
bad_rbs.rank_in_group=2;
if (peptide_strings && add_to_train)
{
bad_rbs.tag1=peptide_strings->size();
peptide_strings->push_back(set.incorrect_sols[sam_idx].pep.as_string(config));
}
// give higher ranked db mismatches a stronger weight;
bad_rbs.tag2=set.incorrect_sols[sam_idx].type;
bad_rbs.tag3=set.incorrect_sols[sam_idx].num_correct_aas;
rds.add_sample(bad_rbs);
rds.add_to_phi_vector(bad_sam_idx, corr_sam_idx, set.incorrect_sols[sam_idx].type);
if (add_to_train)
num_train_pairs++;
}
num_spectra_read++;
if ((model_type == 2 && num_spectra_read % 200 == 0) ||
(model_type == 0 && num_spectra_read % 1000 == 0) )
{
cout << "processed " << num_spectra_read << "/" << set_counter << " spectra (";
double curr_time = time(NULL);
cout << curr_time - start_time << " secs.)" << endl;
}
}
if (test_scan_file)
test_scan_stream.close();
const int total_num_cross = max_num_pairs - train_ds.get_phi_support().size() - test_ds.get_phi_support().size();
const int num_train_db_cross = (int)(total_num_cross * train_ratio);
const int num_test_db_cross = total_num_cross - num_train_db_cross;
if (ratio_db_cross>0)
{
cout << "Adding " << num_train_db_cross << " db cross samples to train" <<endl;
// add cross db samples to train
if (num_train_db_cross>0)
{
const vector<RankBoostSample>& samples = train_ds.get_samples();
vector<SamplePairWeight>& phi_support = train_ds.get_non_const_phi_support();
int i;
for (i=0; i<num_train_db_cross; i++)
{
const int corr_idx = (int)(num_train_pairs * my_random());
int bad_idx=-1;
while (bad_idx<0 || phi_support[bad_idx].tag != SOL_INCORRECT_DB)
// samples[phi_support[bad_idx].idx1].rank_in_group != 1) // cross db with strongest bad db hit
bad_idx = (int)(num_train_pairs * my_random());
SamplePairWeight new_pair = phi_support[bad_idx];
new_pair.idx2 = phi_support[corr_idx].idx2;
new_pair.tag = SOL_INCORRECT_DB_CROSS;
new_pair.weight=1.0;
phi_support.push_back(new_pair);
}
re_weight_complete_denovo_phi_support(phi_support, ratio_db, ratio_denovo, ratio_db_cross);
}
cout << "Adding " << num_test_db_cross << " db cross samples to test" <<endl;
// add cross db samples to test
if (num_test_db_cross>0)
{
const vector<RankBoostSample>& samples = test_ds.get_samples();
vector<SamplePairWeight>& phi_support = test_ds.get_non_const_phi_support();
const int org_support_size = phi_support.size();
for (i=0; i<num_test_db_cross; i++)
{
const int corr_idx = (int)(org_support_size * my_random());
int bad_idx=-1;
while (bad_idx<0 || phi_support[bad_idx].tag != SOL_INCORRECT_DB )
// samples[phi_support[bad_idx].idx1].rank_in_group != 1) // cross db with strongest bad db hit
bad_idx = (int)(org_support_size * my_random());
SamplePairWeight new_pair = phi_support[bad_idx];
new_pair.idx2 = phi_support[corr_idx].idx2;
new_pair.tag = SOL_INCORRECT_DB_CROSS;
new_pair.weight=1.0;
phi_support.push_back(new_pair);
}
re_weight_complete_denovo_phi_support(phi_support,ratio_db,ratio_denovo,ratio_db_cross);
}
}
train_ds.set_num_groups(num_groups_in_train);
test_ds.set_num_groups(num_groups_in_test);
cout << "computing phi weights..." << endl;
train_ds.compute_total_phi_weight();
cout << "initializing potential lists..." << endl;
train_ds.initialize_potenital_lists();
cout << "initializing real feature table..." << endl;
train_ds.initialzie_real_feature_table(part_model->get_feature_names().size());
cout << "Processed " << num_spectra_read << " spectra" << endl;
cout << num_groups_in_train << " sets in training set (" << train_ds.get_phi_support().size() << " pairs)" << endl;
cout << num_groups_in_test << " sets in test set (" << test_ds.get_phi_support().size() << " pairs)" << endl << endl;
}
bool cmp_rerank_score(const SeqPath& a, const SeqPath& b)
{
return (a.rerank_score>b.rerank_score);
}
void DeNovoRankScorer::select_sample_pairs(const vector<SeqPath>& solutions,
const vector<int>& corr_idxs,
const vector<int>& bad_idxs,
vector<weight_pair>& sample_pairs,
int num_pairs) const
{
const int num_iters=3*num_pairs;
const double log_bad_size = log((double)bad_idxs.size());
sample_pairs.clear();
float total_w = 0;
int i;
for (i=0; i<num_iters && sample_pairs.size()<num_pairs; i++)
{
int corr_pos = 0;
int bad_pos = i;
if (i>2 && corr_idxs.size()>0)
{
double r = my_random();
if ((r<0.4 && solutions[0].org_rank>0) || r<0.15)
{
r=0;
}
else
{
r = my_random();
}
corr_pos = int(r*(double)corr_idxs.size());
}
const int corr_idx = corr_idxs[corr_pos];
const int rank_corr = solutions[corr_idx].org_rank;
if (! (i<3 && rank_corr < 25))
{
double log_corr_rank = log(1.0 + rank_corr);
double min_log = 0;
if (rank_corr>150)
min_log = log_corr_rank - 3.5;
double max_u = exp(min_log+(log_bad_size+1-min_log) * my_random());
if (max_u>(double)bad_idxs.size())
max_u=(double)bad_idxs.size();
double min_u = exp(min_log)-1;
bad_pos = (int)(min_u + my_random()*(max_u-min_u));
}
/* double min_log,max_log;
if (rank_corr<=5)
{
min_log=0;
max_log=2.75;
}
else if (rank_corr<=25)
{
min_log=0;
max_log=3.25;
}
else if (rank_corr<500)
{
min_log=log_corr_rank-3.0;
max_log=log_corr_rank;
}
else
{
min_log=log_corr_rank-3.0;
max_log=log_corr_rank-1.0;
}
if (max_log>log_bad_size)
max_log = log_bad_size;
if (min_log<0)
min_log=0;
int bad_pos=i;
if (rank_corr<20 && i<4)
{
bad_pos = i;
if (i==3)
bad_pos+=(int)(my_random()*3);
}
else
{
if (my_random()<0.8)
{
bad_pos = (int)(exp(min_log + (my_random()*(max_log-min_log))))-1;
}
else
{
double m = (log_corr_rank < log_bad_size-1.5 ? log_corr_rank : log_bad_size -1.5);
bad_pos = (int)(exp(m+ (my_random()*(log_bad_size-m))))-1;
}
} */
if (bad_pos<0)
bad_pos=0;
if (bad_pos>=bad_idxs.size())
continue;
const int bad_idx = bad_idxs[bad_pos];
int j;
for (j=0; j<sample_pairs.size(); j++)
if (sample_pairs[j].idx_bad == bad_idx && sample_pairs[j].idx_corr == corr_idx)
break;
if (j<sample_pairs.size())
continue;
int rank_bad = solutions[bad_idx].org_rank;
// float w = (rank_bad<rank_corr ? 1.0 : 1.0 + 0.333*(log((double)(1+rank_bad)) - log((double)(1+rank_corr))));
float w=1.0;
sample_pairs.push_back(weight_pair(corr_idx,bad_idx,w));
total_w += w;
// cout << "Added " << sample_pairs.size() << " :\t" << corr_idx << "\t" << bad_idx << "\t" << w << endl;
}
// cout << endl;
if (total_w>0)
{
float mult_val = (float)(sample_pairs.size())/total_w;
for (i=0; i<sample_pairs.size(); i++)
sample_pairs[i].weight*=mult_val;
}
}
/*************************************************************************************
Creates the training data for models of de novo predictions (6-14).
These sequences do not have to reach the N- or C-terminals
**************************************************************************************/
void DeNovoRankScorer::create_training_data_for_partial_denovo_ranking(
const string& mgf_list,
const double train_ratio,
const int max_num_pairs,
const int charge,
const int size_idx,
const float penalty_for_bad_aa,
RankBoostDataset& train_ds,
RankBoostDataset& test_ds,
char *test_scan_file,
int length_limit)
{
PeakRankModel *&peak_model = peak_prediction_models[model_type];
const int max_num_ppp_frags = 4;
const int max_num_pairs_per_spec = 80;
const mass_t tolerance_diff = 0.275;
if (model_type !=1 && model_type !=3)
{
cout << "Error: this training function is only intended for partial de novo samples!" << endl;
cout << "Need to set model type to 1 or 3, not " << model_type << endl;
exit(1);
}
train_ds.clear();
test_ds.clear();
Config *config = model->get_config();
config->set_use_spectrum_charge(1);
const vector< vector<mass_t> >& size_thresholds = peak_model->get_size_thresholds();
mass_t min_mz=0;
mass_t max_mz = 10000;
if (size_idx>0)
min_mz = (size_thresholds[charge][size_idx-1]+charge-1)/(mass_t)charge;
if (size_idx< size_thresholds[charge].size())
max_mz = (size_thresholds[charge][size_idx]+charge-1)/(mass_t)charge;
int num_solutions= (length_limit<=8 ? 1000 : 2000);
if (model_type == 3 && model_length<10)
{
if (model_length<= 4)
num_solutions= 100;
if (model_length == 5)
num_solutions = 250;
if (model_length > 5)
num_solutions = 750;
}
if (length_limit>15)
length_limit=15;
if (charge >2 && length_limit>10)
length_limit=10;
cout << endl << "Model type " << model_type << ", model length " << model_length << endl;
cout << "Length limit : " << length_limit << endl;
cout << "Num solutions : " << num_solutions << endl;
// max_mz = min_mz + 30;
cout << "Charge " << charge << " size " << size_idx << endl;
cout << "Min m/z " << setprecision(2) << fixed << min_mz << " Max m/z " << max_mz << endl;
double start_time = time(NULL);
// read spectra
FileManager fm;
FileSet fs;
BasicSpecReader bsr;
QCPeak peaks[4000];
fm.init_from_list_file(model->get_config(),mgf_list.c_str());
fs.select_files_in_mz_range(fm,min_mz,max_mz,charge);
fs.randomly_reduce_ssfs(36000);
int num_pairs_per_spec = (int)(max_num_pairs/(float)fs.get_total_spectra() + 0.5);
if (num_pairs_per_spec>max_num_pairs_per_spec)
num_pairs_per_spec=max_num_pairs_per_spec;
cout << "Read " << fs.get_total_spectra() << " spectra headers" << endl;
if (fs.get_total_spectra()<10)
{
cout << "Error: not enough spectra to train!" << endl;
exit(1);
}
cout << "Training with " << num_pairs_per_spec << " pairs per specturm.." << endl;
cout << "Creating RankBoostDatasets... proportion for training " << train_ratio << endl;
vector<int> ppp_frag_type_idxs;
ppp_frag_type_idxs.clear();
cout << "Using a combined peak model" << endl;
cout << "Selecting peaks of the paritally mobile model" << endl;
ppp_frag_type_idxs = peak_model->get_model_ptr(charge,size_idx,PARTIALLYMOBILE)->get_fragment_type_idxs();
int f;
for (f=0; f<ppp_frag_type_idxs.size(); f++)
cout << f+1 << "\t" << config->get_fragment(ppp_frag_type_idxs[f]).label << endl;
cout << endl;
sort(ppp_frag_type_idxs.begin(),ppp_frag_type_idxs.end());
DeNovoPartitionModel *&part_model = dnv_part_models[charge][size_idx];
part_model->init_features(model_type, charge,size_idx,ppp_frag_type_idxs,config);
int num_groups_in_train=0;
int num_groups_in_test=0;
int num_train_pairs=0;
ofstream test_scan_stream;
if (test_scan_file)
{
test_scan_stream.open(test_scan_file);
if (! test_scan_stream.is_open())
{
cout << "Error: couldn't open test stream!" << endl;
exit(1);
}
}
static vector<PrmGraph *> prm_ptrs;
static vector<SeqPath> solutions;
const vector<SingleSpectrumFile *>& all_ssfs = fs.get_ssf_pointers();
int spec_idx;
int num_spectra_read=0;
int num_without_sol_in_graph=0;
int num_without_correct_sol =0;
int num_possible=0;
config->set_use_spectrum_charge(1);
for (spec_idx=0; spec_idx<all_ssfs.size(); spec_idx++)
{
MGF_single *ssf = (MGF_single *)all_ssfs[spec_idx];
const Peptide& full_pep = ssf->peptide;
const mass_t true_mass_with_19 = full_pep.get_mass_with_19();
BasicSpectrum bs;
AnnotatedSpectrum as;
PrmGraph prm;
const vector<int>& aas = ssf->peptide.get_amino_acids();
int a;
for (a=0; a<aas.size(); a++)
if (aas[a]>Val)
break;
if (a<aas.size())
continue;
num_possible++;
//
// if (spec_idx>2000)
// break;
const int num_peaks = bsr.read_basic_spec(config,fm,ssf,peaks);
bs.peaks = peaks;
bs.num_peaks = num_peaks;
bs.ssf = ssf;
as.init_from_QCPeaks(config,peaks,num_peaks,ssf);
prm.create_graph_from_spectrum(model,
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -