📄 partitionmodel.cpp
字号:
if (test_peptide_length == 0)
{
vector<int> test_length_counts;
test_length_counts.resize(200,0);
const vector<RankBoostSample>& samples = test_ds.get_samples();
vector<int> sizes;
sizes.resize(test_ds.get_num_groups(),0);
int i;
for (i=0; i<samples.size(); i++)
sizes[samples[i].group_idx]=samples[i].tag3;
for (i=1; i<sizes.size(); i++)
test_length_counts[sizes[i]]++;
int max=0;
for (i=0; i<200; i++)
{
if (test_length_counts[i]>=500)
break;
if (test_length_counts[i]>test_length_counts[max])
max=i;
}
if (i<200)
{
test_peptide_length = i;
}
else
test_peptide_length = max;
}
cout << "test length " << test_peptide_length << endl;
}
cout << "read training tps..." << endl;
prank->read_training_peptides_into_rank_boost_dataset(frag_idx, charge,
sample_tps, rank_ds, peak_intens, peak_starts, max_annotated_intens);
RankBoostModel& boost = frag_models[f];
boost.init_rankboost_model_feature_names(prank->get_binary_names(),prank->get_real_names());
cout << "create training phi list..." << endl;
create_phi_list_from_samples(peak_intens,peak_starts, max_annotated_intens,
rank_ds.get_non_const_phi_support());
cout << "initializing boost..." << endl;
rank_ds.compute_total_phi_weight();
rank_ds.initialize_potenital_lists();
rank_ds.initialize_binary_one_lists(prank->get_binary_names().size());
rank_ds.initialize_binary_ordered_phi_lists(boost.get_ptr_to_binary_feature_names());
rank_ds.initialzie_real_feature_table(prank->get_real_names().size());
rank_ds.set_max_ratio_for_regular_update(max_weight_ratio);
boost.init_rankboost_model_for_training(rank_ds,40,100);
rank_ds.initialize_real_vote_lists(boost);
// boost.summarize_features(rank_ds.get_samples());
char report_prefix[512];
if (report_dir)
sprintf(report_prefix,"%s/%s_%d",report_dir, partition_name.c_str(),frag_idx);
vector<idx_weight_pair> miss_pairs;
boost.train_rankboost_model(rank_ds,
max_num_rounds,
&miss_pairs,
test_set_ptr,
test_peptide_length,
report_prefix,
stop_signal_file);
// final report
if (report_dir)
{
char name_buff[512];
sprintf(name_buff,"%s_train_miss_pairs.txt",report_prefix);
ofstream report_stream(name_buff);
if (! report_stream.is_open() || ! report_stream.good())
{
cout << "Error: couldn't open pairs report file for writing:" << name_buff << endl;
exit(1);
}
simple_print_peak_pairs(miss_pairs, sample_tps, rank_ds, prank, frag_idx, 250, report_stream);
report_stream.close();
sprintf(name_buff,"%s_feature_list.txt",report_prefix);
ofstream feature_stream(name_buff);
if (! feature_stream.is_open() || ! feature_stream.good())
{
cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
exit(1);
}
cout << "[...";
boost.ouput_importance_ranked_feature_list(rank_ds,feature_stream);
cout << " ...]" << endl;
feature_stream.close();
// write model (also compresses features and deletes the default values)
sprintf(name_buff,"%s_model.txt",report_prefix);
ofstream model_stream(name_buff);
boost.write_rankboost_model(model_stream,true);
model_stream.close();
}
else // send to cout
simple_print_peak_pairs(miss_pairs, sample_tps, rank_ds, prank, frag_idx, 100);
}
}
void PartitionModel::set_partition_name(const string& peak_rank_model_name,
int charge, int size_idx, int mobility)
{
char buff[256];
sprintf(buff,"%s_%d_%d_%d",peak_rank_model_name.c_str(),charge,size_idx,mobility);
partition_name = string(buff);
}
/*************************************************************************
**************************************************************************/
void PartitionModel::simple_print_peak_pairs(
const vector<idx_weight_pair>& pair_idxs,
const vector<TrainingPeptide>& tps,
const RankBoostDataset& ds,
PeakRankModel *prank,
int frag,
int max_examples,
ostream& os) const
{
const int max_examples_for_tp = 2;
const vector<SamplePairWeight>& phi_support= ds.get_phi_support();
vector<int> tp_idx_counts;
tp_idx_counts.clear();
if (this->feature_set_type > 2)
{
cout << "Error: this funcion is not designed for feature set " << feature_set_type << endl;
exit(1);
}
int f;
for (f=0; f<this->fragment_type_idxs.size(); f++)
if (fragment_type_idxs[f]==frag)
break;
if (f== fragment_type_idxs.size())
{
cout << "Error: partition does not have an initialized model for frag " << frag << endl;
exit(1);
}
const int frag_pos=f;
int p_idx;
int counter = 0;
for (p_idx=0; p_idx<pair_idxs.size(); p_idx++)
{
const int pair_idx = pair_idxs[p_idx].idx;
const int x0_idx = phi_support[pair_idx].idx1;
const int x1_idx = phi_support[pair_idx].idx2;
const RankBoostSample sam_x0 = ds.get_sample(x0_idx);
const RankBoostSample sam_x1 = ds.get_sample(x1_idx);
const int tp_idx = sam_x0.group_idx;
const int x0_cut_idx = sam_x0.tag2;
const int x1_cut_idx = sam_x1.tag2;
const TrainingPeptide& tp = tps[tp_idx];
const RankBoostModel& boost_model = frag_models[frag_pos];
const int num_binary_features = boost_model.get_num_binary_features();
const int num_real_features = boost_model.get_num_real_features();
const vector<string>& binary_feature_names = boost_model.get_binary_feature_names();
const vector<string>& real_feature_names = boost_model.get_real_feature_names();
const vector<string>& model_aa_labels = prank->get_model_aa_labels();
const vector<int>& bin_updates = boost_model.get_binary_update_counts();
const vector<int>& real_updates = boost_model.get_real_update_counts();
int j;
for (j=0; j<tp_idx_counts.size(); j++)
if (tp_idx_counts[j]==tp_idx)
break;
if (j==tp_idx_counts.size())
{
tp_idx_counts.push_back(tp_idx);
}
else
{
if (tp_idx_counts[j]>=max_examples_for_tp)
continue;
tp_idx_counts[j]++;
}
counter++;
vector<int> amino_acids;
prank->convert_aas_to_model_aas(tp.amino_acids,amino_acids);
if (sam_x0.group_idx <0 || sam_x0.tag2<0 || sam_x1.group_idx <0 || sam_x1.tag2<0)
{
cout << "Error: bad tags attached to samples:" << endl;
cout << sam_x0.group_idx << " " << sam_x0.tag1 << " " << sam_x0.tag2 << endl;
cout << sam_x1.group_idx << " " << sam_x1.tag1 << " " << sam_x1.tag2 << endl;
exit(1);
}
if (sam_x0.group_idx != sam_x1.group_idx)
{
cout << "Error: group_idx mismathces!" << endl;
exit(1);
}
const int pos = tp.get_frag_idx_pos(frag);
if (pos<0)
{
cout << "Error: frag not in tp intens!" << endl;
exit(1);
}
const vector<float>& intens = tp.intens[pos];
os << counter << "\t" << phi_support[pair_idx].weight << "\t" << pair_idxs[p_idx].weight << "\t";
int a;
for (a=0; a<amino_acids.size(); a++)
{
if (a == x0_cut_idx)
os << " 0 ";
if (a == x1_cut_idx)
os << " 1 ";
os << model_aa_labels[amino_acids[a]];
}
os << endl;
os << "x0: cut " << x0_cut_idx << " inten " << intens[x0_cut_idx] << "\t score " <<
boost_model.calc_rank_score(sam_x0) << endl;
os << "x1: cut " << x1_cut_idx << " inten " << intens[x1_cut_idx] << "\t score " <<
boost_model.calc_rank_score(sam_x1) << endl;
os << endl << endl;
if (max_examples>0 && counter>=max_examples)
break;
}
}
void PartitionModel::print_model_stats() const
{
if (this->feature_set_type <=2)
{
cout << this->partition_name << " \t";
int i;
for (i=0; i<10; i++)
{
int j;
for (j=0; j<this->fragment_type_idxs.size(); j++)
if (fragment_type_idxs[j]==i && frag_models[j].get_ind_was_initialized())
{
cout << " *";
break;
}
if (j==fragment_type_idxs.size())
cout << " ";
}
cout << endl;
}
else
{
cout << this->partition_name << " \t";
int i;
for (i=0; i<10; i++)
{
int j;
for (j=0; j<this->fragment_type_idxs.size(); j++)
if (fragment_type_idxs[j]==i && this->combined_frag_boost_model.get_ind_was_initialized())
{
cout << " *";
break;
}
if (j==fragment_type_idxs.size())
cout << " ";
}
cout << endl;
}
}
/***********************************************************************
makes tables listing features and final scores
Only makes table if the predictions match
************************************************************************/
bool PeakRankModel::make_peak_prediction_table(
const PeptideSolution& sol,
const vector< vector<intensity_t> >& intens,
int num_peaks) const
{
PeptidePeakPrediction ppp;
calc_peptide_predicted_scores(sol, ppp);
// the ppp includes a table of rank scores (rows are actual frag idxs, not relative
// position in the frag_type_idxs).
// reduce intensities to the same dimensionality
const int num_frags = ppp.frag_idxs.size();
vector< vector< float> > observed_intens;
observed_intens.resize(num_frags);
int i,f;
for (f=0; f<num_frags; f++)
{
const int frag_idx = ppp.frag_idxs[f];
observed_intens[f]=intens[frag_idx];
}
// calculate the ranks and mapping between predicted and observed
vector< vector<int> > observed_ranks, predicted_ranks;
calc_combined_peak_ranks(observed_intens, observed_ranks);
calc_combined_peak_ranks(ppp.rank_scores, predicted_ranks);
vector<int> sel_frags, sel_idxs;
vector< float > intensities;
int rank;
for (rank=0; rank<num_peaks; rank++)
{
bool good_pred=false;
for (f=0; f<num_frags; f++)
{
int i;
for (i=0; i<predicted_ranks[f].size(); i++)
{
if (predicted_ranks[f][i] == rank &&
observed_ranks[f][i] == rank)
{
good_pred=true;
sel_frags.push_back(f);
sel_idxs.push_back(i);
intensities.push_back(intens[f][i]);
break;
}
}
}
if (! good_pred)
return false;
}
// cout << "#sel_frags: " << sel_frags.size() << endl;
// calc specific peak vectors and collect data
vector< vector< string> > feature_names;
vector< vector< float > > feature_values;
vector< vector< float > > feature_scores;
vector< float > total_scores;
feature_names.resize(num_peaks);
feature_values.resize(num_peaks);
feature_scores.resize(num_peaks);
total_scores.resize(num_peaks,0);
const Peptide& pep = sol.pep;
const mass_t pm_with_19 = sol.pm_with_19;
const int spec_charge = sol.charge;
const int mobility = get_proton_mobility(pep,spec_charge);
const int size_idx = get_size_group(spec_charge,pm_with_19);
if (! partition_models[spec_charge][size_idx][mobility])
{
cout << "Error: no rank partition model for " <<
spec_charge << " " << size_idx << " " << mobility << endl;
exit(1);
}
if (size_idx != 1 || mobility != 1)
return false;
const mass_t min_detected_mass = calc_min_detected_mass(pm_with_19, spec_charge);
const mass_t max_detected_mass = get_max_detected_mass();
const vector<int>& amino_acids = pep.get_amino_acids();
vector<mass_t> exp_cuts;
pep.calc_expected_breakage_masses(config,exp_cuts);
const mass_t n_mass = pep.get_n_gap();
// calculate a single set of ranks across the combined set of fragments
const int start_cut_idx = (sol.reaches_n_terminal ? 1 : 0);
const int last_cut_idx = (sol.reaches_c_terminal ? exp_cuts.size()-1 : exp_cuts.size());
const mass_t c_mass = exp_cuts[exp_cuts.size()-1];
int max_l=0;
for (i=0; i<sel_frags.size(); i++)
{
const int frag_idx=sel_frags[i];
const int cut_idx = sel_idxs[i];
const FragmentType& fragment = config->get_fragment(frag_idx);
const mass_t cut_mass = exp_cuts[cut_idx];
const mass_t peak_mass = fragment.calc_expected_mass(cut_mass,pm_with_19);
RankBoostSample rbs;
for (f=0; f<num_frags; f++)
if (ppp.frag_idxs[f] == frag_idx)
break;
// cout << "Frag: " << fragment.label << " fi:" << frag_idx << " f:" << f << endl;
if (f==num_frags)
{
cout << "Error: bad frag!!!!" << endl;
exit(1);
}
partition_models[spec_charge][size_idx][mobility]->fill_combined_simple_peak_features(
this, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
// partition_models[spec_charge][size_idx][mobility]->fill_combined_peak_features(
// this, amino_acids, cut_idx, cut_mass, sol, fragment, f, rbs);
total_scores[i] = partition_models[spec_charge][size_idx][mobility]->combined_frag_boost_model.calc_rank_score_with_details(
rbs,feature_names[i],feature_values[i],feature_scores[i]);
if (feature_names[i].size()>max_l)
max_l = feature_names[i].size();
}
cout << "Size: " << size_idx << " Mobility: " << mobility << endl;
// print results
for (i=0; i<num_peaks; i++)
{
cout << config->get_fragment(sel_frags[i]).label << " " <<
sel_idxs[i];
if (i<num_peaks-1)
{
cout << " & ";
}
else
cout << "\\\\" << endl;
}
cout << setprecision(2) << fixed;
for (i=0; i<num_peaks; i++)
{
cout << total_scores[i];
if (i<num_peaks-1)
{
cout << " & ";
}
else
cout << "\\\\" << endl;
}
for (i=0; i<num_peaks; i++)
{
cout << intensities[i];
if (i<num_peaks-1)
{
cout << " & ";
}
else
cout << "\\\\" << endl;
}
for (i=0; i<max_l; i++)
{
int j;
for (j=0; j<num_peaks; j++)
{
if (feature_names[j].size()<=i)
{
cout << " & ";
}
else
{
cout << feature_names[j][i] << " " << feature_values[j][i] << " & ";
if (feature_scores[j][i]>0)
{
cout << "+";
}
cout << feature_scores[j][i];
}
if (j<num_peaks-1)
{
cout << " & ";
}
else
cout << "\\\\" << endl;
}
}
return true;
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -