📄 peakrank_combined.cpp
字号:
set_combined_simple_feature_names_in_rankboost_model(prank);
}
else
{
cout << "Error: no feature names supported for type " << prank->get_feature_set_type() << endl;
exit(1);
}
// fill RankBoostSamples and create rank ds
RankBoostDataset rank_ds,test_ds;
cout << "TRAINING..." << endl;
cout << "Max weight ratio: " << max_weight_ratio << endl;
// initialize and read the test set if it exists
RankBoostDataset *test_set_ptr=NULL;
if (test_set_file)
{
vector<float> test_peak_intens;
vector<PeakStart> test_peak_starts;
vector<int> test_peak_frag_types;
cout << endl << "Reading test tps..." << endl;
prank->read_training_peptides_into_combined_rank_boost_dataset(charge, size_idx, mobility,
test_tps, test_ds, test_peak_intens, test_peak_starts, test_peak_frag_types);
cout << "Test peak starts: " << test_peak_starts.size() << endl;
cout << "Test peak intens: " << test_peak_intens.size() << endl;
cout << "Test peak frags : " << test_peak_frag_types.size() << endl;
cout << "Creating test phi list..." << endl;
create_phi_list_from_combined_peak_samples(test_peak_intens, test_peak_frag_types,
test_peak_starts, max_frag_idx, test_ds.get_non_const_phi_support());
test_ds.compute_total_phi_weight();
test_set_ptr = &test_ds;
// choose length (try shorte peptide if not eonough samples, go for the max)
if (test_peptide_length == 0)
{
vector<int> test_length_counts;
test_length_counts.resize(200,0);
const vector<RankBoostSample>& samples = test_ds.get_samples();
vector<int> sizes;
sizes.resize(test_ds.get_num_groups(),0);
int i;
for (i=0; i<samples.size(); i++)
sizes[samples[i].group_idx]=samples[i].tag3;
for (i=1; i<sizes.size(); i++)
test_length_counts[sizes[i]]++;
int max=0;
for (i=0; i<200; i++)
{
if (test_length_counts[i]>=500)
break;
if (test_length_counts[i]>test_length_counts[max])
max=i;
}
if (i<200)
{
test_peptide_length = i;
}
else
test_peptide_length = max;
}
cout << "test length " << test_peptide_length << endl << endl;
}
vector<float> train_peak_intens;
vector<PeakStart> train_peak_starts;
vector<int> train_peak_frag_types;
cout << endl << "reading training tps..." << endl;
prank->read_training_peptides_into_combined_rank_boost_dataset(charge, size_idx, mobility,
sample_tps, rank_ds, train_peak_intens, train_peak_starts, train_peak_frag_types);
cout << "Train peak starts: " << train_peak_starts.size() << endl;
cout << "Train peak intens: " << train_peak_intens.size() << endl;
cout << "Train peak frags : " << train_peak_frag_types.size() << endl;
cout << "create training phi list..." << endl;
create_phi_list_from_combined_peak_samples(train_peak_intens, train_peak_frag_types,
train_peak_starts, max_frag_idx, rank_ds.get_non_const_phi_support());
cout << "initializing boost..." << endl;
rank_ds.compute_total_phi_weight();
rank_ds.initialize_potenital_lists();
rank_ds.initialize_binary_one_lists(combined_frag_boost_model.get_num_binary_features());
rank_ds.initialize_binary_ordered_phi_lists(combined_frag_boost_model.get_ptr_to_binary_feature_names());
rank_ds.initialzie_real_feature_table(combined_frag_boost_model.get_num_real_features());
rank_ds.set_max_ratio_for_regular_update(max_weight_ratio);
combined_frag_boost_model.init_rankboost_model_for_training(rank_ds,40,100);
rank_ds.initialize_real_vote_lists(combined_frag_boost_model);
char report_prefix[512];
if (report_dir)
sprintf(report_prefix,"%s/%s",report_dir, partition_name.c_str());
if (report_prefix)
{
char name_buff[512];
sprintf(name_buff,"%s_feature_summary.txt",report_prefix);
ofstream sum_stream(name_buff);
combined_frag_boost_model.summarize_features(rank_ds.get_samples(),sum_stream);
sum_stream.close();
}
// write same info as the write_combined_model function performs
vector<string> model_header_strings;
ostringstream oss;
oss << feature_set_type << " " << this->fragment_type_idxs.size();
for (f=0; f<fragment_type_idxs.size(); f++)
oss << " " << fragment_type_idxs[f];
model_header_strings.push_back(oss.str());
vector<idx_weight_pair> miss_pairs;
combined_frag_boost_model.train_rankboost_model(rank_ds,
max_num_rounds,
&miss_pairs,
test_set_ptr,
test_peptide_length,
report_prefix,
stop_signal_file,
&model_header_strings);
// final report
if (report_dir)
{
char name_buff[512];
sprintf(name_buff,"%s_feature_list.txt",report_prefix);
ofstream feature_stream(name_buff);
if (! feature_stream.is_open() || ! feature_stream.good())
{
cout << "Error: couldn't feature_stream file for writing:" << name_buff << endl;
exit(1);
}
cout << "[...";
combined_frag_boost_model.ouput_importance_ranked_feature_list(rank_ds,feature_stream);
cout << " ...]" << endl;
feature_stream.close();
// write model (also compresses features and deletes the default values)
sprintf(name_buff,"%s_model.txt",report_prefix);
ofstream model_stream(name_buff);
vector<peak_rank_stat> dummy_stats;
cout << "Before reduction error: " << setprecision(7) <<
combined_frag_boost_model.calc_prediction_error(test_ds, dummy_stats, test_peptide_length) << endl;
write_combined_partition_model(name_buff);
cout << "After reduction error: " << setprecision(7) <<
combined_frag_boost_model.calc_prediction_error(test_ds, dummy_stats, test_peptide_length) << endl;
}
}
void PartitionModel::fill_combined_peak_features(
const PeakRankModel *prank,
const vector<int>& org_amino_acids,
const int cut_idx,
const mass_t cut_mass,
const PeptideSolution& sol,
const FragmentType& frag,
const int position_idx_in_model_fragment_type_idxs,
RankBoostSample& sample) const
{
const vector<string>& model_aa_labels = prank->get_model_aa_labels();
const vector<int>& session_aas_to_model_aas = prank->get_session_aas_to_model_aas();
const int length = org_amino_acids.size();
const int num_aas = model_aa_labels.size();
const mass_t pm_with_19 = sol.pm_with_19;
const int spec_charge = sol.charge;
int r_idx=0;
int i;
vector<int> amino_acids;
prank->convert_aas_to_model_aas(org_amino_acids, amino_acids);
if (amino_acids.size() != org_amino_acids.size())
{
cout << "Error: aa size mismatch!" << endl;
exit(1);
}
if (cut_idx<=0 || cut_idx>=amino_acids.size())
{
cout << "Error: cut_idx is bad!" << endl;
exit(1);
}
// need to use the special Idx variables and not the regular enumerations
const int HisIdx = session_aas_to_model_aas[His];
const int LysIdx = session_aas_to_model_aas[Lys];
const int ArgIdx = session_aas_to_model_aas[Arg];
const int SerIdx = session_aas_to_model_aas[Ser];
const int ThrIdx = session_aas_to_model_aas[Thr];
const int ProIdx = session_aas_to_model_aas[Pro];
const int GlyIdx = session_aas_to_model_aas[Gly];
const int AlaIdx = session_aas_to_model_aas[Ala];
const int LeuIdx = session_aas_to_model_aas[Leu];
const int AsnIdx = session_aas_to_model_aas[Asn];
const int AspIdx = session_aas_to_model_aas[Asp];
const int GluIdx = session_aas_to_model_aas[Glu];
// special N C side aa indicators
int num_nH=0, num_cH=0;
int num_nK=0, num_cK=0;
int num_nR=0, num_cR=0;
for (i=0; i<cut_idx; i++)
{
if (amino_acids[i] == HisIdx)
num_nH++;
if (amino_acids[i] == LysIdx)
num_nK++;
if (amino_acids[i] == ArgIdx)
num_nR++;
}
for (i=cut_idx; i<length; i++)
{
if (amino_acids[i] == HisIdx)
num_cH++;
if (amino_acids[i] == LysIdx)
num_cK++;
if (amino_acids[i] == ArgIdx)
num_cR++;
}
// MASS / LOCATION FEATURES (REAL + BINARY)
const mass_t max_detected_mass = prank->get_max_detected_mass();
const mass_t exp_peak_mass = frag.calc_expected_mass(cut_mass,pm_with_19);
const mass_t min_obs_mass = prank->calc_min_detected_mass(pm_with_19,spec_charge);
const mass_t max_obs_mass = (pm_with_19>max_detected_mass ? max_detected_mass : pm_with_19);
const float peak_mass_prop = ((exp_peak_mass - min_obs_mass)/(max_obs_mass - min_obs_mass));
const float rounded_peak_prop = 0.02*floor(peak_mass_prop * 50.0);
const mass_t dis_from_min = 20.0*floor((exp_peak_mass - min_obs_mass)*0.05);
const mass_t dis_from_max = 20.0*floor((max_obs_mass - exp_peak_mass)*0.05);
const mass_t dis_from_minmax = (dis_from_min<dis_from_max ? dis_from_min : prank->get_max_detected_mass() - dis_from_max);
const int RKH_n_combo_idx = calc_RKH_combo_idx(num_nR,num_nK,num_nH);
const int RKH_c_combo_idx = calc_RKH_combo_idx(num_cR,num_cK,num_cH);
const int RKH_pair_idx = (RKH_n_combo_idx * num_RKH_combos) + RKH_c_combo_idx;
const float RKH_liniar_pair_idx = RKH_pair_matrix[RKH_n_combo_idx][RKH_c_combo_idx];
const float cut_prop = 0.02 * floor((cut_mass / pm_with_19)*50);
const float cut_n_mass = 20 * floor(cut_mass*0.05);
const float cut_c_mass = 20 * floor((pm_with_19-cut_mass)*0.05);
const int cut_dis_from_n = cut_idx;
const int cut_dis_from_c = length-cut_idx;
const int third=length/3;
const int two_thirds = length - third;
float cut_pos=NEG_INF;
if (cut_idx<=4 || (cut_idx<=third && cut_idx<8))
{
cut_pos = (float)cut_idx;
}
else if (cut_idx>=length-4 || (cut_idx>=two_thirds && cut_idx>length-8))
{
cut_pos = (float)(20-length+cut_idx);
}
else
cut_pos = 10.0 + cut_prop;
const int n_aa = amino_acids[cut_idx-1];
const int c_aa = amino_acids[cut_idx];
const int n_term_aa = amino_acids[0];
const int c_term_aa = amino_acids[length-1];
sample.clear();
// first feature is always 0, tells what fragment this is
sample.add_real_feature(0,(float)position_idx_in_model_fragment_type_idxs);
// this is the new starting index for features in this sample. The offset
// is computaed according to the fragment type
int f_idx =1 + position_idx_in_model_fragment_type_idxs * num_features_per_frag; // num_features_per_frag_type
if (num_features_per_frag==0)
{
cout << "Error: num_features_per_frag is 0!" << endl;
exit(1);
}
// add general position features
sample.add_real_feature(f_idx++,dis_from_minmax);
sample.add_real_feature(f_idx++,rounded_peak_prop);
sample.add_real_feature(f_idx++,RKH_pair_idx);
sample.add_real_feature(f_idx++,RKH_liniar_pair_idx);
sample.add_real_feature(f_idx++,cut_prop);
sample.add_real_feature(f_idx++,cut_pos);
sample.add_real_feature(f_idx++,cut_n_mass);
sample.add_real_feature(f_idx++,cut_c_mass);
sample.add_real_feature(f_idx++,(float)cut_dis_from_n);
sample.add_real_feature(f_idx++,(float)cut_dis_from_c);
const float position_var = rounded_peak_prop; // this will be used for all amino acid features
// fill aa flanking features N side
if (cut_idx>0)
sample.add_real_feature(f_idx+amino_acids[cut_idx-1],position_var);
f_idx+=num_aas;
if (cut_idx>1)
sample.add_real_feature(f_idx+amino_acids[cut_idx-2],position_var);
f_idx+=num_aas;
if (cut_idx>2)
sample.add_real_feature(f_idx+amino_acids[cut_idx-3],position_var);
f_idx+=num_aas;
// fill aa flanking features C side
if (cut_idx<length)
sample.add_real_feature(f_idx+amino_acids[cut_idx],position_var);
f_idx+=num_aas;
if (cut_idx<length-1)
sample.add_real_feature(f_idx+amino_acids[cut_idx+1],position_var);
f_idx+=num_aas;
if (cut_idx<length-2)
sample.add_real_feature(f_idx+amino_acids[cut_idx+2],position_var);
f_idx+=num_aas;
// fill cut pair features X-Y
sample.add_real_feature(f_idx+(n_aa*num_aas+c_aa),position_var);
f_idx+=(num_aas*num_aas);
// fill aa count features (up top 3 aa's away from cut)
vector<int> n_aa_counts, c_aa_counts;
n_aa_counts.resize(num_aas+1,0);
c_aa_counts.resize(num_aas+1,0);
for (i=0; i<cut_idx-3; i++)
n_aa_counts[amino_acids[i]]++;
for (i=cut_idx+3; i<length; i++)
c_aa_counts[amino_acids[i]]++;
int a;
for (a=0; a<num_aas; a++)
if (n_aa_counts[a]>0)
sample.add_real_feature(f_idx+a,n_aa_counts[a]);
f_idx+=num_aas;
for (a=0; a<num_aas; a++)
if (c_aa_counts[a]>0)
sample.add_real_feature(f_idx+a,c_aa_counts[a]);
f_idx+=num_aas;
// add Cut\terminal features
sample.add_real_feature(f_idx+n_term_aa,cut_pos);
f_idx+=num_aas;
sample.add_real_feature(f_idx+c_term_aa,cut_pos);
f_idx+=num_aas;
// X|
if (cut_idx>0)
{
sample.add_real_feature(f_idx+amino_acids[cut_idx-1],cut_pos);
f_idx+=num_aas;
if (c_term_aa == LysIdx)
sample.add_real_feature(f_idx+amino_acids[cut_idx-1],cut_pos);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -