📄 peakrank_combined.cpp
字号:
frag_feature_names.push_back("Cut is _|_" + model_aa_labels[i] );
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is _|__" + model_aa_labels[i] );
for (i=0; i<num_aas; i++)
{
int j;
for (j=0; j<num_aas; j++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "|" + model_aa_labels[j]);
}
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("# N-side " + model_aa_labels[i] );
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("# C-side " + model_aa_labels[i] );
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("N-term aa is " + model_aa_labels[i] + ", cut pos" );
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("C-term aa is " + model_aa_labels[i] + ", cut pos" );
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "|, cut pos");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "|, cut pos, C-term is K");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "|, cut pos, C-term is R");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "_|, cut pos");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "_|, cut pos, C-term is K");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is " + model_aa_labels[i] + "_|, cut pos, C-term is R");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is |" + model_aa_labels[i] + ", cut pos");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is |" + model_aa_labels[i] + ", cut pos, C-term is K");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is |" + model_aa_labels[i] + ", cut pos, C-term is R");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is |_" + model_aa_labels[i] + ", cut pos");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is |_" + model_aa_labels[i] + ", cut pos, C-term is K");
for (i=0; i<num_aas; i++)
frag_feature_names.push_back("Cut is |_" + model_aa_labels[i] + ", cut pos, C-term is R");
// create all feature names
all_feature_names.clear();
all_feature_names.push_back("FRAG TYPE");
int f;
for (f=0; f<this->fragment_type_idxs.size(); f++)
{
const string frag_label = config->get_fragment(fragment_type_idxs[f]).label;
int i;
for (i=0; i<frag_feature_names.size(); i++)
{
string feature_name = frag_label + ": " + frag_feature_names[i];
all_feature_names.push_back(feature_name);
}
}
num_features_per_frag = frag_feature_names.size();
vector<string> empty;
empty.clear();
combined_frag_boost_model.init_rankboost_model_feature_names(empty, all_feature_names);
}
struct inten_trip {
inten_trip() : frag_type_idx(int(NEG_INF)), idx(int(NEG_INF)), inten(int(NEG_INF)) {};
inten_trip(int _f, int _i, float _n) : frag_type_idx(_f), idx(_i), inten(_n) {};
bool operator< (const inten_trip& other) const
{
return inten>other.inten;
}
int frag_type_idx;
int idx;
float inten;
};
/***********************************************************************
calculates the relative rank of the cuts. If intensity is 0, the rank
999 is given (rank[0] also is 999). The calculation is done across the
different fragment types given (not only one kind).
************************************************************************/
void calc_combined_peak_ranks(const vector< vector<float> >& intens,
vector< vector<int> >& peak_ranks)
{
vector<inten_trip> trips;
int i;
for (i=0; i<intens.size(); i++)
{
int j;
for (j=1; j<intens[i].size(); j++)
if (intens[i][j]>0)
trips.push_back(inten_trip(i,j,intens[i][j]));
}
sort(trips.begin(),trips.end());
peak_ranks.clear();
peak_ranks.resize(intens.size());
for (i=0; i<intens.size(); i++)
peak_ranks[i].resize(intens[i].size(),999);
for (i=0; i<trips.size(); i++)
peak_ranks[trips[i].frag_type_idx][trips[i].idx]=i;
}
void PeakRankModel::read_training_peptides_into_combined_rank_boost_dataset(
int spec_charge,
int size_idx,
int mobility,
const vector<TrainingPeptide>& sample_tps,
RankBoostDataset& rank_ds,
vector<float>& peak_intens,
vector<PeakStart>& peak_starts,
vector<int>& peak_frag_types) const
{
const vector<mass_t>& aa2mass = config->get_aa2mass();
const PartitionModel* part_model = partition_models[spec_charge][size_idx][mobility];
const vector<int>& frag_type_idxs = part_model->fragment_type_idxs;
double random_thresh = 0.8;
if (sample_tps.size()<12000)
random_thresh=1.0;
if (sample_tps.size()>15000)
random_thresh=0.7;
if (spec_charge>=3 && size_idx >=2 && mobility > MOBILE && sample_tps.size()>12000 )
random_thresh = 0.7;
if (sample_tps.size()>30000)
random_thresh = 0.6;
rank_ds.clear();
peak_intens.clear();
peak_frag_types.clear();
peak_starts.clear();
if (this->feature_set_type <=2)
{
cout << "Error: read_training_peptides_into_combined_rank_boost_dataset works only on frag set 3 and up!" << endl;
exit(1);
}
int i;
for (i=0; i<sample_tps.size(); i++)
{
const TrainingPeptide& org_tp = sample_tps[i];
if (random_thresh<1.0 && my_random() > random_thresh)
continue;
// create a new tp only with the frags we are interested in, and in the order
// of frag_type_idxs
// the first intensity and last intensity are -999 (so the intens size is length+1)
vector< vector<float> > all_intens;
all_intens.resize(frag_type_idxs.size());
int f;
for (f=0; f<frag_type_idxs.size(); f++)
{
const int frag_type_idx = frag_type_idxs[f];
const int frag_pos = org_tp.get_frag_idx_pos(frag_type_idx);
if (frag_pos<0)
{
all_intens[f].clear();
continue;
}
all_intens[f]=org_tp.intens[frag_pos];
all_intens[f][0]=-999.0;
if (all_intens[f].size() == org_tp.length)
all_intens[f].push_back(-999.0);
}
// scan all cuts for max inten
float max_ann_inten=0;
for (f=0; f<all_intens.size(); f++)
{
if (all_intens[f].size()==0)
continue;
int c;
for (c=0; c<all_intens[f].size()-1; c++)
if (all_intens[f][c]>max_ann_inten)
max_ann_inten=all_intens[f][c];
}
if (max_ann_inten<=0)
continue;
// gives integer values to cut idxs/frags according to intensity
vector< vector<int> > peak_ranks;
calc_combined_peak_ranks(all_intens, peak_ranks);
mass_t c_term_mass = org_tp.n_mass;
int aa_idx;
for (aa_idx=0; aa_idx<org_tp.amino_acids.size(); aa_idx++)
c_term_mass+=aa2mass[org_tp.amino_acids[aa_idx]];
PeakStart ps;
ps.peptide_sample_idx = i;
ps.peak_start_idx=rank_ds.get_num_samples();
ps.num_peaks=0;
PeptideSolution sol;
sol.pep.set_peptide_aas(org_tp.amino_acids);
sol.pep.calc_mass(config);
sol.charge = org_tp.charge;
sol.most_basic_aa_removed_from_n = org_tp.best_n_removed;
sol.most_basic_aa_removed_from_c = org_tp.best_c_removed;
sol.reaches_n_terminal = (org_tp.n_mass<1.0);
sol.reaches_c_terminal = (c_term_mass + 25 > org_tp.pm_with_19);
sol.pm_with_19 = org_tp.pm_with_19;
for (f=0; f<frag_type_idxs.size(); f++)
{
if (all_intens[f].size()==0)
continue;
const int frag_type_idx = frag_type_idxs[f];
const vector<float> & frag_intens = all_intens[f];
const FragmentType& frag = config->get_fragment(frag_type_idx);
// fill samples
mass_t cut_mass=org_tp.n_mass;
int cut_idx;
for (cut_idx=0; cut_idx<frag_intens.size(); cut_idx++)
{
if (cut_idx>0)
cut_mass+=aa2mass[org_tp.amino_acids[cut_idx-1]];
if (frag_intens[cut_idx]>=0)
{
RankBoostSample rbs;
if (feature_set_type == 3)
{
part_model->fill_combined_peak_features(this,
org_tp.amino_acids, cut_idx, cut_mass, sol, frag, f, rbs);
}
else if (feature_set_type == 4)
{
part_model->fill_combined_dnv_peak_features(this, org_tp.n_mass, c_term_mass,
org_tp.amino_acids, cut_idx, cut_mass, sol, frag, f, rbs);
}
else if (feature_set_type == 5)
{
part_model->fill_combined_simple_peak_features(this,
org_tp.amino_acids, cut_idx, cut_mass, sol, frag, f, rbs);
}
else
{
cout << "Error: feature set type not supported: " << feature_set_type << endl;
exit(1);
}
rbs.group_idx = i;
rbs.rank_in_group = peak_ranks[f][cut_idx];
rbs.tag1 = rank_ds.get_num_samples(); // sample idx
rbs.tag2 = cut_idx; // cut idx
rbs.tag3 = org_tp.amino_acids.size(); // peptide length, this tag is used to filter
// the error estimates for a specific length
rank_ds.add_sample(rbs);
peak_intens.push_back(frag_intens[cut_idx]);
peak_frag_types.push_back(frag_type_idx);
}
}
}
ps.num_peaks = rank_ds.get_num_samples() - ps.peak_start_idx;
peak_starts.push_back(ps);
}
rank_ds.set_num_groups(sample_tps.size());
}
void PeakRankModel::train_all_combined_partition_models(
int frag_fill_type,
char *prefix_path,
int sel_charge,
int sel_size_idx,
int sel_mobility,
int num_frags,
char *report_dir,
int num_rounds,
char *test_set,
int test_peptide_length,
char *stop_signal_file,
weight_t max_weight_ratio)
{
const int max_charge = size_thresholds.size() -1;
this->feature_set_type = frag_fill_type;
if (partition_models.size()<max_charge+1)
partition_models.resize(max_charge+1);
int charge;
for (charge=1; charge<=max_charge; charge++)
{
if (sel_charge>0 && sel_charge != charge)
continue;
const int num_size_idxs = size_thresholds[charge].size()+1;
if (partition_models[charge].size()<num_size_idxs)
partition_models[charge].resize(num_size_idxs);
cout << "FRAG FILL TYPE: " << frag_fill_type << endl;
cout << "CHARGE " << charge << " , # size idxs: " << num_size_idxs << endl;
cout << "Test peptide length: " << test_peptide_length << endl;
int size_idx;
for (size_idx=0; size_idx<num_size_idxs; size_idx++)
{
if (partition_models[charge][size_idx].size()<=NONMOBILE)
partition_models[charge][size_idx].resize(NONMOBILE+1,NULL);
int mobility;
for (mobility=MOBILE; mobility<=NONMOBILE; mobility++)
{
if ((sel_charge>=0 && sel_charge != charge) ||
(sel_size_idx>=0 && sel_size_idx != size_idx) ||
(sel_mobility>=0 && sel_mobility != mobility) )
continue;
char tr_file[256];
sprintf(tr_file,"%s_%d_%d_%d.txt",prefix_path,charge,size_idx,mobility);
if (! partition_models[charge][size_idx][mobility])
partition_models[charge][size_idx][mobility] = new PartitionModel;
cout << "Training combined models for charge " << charge << " size " << size_idx<< " mobility " <<
mobility << endl;
cout << "Max weight ratio " << max_weight_ratio << endl;
partition_models[charge][size_idx][mobility]->set_partition_name(get_peak_rank_model_name(),
charge, size_idx, mobility);
partition_models[charge][size_idx][mobility]->set_feature_set_type(feature_set_type);
partition_models[charge][size_idx][mobility]->train_combined_partition_model(this,
tr_file,charge,size_idx,mobility, num_frags, report_dir,
num_rounds,test_set, test_peptide_length, stop_signal_file,
max_weight_ratio);
cout << endl;
}
}
}
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -