📄 pmc_rank.cpp
字号:
if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z)
true_mz_bin_idx--;
int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge);
static vector<RankBoostSample> spec_samples;
fill_RankBoost_smaples_with_PMC(bs, charge, spec_samples);
// select samples and add them to pmc_ds
int good_idx;
vector<int> bad_idxs;
select_training_sample_idxs(charge,spec_samples,bs,good_idx,bad_idxs);
const bool ind_add_to_train = (my_random()<prop_train);
int group_idx;
if (ind_add_to_train)
{
group_idx= num_groups_in_train++;
}
else
{
group_idx= num_groups_in_test++;
test_ssfs.push_back(ssf);
}
RankBoostDataset& ds = (ind_add_to_train ? train_ds : test_ds);
const int pos_index = ds.get_num_samples();
spec_samples[good_idx].group_idx = group_idx;
spec_samples[good_idx].rank_in_group=0;
ds.add_sample(spec_samples[good_idx]);
if (sample_diagnostic)
pos_ds.add_sample(spec_samples[good_idx]);
int j;
for (j=0; j<bad_idxs.size(); j++)
{
const int bad_idx = bad_idxs[j];
if (bad_idx < 0 || bad_idx>= spec_samples.size())
continue;
spec_samples[bad_idx].group_idx=group_idx;
spec_samples[bad_idx].rank_in_group=1;
ds.add_to_phi_vector(ds.get_num_samples(),pos_index);
ds.add_sample(spec_samples[bad_idx]);
if (sample_diagnostic)
neg_ds.add_sample(spec_samples[bad_idx]);
}
}
train_ds.set_num_groups(num_groups_in_train);
test_ds.set_num_groups(num_groups_in_test);
train_ds.compute_total_phi_weight();
train_ds.initialize_potenital_lists();
train_ds.initialzie_real_feature_table(real_names.size());
test_ds.compute_total_phi_weight();
if (pmc_rank_models[charge][size_idx])
delete pmc_rank_models[charge][size_idx];
pmc_rank_models[charge][size_idx] = new RankBoostModel;
RankBoostModel* boost = pmc_rank_models[charge][size_idx];
vector<string> empty;
empty.clear();
boost->init_rankboost_model_feature_names(empty,real_names);
boost->init_rankboost_model_for_training(train_ds,100,25);
train_ds.initialize_real_vote_lists(*boost);
if (sample_diagnostic)
{
boost->summarize_features_pos_neg(pos_ds.get_samples(),neg_ds.get_samples());
}
else
boost->summarize_features(train_ds.get_samples());
boost->train_rankboost_model(train_ds,4000,NULL,&test_ds);
boost->ouput_ranked_feature_list();
// output_pmc_rank_results(fm,charge,test_ssfs);
// exit(0);
ind_initialized_pmcr = true;
// string path;
// path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCRtt.txt";
// this->write_pmc_rank_models(path.c_str());
}
}
string path;
path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt";
this->write_pmc_rank_models(path.c_str());
ind_initialized_pmcr = true;
}
struct offset_pair {
offset_pair() : offset(POS_INF), inten_sum(0) {};
offset_pair(mass_t off,float inten) : offset(off), inten_sum(inten) {};
mass_t offset;
float inten_sum;
};
bool cmp_offset_pair_offset (const offset_pair& a, const offset_pair& b)
{
return (a.offset<b.offset);
}
bool cmp_offset_pair_inten (const offset_pair& a, const offset_pair& b)
{
return (a.inten_sum>b.inten_sum);
}
float calc_mean_abs_offset(const vector<float>& offsets_by_inten)
{
const float missing_pair_offset = 0.5;
const int num_offsets = 3;
if (offsets_by_inten.size()==0)
return 1000;
float abs_off=0;
int i;
for (i=0; i<num_offsets && i<offsets_by_inten.size(); i++)
abs_off+=fabs(offsets_by_inten[i]);
abs_off += (3-i)*missing_pair_offset;
return (abs_off/num_offsets);
}
void calc_pmc_rank_stats_for_mass(const QCPeak *peaks,
int num_peaks,
mass_t single_charge_pair_sum,
mass_t tolerance,
const vector<float>& iso_levels,
const vector<bool>& strong_inds,
const vector<bool>& strict_iso_inds,
PMCRankStats& stats)
{
const mass_t min_single_sum = single_charge_pair_sum - tolerance;
const mass_t max_single_sum = single_charge_pair_sum + tolerance;
const mass_t min_double_sum = min_single_sum + 1.0;
const mass_t max_double_sum = max_single_sum + 1.0;
const mass_t double_charge_pair_sum = single_charge_pair_sum +1.0;
const mass_t min_single_h2o_sum = min_single_sum - MASS_H2O;
const mass_t max_single_h2o_sum = max_single_sum - MASS_H2O;
const mass_t single_charge_pair_h2o_sum = single_charge_pair_sum - MASS_H2O;
const mass_t min_double_h2o_sum = min_double_sum - MASS_H2O;
const mass_t max_double_h2o_sum = max_double_sum - MASS_H2O;
const mass_t double_charge_pair_h2o_sum = double_charge_pair_sum - MASS_H2O;
static vector<offset_pair> by_pairs, strong_pairs;
static vector<offset_pair> c2_pairs, strong_c2_pairs;
static vector<offset_pair> h2o_pairs, c2_h2o_pairs;
by_pairs.clear();
strong_pairs.clear();
c2_pairs.clear();
strong_c2_pairs.clear();
h2o_pairs.clear();
c2_h2o_pairs.clear();
stats.clear();
int forward_idx = -1;
int back_idx = num_peaks-1;
// find pairs of b/y
while (forward_idx<back_idx)
{
forward_idx++;
if (iso_levels[forward_idx]>0)
{
continue;
}
while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max_single_sum)
back_idx--;
if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min_single_sum)
{
if (iso_levels[back_idx]>0)
continue;
const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_sum);
const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
by_pairs.push_back(offset_pair(offset,inten_sum));
stats.inten_frag_pairs += inten_sum;
if (strong_inds[forward_idx] || strong_inds[back_idx])
{
strong_pairs.push_back(offset_pair(offset,inten_sum));
stats.inten_strong_pairs += inten_sum;
}
}
}
// find pairs b/y2
forward_idx = -1;
back_idx = num_peaks-1;
const int last_idx =num_peaks-1;
while (forward_idx<last_idx)
{
forward_idx++;
if (iso_levels[forward_idx]>0)
continue;
mass_t sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
while (back_idx>=0 && sum>max_double_sum)
{
back_idx--;
if (back_idx<0)
break;
sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
}
if (back_idx>=0 && sum > min_double_sum)
{
if (iso_levels[back_idx]>0)
continue;
const mass_t offset = fabs(sum - double_charge_pair_sum);
const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
c2_pairs.push_back(offset_pair(offset,inten_sum));
stats.inten_c2_pairs += inten_sum;
if (strong_inds[forward_idx] || strong_inds[back_idx])
{
strong_c2_pairs.push_back(offset_pair(offset,inten_sum));
stats.inten_c2_strong_pairs = inten_sum;
}
}
}
// find pairs of b/y-H2O
forward_idx = -1;
back_idx = num_peaks-1;
while (forward_idx<back_idx)
{
forward_idx++;
if (iso_levels[forward_idx]>0)
{
continue;
}
while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max_single_h2o_sum)
back_idx--;
if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min_single_h2o_sum)
{
if (iso_levels[back_idx]>0)
continue;
const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_h2o_sum);
const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
h2o_pairs.push_back(offset_pair(offset,inten_sum));
stats.inten_h2o_loss_frag_pairs += inten_sum;
}
}
// find pairs b/y2 - H2O
forward_idx = -1;
back_idx = num_peaks-1;
while (forward_idx<last_idx)
{
forward_idx++;
if (iso_levels[forward_idx]>0)
continue;
mass_t sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
while (back_idx>=0 && sum>max_double_h2o_sum)
{
back_idx--;
if (back_idx<0)
break;
sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
}
if (back_idx>=0 && sum > min_double_h2o_sum)
{
if (iso_levels[back_idx]>0)
continue;
const mass_t offset = fabs(sum - double_charge_pair_h2o_sum);
const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
c2_h2o_pairs.push_back(offset_pair(offset,inten_sum));
stats.itnen_h2o_loss_c2_frag_pairs += inten_sum;
}
}
stats.num_frag_pairs = by_pairs.size();
stats.num_strong_frag_pairs = strong_pairs.size();
stats.num_c2_frag_pairs = c2_pairs.size();
stats.num_strong_c2_frag_pairs = strong_c2_pairs.size();
stats.num_h2o_loss_frag_pairs = h2o_pairs.size();
stats.num_h2o_loss_c2_frag_pairs = c2_h2o_pairs.size();
int i;
vector<float>& offset_pairs_ordered_by_inten = stats.offset_pairs_ordered_by_inten;
sort(by_pairs.begin(),by_pairs.end(),cmp_offset_pair_inten);
offset_pairs_ordered_by_inten.resize(by_pairs.size());
for (i=0; i<by_pairs.size(); i++)
offset_pairs_ordered_by_inten[i]=by_pairs[i].offset;
stats.mean_offset_pairs=calc_mean_abs_offset(offset_pairs_ordered_by_inten);
vector<float>& strong_offset_pairs_ordered_by_inten = stats.strong_offset_pairs_ordered_by_inten;
sort(strong_pairs.begin(),strong_pairs.end(),cmp_offset_pair_inten);
strong_offset_pairs_ordered_by_inten.resize(strong_pairs.size());
for (i=0; i<strong_pairs.size(); i++)
strong_offset_pairs_ordered_by_inten[i]=strong_pairs[i].offset;
stats.mean_offset_strong_pairs=calc_mean_abs_offset(strong_offset_pairs_ordered_by_inten);
vector<float>& c2_offset_pairs_ordered_by_inten = stats.c2_offset_pairs_ordered_by_inten;
sort(c2_pairs.begin(),c2_pairs.end(),cmp_offset_pair_inten);
c2_offset_pairs_ordered_by_inten.resize(c2_pairs.size());
for (i=0; i<c2_pairs.size(); i++)
c2_offset_pairs_ordered_by_inten[i]=c2_pairs[i].offset;
stats.mean_offset_c2_pairs=calc_mean_abs_offset(c2_offset_pairs_ordered_by_inten);
// fill in additional iso sum features (look at pairs that sum to expected, expected+1 expected+2)
// find pairs of b/y
static vector<offset_pair> pairs0, pairs1, pairs2;
static vector<offset_pair> c2_pairs0, c2_pairs1, c2_pairs2;
pairs0.clear();
forward_idx = -1;
back_idx = num_peaks-1;
while (forward_idx<back_idx)
{
forward_idx++;
if (strict_iso_inds[forward_idx])
continue;
while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max_single_sum)
back_idx--;
if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min_single_sum)
{
if (strict_iso_inds[back_idx])
continue;
const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_sum);
const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
pairs0.push_back(offset_pair(offset,inten_sum));
}
}
pairs1.clear();
forward_idx = -1;
back_idx = num_peaks-1;
const mass_t max1 = max_single_sum+1.0;
const mass_t min1 = min_single_sum+1.0;
while (forward_idx<back_idx)
{
forward_idx++;
while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max1)
back_idx--;
if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min1)
{
if (! (strict_iso_inds[back_idx] || strict_iso_inds[forward_idx]))
continue;
const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_sum);
const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
pairs1.push_back(offset_pair(offset,inten_sum));
}
}
pairs2.clear();
forward_idx = -1;
back_idx = num_peaks-1;
const mass_t max2 = max_single_sum+2.0;
const mass_t min2 = min_single_sum+2.0;
while (forward_idx<back_idx)
{
forward_idx++;
while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max2)
back_idx--;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -