⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pmc_rank.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 4 页
字号:

				if (true_mz_bin_idx>0 && pmc_stats[true_mz_bin_idx].m_over_z-true_mz>true_mz-pmc_stats[true_mz_bin_idx-1].m_over_z)
					true_mz_bin_idx--;

				int opt_bin_idx = get_optimal_bin(true_mz_bin_idx, charge);

				
				static vector<RankBoostSample> spec_samples;
				fill_RankBoost_smaples_with_PMC(bs, charge, spec_samples);

				// select samples and add them to pmc_ds
				int good_idx;
				vector<int> bad_idxs;
				select_training_sample_idxs(charge,spec_samples,bs,good_idx,bad_idxs);

				const bool ind_add_to_train = (my_random()<prop_train);
				int group_idx;
				
				if (ind_add_to_train)
				{
					group_idx= num_groups_in_train++;	
				}
				else
				{
					group_idx= num_groups_in_test++;
					test_ssfs.push_back(ssf);
				}
				
				
				RankBoostDataset& ds = (ind_add_to_train ? train_ds : test_ds);

				const int pos_index  = ds.get_num_samples();
				spec_samples[good_idx].group_idx = group_idx;
				spec_samples[good_idx].rank_in_group=0;

				ds.add_sample(spec_samples[good_idx]);
				if (sample_diagnostic)
					pos_ds.add_sample(spec_samples[good_idx]);

				int j;
				for (j=0; j<bad_idxs.size(); j++)
				{
					const int bad_idx = bad_idxs[j];
					if (bad_idx < 0 || bad_idx>= spec_samples.size())
						continue;
		
					spec_samples[bad_idx].group_idx=group_idx;
					spec_samples[bad_idx].rank_in_group=1;

					ds.add_to_phi_vector(ds.get_num_samples(),pos_index);
					ds.add_sample(spec_samples[bad_idx]);

					if (sample_diagnostic)
						neg_ds.add_sample(spec_samples[bad_idx]);
				}						   
			}

			train_ds.set_num_groups(num_groups_in_train);
			test_ds.set_num_groups(num_groups_in_test);
			
			train_ds.compute_total_phi_weight();
			train_ds.initialize_potenital_lists();
			train_ds.initialzie_real_feature_table(real_names.size());

			test_ds.compute_total_phi_weight();

			if (pmc_rank_models[charge][size_idx])
				delete pmc_rank_models[charge][size_idx];
			
			pmc_rank_models[charge][size_idx] = new RankBoostModel;
		

			RankBoostModel* boost = pmc_rank_models[charge][size_idx];

			vector<string> empty;
			empty.clear();
			boost->init_rankboost_model_feature_names(empty,real_names);
			boost->init_rankboost_model_for_training(train_ds,100,25);

			train_ds.initialize_real_vote_lists(*boost);

			if (sample_diagnostic)
			{
				boost->summarize_features_pos_neg(pos_ds.get_samples(),neg_ds.get_samples());
			}
			else
				boost->summarize_features(train_ds.get_samples());

			boost->train_rankboost_model(train_ds,4000,NULL,&test_ds);
			
			boost->ouput_ranked_feature_list();

		//	output_pmc_rank_results(fm,charge,test_ssfs);

		//	exit(0);

			ind_initialized_pmcr = true;
		//	string path;
		//	path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCRtt.txt";
		//	this->write_pmc_rank_models(path.c_str());
			
		}
	}

	string path;
	path = config->get_resource_dir() + "/" + config->get_model_name() + "_PMCR.txt";
	this->write_pmc_rank_models(path.c_str());
	ind_initialized_pmcr = true;
}


struct offset_pair {
	offset_pair() : offset(POS_INF), inten_sum(0) {};
	offset_pair(mass_t off,float inten) : offset(off), inten_sum(inten) {};
	mass_t offset;
	float inten_sum;
};

bool cmp_offset_pair_offset (const offset_pair& a, const offset_pair& b)
{
	return (a.offset<b.offset);
}

bool cmp_offset_pair_inten (const offset_pair& a, const offset_pair& b)
{
	return (a.inten_sum>b.inten_sum);
}


float calc_mean_abs_offset(const vector<float>& offsets_by_inten)
{
	const float missing_pair_offset = 0.5;
	const int   num_offsets         = 3;

	if (offsets_by_inten.size()==0)
		return 1000;

	float abs_off=0;
	int i;
	for (i=0; i<num_offsets && i<offsets_by_inten.size(); i++)
		abs_off+=fabs(offsets_by_inten[i]);

	abs_off += (3-i)*missing_pair_offset;
	
	return (abs_off/num_offsets);
}


void calc_pmc_rank_stats_for_mass(const QCPeak *peaks, 
										  int num_peaks, 
										  mass_t single_charge_pair_sum,
										  mass_t tolerance, 
										  const vector<float>& iso_levels,
										  const vector<bool>& strong_inds,
										  const vector<bool>& strict_iso_inds,
										  PMCRankStats& stats)
{
	const mass_t min_single_sum = single_charge_pair_sum - tolerance;
	const mass_t max_single_sum = single_charge_pair_sum + tolerance;

	const mass_t min_double_sum = min_single_sum + 1.0;
	const mass_t max_double_sum = max_single_sum + 1.0;
	const mass_t double_charge_pair_sum = single_charge_pair_sum +1.0;

	const mass_t min_single_h2o_sum = min_single_sum - MASS_H2O;
	const mass_t max_single_h2o_sum = max_single_sum - MASS_H2O;
	const mass_t single_charge_pair_h2o_sum = single_charge_pair_sum - MASS_H2O;

	const mass_t min_double_h2o_sum = min_double_sum - MASS_H2O;
	const mass_t max_double_h2o_sum = max_double_sum - MASS_H2O;
	const mass_t double_charge_pair_h2o_sum = double_charge_pair_sum - MASS_H2O;

	static vector<offset_pair> by_pairs,  strong_pairs;
	static vector<offset_pair> c2_pairs,  strong_c2_pairs;
	static vector<offset_pair> h2o_pairs, c2_h2o_pairs;

	by_pairs.clear();
	strong_pairs.clear();
	c2_pairs.clear();
	strong_c2_pairs.clear();
	h2o_pairs.clear();
	c2_h2o_pairs.clear();

	stats.clear();

	int forward_idx = -1;
	int back_idx = num_peaks-1;

	// find pairs of b/y
	while (forward_idx<back_idx)
	{
		forward_idx++;
		if (iso_levels[forward_idx]>0)
		{
			continue;
		}

		while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max_single_sum)
			back_idx--;

		if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min_single_sum)
		{
			if (iso_levels[back_idx]>0)
				continue;

			const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_sum);
			const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
					
			by_pairs.push_back(offset_pair(offset,inten_sum));
			stats.inten_frag_pairs += inten_sum;

			if (strong_inds[forward_idx] || strong_inds[back_idx])
			{
				strong_pairs.push_back(offset_pair(offset,inten_sum));
				stats.inten_strong_pairs += inten_sum;
			}
		}
	}

	// find pairs b/y2
	forward_idx = -1;
	back_idx = num_peaks-1;

	const int last_idx =num_peaks-1;
	while (forward_idx<last_idx)
	{
		forward_idx++;
		if (iso_levels[forward_idx]>0)
			continue;
			
		mass_t sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
		while (back_idx>=0 && sum>max_double_sum)
		{
			back_idx--;
			if (back_idx<0)
				break;
			sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
		}

		if (back_idx>=0 && sum > min_double_sum)
		{
			if (iso_levels[back_idx]>0)
				continue;

			const mass_t offset = fabs(sum - double_charge_pair_sum);
			const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
			
			c2_pairs.push_back(offset_pair(offset,inten_sum));
			stats.inten_c2_pairs += inten_sum;

			if (strong_inds[forward_idx] || strong_inds[back_idx])
			{
				strong_c2_pairs.push_back(offset_pair(offset,inten_sum));
				stats.inten_c2_strong_pairs = inten_sum;
			}
		}
	}

	// find pairs of b/y-H2O
	forward_idx = -1;
	back_idx = num_peaks-1;

	while (forward_idx<back_idx)
	{
		forward_idx++;
		if (iso_levels[forward_idx]>0)
		{
			continue;
		}

		while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max_single_h2o_sum)
			back_idx--;

		if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min_single_h2o_sum)
		{
			if (iso_levels[back_idx]>0)
				continue;

			const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_h2o_sum);
			const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
					
			h2o_pairs.push_back(offset_pair(offset,inten_sum));
			stats.inten_h2o_loss_frag_pairs += inten_sum;
		}
	}

	// find pairs b/y2 - H2O
	forward_idx = -1;
	back_idx = num_peaks-1;

	while (forward_idx<last_idx)
	{
		forward_idx++;
		if (iso_levels[forward_idx]>0)
			continue;
			
		mass_t sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
		while (back_idx>=0 && sum>max_double_h2o_sum)
		{
			back_idx--;
			if (back_idx<0)
				break;
			sum = 2*peaks[forward_idx].mass + peaks[back_idx].mass;
		}

		if (back_idx>=0 && sum > min_double_h2o_sum)
		{
			if (iso_levels[back_idx]>0)
				continue;

			const mass_t offset = fabs(sum - double_charge_pair_h2o_sum);
			const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
			
			c2_h2o_pairs.push_back(offset_pair(offset,inten_sum));
			stats.itnen_h2o_loss_c2_frag_pairs += inten_sum;
		}
	}

	stats.num_frag_pairs = by_pairs.size();
	stats.num_strong_frag_pairs = strong_pairs.size();
	stats.num_c2_frag_pairs = c2_pairs.size();
	stats.num_strong_c2_frag_pairs = strong_c2_pairs.size();
	stats.num_h2o_loss_frag_pairs = h2o_pairs.size();
	stats.num_h2o_loss_c2_frag_pairs = c2_h2o_pairs.size();

	int i;

	vector<float>& offset_pairs_ordered_by_inten = stats.offset_pairs_ordered_by_inten;
	sort(by_pairs.begin(),by_pairs.end(),cmp_offset_pair_inten);
	offset_pairs_ordered_by_inten.resize(by_pairs.size());
	for (i=0; i<by_pairs.size(); i++)
		offset_pairs_ordered_by_inten[i]=by_pairs[i].offset;
	stats.mean_offset_pairs=calc_mean_abs_offset(offset_pairs_ordered_by_inten);

	vector<float>& strong_offset_pairs_ordered_by_inten = stats.strong_offset_pairs_ordered_by_inten;
	sort(strong_pairs.begin(),strong_pairs.end(),cmp_offset_pair_inten);
	strong_offset_pairs_ordered_by_inten.resize(strong_pairs.size());
	for (i=0; i<strong_pairs.size(); i++)
		strong_offset_pairs_ordered_by_inten[i]=strong_pairs[i].offset;
	stats.mean_offset_strong_pairs=calc_mean_abs_offset(strong_offset_pairs_ordered_by_inten);

	vector<float>& c2_offset_pairs_ordered_by_inten = stats.c2_offset_pairs_ordered_by_inten;
	sort(c2_pairs.begin(),c2_pairs.end(),cmp_offset_pair_inten);
	c2_offset_pairs_ordered_by_inten.resize(c2_pairs.size());
	for (i=0; i<c2_pairs.size(); i++)
		c2_offset_pairs_ordered_by_inten[i]=c2_pairs[i].offset;
	stats.mean_offset_c2_pairs=calc_mean_abs_offset(c2_offset_pairs_ordered_by_inten);



	// fill in additional iso sum features (look at pairs that sum to expected, expected+1 expected+2)
	
	// find pairs of b/y

	static vector<offset_pair> pairs0,  pairs1, pairs2;
	static vector<offset_pair> c2_pairs0,  c2_pairs1, c2_pairs2;
	
	pairs0.clear();
	forward_idx = -1;
	back_idx = num_peaks-1;
	while (forward_idx<back_idx)
	{
		forward_idx++;
		if (strict_iso_inds[forward_idx])
			continue;

		while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max_single_sum)
			back_idx--;

		if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min_single_sum)
		{
			if (strict_iso_inds[back_idx])
				continue;

			const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_sum);
			const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
					
			pairs0.push_back(offset_pair(offset,inten_sum));
		}
	}

	pairs1.clear();
	forward_idx = -1;
	back_idx = num_peaks-1;
	const mass_t max1 = max_single_sum+1.0;
	const mass_t min1 = min_single_sum+1.0;
	while (forward_idx<back_idx)
	{
		forward_idx++;
	
		while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max1)
			back_idx--;

		if (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass > min1)
		{
			if (! (strict_iso_inds[back_idx] || strict_iso_inds[forward_idx]))
				continue;

			const mass_t offset = fabs(peaks[forward_idx].mass + peaks[back_idx].mass - single_charge_pair_sum);
			const float inten_sum = peaks[forward_idx].intensity + peaks[back_idx].intensity;
					
			pairs1.push_back(offset_pair(offset,inten_sum));
		}
	}

	pairs2.clear();
	forward_idx = -1;
	back_idx = num_peaks-1;
	const mass_t max2 = max_single_sum+2.0;
	const mass_t min2 = min_single_sum+2.0;
	while (forward_idx<back_idx)
	{
		forward_idx++;
	
		while (back_idx>=0 && peaks[forward_idx].mass + peaks[back_idx].mass>max2)
			back_idx--;

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -