⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 pmcsqs.h

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 H
字号:
#ifndef __PMSQS_H__
#define __PMSQS_H__

#include "ME_REG.h"
#include "RankBoost.h"
#include "QuickClustering.h"
#include "includes.h"

#define MIN_SPECTRA_FOR_PMCSQS_MODEL 100

// The 3 SQS model makes binary decisions: good charge c spectrum or not good for charges 
// c=1,2,3.
// The PMC model assumes that we have a good spectrum and in the correct charge.
// 




typedef enum SQS_Fields 
{ 
	//0
	SQS_CONST, SQS_PEAK_DENSITY,

	SQS_PROP_UPTO2G,  SQS_PROP_UPTO5G, 
	SQS_PROP_UPTO10G, SQS_PROP_MORE10G,

	//14
	SQS_PROP_INTEN_UPTO2G, SQS_PROP_INTEN_UPTO5G, SQS_PROP_INTEN_MORE5G,

	SQS_PROP_ISO_PEAKS, SQS_PROP_STRONG_WITH_ISO_PEAKS,

	//19
	SQS_PROP_ALL_WITH_H2O_LOSS,    SQS_PROP_ALL_WITH_NH3_LOSS,    SQS_PROP_ALL_WITH_CO_LOSS,
	SQS_PROP_STRONG_WITH_H2O_LOSS, SQS_PROP_STRONG_WITH_NH3_LOSS, SQS_PROP_STRONG_WITH_CO_LOSS,

	SQS_C2_PROP_ALL_WITH_H2O_LOSS,    SQS_C2_PROP_ALL_WITH_NH3_LOSS,    SQS_C2_PROP_ALL_WITH_CO_LOSS,
	SQS_C2_PROP_STRONG_WITH_H2O_LOSS, SQS_C2_PROP_STRONG_WITH_NH3_LOSS, SQS_C2_PROP_STRONG_WITH_CO_LOSS,

	SQS_DIFF_ALL_WITH_H2O_LOSS,    SQS_DIFF_ALL_WITH_NH3_LOSS,    SQS_DIFF_ALL_WITH_CO_LOSS,
	SQS_DIFF_STRONG_WITH_H2O_LOSS, SQS_DIFF_STRONG_WITH_NH3_LOSS, SQS_DIFF_STRONG_WITH_CO_LOSS,
	
	SQS_PROP_PEAKS_WITH_C1C2, SQS_PROP_STRONG_PEAKS_WITH_C1C2, SQS_PROP_INTEN_WITH_C1C2,

	SQS_IND_MAX_TAG_LENGTH_ABOVE_4, SQS_IND_MAX_TAG_LENGTH_BELOW_4,
	SQS_MAX_TAG_LENGTH_ABOVE_4,     SQS_MAX_TAG_LENGTH_BELOW_4,

	SQS_PROP_INTEN_IN_TAGS,
	SQS_PROP_TAGS1, SQS_PROP_STRONG_PEAKS_IN_TAG1, SQS_PROP_INTEN_TAG1, 
	SQS_IND_PROP_STRONG_BELOW30_TAG1,
	SQS_PROP_TAGS2, SQS_PROP_STRONG_PEAKS_IN_TAG2, SQS_PROP_INTEN_TAG2,	
	SQS_IND_PROP_STRONG_BELOW20_TAG2,
	SQS_PROP_TAGS3, SQS_PROP_STRONG_PEAKS_IN_TAG3, SQS_PROP_INTEN_TAG3, 
	SQS_IND_PROP_STRONG_BELOW10_TAG3,

	SQS_C2_IND_MAX_TAG_LENGTH_ABOVE_4, SQS_C2IND_MAX_TAG_LENGTH_BELOW_4,
	SQS_C2MAX_TAG_LENGTH_ABOVE_4,      SQS_C2MAX_TAG_LENGTH_BELOW_4,

	
	SQS_C2PROP_INTEN_IN_TAGS,
	SQS_C2PROP_TAGS1, SQS_C2PROP_STRONG_PEAKS_IN_TAG1, SQS_C2PROP_INTEN_TAG1, 
	SQS_IND_C2PROP_STRONG_BELOW30_TAG1,
	SQS_C2PROP_TAGS2, SQS_C2PROP_STRONG_PEAKS_IN_TAG2, SQS_C2PROP_INTEN_TAG2,	
	SQS_IND_C2PROP_STRONG_BELOW20_TAG2,
	SQS_C2PROP_TAGS3, SQS_C2PROP_STRONG_PEAKS_IN_TAG3, SQS_C2PROP_INTEN_TAG3, 
	SQS_IND_C2PROP_STRONG_BELOW10_TAG3,

	SQS_DIFF_MAX_TAG_LENGTH, SQS_DIFF_PROP_INTEN_IN_TAGS,
	SQS_DIFF_PROP_TAGS1, SQS_DIFF_PROP_STRONG_PEAKS_IN_TAG1, SQS_DIFF_PROP_INTEN_TAG1,
	SQS_DIFF_PROP_TAGS2, SQS_DIFF_PROP_STRONG_PEAKS_IN_TAG2, SQS_DIFF_PROP_INTEN_TAG2,
	SQS_DIFF_PROP_TAGS3, SQS_DIFF_PROP_STRONG_PEAKS_IN_TAG3, SQS_DIFF_PROP_INTEN_TAG3,
	
	SQS_PEAK_DENSE_T1,  SQS_PEAK_DENSE_T2,  SQS_PEAK_DENSE_T3,
	SQS_INTEN_DENSE_T1, SQS_INTEN_DENSE_T2, SQS_INTEN_DENSE_T3,

	SQS_PEAK_DENSE_H1,  SQS_PEAK_DENSE_H2,
	SQS_INTEN_DENSE_H1, SQS_INTEN_DENSE_H2,

	SQS_PROP_MZ_RANGE_WITH_33_INTEN,
	SQS_PROP_MZ_RANGE_WITH_50_INTEN,
	SQS_PROP_MZ_RANGE_WITH_75_INTEN,
	SQS_PROP_MZ_RANGE_WITH_90_INTEN,

	// get these from the PM features (max values found)
	// all values are after subtracting the maximum background values
	// obtained by using erroneous parent masses

	SQS_NUM_FRAG_PAIRS_1,			SQS_NUM_STRONG_FRAG_PAIRS_1,
	SQS_NUM_C2_FRAG_PAIRS_1,		SQS_NUM_STRONG_C2_FRAG_PAIRS_1,

	SQS_NUM_FRAG_PAIRS_2,			SQS_NUM_STRONG_FRAG_PAIRS_2,
	SQS_NUM_C2_FRAG_PAIRS_2,		SQS_NUM_STRONG_C2_FRAG_PAIRS_2,

	SQS_NUM_FRAG_PAIRS_3,			SQS_NUM_STRONG_FRAG_PAIRS_3,
	SQS_NUM_C2_FRAG_PAIRS_3,		SQS_NUM_STRONG_C2_FRAG_PAIRS_3,

	SQS_PROP_OF_MAX_FRAG_PAIRS_1,			SQS_PROP_OF_MAX_STRONG_FRAG_PAIRS_1,
	SQS_PROP_OF_MAX_C2_FRAG_PAIRS_1,		SQS_PROP_OF_MAX_STRONG_C2_FRAG_PAIRS_1,

	SQS_PROP_OF_MAX_FRAG_PAIRS_2,			SQS_PROP_OF_MAX_STRONG_FRAG_PAIRS_2,
	SQS_PROP_OF_MAX_C2_FRAG_PAIRS_2,		SQS_PROP_OF_MAX_STRONG_C2_FRAG_PAIRS_2,

	SQS_PROP_OF_MAX_FRAG_PAIRS_3,			SQS_PROP_OF_MAX_STRONG_FRAG_PAIRS_3,
	SQS_PROP_OF_MAX_C2_FRAG_PAIRS_3,		SQS_PROP_OF_MAX_STRONG_C2_FRAG_PAIRS_3,

	SQS_PROP_FRAG_PAIRS_1,			SQS_PROP_STRONG_FRAG_PAIRS_1,
	SQS_PROP_C2_FRAG_PAIRS_1,		SQS_PROP_STRONG_C2_FRAG_PAIRS_1,

	SQS_PROP_FRAG_PAIRS_2,			SQS_PROP_STRONG_FRAG_PAIRS_2,
	SQS_PROP_C2_FRAG_PAIRS_2,		SQS_PROP_STRONG_C2_FRAG_PAIRS_2,

	SQS_PROP_FRAG_PAIRS_3,			SQS_PROP_STRONG_FRAG_PAIRS_3,
	SQS_PROP_C2_FRAG_PAIRS_3,		SQS_PROP_STRONG_C2_FRAG_PAIRS_3,

	SQS_DIFF_NUM_FRAG_PAIRS_23,					SQS_DIFF_NUM_STRONG_FRAG_PAIRS_23,
	SQS_DIFF_NUM_C2_FRAG_PAIRS_23,				SQS_DIFF_NUM_STRONG_C2_FRAG_PAIRS_23,
	SQS_DIFF_PROP_OF_MAX_FRAG_PAIRS_23,			SQS_DIFF_PROP_OF_MAX_STRONG_FRAG_PAIRS_23,
	SQS_DIFF_PROP_OF_MAX_C2_FRAG_PAIRS_23,		SQS_DIFF_PROP_OF_MAX_STRONG_C2_FRAG_PAIRS_23,
	SQS_DIFF_PROP_FRAG_PAIRS_23,				SQS_DIFF_PROP_STRONG_FRAG_PAIRS_23,
	SQS_DIFF_PROP_C2_FRAG_PAIRS_23,				SQS_DIFF_PROP_STRONG_C2_FRAG_PAIRS_23,


	SQS_NUM_FIELDS 
} SQS_Fields;







struct PMCRankStats {

	void clear();

	mass_t m_over_z;

	score_t rank_score;

	float num_frag_pairs;
	float num_strong_frag_pairs;
	float num_c2_frag_pairs;
	float num_strong_c2_frag_pairs;
	float num_h2o_loss_frag_pairs;
	float num_h2o_loss_c2_frag_pairs;

	float inten_frag_pairs;
	float inten_strong_pairs;
	float inten_c2_pairs;
	float inten_c2_strong_pairs;
	float inten_h2o_loss_frag_pairs;
	float itnen_h2o_loss_c2_frag_pairs;

	float mean_offset_pairs;
	float mean_offset_strong_pairs;
	float mean_offset_c2_pairs;
	float mean_offset_c2_strong_pairs;
	float mean_offset_h2o_pairs;
	float mean_offset_c2_h2o_pairs;

	bool ind_pairs_with_min_tol;
	bool ind_strong_pairs_with_min_tol;
	bool ind_c2_pairs_with_min_tol;
	bool ind_c2_strong_pairs_with_min_tol;

	float log_dis_from_pairs_min_tol;
	float log_dis_from_strong_pairs_min_tol;
	float log_dis_from_c2_pairs_min_tol;
	float log_dis_from_c2_strong_pairs_min_tol;

	vector<float> offset_pairs_ordered_by_inten;
	vector<float> strong_offset_pairs_ordered_by_inten;
	vector<float> c2_offset_pairs_ordered_by_inten;

	float num_strict_pairs0, inten_strict_pairs0;
	float num_strict_pairs1, inten_strict_pairs1;
	float num_strict_pairs2, inten_strict_pairs2;

	float c2_num_strict_pairs0, c2_inten_strict_pairs0;
	float c2_num_strict_pairs1, c2_inten_strict_pairs1;
	float c2_num_strict_pairs2, c2_inten_strict_pairs2;
};


// per charge
struct PmcSqsChargeRes {
	
	PmcSqsChargeRes() : mz1(-1), score1(NEG_INF), mz2(-1), score2(NEG_INF), min_comp_prob(-1), sqs_prob(0) {};

	float mz1;
	float score1;
	float mz2;
	float score2;

	float min_comp_prob; // the minimal probaiblity when comparing to other charges
	float sqs_prob;
};

struct  DPColumn {
	int pointers[Val+1];
};


class PMCSQS_Scorer {
public:

	PMCSQS_Scorer() : max_model_charge(0),
				frag_pair_sum_offset(NEG_INF), bin_increment(NEG_INF), 
				ind_initialized_pmcr(false), ind_initialized_sqs(false), config(NULL), 
				curr_spec_total_intensity(0), curr_spec_strong_intensity(0),
				curr_spec_num_strong(0) {};

	bool init_for_current_spec(Config *config, const BasicSpectrum& bs);

	void   set_frag_pair_sum_offset(mass_t offset) { frag_pair_sum_offset = offset; }
	mass_t get_frag_pair_sum_offset() const { return frag_pair_sum_offset; }

	void   set_bin_increment(mass_t inc) { bin_increment = inc; }
	mass_t get_bin_increment() const { return bin_increment; }


	const vector< vector<PMCRankStats> >& get_curr_spec_rank_pmc_tables() const { return curr_spec_rank_pmc_tables; }

	void find_best_mz_values_from_rank_model(const BasicSpectrum& bs, int charge, PmcSqsChargeRes& res);


	float get_pmcsqs_results_for_spectrum(Config *config, const BasicSpectrum& bs,
						vector<PmcSqsChargeRes>& res);

	float get_best_mz_charge(Config *config, const BasicSpectrum& bs, 
						   mass_t* mz1, int* charge1, float *prob1,
						   mass_t* mz2, int* charge2, float *prob2,
						   vector<PmcSqsChargeRes>* all_res = NULL);

	void select_pms_and_charges(Config *config, 
								const BasicSpectrum& bs,
								vector<mass_t>& pms_with_19,
								vector<int>&    charges,
								bool	add_one_Da_offsets = false,
								vector<PmcSqsChargeRes>* all_res = NULL);

	void benchmark_pm_selection(Config *config, FileManager& fm, mass_t pm_val_tol);


	float get_sqs_for_spectrum(Config *config, const BasicSpectrum& bs, 
							   int *charge = NULL, bool verbose = false);

	mass_t get_charge_mz_bias(int charge) const { return charge_mz_biases[charge]; }

	void print_spec(const BasicSpectrum& bs) const;

	void test_pmc(Config *config, char *specs_file, int charge, mass_t min_mass=0,
		mass_t max_mass = POS_INF);

	void output_pmc_rank_results(const FileManager& fm, int charge,
				const vector<SingleSpectrumFile *>& test_ssfs); 


	void train_sqs_models(Config *config, const FileManager& pos_fm, char *neg_list, 
		int specific_charge=-1, vector<vector<float> > *inp_weights = NULL);

	void train_pmc_rank_models(Config *config, const FileManager& fm, int sel_charge = 0, bool overwrite=true);

	bool read_pmc_rank_models(Config *config, char *file);

	void write_pmc_rank_models(const char *path) const;

	bool read_sqs_models(Config *config, char *file);

	void write_sqs_models(const char *path) const;

	void compute_sqs_cum_stats_for_ided(Config *config, char *list);
	void compute_sqs_cum_stats_for_crap(Config *config, char *list);

	void benchmark_sqs(Config *config, char *list, char *anns);

	bool get_ind_initialized_pmcr() const { return ind_initialized_pmcr; }
	bool get_ind_initialized_sqs() const { return ind_initialized_sqs; }

	int get_max_model_charge() const { return max_model_charge; }

	void output_filtered_spectra_to_mgfs(Config *config,
									 const vector<string>& files,
									 char *out_dir,
									 float filter_prob, 
									 int& num_written, 
									 int& num_read);

private:

	vector< vector<RankBoostModel * > >  pmc_rank_models; // charge / size thresholds
	vector< vector<mass_t> >             pmc_rank_mass_thresholds;
	vector< vector<mass_t> >			 pmc_charge_mz_biases;


	vector<mass_t> sqs_mass_thresholds;
	vector< vector<float> > sqs_correction_factors; // sqs prob += this[charge][size_idx]
	vector< vector<float> > sqs_mult_factors;      // sqs prob *= this[charge][size_idx]

	vector< vector< vector<ME_Regression_Model *> > > sqs_models; // charge1, charge2 (charge2=0 is pos 
																  // vs negative classes) / size_idx



	vector<mass_t> charge_mz_biases; // add this to the predicted m/z

	int max_model_charge; // the maximal charge for which we have a model

	mass_t frag_pair_sum_offset; // b+y or c+z - (PM+19)

	mass_t bin_increment;

	bool ind_initialized_pmcr;

	bool ind_initialized_sqs;


	Config *config;

	float curr_spec_total_intensity;
	float curr_spec_strong_intensity;
	int   curr_spec_num_strong;

	vector<bool>  curr_spec_strong_inds;
	vector<float> curr_spec_iso_levels;
	vector<bool>  curr_spec_strict_iso_ind; // holds for each spectrum ind if there is any peak at -1

//	vector< vector<PmcStats> > curr_spec_pmc_tables;
//	vector<PmcStats>		   curr_spec_background_stats;
//	vector<PmcStats>		   curr_spec_maximal_values;

	vector< vector<PMCRankStats> > curr_spec_rank_pmc_tables;
	vector<PMCRankStats>		curr_spec_rank_background_stats;
	vector<PMCRankStats>		curr_spec_rank_maximal_values;
	
	void fill_SQS_DP(const BasicSpectrum& bs, vector<DPColumn>& dp, int farge_charge ) const;


	void fill_fval_vector_with_SQS(const BasicSpectrum& bs,
								   ME_Regression_Sample& sam) const;

	
	void fill_RankBoost_smaples_with_PMC(const BasicSpectrum& bs,
										 int charge,
										 vector<RankBoostSample>& samples) const;

	
	int get_optimal_bin(int true_mz_bin, int charge) const;

	void select_training_sample_idxs(int charge,
		const vector<ME_Regression_Sample>& spec_samples,
		const BasicSpectrum& bs,
		int& correct_idx,
		vector<int>& bad_pmc_idxs) const;

	void select_training_sample_idxs(int charge,
		const vector<RankBoostSample>& spec_samples,
		const BasicSpectrum& bs,
		int& correct_idx,
		vector<int>& bad_pmc_idxs) const;

	int get_rank_model_size_idx(int charge, mass_t pm_with_19) const
	{
		if (pmc_rank_mass_thresholds.size()<=charge)
		{
			cout << "Error: PMC does not support charge " << charge << endl;
			exit(1);
		}

		int i;
		for (i=0; i<pmc_rank_mass_thresholds[charge].size(); i++)
			if (pm_with_19<=pmc_rank_mass_thresholds[charge][i])
				break;
		return i;
	}



	void calculate_curr_spec_pmc_values( const BasicSpectrum& bs, mass_t bin_increment);

	void get_sqs_features_from_pmc_tables(const BasicSpectrum& bs,
										  vector< vector<float> >& sqs_featrues) const;



	void set_pmc_mass_thresholds(int option = 0); // option 1 , fixed for it data

	void set_sqs_mass_thresholds();

	void set_default_sqs_correct_factors();

	void init_sqs_correct_factors(int max_charge, int num_sizes);

	int  get_sqs_size_idx(mass_t mass) const
	{
		int i;
		for (i=0; i<sqs_mass_thresholds.size(); i++)
			if (mass<=sqs_mass_thresholds[i])
				break;
		return i;
	}

	void create_filtered_peak_list_for_sqs(
									  QCPeak *org_peaks, int num_org_peaks,
									  QCPeak *new_peaks, int& num_new_peaks) const;

};







void fill_rank_PMC_stats(int charge,
						  const mass_t single_charge_pair_offset, // the sum of b+y or c+z
						  mass_t minus_range, 
						  mass_t plus_range,
						  mass_t increment,
						  Config *config,
						  const BasicSpectrum& bs,
						  const vector<bool>& strong_inds,
						  const vector<float>& iso_levels,
						  const vector<bool>& strict_iso_inds,
						  vector<PMCRankStats>& pmc_stats_vec);

void calc_pmc_rank_stats_for_mass(const QCPeak *peaks, 
										  int num_peaks, 
										  mass_t single_charge_pair_sum,
										  mass_t tolerance, 
										  const vector<float>& iso_levels,
										  const vector<bool>& strong_inds,
										  const vector<bool>& strict_iso_inds,
										  PMCRankStats& stats);



void create_training_files(Config *config);

void init_PMC_feature_names(vector<string>& names);




#endif

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -