⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 quickclustering.h

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 H
📖 第 1 页 / 共 2 页
字号:
#ifndef __QUICKCLUSTERING_H__
#define __QUICKCLUSTERING_H__

#include "FileManagement.h"
#include "includes.h"


#define XML_BUFFER_SIZE 724288
#define XML_BUFFER_HALF_SIZE 362144

#define LARGISH_CLUSTER_SIZE 6
#define NUM_CLUSTERS_PER_FILE 10000
#define DAT_BUFF_SIZE 10000000
#define DAT_FILE_INCREMENT 25.0


#define NUM_TOP_CLUSTER_PEAKS 5

#define MAX_SIZE_FOR_SQS_REP 4


// values at which the cluster should be remade
const int cluster_reset_values[]={2,3,4,6,9,15,30,50,100,200};
const int num_cluster_reset_values = sizeof(cluster_reset_values)/sizeof(int);


struct MassRankPair {
	
	bool operator< (const MassRankPair& other) const
	{
		return (rank<other.rank);
	}

	mass_t mass;
	int    rank;
};


struct QCPeak {
	QCPeak() : mass(-1), intensity(0), scaled_intensity(0), adjusted_inten(0),
			   num_occurences(1), max_num_occurences(0), source_spec_idx(-1) {};

	mass_t mass;
	intensity_t intensity;
	intensity_t scaled_intensity;
	float   adjusted_inten;  // holds the adjusted value of the peak intensity for
	int	    num_occurences;
	int		max_num_occurences;
	int	    source_spec_idx;
};


struct BasicSpectrum {
public:
	BasicSpectrum() : peaks(NULL), prm_peaks(NULL), num_peaks(-1), num_prm_peaks(-1),
		ssf(NULL), squared_adjusted_inten(-1),	signal_level(0) {};

	void output_to_mgf(ostream& mgf, Config *config, const char *seq=NULL) const;

	void output_to_mgf(FILE* mgf_stream, Config *config, const char *seq=NULL) const;

	// returns number of peaks that match the given masses
	int get_number_of_matching_peaks(mass_t tolerance, const vector<mass_t>& masses) const;

	float calc_signal_level();

	void  calc_peak_isotope_levels(mass_t tolerance, vector<float>& iso_levels) const;
	void  mark_all_possible_isotope_peaks(mass_t tolerance, vector<bool>& iso_inds) const;

	bool  select_strong_peak_idxs(const vector<float>& iso_levels, vector<bool>& indicators) const;
	
	void print_peaks() const;

	QCPeak *peaks;
	QCPeak *prm_peaks;

	int     num_peaks;
	int		num_prm_peaks;
	SingleSpectrumFile *ssf;
	float   squared_adjusted_inten;
	float	signal_level;
};





// Similar to FileSet but has minimum overhead and works with BasicSpectra
class BasicSpecReader {
public:

	BasicSpecReader() : max_peak_list_size(1000), 
						mgf_stream(NULL), current_mgf_file_idx(-1),
						mzxml_stream(NULL), current_mzxml_file_idx(-1),
						current_dat_file_idx(-1), current_ms2_file_idx(-1) {};

	~BasicSpecReader() { if (mgf_stream) fclose(mgf_stream);
						 if (mzxml_stream) fclose(mzxml_stream);}
						 

	// Reads the important info from the single spectrum
	// pretty much does what the get_next_spectrum() does with the FileSet
	// but without much of the overhead. Returns number of peaks read (after
	// joining close adjacent peaks)
	int read_basic_spec(Config *config, 
						const FileManager& fm, 
						SingleSpectrumFile *ssf, 
						QCPeak* peaks,
						bool override_file_idx = false,
						bool no_processing = false);

private:

	int max_peak_list_size;

	FILE *mgf_stream;          // the current MGF file being scanned (its open stream)
	int current_mgf_file_idx;  // the file index of the current mgf that is open

	FILE *mzxml_stream;          // the current MZXML file being scanned (its open stream)
	int current_mzxml_file_idx;  // the file index of the current MZXML that is open

	ifstream dat_file;
	int current_dat_file_idx;

	FILE *ms2_stream;
	int current_ms2_file_idx;

	vector<QCPeak> peak_list;      // used for temporary storage of a spectrum's peak list

	// these functions just extract the peak list from the spectrum file, return the actual
	// number of peaks (after joining)
	int get_peak_list_from_DTA(const char* dta_name);
	int get_peak_list_from_MGF(FILE *mgf_stream);
	int get_peak_list_from_MZXML(FILE *mzxml_stream);
	int get_peak_list_from_DAT(ifstream& dat_file, QCPeak *peaks);
	int get_peak_list_from_MS2(FILE *ms2_stream);
	int get_peak_list_from_PKL(const string& pkl_path);
};



struct PeakListPointer {
	QCPeak *peaks;
	int num_peaks;
};

struct CutProb {
	bool operator< (const CutProb& other) const
	{
		return (mass < other.mass);
	}

	float mass;
	float prob;
};

class ClusterSpectrum {
	friend class QCOutputter;
public:
	ClusterSpectrum() : tmp_cluster_idx(-1), sim_matrix_row_start(NULL), charge(0), config(NULL), m_over_z(-1), retention_time(-1),
		                tolerance(-1), 
						maximum_peaks_vector_size(0), maximum_good_peaks_to_output(0),
						num_spectra_in_cluster(0), best_sqs_spec_idx(-1) {};


	void create_new_cluster(Config *config, BasicSpectrum& bs,  int cluster_idx);
	
	// adds the spectrum to this cluster. If the total number of spectra 
	// is one of the "reset" values (2,3,4,6,9,15,30,50,100), the cluster is
	// recreated.
	void add_spectrum_to_cluster(BasicSpectrum& bs, 
								 const vector<int>& spec_top_idxs,
								 float top_x_masses[NUM_TOP_CLUSTER_PEAKS]);

	// tries to add the cluster
	// succeeds only if the similarity of the two originals to the new consensus
	// is above the sim_tresh (returns true if it made the addition, false otherwise)
	bool add_cluster(ClusterSpectrum& cs, float sim_thresh);

	void filter_peaks_with_slidinig_window();
	
	const vector<BasicSpectrum>& get_basic_spectra() const { return basic_spectra; }

	// sets the m_over_z as the average of the m_over_z of the basic_spectra
	void set_cluster_m_over_z();

	// checks that all basic spectra have the same charge, otherwise cahrge =0
	void set_charge();
	void set_charge(int c) { charge=c; }

	void set_title( const string& new_title) { title = new_title; }
	int  get_num_basic_spectra() const { return basic_spectra.size(); }
	int  get_tmp_cluster_idx() const { return tmp_cluster_idx; }
	void set_tmp_cluster_idx(int idx) { tmp_cluster_idx=-1; }

	unsigned char *get_sim_matrix_row_start() const { return sim_matrix_row_start; }
	void set_sim_matrix_row_start(unsigned char *pos) { sim_matrix_row_start = pos; }


	float get_retention_time() const { return retention_time; }
	void  set_retention_time(float r) { retention_time = r; }

	mass_t get_m_over_z() const { return m_over_z; }

	const string& get_title() const { return title; }
	BasicSpectrum& get_basic_spectrum(int idx) { return basic_spectra[idx]; }
	void set_basic_spectra(vector<BasicSpectrum>& new_spectra) { basic_spectra = new_spectra; }

	const vector<int>& get_top_ranked_idxs() const { return top_ranked_peak_idxs; }

	void set_top_ranked_idxs(const vector<int>& top_idxs) { top_ranked_peak_idxs = top_idxs; }

	void set_top_masses(float top_x_masses[NUM_TOP_CLUSTER_PEAKS]) 
	{
		int i;
		for (i=0; i<NUM_TOP_CLUSTER_PEAKS; i++)
			top_peak_masses[i]=top_x_masses[i];
	}


	float* get_top_peak_masses() const { return (float *)top_peak_masses; }

	bool find_match_in_top_masses(float top_x_masses[NUM_TOP_CLUSTER_PEAKS]) const
	{
		int a_idx=0;
		int b_idx=0;
		while (1)
		{
			float diff = top_x_masses[a_idx] - top_peak_masses[b_idx];
			if (diff<0.65 && diff>-0.65)
				return true;

			if (diff<0)
			{
				if (++a_idx==NUM_TOP_CLUSTER_PEAKS)
					return false;
				continue;
			}

			if (++b_idx==NUM_TOP_CLUSTER_PEAKS)
				return false;
		}
	}

	const QCPeak * get_peaks_pointer() const { return &peaks[0]; }
	int   get_num_peaks() const { return peaks.size(); }

	const string& get_peptide_str() const { return peptide_str; }
	void set_peptide_str(string& str) { peptide_str = str; }

	Config *get_config() { return config; }
	void    set_config(Config *c) { config = c; }

	int     get_charge() const { return charge; }

	// joins the spectra to form a new cluster spectrum
	void create_cluster_by_binning_basic_spectra();

	


	static void init_statics(Config *config)
	{
		init_min_num_occurences(config->get_tolerance());
	}

	static void set_num_top_peaks_per_1000_da(int val) { num_top_peaks_per_1000_da = val; }
	static int get_num_top_peaks_per_1000_da()  { return num_top_peaks_per_1000_da; }


	// sets the consensus spectrum to be the basic spectrum with the maximal similarity
	// to other spectra. Returns the idx of the most similar spectrum
	int select_max_similarity_spectrum_as_consensus();

	int select_max_sqs_spectrum_as_consensus();

	// creates the title (file name) for the cluster
	void make_title(string& name, int batch_idx, int cluster_idx);

	// writes the spectrum to the output file in the mgf format
	void write_spectrum_to_mgf(ostream& mgf, bool write_peak_count=false) const;

	void write_spectrum_to_pkl_single(string& file_name) const;

	void write_cluster_basic_spectra_to_mgf(char *mgf_file) const;

	// finds how many spectra have a peptide sequence that doesn't match the majority assignment
	int  get_num_misassigned_spectra(mass_t pm_tolerance= 0.08,
		 int *mismatched_with_pep = NULL) const;

	// returns true if there is a mjority annotation (with 75% of the annotated spectra)
	// if so, returns its string and mass in the variables
	bool has_majority_annotation(string& pep_str, mass_t& pep_mass) const;

	int  get_num_basic_spectra_with_peptide() const;

	void print_cluster_peptides() const;
	
	void print_cluster_similarities();

	void print_explained_intensity_stats(Peptide& pep) const;


private:
	int	   tmp_cluster_idx; // the idx given when the clustering is done, -1 means invalid cluster
	unsigned char  *sim_matrix_row_start;  // the address of this clusters entry in the sim_matrix
	int    charge;

	Config *config;  

	mass_t      m_over_z;
	float       retention_time; // the average time for the spectra in the cluster
	mass_t      tolerance;

	int		    maximum_peaks_vector_size;
	int		    maximum_good_peaks_to_output;
	int			num_spectra_in_cluster;

	int			best_sqs_spec_idx; // holds the basic spectrum idx for the spectrum with the
								   // highest sqs score, otherwise holds -1 (if the cluster is
								   // bigger than MAX_SIZE_FOR_SQS_REP

	string title;
	string peptide_str; // if the cluster as an assigned peptide

	float top_peak_masses[NUM_TOP_CLUSTER_PEAKS]; // the added spectrum needs to match one of these 
								             

	vector<QCPeak>        peaks;
	vector<BasicSpectrum> basic_spectra;
	vector<int>           top_ranked_peak_idxs;  // idxs of top ranked peaks (sorted list by idx)


	void create_consensus_by_binning_basic_spectra();
	
	// adds the given list to the clusters current list.
	// if the new cluster is larger than X, peak scores are weighted according
	// to the percentage in which the peaks appear
	// list of peaks is then filtered to remove excess peaks
	bool add_peak_list(const QCPeak *second_peaks, 
					   int num_second_peaks, 
					   mass_t tolerance,
					   int num_basic_spectra_added, 
					   bool need_to_scale = true);

	void create_consensus_sepctrum_from_peak_list_pointers(
				vector<PeakListPointer>& plp, 
				int total_num_peaks);

	// recursively merges the peak lists from the various spectra
	// performs a merge until the pointers list has only one entry
	void merge_peak_lists(vector<QCPeak>& org_peaks,
						  vector<QCPeak>& new_peaks,
						  vector<PeakListPointer>& pointers);


	// joins adjacent peaks, markes invaldiated peaks by assigning their mass to -1
	// and weeds them out
	void join_merged_peak_lists(PeakListPointer& plp,
								PeakListPointer& alt_plp,
								int num_merged_spectra,
								mass_t tolerance);


	// selects the consensus peaks - those that appear more than the expected cutoff
	// also takes some of the stronger peaks that didn't make the cutoff
	// writes the selected peaks into the peaks of the cluster spectrum
	void select_consensus_peaks(PeakListPointer& plp, 
								PeakListPointer& alt_plp,
								int num_org_spectra);


	// static
	static void increase_tmp_storage_size(int num_peaks);
	static void init_min_num_occurences(mass_t tolerance);
	

	static vector<QCPeak> tmp_peak_area1;  // used in peak merging
	static vector<QCPeak> tmp_peak_area2;
    
	static vector<int>   min_num_occurences;
	static int   num_top_peaks_per_1000_da;
	static float large_num_occurence_ratios;
	static int   large_cluster_size;

};




bool mark_top_peaks_with_sliding_window(const QCPeak *peaks, int num_peaks, mass_t window_size, 
					    int num_peaks_per_window, vector<bool>& indicators);


/************************************************************
// Functions for the dot product similarity distance
*************************************************************/
void select_top_peak_idxs(QCPeak *peaks, int num_peaks, 
						  mass_t m_over_z, mass_t tolerance, 
						  vector<int>& top_ranked_peak_idxs,  
						  float top_x_masses[NUM_TOP_CLUSTER_PEAKS]=NULL,
						  int top_peaks_per_100da = 20,
						  Config *config = NULL);


// sets the adjusted intensity of the peaks

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -