📄 quickclustering.h
字号:
#ifndef __QUICKCLUSTERING_H__
#define __QUICKCLUSTERING_H__
#include "FileManagement.h"
#include "includes.h"
#define XML_BUFFER_SIZE 724288
#define XML_BUFFER_HALF_SIZE 362144
#define LARGISH_CLUSTER_SIZE 6
#define NUM_CLUSTERS_PER_FILE 10000
#define DAT_BUFF_SIZE 10000000
#define DAT_FILE_INCREMENT 25.0
#define NUM_TOP_CLUSTER_PEAKS 5
#define MAX_SIZE_FOR_SQS_REP 4
// values at which the cluster should be remade
const int cluster_reset_values[]={2,3,4,6,9,15,30,50,100,200};
const int num_cluster_reset_values = sizeof(cluster_reset_values)/sizeof(int);
struct MassRankPair {
bool operator< (const MassRankPair& other) const
{
return (rank<other.rank);
}
mass_t mass;
int rank;
};
struct QCPeak {
QCPeak() : mass(-1), intensity(0), scaled_intensity(0), adjusted_inten(0),
num_occurences(1), max_num_occurences(0), source_spec_idx(-1) {};
mass_t mass;
intensity_t intensity;
intensity_t scaled_intensity;
float adjusted_inten; // holds the adjusted value of the peak intensity for
int num_occurences;
int max_num_occurences;
int source_spec_idx;
};
struct BasicSpectrum {
public:
BasicSpectrum() : peaks(NULL), prm_peaks(NULL), num_peaks(-1), num_prm_peaks(-1),
ssf(NULL), squared_adjusted_inten(-1), signal_level(0) {};
void output_to_mgf(ostream& mgf, Config *config, const char *seq=NULL) const;
void output_to_mgf(FILE* mgf_stream, Config *config, const char *seq=NULL) const;
// returns number of peaks that match the given masses
int get_number_of_matching_peaks(mass_t tolerance, const vector<mass_t>& masses) const;
float calc_signal_level();
void calc_peak_isotope_levels(mass_t tolerance, vector<float>& iso_levels) const;
void mark_all_possible_isotope_peaks(mass_t tolerance, vector<bool>& iso_inds) const;
bool select_strong_peak_idxs(const vector<float>& iso_levels, vector<bool>& indicators) const;
void print_peaks() const;
QCPeak *peaks;
QCPeak *prm_peaks;
int num_peaks;
int num_prm_peaks;
SingleSpectrumFile *ssf;
float squared_adjusted_inten;
float signal_level;
};
// Similar to FileSet but has minimum overhead and works with BasicSpectra
class BasicSpecReader {
public:
BasicSpecReader() : max_peak_list_size(1000),
mgf_stream(NULL), current_mgf_file_idx(-1),
mzxml_stream(NULL), current_mzxml_file_idx(-1),
current_dat_file_idx(-1), current_ms2_file_idx(-1) {};
~BasicSpecReader() { if (mgf_stream) fclose(mgf_stream);
if (mzxml_stream) fclose(mzxml_stream);}
// Reads the important info from the single spectrum
// pretty much does what the get_next_spectrum() does with the FileSet
// but without much of the overhead. Returns number of peaks read (after
// joining close adjacent peaks)
int read_basic_spec(Config *config,
const FileManager& fm,
SingleSpectrumFile *ssf,
QCPeak* peaks,
bool override_file_idx = false,
bool no_processing = false);
private:
int max_peak_list_size;
FILE *mgf_stream; // the current MGF file being scanned (its open stream)
int current_mgf_file_idx; // the file index of the current mgf that is open
FILE *mzxml_stream; // the current MZXML file being scanned (its open stream)
int current_mzxml_file_idx; // the file index of the current MZXML that is open
ifstream dat_file;
int current_dat_file_idx;
FILE *ms2_stream;
int current_ms2_file_idx;
vector<QCPeak> peak_list; // used for temporary storage of a spectrum's peak list
// these functions just extract the peak list from the spectrum file, return the actual
// number of peaks (after joining)
int get_peak_list_from_DTA(const char* dta_name);
int get_peak_list_from_MGF(FILE *mgf_stream);
int get_peak_list_from_MZXML(FILE *mzxml_stream);
int get_peak_list_from_DAT(ifstream& dat_file, QCPeak *peaks);
int get_peak_list_from_MS2(FILE *ms2_stream);
int get_peak_list_from_PKL(const string& pkl_path);
};
struct PeakListPointer {
QCPeak *peaks;
int num_peaks;
};
struct CutProb {
bool operator< (const CutProb& other) const
{
return (mass < other.mass);
}
float mass;
float prob;
};
class ClusterSpectrum {
friend class QCOutputter;
public:
ClusterSpectrum() : tmp_cluster_idx(-1), sim_matrix_row_start(NULL), charge(0), config(NULL), m_over_z(-1), retention_time(-1),
tolerance(-1),
maximum_peaks_vector_size(0), maximum_good_peaks_to_output(0),
num_spectra_in_cluster(0), best_sqs_spec_idx(-1) {};
void create_new_cluster(Config *config, BasicSpectrum& bs, int cluster_idx);
// adds the spectrum to this cluster. If the total number of spectra
// is one of the "reset" values (2,3,4,6,9,15,30,50,100), the cluster is
// recreated.
void add_spectrum_to_cluster(BasicSpectrum& bs,
const vector<int>& spec_top_idxs,
float top_x_masses[NUM_TOP_CLUSTER_PEAKS]);
// tries to add the cluster
// succeeds only if the similarity of the two originals to the new consensus
// is above the sim_tresh (returns true if it made the addition, false otherwise)
bool add_cluster(ClusterSpectrum& cs, float sim_thresh);
void filter_peaks_with_slidinig_window();
const vector<BasicSpectrum>& get_basic_spectra() const { return basic_spectra; }
// sets the m_over_z as the average of the m_over_z of the basic_spectra
void set_cluster_m_over_z();
// checks that all basic spectra have the same charge, otherwise cahrge =0
void set_charge();
void set_charge(int c) { charge=c; }
void set_title( const string& new_title) { title = new_title; }
int get_num_basic_spectra() const { return basic_spectra.size(); }
int get_tmp_cluster_idx() const { return tmp_cluster_idx; }
void set_tmp_cluster_idx(int idx) { tmp_cluster_idx=-1; }
unsigned char *get_sim_matrix_row_start() const { return sim_matrix_row_start; }
void set_sim_matrix_row_start(unsigned char *pos) { sim_matrix_row_start = pos; }
float get_retention_time() const { return retention_time; }
void set_retention_time(float r) { retention_time = r; }
mass_t get_m_over_z() const { return m_over_z; }
const string& get_title() const { return title; }
BasicSpectrum& get_basic_spectrum(int idx) { return basic_spectra[idx]; }
void set_basic_spectra(vector<BasicSpectrum>& new_spectra) { basic_spectra = new_spectra; }
const vector<int>& get_top_ranked_idxs() const { return top_ranked_peak_idxs; }
void set_top_ranked_idxs(const vector<int>& top_idxs) { top_ranked_peak_idxs = top_idxs; }
void set_top_masses(float top_x_masses[NUM_TOP_CLUSTER_PEAKS])
{
int i;
for (i=0; i<NUM_TOP_CLUSTER_PEAKS; i++)
top_peak_masses[i]=top_x_masses[i];
}
float* get_top_peak_masses() const { return (float *)top_peak_masses; }
bool find_match_in_top_masses(float top_x_masses[NUM_TOP_CLUSTER_PEAKS]) const
{
int a_idx=0;
int b_idx=0;
while (1)
{
float diff = top_x_masses[a_idx] - top_peak_masses[b_idx];
if (diff<0.65 && diff>-0.65)
return true;
if (diff<0)
{
if (++a_idx==NUM_TOP_CLUSTER_PEAKS)
return false;
continue;
}
if (++b_idx==NUM_TOP_CLUSTER_PEAKS)
return false;
}
}
const QCPeak * get_peaks_pointer() const { return &peaks[0]; }
int get_num_peaks() const { return peaks.size(); }
const string& get_peptide_str() const { return peptide_str; }
void set_peptide_str(string& str) { peptide_str = str; }
Config *get_config() { return config; }
void set_config(Config *c) { config = c; }
int get_charge() const { return charge; }
// joins the spectra to form a new cluster spectrum
void create_cluster_by_binning_basic_spectra();
static void init_statics(Config *config)
{
init_min_num_occurences(config->get_tolerance());
}
static void set_num_top_peaks_per_1000_da(int val) { num_top_peaks_per_1000_da = val; }
static int get_num_top_peaks_per_1000_da() { return num_top_peaks_per_1000_da; }
// sets the consensus spectrum to be the basic spectrum with the maximal similarity
// to other spectra. Returns the idx of the most similar spectrum
int select_max_similarity_spectrum_as_consensus();
int select_max_sqs_spectrum_as_consensus();
// creates the title (file name) for the cluster
void make_title(string& name, int batch_idx, int cluster_idx);
// writes the spectrum to the output file in the mgf format
void write_spectrum_to_mgf(ostream& mgf, bool write_peak_count=false) const;
void write_spectrum_to_pkl_single(string& file_name) const;
void write_cluster_basic_spectra_to_mgf(char *mgf_file) const;
// finds how many spectra have a peptide sequence that doesn't match the majority assignment
int get_num_misassigned_spectra(mass_t pm_tolerance= 0.08,
int *mismatched_with_pep = NULL) const;
// returns true if there is a mjority annotation (with 75% of the annotated spectra)
// if so, returns its string and mass in the variables
bool has_majority_annotation(string& pep_str, mass_t& pep_mass) const;
int get_num_basic_spectra_with_peptide() const;
void print_cluster_peptides() const;
void print_cluster_similarities();
void print_explained_intensity_stats(Peptide& pep) const;
private:
int tmp_cluster_idx; // the idx given when the clustering is done, -1 means invalid cluster
unsigned char *sim_matrix_row_start; // the address of this clusters entry in the sim_matrix
int charge;
Config *config;
mass_t m_over_z;
float retention_time; // the average time for the spectra in the cluster
mass_t tolerance;
int maximum_peaks_vector_size;
int maximum_good_peaks_to_output;
int num_spectra_in_cluster;
int best_sqs_spec_idx; // holds the basic spectrum idx for the spectrum with the
// highest sqs score, otherwise holds -1 (if the cluster is
// bigger than MAX_SIZE_FOR_SQS_REP
string title;
string peptide_str; // if the cluster as an assigned peptide
float top_peak_masses[NUM_TOP_CLUSTER_PEAKS]; // the added spectrum needs to match one of these
vector<QCPeak> peaks;
vector<BasicSpectrum> basic_spectra;
vector<int> top_ranked_peak_idxs; // idxs of top ranked peaks (sorted list by idx)
void create_consensus_by_binning_basic_spectra();
// adds the given list to the clusters current list.
// if the new cluster is larger than X, peak scores are weighted according
// to the percentage in which the peaks appear
// list of peaks is then filtered to remove excess peaks
bool add_peak_list(const QCPeak *second_peaks,
int num_second_peaks,
mass_t tolerance,
int num_basic_spectra_added,
bool need_to_scale = true);
void create_consensus_sepctrum_from_peak_list_pointers(
vector<PeakListPointer>& plp,
int total_num_peaks);
// recursively merges the peak lists from the various spectra
// performs a merge until the pointers list has only one entry
void merge_peak_lists(vector<QCPeak>& org_peaks,
vector<QCPeak>& new_peaks,
vector<PeakListPointer>& pointers);
// joins adjacent peaks, markes invaldiated peaks by assigning their mass to -1
// and weeds them out
void join_merged_peak_lists(PeakListPointer& plp,
PeakListPointer& alt_plp,
int num_merged_spectra,
mass_t tolerance);
// selects the consensus peaks - those that appear more than the expected cutoff
// also takes some of the stronger peaks that didn't make the cutoff
// writes the selected peaks into the peaks of the cluster spectrum
void select_consensus_peaks(PeakListPointer& plp,
PeakListPointer& alt_plp,
int num_org_spectra);
// static
static void increase_tmp_storage_size(int num_peaks);
static void init_min_num_occurences(mass_t tolerance);
static vector<QCPeak> tmp_peak_area1; // used in peak merging
static vector<QCPeak> tmp_peak_area2;
static vector<int> min_num_occurences;
static int num_top_peaks_per_1000_da;
static float large_num_occurence_ratios;
static int large_cluster_size;
};
bool mark_top_peaks_with_sliding_window(const QCPeak *peaks, int num_peaks, mass_t window_size,
int num_peaks_per_window, vector<bool>& indicators);
/************************************************************
// Functions for the dot product similarity distance
*************************************************************/
void select_top_peak_idxs(QCPeak *peaks, int num_peaks,
mass_t m_over_z, mass_t tolerance,
vector<int>& top_ranked_peak_idxs,
float top_x_masses[NUM_TOP_CLUSTER_PEAKS]=NULL,
int top_peaks_per_100da = 20,
Config *config = NULL);
// sets the adjusted intensity of the peaks
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -