⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 qcbasicspecreader.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 2 页
字号:
#include "QuickClustering.h"
#include "Isotopes.h"


int BasicSpecReader::read_basic_spec(Config* config, 
									 const FileManager& fm,
									 SingleSpectrumFile *ssf, 
									 QCPeak* peaks, 
									 bool override_file_idx,
									 bool no_processing)
{
	const mass_t tolerance = config->get_tolerance();
	int i,num_peaks = -1;

	if (ssf->num_peaks > peak_list.size())
	{
		max_peak_list_size = ssf->num_peaks * 2;
		if (max_peak_list_size<2000)
			max_peak_list_size = 2000;
	
		peak_list.resize(max_peak_list_size);
	}

//	cout << ssf->file_idx << " " << ssf->single_name  << " NUM PEAKS: " << ssf->num_peaks << endl;

	if (ssf->type == DTA)
	{
		num_peaks = get_peak_list_from_DTA(ssf->single_name.c_str());
		if (num_peaks != ssf->num_peaks)
		{
			cout << "Heh heh ... there is an error!!!" << endl;
			exit(1);
		}
	}
	else if (ssf->type == MGF)
	{
		if (this->current_mgf_file_idx != ssf->file_idx)
		{
			if (mgf_stream)
			{
				fclose(mgf_stream);
			}

			int file_idx = ssf->file_idx;
			if (override_file_idx)
				file_idx = 0;

			const string& fname = fm.get_mgf_file(file_idx).mgf_name.c_str();
			
			mgf_stream=fopen(fname.c_str(),"r");
			if (! mgf_stream)
			{
				cout << "Error: couldn't open mgf: " << fname << endl;
				exit(1);
				
			}
			this->current_mgf_file_idx = ssf->file_idx;
		}

		if ( fseek(mgf_stream,ssf->file_pos,SEEK_SET) )
		{
			cout << "Error: could not skip in file!" << endl;
			exit(1);
		}

		num_peaks = get_peak_list_from_MGF(mgf_stream);
		MGF_single *mgf_ssf = (MGF_single *)ssf;
		if (num_peaks>0 && peak_list[0].mass != mgf_ssf->first_peak_mass)
		{
			mgf_ssf->print_ssf_stats(config);
			cout << "Error: mismatch in first peak masses: " << setprecision(5) << mgf_ssf->first_peak_mass << " " << peak_list[0].mass << endl;
			cout << "This error could arise because of Unix/Windows issues." << endl;
			cout << "Try running unix2dos on the input files." << endl;
			exit(1);
		}
	}
	else if (ssf->type == MZXML)
	{
		// overide if the mzXML was read entirely into the MZXML_file
		// via peak buff. Should know if this happend because 
		MZXML_single *mzxml_ssf = (MZXML_single *)ssf;
		if (mzxml_ssf->peak_buff_start_idx>=0)
		{
			int i;
			float *ptr;
			fm.copy_mzxml_peak_buff_ptr(&ptr);

			float *mzxml_peak_buff = ptr + mzxml_ssf->peak_buff_start_idx;
			int idx=0;
			for (i=0; i<mzxml_ssf->num_peaks; i++)
			{
				peaks[i].mass     =mzxml_peak_buff[idx++];
				peaks[i].intensity=mzxml_peak_buff[idx++];
			}
			// peaks already filtered
			return mzxml_ssf->num_peaks;
		}
		else
		{
			if (this->current_mzxml_file_idx != ssf->file_idx)
			{
				if (mzxml_stream)
				{
					fclose(mzxml_stream);
				}

				int file_idx = ssf->file_idx;
				if (override_file_idx)
					file_idx = 0;

				const string& fname = fm.get_mzxml_file(file_idx).mzxml_name.c_str();
				
				mzxml_stream=fopen(fname.c_str(),"r");
				if (! mzxml_stream)
				{
					cout << "Error: couldn't open mzxml: " << fname << endl;
					exit(1);
					
				}
				this->current_mzxml_file_idx = ssf->file_idx;
			}

			if ( fseek(mzxml_stream,ssf->file_pos,0) )
			{
				cout << "Error: could not skip in file!" << endl;
				exit(1);
			}
			num_peaks = get_peak_list_from_MZXML(mzxml_stream);
		}
	}
	else if (ssf->type == DAT)
	{
		if (this->current_dat_file_idx != ssf->file_idx)
		{
			if (dat_file.is_open())
			{
				dat_file.close();
				
			}

			int file_idx = ssf->file_idx;
			if (override_file_idx)
				file_idx = 0;

			const string& fname = fm.get_dat_file(file_idx).dat_name.c_str();
			
			dat_file.open(fname.c_str(), ios::in | ios::binary);
	
			if (! dat_file.is_open())
			{
				cout << "Error: couldn't open dat: " << fname << endl;
				exit(1);
			}
			this->current_dat_file_idx = ssf->file_idx;
		}

		dat_file.seekg(ssf->file_pos);


		// If the peaks are read from a DAT file, there is no need to perform a joining of the peaks
	//	num_peaks = get_peak_list_from_DAT(dat_file, &peak_list[0]);
		num_peaks = get_peak_list_from_DAT(dat_file, peaks);
		if (num_peaks<3)
			num_peaks=-1;
		return num_peaks;	
	}
	else if (ssf->type == PKL)
	{
		int file_idx = ssf->file_idx;
		if (override_file_idx)
				file_idx = 0;

		const string& pkl_name = fm.get_pkl_dir(file_idx).dir_path;
		string file_path = pkl_name + "/" + ssf->single_name;

		num_peaks = get_peak_list_from_PKL(file_path);
	}
	else if (ssf->type == MS2)
	{
		if (this->current_ms2_file_idx != ssf->file_idx)
		{
			if (ms2_stream)
			{
				fclose(ms2_stream);
			}

			const string& fname = fm.get_ms2_file(ssf->file_idx).ms2_name.c_str();
			
			ms2_stream=fopen(fname.c_str(),"rb");
			if (! mgf_stream)
			{
				cout << "Error: couldn't open ms2: " << fname << endl;
				exit(1);
				
			}
			this->current_ms2_file_idx = ssf->file_idx;
		}

		if ( fseek(ms2_stream,ssf->file_pos,SEEK_SET) )
		{
			cout << "Error: could not skip in file!" << endl;
			exit(1);
		}

		num_peaks = get_peak_list_from_MS2(ms2_stream);
	
	}
	else
	{
		cout << "Error: invalid file type:" << ssf->type <<  endl;
		exit(1);
	}

	if (num_peaks<3)
		num_peaks=-1;


	// copy peaks as is
	if (no_processing)
	{
		int i;
		for (i=0; i<num_peaks; i++)
			peaks[i]=peak_list[i];

		return num_peaks;
	}


	// join adjacent peaks
	const mass_t join_tolerance = (tolerance < 0.05 ? tolerance : 0.6 * tolerance);
	int p_idx=0;
	i=1;
	while (i<num_peaks)
	{
		
		if (peak_list[i].mass - peak_list[p_idx].mass<=join_tolerance)
		{
			intensity_t inten_sum = peak_list[i].intensity + peak_list[p_idx].intensity;
			mass_t new_mass = (peak_list[i].intensity * peak_list[i].mass + 
							   peak_list[p_idx].intensity * peak_list[p_idx].mass ) / inten_sum;

			peak_list[p_idx].mass = new_mass;
			peak_list[p_idx].intensity = inten_sum;	
		}
		else
		{
			peak_list[++p_idx]=peak_list[i];
		}
		i++;
	}

	num_peaks = p_idx+1;
	


	// filter low intensity noise
	// and mark those that are good peaks
	int max_peak_idx = num_peaks -1;
	int min_idx=1;
	int max_idx=1;
	p_idx =1;

	vector<bool> indicators;

	mark_top_peaks_with_sliding_window(&peak_list[0], 
									   num_peaks,
									   config->get_local_window_size(),
									   config->get_max_number_peaks_per_local_window(),
									   indicators);

	if (ssf->precursor_intensity <=0)
	{

	//	cout << "PRecursor was zero!" << endl;
	//	exit(1);

		ssf->precursor_intensity=0;
		for (i=0; i<num_peaks; i++)
			ssf->precursor_intensity+=peak_list[i].intensity;
	}

	p_idx=0;
	for (i=0; i<num_peaks; i++)
		if (indicators[i])
			peaks[p_idx++]=peak_list[i];
	
	num_peaks = p_idx;



	// normalize intensities

	if (config->get_need_to_normalize() && ! config->get_itraq_mode())
	{
		intensity_t total_inten=0;
		for (i=0; i<num_peaks; i++)
			total_inten+=peaks[i].intensity;

		const mass_t one_over_total_inten = (1000.0 / total_inten);

		for (i=0; i<num_peaks; i++)
			peaks[i].intensity *= one_over_total_inten;
	}


	// sanity check
	if (peaks[0].mass<=0 && peaks[0].intensity<0)
	{
		cout << "Invalid mass or intensity at first peak: " << peaks[0].mass << "\t" << peaks[0].intensity << endl;
		exit(1);
	}

	for (i=1; i<num_peaks; i++)
	{
	
		if (peaks[i].intensity<0 || peaks[i].mass <0 || peaks[i].mass < peaks[i-1].mass)
		{
			cout << "Error: peak mismatches in file! (peak " << i << "/" << num_peaks-1 << ")" << endl;

			cout << fixed << setprecision(4);
			int j;
			for (j=0; j<num_peaks; j++)
				cout << peaks[j].mass << " " << peaks[j].intensity << endl;
			exit(1);
		}
	}

	return num_peaks;
}


// these functions just extract the peak list from the spectrum file, return the actual
// number of peaks (after joining)

int BasicSpecReader::get_peak_list_from_DTA(const char* dta_name)
{
	ifstream fs(dta_name,ios::in);
	if (! fs)
		return -1;

	char buff[256];

	fs.getline(buff,256);

	if (fs.bad())
		return false;

	while (buff[0] =='#') 
	{
		fs.getline(buff,256);
	}
	
	

	int p_count=0;
	while (fs.getline(buff,256))
	{
		istringstream is(buff);

		mass_t& mass = peak_list[p_count].mass;
		intensity_t& intensity = peak_list[p_count].intensity;
		is >> mass >> intensity;

		if (mass <0 || intensity==0)   // the peak probably got rounded down
			continue;

		p_count++;
	
	}

	fs.close();
	return p_count;
}

int BasicSpecReader::get_peak_list_from_MGF(FILE *mgf_stream)
{
	char buff[1024];
	
	mass_t peak_mass=-1;
	intensity_t peak_intensity=-1;

	while (1)
	{

		if( ! fgets(buff, 1024, mgf_stream) )
			return -1;

		if (! strncmp("END IONS",buff,7))
			return -1;

		istringstream is(buff);
		is >> peak_mass >> peak_intensity;

		if (peak_mass>0 &&  peak_intensity>0)
			break;
	}
	
	peak_list[0].mass = peak_mass;
	peak_list[0].intensity = peak_intensity;

	int p_count = 1;

	while (fgets(buff, 256, mgf_stream))
	{
		istringstream is(buff);
		
		if (! strncmp("END IONS",buff,7))
			break;

		
		mass_t mass=-1;
		intensity_t intensity=-1;

		is >> mass >> intensity;
	
		if (mass <=0 || intensity<=0)   // the peak probably got rounded down
			continue;

		peak_list[p_count].mass		= mass;
		peak_list[p_count].intensity= intensity;

		p_count++;
	}
	return p_count;
}



// 
int BasicSpecReader::get_peak_list_from_DAT(ifstream& dat_file, QCPeak *peaks)
{
	const int num_header_bytes_to_read = sizeof(mass_t) + 2* sizeof(float) + 4 * sizeof(int);
	const int num_peaks_pos = sizeof(mass_t) + 3 *sizeof(int);
	static char      header_buff[num_header_bytes_to_read];
	static vector<float> peak_buff;
	static int    peak_buff_size = 0;

	memset(header_buff,0,num_header_bytes_to_read);
	
	// read header info

	if (! dat_file.read(header_buff,num_header_bytes_to_read))

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -