⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filemanagement.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
{
	if (mgf_name.length() == 0)
	{
		printf("Error: must first copy name to MGF_file!\n");
		exit(1);
	}

	FILE *stream=fopen(mgf_name.c_str(),"r");
	if (! stream)
	{
		cout << "Error: couldn't open mgf file for reading: " << mgf_name << endl;
		exit(1);
	}

//	cout << "TTT Reading mgf idx: " << file_idx << endl;

	this->single_spectra.clear();
	this->single_spectra.reserve(5000);

	this->num_spectra.resize(100,0);
	
	Spectrum s;
	int counter=0;
	while (1)
	{
	
		MGF_single mg;
		
		long pos = ftell(stream);  // remember stream pointer

		mg.type = MGF;
		mg.file_pos = -1;
		mg.file_idx = file_idx;
		mg.idx_in_file = counter++;

		if (quick_flag)
		{
			
			if (! mg.scan_mgf_single(stream,config) )
			{
				static char tmp_buff[256];
				if (! fgets(tmp_buff,256,stream)) // move forwards, there was a problem with that position
					break;						  // reached eof
				
				continue;
			}

			// there was something wrong with this spectrum
			if (mg.org_pm_with_19<0)
				continue;

			if (mg.org_pm_with_19 < min_spec_mass)
				min_spec_mass = mg.org_pm_with_19;

			if (mg.org_pm_with_19 > max_spec_mass)
				max_spec_mass = mg.org_pm_with_19;

			if (mg.charge <min_charge)
				min_charge = mg.charge;

			if (mg.charge >max_charge)
				max_charge = mg.charge;

			num_spectra[mg.charge]++;
		}
		else
		{
			mg.file_pos = ftell(stream);

			if (! s.read_from_MGF_stream(config,stream) )
				break;

			s.init_spectrum();

			mg.peptide = s.get_peptide();
			mg.single_name = s.get_file_name();
			mg.org_pm_with_19 = s.get_org_pm_with_19();
			mg.charge = s.get_charge();
			mg.pm_with_19 = (s.get_corrected_pm_with_19()>0 ?s.get_corrected_pm_with_19() : s.get_org_pm_with_19());
			mg.num_peaks = s.get_num_peaks();
			
			if (mg.org_pm_with_19 < min_spec_mass)
				min_spec_mass = mg.org_pm_with_19;

			if (mg.org_pm_with_19 > max_spec_mass)
				max_spec_mass = mg.org_pm_with_19;

			if (mg.charge <min_charge)
				min_charge = mg.charge;

			if (mg.charge >max_charge)
				max_charge = mg.charge;

			num_spectra[mg.charge]++;
		}
	
		if (mg.file_pos>=0)
		{
			single_spectra.push_back(mg);
		}
		
	}
	fclose(stream);
	
}




void DAT_file::initial_read(Config *config, int file_idx)
{
	const int bytes_to_read = sizeof(mass_t) + 4 * sizeof(int) + 2 *sizeof(float);

	if (dat_name.length() == 0)
	{
		printf("Error: must first copy name to DAT_file!\n");
		exit(1);
	}

	ifstream dat_stream(dat_name.c_str(),ios::in | ios::binary);

	if (! dat_stream.is_open() || ! dat_stream.good())
	{
		cout << "Error: couldn't open dat file for reading: " << dat_name << endl;
		exit(1);
	}

	this->single_spectra.clear();
	this->single_spectra.reserve(20000);
	this->num_spectra.resize(100,0);
	

	int counter=0;
	while (1)
	{
	
		char header_buff[bytes_to_read];
		long pos = dat_stream.tellg();
		DAT_single dat;
		

		dat.type = DAT;
		dat.file_pos = pos;
		dat.file_idx = file_idx;

		dat_stream.read(header_buff,bytes_to_read);
		if (dat_stream.gcount() != bytes_to_read)
			break;
		
		char *b_pos = header_buff;
		
		dat.m_over_z = *(mass_t *)b_pos;
		b_pos += sizeof(mass_t);
		
		dat.charge = *(int *)b_pos;
		b_pos += sizeof(int);

		dat.mzxml_file_idx = *(int *)b_pos;
		b_pos += sizeof(int);

		dat.scan_number = *(int *)b_pos;
		b_pos += sizeof(int);

		dat.num_peaks = *(int *)b_pos;
		b_pos += sizeof(int);

		dat.retention_time = *(float *)b_pos;
		b_pos += sizeof(float);

		dat.precursor_intensity = *(float *)b_pos;
		b_pos += sizeof(float);

		// sanity checks
		if (dat.m_over_z<0.0 || dat.m_over_z>10000.0 || 
			dat.mzxml_file_idx<0 || dat.mzxml_file_idx>100000 ||
			dat.num_peaks<0  || dat.num_peaks>100000 ||
			dat.scan_number<0 || dat.scan_number>1000000)
		{
			cout << "Error in DAT file " << file_idx << " pos " << pos << "  #" << 
				single_spectra.size() << endl;
			
			cout << "m/z = " << dat.m_over_z << endl;
			cout << "mzxml idx = " << dat.mzxml_file_idx << endl;
			cout << "scan =  " << dat.scan_number << endl;
			cout << "num_peaks = " << dat.num_peaks << endl;
			exit(1);
		}

		dat_stream.seekg(pos  + bytes_to_read + 2*sizeof(float)*dat.num_peaks);

		single_spectra.push_back(dat);
		counter++;
	}

	dat_stream.close();
}



void MS2_file::initial_read(Config *config, int file_idx, bool quick_flag)
{
	if (ms2_name.length() == 0)
	{
		printf("Error: must first copy name to MS2_file!\n");
		exit(1);
	}

	FILE *stream=fopen(ms2_name.c_str(),"r");
	if (! stream)
	{
		cout << "Error: couldn't open mgf file for reading: " << ms2_name << endl;
		exit(1);
	}

	this->single_spectra.clear();
	this->single_spectra.reserve(5000);

	this->num_spectra.resize(100,0);
	
	Spectrum s;
	int counter=0;
	while (1)
	{
	
		MS2_single ms2;
		
		long pos = ftell(stream);  // remember stream pointer

		ms2.type = MS2;
		ms2.file_pos = pos;
		ms2.file_idx = file_idx;
		ms2.idx_in_file = counter++;

		if (quick_flag)
		{
			
			if (! ms2.scan_ms2_single(stream,config) )
				break;

			if (ms2.org_pm_with_19 < min_spec_mass)
				min_spec_mass = ms2.org_pm_with_19;

			if (ms2.org_pm_with_19 > max_spec_mass)
				max_spec_mass = ms2.org_pm_with_19;

			if (ms2.charge <min_charge)
				min_charge = ms2.charge;

			if (ms2.charge >max_charge)
				max_charge = ms2.charge;

			num_spectra[ms2.charge]++;
		}
		else
		{
			cout << "Reading MS2 not supported in this mode!" << endl;
			exit(0);
		/*	if (! s.read_from_MS2_stream(config,stream) )
				break;

			s.init_spectrum();

			ms2.peptide = s.get_peptide();
			ms2.single_name = s.get_file_name();
			ms2.org_pm_with_19 = s.get_org_pm_with_19();
			ms2.charge = s.get_charge();
			ms2.pm_with_19 = s.get_corrected_pm_with_19();
			ms2.num_peaks = s.get_num_peaks();
			
			if (ms2.org_pm_with_19 < min_spec_mass)
				min_spec_mass = ms2.org_pm_with_19;

			if (ms2.org_pm_with_19 > max_spec_mass)
				max_spec_mass = ms2.org_pm_with_19;

			if (ms2.charge <min_charge)
				min_charge = ms2.charge;

			if (ms2.charge >max_charge)
				max_charge = ms2.charge;

			num_spectra[ms2.charge]++;*/
		}
	
		single_spectra.push_back(ms2);
		
	}
	fclose(stream);
	
}




// reads the summary tsv file and stores stats
// extract all information from the tsv file
void PKL_dir::initial_read(Config *config, int dir_idx, const string& path, 
						   const string& tsv_file , mass_t min_m_over_z, mass_t max_m_over_z)
{
	this->dir_path = path;
	this->tsv_path = tsv_file;

	FILE *tsv_stream = fopen(tsv_file.c_str(),"r");

	if (! tsv_stream)
	{
		cout << "Error: couldn't open tsv file for reading: " << tsv_file << endl;
		exit(1);
	}

	char line_buff[2048];
	fgets(line_buff,2048,tsv_stream); // first row is headers

	while (fgets(line_buff,2048,tsv_stream))
	{
		int np=-1;
		PKL_single pkl;

		pkl.type = PKL;
		pkl.file_idx = dir_idx;
		pkl.m_over_z = -1;
		pkl.single_name = "";
		pkl.num_peaks=-1;

		istringstream is(line_buff);

		mass_t mz_sel,mz_cen,parent_MH;
		int tmp_charge, ns;
		float max_inten;

		is >> pkl.single_name >> pkl.scan_number >> pkl.retention_time >> np >> 
			pkl.num_peaks >> mz_sel >> pkl.m_over_z >> mz_cen >> parent_MH >> tmp_charge >> ns >>
			max_inten >> pkl.precursor_intensity;

		if (pkl.num_peaks<=5)
			continue;

		if (pkl.m_over_z < min_m_over_z || pkl.m_over_z>max_m_over_z)
			continue;


		// set charge from file name
		pkl.charge = 0;
		if (pkl.single_name.length()>4)
		{
			char charge_sym = pkl.single_name[pkl.single_name.length()-5];
			if (charge_sym<'0' || charge_sym>'9')
			{
				cout << "Error: couldn't extract charge from file name " << pkl.single_name << endl;
				exit(1);
			}
			pkl.charge = int(charge_sym - '0');
		}

		single_spectra.push_back(pkl);
			
	}

	fclose(tsv_stream);
}



// reads a list with dirs and paths to tsv files
void FileManager::init_from_pkl_dir_list(Config *config, const char *list_file, 
										 mass_t min_m_over_z, mass_t max_m_over_z)
{
	FILE *list_stream=fopen(list_file,"r");

	if (! list_stream)
	{
		cout << "Error: couldn't open pkl dir list for reading: " << list_file << endl;
		exit(1);
	}


	char line_buff[1024];
	int n=0;
	while (fgets(line_buff,1024,list_stream))
		n++;

	fclose(list_stream);

	pkl_dirs.resize(n);

	list_stream=fopen(list_file,"r");


	cout << "Read pkl files (idx path #spectra):" << endl;
	int pkl_dir_idx=0;
	while (fgets(line_buff,1024,list_stream))
	{
		istringstream is(line_buff);

		string pkl_dir = "";
		string tsv_file = "";

		is >> pkl_dir >> tsv_file;

		if (pkl_dir.length()>2 && tsv_file.length() >2)
		{
			pkl_dirs[pkl_dir_idx].initial_read(config,pkl_dir_idx,pkl_dir,tsv_file,
											   min_m_over_z, max_m_over_z);
			
			if (pkl_dirs[pkl_dir_idx].single_spectra.size()>0)
			{
				cout << pkl_dir_idx << "\t" << pkl_dir << "\t" << pkl_dirs[pkl_dir_idx].single_spectra.size() << endl;

				pkl_dir_idx++;
			}
		}
	}	
}


void FileManager::init_from_single_pkl_dir(Config *config, const string& pkl_dir_path, 
					const string& tsv_file, int pkl_dir_idx, mass_t min_m_over_z, mass_t max_m_over_z)
{

	pkl_dirs.resize(1);
	pkl_dirs[0].initial_read(config, pkl_dir_idx, pkl_dir_path, tsv_file, 
							 min_m_over_z, max_m_over_z);

}



/*************************************************************************
Select all file pointers from the FileManager
**************************************************************************/
void FileSet::select_all_files(const FileManager& fm, bool remove_duplicates)
{
	int i;

	this->ssf_pointers.clear();

	for (i=0; i<fm.dta_files.size(); i++)
	{
		ssf_pointers.push_back((struct DTA_file *)&fm.dta_files[i]);

		// check that this doesn't correpond to the immediate previous spec
		if (remove_duplicates && ssf_pointers.size()>1)
		{
			const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
			const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];

			if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
				 ( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
			{
				ssf_pointers.pop_back();
			}
		}
	}

	for (i=0; i<fm.mgf_files.size(); i++)
	{
		int j;
		for (j=0; j<fm.mgf_files[i].single_spectra.size(); j++)
		{
			const SingleSpectrumFile *ssf = &fm.mgf_files[i].single_spectra[j];

			if (ssf->num_peaks<5)
				continue;

			this->ssf_pointers.push_back((struct MGF_single *)ssf);

			// check that this doesn't correpond to the immediate previous spec
			if (remove_duplicates && ssf_pointers.size()>1)
			{
				const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
				const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];

				if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
				 ( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
				{
					ssf_pointers.pop_back();
					continue;
				}
			}

//			cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
		}
	}

	for (i=0; i<fm.mzxml_files.size(); i++)
	{
		int j;
		for (j=0; j<fm.mzxml_files[i].single_spectra.size(); j++)
		{
			const SingleSpectrumFile *ssf = &fm.mzxml_files[i].single_spectra[j];

			if (ssf->num_peaks<5)
				continue;

			this->ssf_pointers.push_back((struct MZXML_single *)ssf);

			// check that this doesn't correpond to the immediate previous spec
			if (remove_duplicates && ssf_pointers.size()>1)
			{
				const SingleSpectrumFile * current_ssf = ssf_pointers[ssf_pointers.size()-1];
				const SingleSpectrumFile * previous_ssf = ssf_pointers[ssf_pointers.size()-2];

				if ( ( previous_ssf->num_peaks == current_ssf->num_peaks ) &&
				 ( fabs(previous_ssf->m_over_z-current_ssf->m_over_z) < 0.005) )
				{
					ssf_pointers.pop_back();
					continue;
				}
			}

//			cout << setprecision(9)<< ssf->m_over_z << " " << ssf->num_peaks << endl;
		}
	}

	for (i=0; i<fm.dat_files.size(); i++)
	{
		int j;
		for (j=0; j<fm.dat_files[i].single_spectra.size(); j++)
		{
			const SingleSpectrumFile *ssf = &fm.dat_files[i].single_spectra[j];

		//	if (ssf->num_peaks<5)
		//		continue;

			this->ssf_pointers.push_back((struct DAT_single *)ssf);

			// check that this doesn't correpond to the immediate previous spec
			if (remove_duplicates && ssf_pointers.size()>1)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -