⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 filemanagement.cpp

📁 MS-Clustering is designed to rapidly cluster large MS/MS datasets. The program merges similar spectr
💻 CPP
📖 第 1 页 / 共 5 页
字号:
#include "FileManagement.h"
#include "auxfun.h"

void read_paths_into_list(const char *list_file, vector<string>& list)
{
	ifstream fs(list_file,ios::in);
	if (! fs.good())
	{
		cout << "Error: couldn't open file for reading: " << list_file << endl;
		exit(1);
	}

	char buff[512];
	list.clear();

	while ( fs.getline(buff,512) )
	{
		string path(buff);
		if (path.length()>3)
		{
			list.push_back(path);
		}
	}
}


// parses the type from the file name, -1 if not recognized
int get_file_extension_type(const string& fname)
{
	int length = fname.length();

	int last_pos = length-1;
	while (last_pos>0 && (
		   fname[last_pos] == '\n' || fname[last_pos] == '\r' || 
		   fname[last_pos] == '\t' || fname[last_pos] == '\f') )
		last_pos--;

	if (fname[last_pos-2]=='d' && fname[last_pos-1]=='t' && 
		fname[last_pos  ]=='a')
		return DTA;

	if (fname[last_pos-2]=='m' && fname[last_pos-1]=='g' && 
		fname[last_pos  ]=='f')
		return MGF;

	if (fname[last_pos-4] == 'm' &&
	    fname[last_pos-3] == 'z' &&
		fname[last_pos-2]=='X' && fname[last_pos-1]=='M' && 
		fname[last_pos  ]=='L')
		return MZXML;

	if (fname[last_pos-2]=='d' && fname[last_pos-1]=='a' && 
		fname[last_pos  ]=='t')
		return DAT;

	if (fname[last_pos-2]=='m' && fname[last_pos-1]=='s' && 
		fname[last_pos  ]=='2')
		return MS2;

	if (fname[last_pos-2]=='t' && fname[last_pos-1]=='x' && 
		fname[last_pos  ]=='t')
		return TXT;


	return -1;
}

int SingleSpectrumFile::get_scan() const
{
	if (type == MGF)
		return ((MGF_single *)this)->idx_in_file;
	if (type == MZXML)
		return ((MZXML_single *)this)->scan_number;
	if (type == DAT)
		return ((DAT_single *)this)->scan_number;

	return 0;
}


/****************************************************************
quickly extracts the charge, sequence, and pm_with_19 from dta
*****************************************************************/
bool DTA_file::scan_dta(const string& fname, const Config *config)
{
	char buff[1024];
	peptide.clear();

	ifstream fs(fname.c_str(),ios::in);
	if (! fs.good())
	{
		cout << "Error: couldn't open "<< fname << endl;
		exit(1);
	}
	
	fs.getline(buff,1024);

	while (buff[0] =='#') 
	{
		if (! strncmp(buff,"#SEQ",4))
			peptide.parse_from_string(config,buff+5);


		if (! fs.getline(buff,1024))
		{
			cout << "Error scanning " << fname << endl;
			exit(1);
		}
		continue;
	}
	
	istringstream is(buff);

	org_pm_with_19 = -1;
	charge = -1;
	is >> org_pm_with_19 >> charge;
//	charge = 0; // bad fix

	if (org_pm_with_19<0 || charge <0)
	{
		cout << "Error: couldn't find pm and charge in DTA file, pm: "
			 << org_pm_with_19 << " , charge: " << charge << endl;
		exit(1);
	}

	if (charge>0)
	{
		m_over_z = (org_pm_with_19 + charge -1 ) / (mass_t)charge;
	}

	if (peptide.get_num_aas()>0 && charge>0)
	{
		mass_t diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
		if (fabs(diff)>6.0)
		{
			// try and correct charge!
			for (charge=1; charge<=3; charge++)
			{
				org_pm_with_19 =  m_over_z * charge + MASS_PROTON * (1 - charge);
				diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
				if (diff<5)
					break;
			}

			if (diff>7)
			{
				cout << "Error: sequence mass doesn't add up: " << fname << " (diff: "
					 << diff << ")" << endl;
				cout << "Pepitde: " << peptide.as_string(config) << endl;
				cout << "Mass Cys = " << config->get_session_tables().get_aa2mass(Cys) << endl;
			}
		//	exit(1);
		}
	}

	num_peaks=0;
	while (! fs.eof())
	{
		fs.getline(buff,64);
		float mass,inten;
		if (sscanf(buff,"%f %f",&mass,&inten)==2)
		{
			if (mass<0 || inten<=0)
				continue;
			num_peaks++;
		}
	}

	fs.close();
	return true;
}


/****************************************************************
quickly extracts the charge, sequence, and pm_with_19 from MGF location
*****************************************************************/
bool MGF_single::scan_mgf_single(FILE *stream, const Config *config)
{
	char buff[1024];
	mass_t exp_pm=-1;
	
	peptide.clear();

	
	while (fgets(buff, 256, stream))
	{
		file_pos = ftell(stream);
		if (strncmp(buff,"BEGIN IONS",10) )
		{
			
			continue;
		}
		break;
	}


	charge=0;
	m_over_z=-1;

		// read header info and first peak
	while (1)
	{
	

		if( ! fgets(buff, 1024, stream))
			return false;

		if (! strncmp(buff,"END IONS",7))
			return false;

		if (! strncmp(buff,"TITLE=",6) )
		{
			int len = strlen(buff)-1;
			buff[len]='\0';
			if (buff[len-1]=='\r' || buff[len-1]=='\n' )
				buff[len-1]='\0';
			single_name = buff + 6;

			continue;
		}
		else
		if (! strncmp(buff,"SEQ=",4) )
		{
			string seq_string(buff+4);
			peptide.parse_from_string(config,seq_string);
			peptide.calc_mass(config);
			continue;		
		}
		else
		if (! strncmp(buff,"SCAN=",5) )
		{
			if (sscanf(buff+5,"%d",&scan_number) != 1)
			{
				cout << "Error: couldn't read scan number!" << endl;
				exit(1);
			}
			continue;
		}
		else
		if (! strncmp(buff,"RT=",3) )
		{
			if (sscanf(buff+3,"%f",&retention_time) != 1)
			{
				cout << "Error: couldn't read retention_time!" << endl;
				exit(1);
			}
			continue;
		}
		else
		if (! strncmp(buff,"CLUSTER_SIZE=",13) )
		{
			if (sscanf(buff+13,"%d",&cluster_size) != 1)
			{
				cout << "Error: couldn't read cluster size!" << endl;
				exit(1);
			}
			continue;
		}
		else	
		if ( ! strncmp(buff,"CHARGE=",6))
		{
			if (sscanf(buff,"CHARGE=%d",&charge) != 1)
			{
				cout <<  "Error: couldn't read charge!" << endl;
				return false;
			}
		}
		else
		if (! strncmp(buff,"PEPMASS=",8))
		{
			istringstream is(buff+8);
			is >> m_over_z;
			
			if (m_over_z < 0)
			{
				cout << "Error: reading pepmass:" << m_over_z << endl;
				return false;
			}		
		}
		else
		{
			istringstream is(buff);
			Peak p;
	
			is >> p.mass >> p.intensity;

			if (p.mass >0 && p.intensity>0)
			{
				first_peak_mass = p.mass;
				break;
			}
		}
	}

	if (charge<0 || m_over_z<0)
		return false;

	org_pm_with_19 = m_over_z * charge + MASS_PROTON * (1 - charge);
	if (org_pm_with_19<0)
	{
		org_pm_with_19=-1;
	}


	num_peaks=0;
	while ( fgets(buff, 1024, stream) )
	{
		if (! strncmp(buff,"END IONS",8) )
			break;
		num_peaks++;
	}

	if (peptide.get_num_aas()>0 && charge>0)
	{
		mass_t diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
		if (fabs(diff)>7.0)
		{
			// try and correct charge!
			int c;
			for (c=1; c<=4; c++)
			{
				mass_t new_pm_with_19 =  m_over_z * c + MASS_PROTON * (1 - c);
				diff = fabs(new_pm_with_19 - peptide.get_mass() - MASS_OHHH);
				if (diff<10)
				{
					charge = c;
					org_pm_with_19 = new_pm_with_19;
					return true;
				}
			}
		

			cout << "Error: MGF sequence mass doesn't add up: " << single_name << " (diff: "
				 << diff << ")" <<  endl;
			cout << "m/z: " << m_over_z << " org_pm_with_19: " << org_pm_with_19 << endl;
			cout << "Pepitde: " << peptide.as_string(config) << " (" << peptide.get_mass() << ")" << endl;
			cout << "Mass Cys = " << config->get_session_tables().get_aa2mass(Cys) << endl;
		//	org_pm_with_19 = -1;
			return false;
		}
	}

	return true;
}




/****************************************************************
quickly extracts the charge, sequence, and pm_with_19 from MS2 location
*****************************************************************/
bool MS2_single::scan_ms2_single(FILE *stream, const Config *config)
{
	char buff[128];
	mass_t exp_pm=-1;
	
	peptide.clear();

	
	while (fgets(buff, 256, stream))
	{
		if (! strncmp(buff,":",1) )
			break;
	}

	if (feof(stream))
		return true;

	buff[strlen(buff)-1]='\0';
	single_name = buff+1;

	charge=-1;
	org_pm_with_19=-1;

	// read header info and first peak
	if( ! fgets(buff, 1024, stream))
		return false;

	
	if (! strncmp(buff,"SEQ=",4) )
	{
		string seq_string(buff+4);
		peptide.parse_from_string(config,seq_string);
		if( ! fgets(buff, 1024, stream))
			return false;		
	}
	
	istringstream is(buff);
	is >> org_pm_with_19 >> charge;
	if (org_pm_with_19<0 || org_pm_with_19>10000)
	{
		cout << "Error: pm_with_19 in MS2 not in range: " << org_pm_with_19 << endl;
		exit(1);
	}

	if (charge<=0 || charge>10)
	{
		cout << "Error: bad charge state in MS2: " << charge << endl;
		exit(1);
	}


	m_over_z = (org_pm_with_19 -1 + charge) / charge;


	num_peaks=0;
	while ( fgets(buff, 128, stream) )
	{
		if (! strncmp(buff,":",1) || strlen(buff)< 3 )
			break;
		num_peaks++;
	}

	if (peptide.get_num_aas()>0 && charge>0)
	{
		mass_t diff = org_pm_with_19 - peptide.get_mass() - MASS_OHHH;
		if (fabs(diff)>7.0)
		{
			cout << "Error: sequence mass doesn't add up: " << single_name << " (diff: "
				 << diff << ")" <<  endl;
			cout << "m/z: " << m_over_z << " org_pm_with_19: " << org_pm_with_19 << endl;
			cout << "Pepitde: " << peptide.as_string(config) << " (" << peptide.get_mass() << ")" << endl;
			cout << "Mass Cys = " << config->get_session_tables().get_aa2mass(Cys) << endl;
			exit(1);
		}
	}

	return true;
}


/*******************************************************************
Special function to overcome mzMXL parsing problems.
Reads the peak lists into a local buff of floats.
********************************************************************/
void FileManager::init_and_read_single_mzXML(
						Config *config, 
						const char * file_name, 
						int file_idx,
						mass_t min_m_over_z, 
						mass_t max_m_over_z)
{

	min_charge =9999;
	max_charge =0;

	num_spectra.resize(100,0);
    
	mzxml_files.resize(1);

	mzxml_files[0].mzxml_name = file_name;

	int mzxml_file_idx = (file_idx>=0 ? file_idx : 0);

	mzxml_files[0].extract_peak_lists_from_mzXML(config,mzxml_files[0].mzxml_name,mzxml_file_idx,0,10000);

	min_spec_mass = mzxml_files[0].min_spec_mass;
	max_spec_mass = mzxml_files[0].max_spec_mass;
	min_charge = mzxml_files[0].min_charge;
	max_charge = mzxml_files[0].max_charge;

	count_num_spectra();
//	print_summary_stats();
}


void FileManager::init_from_list(Config *config, const vector<string>& list, bool quick_flag,
								 int file_idx)
{
	int i,num_dta=0,num_mgf=0,num_mzxml=0,num_dat=0,num_ms2=0;

	min_charge =9999;
	max_charge =0;

//	cout << "init from list: " << list.size() << endl;

	num_spectra.resize(100,0);
    
    for (i=0; i<list.size(); i++)
    {
		if (list[i][0] == '#')
			continue;

		int last_pos = list[i].length()-1;
	
		if (list[i][last_pos] == '\n' || list[i][last_pos] == '\r' || 
			list[i][last_pos] == '\t' || list[i][last_pos] == '\f')
			last_pos--;

		if (list[i][last_pos-2]=='d' && list[i][last_pos-1]=='t' && 
			list[i][last_pos  ]=='a')
		{
			num_dta++;
		}
		else if (list[i][last_pos-2]=='m' && list[i][last_pos-1]=='g' && 
			list[i][last_pos  ]=='f')
		{
			num_mgf++;
		}
		else if (list[i][last_pos-4] == 'm' &&
				 list[i][last_pos-3] == 'z' &&
			list[i][last_pos-2]=='X' && list[i][last_pos-1]=='M' && 
			list[i][last_pos  ]=='L')
		{
			num_mzxml++;
		}
		else if (
			list[i][last_pos-2]=='d' && list[i][last_pos-1]=='a' && 
			list[i][last_pos  ]=='t')
		{
			num_dat++;
		}
		else if (
			list[i][last_pos-2]=='m' && list[i][last_pos-1]=='s' && 
			list[i][last_pos  ]=='2')
		{
			num_ms2++;
		}
		else
		{
			cout << "Error::: couldn't recognize file type for: " << list[i] << endl;
			exit(1);
		}
    }

	dta_files.resize(num_dta);
	mgf_files.resize(num_mgf);
	mzxml_files.resize(num_mzxml);

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -