⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 myfileapp.cpp

📁 现代汉语预处理程序源码含断句、删除空格等
💻 CPP
📖 第 1 页 / 共 2 页
字号:
			//	CurTime="<TIME>"+CurLine.Mid(6)+"</TIME>";
				CurTime="<TIME>现代</TIME>"; // 文件夹9中的语料都是当代汉语
			//	CurTime="<TIME>当代</TIME>"; // 文件夹10中的语料都是当代汉语

				// 处理法律文献的篇名
/*
				inFile.ReadString(CurLine);
				inFile.ReadString(CurLine);
				CurLine.TrimLeft();
				CurLine.TrimRight();
				Title=CurLine;
*/
			}
			else{
				AfxMessageBox("Format Error!");
				return;
			}
			
			// 处理输出文件的文件名

			FILE *fp_out;
			CString NewFileName;

			// 处理编辑部的故事的文件名
			if (CurTitle.Find("编辑部的故事")>=0)
				Title="编辑部的故事_"+Title;

			// 处理北京话口语调查材料的文件名
			if (CurTitle.Find("北京话口语调查材料")>=0)
				Title="北京话口语调查材料_"+Title;
				
			// 一般语料的文件名
			
			Title=ChangeBracket(Title);

			// 处理篇名中可能包含的问号
			if (Title.Find("?")>=0) {
				int dd=Title.Find("?");
				while (dd>=0 && (Title.GetLength()>=(dd+1))) 
				{
					Title=Title.Left(dd)+Title.Mid(dd+1);
					dd=Title.Find("?");
				}				
			}

			NewFileName=Title+".xml";
			fp_out=fopen(NewFileName,"wt");
			CStdioFile outFile(fp_out);
			
			outFile.WriteString(XML_Initial+"\n");
//			outFile.WriteString(XML_Initial2+"\n");
			outFile.WriteString(text_begin+"\n");
			outFile.WriteString(text_head_begin+"\n");

			outFile.WriteString(CurTitle+"\n");
			outFile.WriteString(CurAuthor+"\n");
			outFile.WriteString(CurStyle+"\n");
			outFile.WriteString(CurTime+"\n");

			outFile.WriteString(text_head_end+"\n");
			outFile.WriteString(text_body_begin+"\n");
			// 以上处理文件头
			
			CurState=2; //  以下处理文件正文
			while(inFile.ReadString(CurLine) && CurState==2)
			{
				CurLine.TrimLeft(); 
				CurLine.TrimRight();
				
				int CurLineLength=0;

				CurLineLength=CurLine.GetLength();
				if (CurLineLength==1 && CurLine=="@")
				{
					CurState=1; // 扫描到“@”表示下面是一个新文件
					break;
				}
				else {
					if (CurLine!="") {
						if((CurLine.Find("@")==0) || (CurLine.Find("王朔")==0) || (CurLine=="【全文完】") || (CurLine=="(全文完)") || (CurLine=="(全文完)")
							|| (CurLine.Find("From")==0) || (CurLine.Find("Subject")==0)  || (CurLine.Find("To:")==0)  || (CurLine.Find("Date:")==0)
							|| (CurLine.Find("Source:")==0) || (CurLine.Find("发信人:")==0)|| (CurLine.Find("标题:")==0)|| (CurLine.Find("发信站:")==0)
							|| (CurLine.Find("[小说完]")==0) || (CurLine.Find("Newsgroups:")==0) || (CurLine.Find("ganization:")>=0) || (CurLine.Find("【完】")==0)
							|| (CurLine.Find("(完)")==0) || (CurLine.Find("提交者:")==0) ||  (CurLine.Find("答复:")==0))
						{
						// 处理编辑部的故事文本
						//if((CurLine.Find("@")==0) || (CurLine.Find("编辑部的故事·")==0) || (CurLine=="【全文完】") ){
						}
						else {
							if((CurLine.Find("亦凡书库")>=0) || (CurLine.Find("(原载")==0) || (CurLine.Find("原载")==0) || (CurLine.Find("摘自《")==0)) {
								CurLine="<FROM>"+CurLine+"</FROM>";
								outFile.WriteString(CurLine+"\n");
							}
							else {
								if ((CurLine.Find("【作者")==0))
								{
									CurLine="<BACKGROUND>"+CurLine+"</BACKGROUND>";
									outFile.WriteString(CurLine+"\n");
								}
								else
								{
									CurLine="<p>"+ChangeBracket(CurLine)+"</p>";
									outFile.WriteString(CurLine+"\n");
								}
							}
						}
					}				
				}
			} // 如果正文处理结束,就跳出循环
			outFile.WriteString(text_body_end+"\n");
			outFile.WriteString(text_end);

			outFile.Close();
		}
	}
	inFile.Close();
}

CString ChangeBracket(CString SourceString)
{// 将字符串中的<<改为中文书名号《,  将>>改为》

	int d1;
	
	d1=SourceString.Find("<<");
	while (d1>=0 && (SourceString.GetLength()>=(d1+2))) 
	{
		SourceString=SourceString.Left(d1)+"《"+SourceString.Mid(d1+2);
		d1=SourceString.Find("<<");
	}
	d1=SourceString.Find(">>");
	while (d1>=0 && (SourceString.GetLength()>=(d1+2))) 
	{
		SourceString=SourceString.Left(d1)+"》"+SourceString.Mid(d1+2);
		d1=SourceString.Find(">>");
	}

	d1=SourceString.Find("<");
	while (d1>=0 && (SourceString.GetLength()>=(d1+1))) 
	{
		SourceString=SourceString.Left(d1)+"("+SourceString.Mid(d1+1);
		d1=SourceString.Find("<");
	}
	d1=SourceString.Find(">");
	while (d1>=0 && (SourceString.GetLength()>=(d1+1))) 
	{
		SourceString=SourceString.Left(d1)+")"+SourceString.Mid(d1+1);
		d1=SourceString.Find(">");
	}

	return SourceString;
}

void GetRawFormatFromAnnotatedFile(CString FileName)
{// 将已标注好的语料还原为未标注语料

}



int CompareTwoNum(const void *q1, const void *q2)
{// 比较两个数的大小,q1大返回-1,按降序排

	if (((struct SentLengthFreq *)q1)->SentFreq > ((struct SentLengthFreq *)q2)->SentFreq)
		return -1;
	else {
		if(((struct SentLengthFreq *)q1)->SentFreq==((struct SentLengthFreq *)q2)->SentFreq)
		{
			if (((struct SentLengthFreq *)q1)->SentLength >= ((struct SentLengthFreq *)q2)->SentLength)
				return -1;
			else
				return 1;
		}
		else
			return 1;
	}
}

void SumCharOfSentence(CString FileName)
{// 统计一个按句子分行的文件中各句长度及频度

	struct SentLengthFreq Sentence[500];
	
	int ddd=0;
	while (ddd<500) { // 结构赋初值
		Sentence[ddd].SentLength=0;
		Sentence[ddd].SentFreq=0;
		ddd++;
	}

	FILE *in, *out;
	in=fopen(FileName,"rt");
	if (in==NULL) {
		AfxMessageBox("Can not open the file");
		return;
	}

	CString Fname,CurLine;

	Fname=ChangeFileName(FileName,"-sta"); 
	out=fopen(Fname,"wt");
	CStdioFile inFile(in), outFile(out);
	
	int curfoot=0,AllNumOfSent=0;

	while (inFile.ReadString(CurLine))
	{	
		AllNumOfSent++;
		
		CurLine.TrimLeft();
		CurLine.TrimRight();

	//	int tmplength=CountWordOfOneSentence(CurLine); // 按词数计句长
		
		int tmplength=CurLine.GetLength(); // 按字节数计当前句的句长
		
		int Findout=0,i=0;
		
		while (i<500)
		{
			if (tmplength==Sentence[i].SentLength)
			{// 如果当前句的句长是Sentence结构中已经记录过的句长值
				Sentence[i].SentFreq++; // 将结构中句长频度值增加1
				Findout=1; // 表示当前句的长度在结构中已经存在了
				break;
			}
			i++;
		}

		if (Findout==0)
		{// 如果当前句的句长尚未记录在Sentence结构中
			Sentence[curfoot].SentLength=tmplength;
			Sentence[curfoot].SentFreq=1;
			curfoot++;
		}
	}

	//// 按句子频度排序
	qsort(Sentence,curfoot,sizeof(struct SentLengthFreq),CompareTwoNum);
	
	int j=0;
	outFile.WriteString("句长	频度\n\n");
	CString tabspace="	";
    CString len,freq,NumOfSent,NumOfWord,AllSent;
	
	int CountWordNum=0;

	while (curfoot>=0)
	{
	
		len.Format("%d",Sentence[j].SentLength);
		freq.Format("%d",Sentence[j].SentFreq);
		CountWordNum=CountWordNum+(Sentence[j].SentLength)*(Sentence[j].SentFreq);
		
		if (len!="0") {
			outFile.WriteString(len+tabspace+freq+"\n");
		//	outFile.WriteString("\n");
		}
		j++;
		curfoot--;
	} 

	NumOfSent.Format("%d",j-1);
	NumOfWord.Format("%d",CountWordNum);
	AllSent.Format("%d",AllNumOfSent);
	
	outFile.WriteString("\n");
	outFile.WriteString("不同句长的句子数:"+NumOfSent);
	outFile.WriteString("\n");
/*
	outFile.WriteString("总词数:"+NumOfWord); // 输出文件包含的总词数
	outFile.WriteString("\n");
*/
	outFile.WriteString("总句数:"+AllSent);
	
	inFile.Close();
	outFile.Close();
}


int CountWordOfOneSentence(CString Sentence)
{//按词数计算一个句子的长度

	int SentLength=0,dash_pos=0;
	
	Sentence.TrimLeft();
	Sentence.TrimRight();

	while (Sentence!="") 
	{
		dash_pos=Sentence.Find("/");
		if(dash_pos>=0) {
			SentLength++;
			Sentence=Sentence.Mid(dash_pos+1);
		}
	}
	return SentLength;
}


int CompareLength(const void *q1, const void *q2)
{// 比较句长大小,q1大返回-1,按降序排

	if (((struct Sent_Length *)q1)->SLength > ((struct Sent_Length *)q2)->SLength)
		return -1;
	else {
		if(((struct Sent_Length *)q1)->SLength==((struct Sent_Length *)q2)->SLength)
		{
			if (((struct Sent_Length *)q1)->SLength >= ((struct Sent_Length *)q2)->SLength)
				return -1;
			else
				return 1;
		}
		else
			return 1;
	}
}

void SentLenSort(CString FileName)
{// 按句子长度对文件中句子进行排序
	struct Sent_Length mySentence[MAXSENT];

	int ddd=0;
	while (ddd<MAXSENT) { // 结构赋初值
		mySentence[ddd].SLength=0;
		mySentence[ddd].Sent="";
		ddd++;
	}

	FILE *in, *out;

	in=fopen(FileName,"rt");
	if (in==NULL) {
		AfxMessageBox("Can not open the file");
		return;
	}

	CString CurLine;

	CStdioFile inFile(in);
	
	int AllNumOfSent=0;

	while (inFile.ReadString(CurLine))
	{	
		CurLine.TrimLeft();
		CurLine.TrimRight();

	//	int tmplength=CountWordOfOneSentence(CurLine); // 按词数计句长
		mySentence[AllNumOfSent].SLength=CurLine.GetLength(); // 按字节数计当前句的句长
		mySentence[AllNumOfSent].Sent=CurLine;
		AllNumOfSent++;
		if (AllNumOfSent==MAXSENT)
		{
			AfxMessageBox("当前处理的文件中句子数量超出范围");
			break;
		}
	}		
	
	inFile.Close();
	
	//// 按句子长度排序
	qsort(mySentence,AllNumOfSent,sizeof(struct Sent_Length),CompareLength);

	out=fopen(FileName,"w+"); // 打开原文件为写入模式,原文件数据丢失。

	CStdioFile outFile(out);
	
	AllNumOfSent--;
	int kkk=0;

	while (AllNumOfSent>=0)
	{
		outFile.WriteString(mySentence[kkk].Sent+'\n');
		kkk++;
		AllNumOfSent--;
	} 
	outFile.Close();
}

void AddNoOfLine(CString FileName)
{// 在文件每行前加上行号,
 //	如果文件中每行都是一个句子,就是给句子加上序号
	FILE * in, * out;

	in=fopen((const char *)FileName,"rt");
	if(in==NULL) { 
		AfxMessageBox("can't open the file"); 
		return;
	}
	
	FileName=ChangeFileName(FileName,"_");

	out=fopen((const char*)FileName,"wt");
	if(out==NULL) {
		AfxMessageBox("can't write the file");
		fclose(in);
		return;
	}
	CStdioFile inFile(in),outFile(out);

	char s[4000];
	CString line;

	int i=1;
	char buffer[20];
  
	while(inFile.ReadString(s,4000)) {
		_itoa(i, buffer, 10 ); // 将整数i转为字符值buffer
		line=buffer;
		line=line+ " ";
		line=line + s;
		outFile.WriteString(line);
		i++;
	}

	fclose(in);
	fclose(out);
}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -