📄 myfileapp.cpp
字号:
// CurTime="<TIME>"+CurLine.Mid(6)+"</TIME>";
CurTime="<TIME>现代</TIME>"; // 文件夹9中的语料都是当代汉语
// CurTime="<TIME>当代</TIME>"; // 文件夹10中的语料都是当代汉语
// 处理法律文献的篇名
/*
inFile.ReadString(CurLine);
inFile.ReadString(CurLine);
CurLine.TrimLeft();
CurLine.TrimRight();
Title=CurLine;
*/
}
else{
AfxMessageBox("Format Error!");
return;
}
// 处理输出文件的文件名
FILE *fp_out;
CString NewFileName;
// 处理编辑部的故事的文件名
if (CurTitle.Find("编辑部的故事")>=0)
Title="编辑部的故事_"+Title;
// 处理北京话口语调查材料的文件名
if (CurTitle.Find("北京话口语调查材料")>=0)
Title="北京话口语调查材料_"+Title;
// 一般语料的文件名
Title=ChangeBracket(Title);
// 处理篇名中可能包含的问号
if (Title.Find("?")>=0) {
int dd=Title.Find("?");
while (dd>=0 && (Title.GetLength()>=(dd+1)))
{
Title=Title.Left(dd)+Title.Mid(dd+1);
dd=Title.Find("?");
}
}
NewFileName=Title+".xml";
fp_out=fopen(NewFileName,"wt");
CStdioFile outFile(fp_out);
outFile.WriteString(XML_Initial+"\n");
// outFile.WriteString(XML_Initial2+"\n");
outFile.WriteString(text_begin+"\n");
outFile.WriteString(text_head_begin+"\n");
outFile.WriteString(CurTitle+"\n");
outFile.WriteString(CurAuthor+"\n");
outFile.WriteString(CurStyle+"\n");
outFile.WriteString(CurTime+"\n");
outFile.WriteString(text_head_end+"\n");
outFile.WriteString(text_body_begin+"\n");
// 以上处理文件头
CurState=2; // 以下处理文件正文
while(inFile.ReadString(CurLine) && CurState==2)
{
CurLine.TrimLeft();
CurLine.TrimRight();
int CurLineLength=0;
CurLineLength=CurLine.GetLength();
if (CurLineLength==1 && CurLine=="@")
{
CurState=1; // 扫描到“@”表示下面是一个新文件
break;
}
else {
if (CurLine!="") {
if((CurLine.Find("@")==0) || (CurLine.Find("王朔")==0) || (CurLine=="【全文完】") || (CurLine=="(全文完)") || (CurLine=="(全文完)")
|| (CurLine.Find("From")==0) || (CurLine.Find("Subject")==0) || (CurLine.Find("To:")==0) || (CurLine.Find("Date:")==0)
|| (CurLine.Find("Source:")==0) || (CurLine.Find("发信人:")==0)|| (CurLine.Find("标题:")==0)|| (CurLine.Find("发信站:")==0)
|| (CurLine.Find("[小说完]")==0) || (CurLine.Find("Newsgroups:")==0) || (CurLine.Find("ganization:")>=0) || (CurLine.Find("【完】")==0)
|| (CurLine.Find("(完)")==0) || (CurLine.Find("提交者:")==0) || (CurLine.Find("答复:")==0))
{
// 处理编辑部的故事文本
//if((CurLine.Find("@")==0) || (CurLine.Find("编辑部的故事·")==0) || (CurLine=="【全文完】") ){
}
else {
if((CurLine.Find("亦凡书库")>=0) || (CurLine.Find("(原载")==0) || (CurLine.Find("原载")==0) || (CurLine.Find("摘自《")==0)) {
CurLine="<FROM>"+CurLine+"</FROM>";
outFile.WriteString(CurLine+"\n");
}
else {
if ((CurLine.Find("【作者")==0))
{
CurLine="<BACKGROUND>"+CurLine+"</BACKGROUND>";
outFile.WriteString(CurLine+"\n");
}
else
{
CurLine="<p>"+ChangeBracket(CurLine)+"</p>";
outFile.WriteString(CurLine+"\n");
}
}
}
}
}
} // 如果正文处理结束,就跳出循环
outFile.WriteString(text_body_end+"\n");
outFile.WriteString(text_end);
outFile.Close();
}
}
inFile.Close();
}
CString ChangeBracket(CString SourceString)
{// 将字符串中的<<改为中文书名号《, 将>>改为》
int d1;
d1=SourceString.Find("<<");
while (d1>=0 && (SourceString.GetLength()>=(d1+2)))
{
SourceString=SourceString.Left(d1)+"《"+SourceString.Mid(d1+2);
d1=SourceString.Find("<<");
}
d1=SourceString.Find(">>");
while (d1>=0 && (SourceString.GetLength()>=(d1+2)))
{
SourceString=SourceString.Left(d1)+"》"+SourceString.Mid(d1+2);
d1=SourceString.Find(">>");
}
d1=SourceString.Find("<");
while (d1>=0 && (SourceString.GetLength()>=(d1+1)))
{
SourceString=SourceString.Left(d1)+"("+SourceString.Mid(d1+1);
d1=SourceString.Find("<");
}
d1=SourceString.Find(">");
while (d1>=0 && (SourceString.GetLength()>=(d1+1)))
{
SourceString=SourceString.Left(d1)+")"+SourceString.Mid(d1+1);
d1=SourceString.Find(">");
}
return SourceString;
}
void GetRawFormatFromAnnotatedFile(CString FileName)
{// 将已标注好的语料还原为未标注语料
}
int CompareTwoNum(const void *q1, const void *q2)
{// 比较两个数的大小,q1大返回-1,按降序排
if (((struct SentLengthFreq *)q1)->SentFreq > ((struct SentLengthFreq *)q2)->SentFreq)
return -1;
else {
if(((struct SentLengthFreq *)q1)->SentFreq==((struct SentLengthFreq *)q2)->SentFreq)
{
if (((struct SentLengthFreq *)q1)->SentLength >= ((struct SentLengthFreq *)q2)->SentLength)
return -1;
else
return 1;
}
else
return 1;
}
}
void SumCharOfSentence(CString FileName)
{// 统计一个按句子分行的文件中各句长度及频度
struct SentLengthFreq Sentence[500];
int ddd=0;
while (ddd<500) { // 结构赋初值
Sentence[ddd].SentLength=0;
Sentence[ddd].SentFreq=0;
ddd++;
}
FILE *in, *out;
in=fopen(FileName,"rt");
if (in==NULL) {
AfxMessageBox("Can not open the file");
return;
}
CString Fname,CurLine;
Fname=ChangeFileName(FileName,"-sta");
out=fopen(Fname,"wt");
CStdioFile inFile(in), outFile(out);
int curfoot=0,AllNumOfSent=0;
while (inFile.ReadString(CurLine))
{
AllNumOfSent++;
CurLine.TrimLeft();
CurLine.TrimRight();
// int tmplength=CountWordOfOneSentence(CurLine); // 按词数计句长
int tmplength=CurLine.GetLength(); // 按字节数计当前句的句长
int Findout=0,i=0;
while (i<500)
{
if (tmplength==Sentence[i].SentLength)
{// 如果当前句的句长是Sentence结构中已经记录过的句长值
Sentence[i].SentFreq++; // 将结构中句长频度值增加1
Findout=1; // 表示当前句的长度在结构中已经存在了
break;
}
i++;
}
if (Findout==0)
{// 如果当前句的句长尚未记录在Sentence结构中
Sentence[curfoot].SentLength=tmplength;
Sentence[curfoot].SentFreq=1;
curfoot++;
}
}
//// 按句子频度排序
qsort(Sentence,curfoot,sizeof(struct SentLengthFreq),CompareTwoNum);
int j=0;
outFile.WriteString("句长 频度\n\n");
CString tabspace=" ";
CString len,freq,NumOfSent,NumOfWord,AllSent;
int CountWordNum=0;
while (curfoot>=0)
{
len.Format("%d",Sentence[j].SentLength);
freq.Format("%d",Sentence[j].SentFreq);
CountWordNum=CountWordNum+(Sentence[j].SentLength)*(Sentence[j].SentFreq);
if (len!="0") {
outFile.WriteString(len+tabspace+freq+"\n");
// outFile.WriteString("\n");
}
j++;
curfoot--;
}
NumOfSent.Format("%d",j-1);
NumOfWord.Format("%d",CountWordNum);
AllSent.Format("%d",AllNumOfSent);
outFile.WriteString("\n");
outFile.WriteString("不同句长的句子数:"+NumOfSent);
outFile.WriteString("\n");
/*
outFile.WriteString("总词数:"+NumOfWord); // 输出文件包含的总词数
outFile.WriteString("\n");
*/
outFile.WriteString("总句数:"+AllSent);
inFile.Close();
outFile.Close();
}
int CountWordOfOneSentence(CString Sentence)
{//按词数计算一个句子的长度
int SentLength=0,dash_pos=0;
Sentence.TrimLeft();
Sentence.TrimRight();
while (Sentence!="")
{
dash_pos=Sentence.Find("/");
if(dash_pos>=0) {
SentLength++;
Sentence=Sentence.Mid(dash_pos+1);
}
}
return SentLength;
}
int CompareLength(const void *q1, const void *q2)
{// 比较句长大小,q1大返回-1,按降序排
if (((struct Sent_Length *)q1)->SLength > ((struct Sent_Length *)q2)->SLength)
return -1;
else {
if(((struct Sent_Length *)q1)->SLength==((struct Sent_Length *)q2)->SLength)
{
if (((struct Sent_Length *)q1)->SLength >= ((struct Sent_Length *)q2)->SLength)
return -1;
else
return 1;
}
else
return 1;
}
}
void SentLenSort(CString FileName)
{// 按句子长度对文件中句子进行排序
struct Sent_Length mySentence[MAXSENT];
int ddd=0;
while (ddd<MAXSENT) { // 结构赋初值
mySentence[ddd].SLength=0;
mySentence[ddd].Sent="";
ddd++;
}
FILE *in, *out;
in=fopen(FileName,"rt");
if (in==NULL) {
AfxMessageBox("Can not open the file");
return;
}
CString CurLine;
CStdioFile inFile(in);
int AllNumOfSent=0;
while (inFile.ReadString(CurLine))
{
CurLine.TrimLeft();
CurLine.TrimRight();
// int tmplength=CountWordOfOneSentence(CurLine); // 按词数计句长
mySentence[AllNumOfSent].SLength=CurLine.GetLength(); // 按字节数计当前句的句长
mySentence[AllNumOfSent].Sent=CurLine;
AllNumOfSent++;
if (AllNumOfSent==MAXSENT)
{
AfxMessageBox("当前处理的文件中句子数量超出范围");
break;
}
}
inFile.Close();
//// 按句子长度排序
qsort(mySentence,AllNumOfSent,sizeof(struct Sent_Length),CompareLength);
out=fopen(FileName,"w+"); // 打开原文件为写入模式,原文件数据丢失。
CStdioFile outFile(out);
AllNumOfSent--;
int kkk=0;
while (AllNumOfSent>=0)
{
outFile.WriteString(mySentence[kkk].Sent+'\n');
kkk++;
AllNumOfSent--;
}
outFile.Close();
}
void AddNoOfLine(CString FileName)
{// 在文件每行前加上行号,
// 如果文件中每行都是一个句子,就是给句子加上序号
FILE * in, * out;
in=fopen((const char *)FileName,"rt");
if(in==NULL) {
AfxMessageBox("can't open the file");
return;
}
FileName=ChangeFileName(FileName,"_");
out=fopen((const char*)FileName,"wt");
if(out==NULL) {
AfxMessageBox("can't write the file");
fclose(in);
return;
}
CStdioFile inFile(in),outFile(out);
char s[4000];
CString line;
int i=1;
char buffer[20];
while(inFile.ReadString(s,4000)) {
_itoa(i, buffer, 10 ); // 将整数i转为字符值buffer
line=buffer;
line=line+ " ";
line=line + s;
outFile.WriteString(line);
i++;
}
fclose(in);
fclose(out);
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -