📄 im.cpp
字号:
#include<stdio.h>
#include<string.h>
#include<locale.h>
#include<stdlib.h>
#include<malloc.h>
#include<io.h>
#include<assert.h>
#include<math.h>
typedef struct word_item{
wchar_t *word;
unsigned freq;
}WordItem;
int wfreqs[20902]={0};
double wfreqfs[20902]={0};
WordItem *items=NULL;
int wordcount=2000000;
wchar_t *puncs=L"⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ*-./⑴⑵⑶⑷⑸⑹⑺⑻⑼⑽㈠㈡㈢㈣㈤㈥㈦㈧㈨㈩$‰∑§αβγ¥℃∏※±×÷□◆▲●★【】『』①②③④⑤⑥⑦⑧⑨⑩⒈⒉⒊⒋⒌⒍⒎⒏⒐⒑12345678901234567890〈〉○●△▲『』±%×~!@#$%^&*()_+|=][';,.?:\"<>{}!·#¥%……—*‘’()、——+|{}[]:“”;《》,。?\n\r\t ";
wchar_t *puncs1=L"\n\t\r ";
int freq_cmp(const void *p,const void *q)//词频排序函数
{
WordItem *wp=(WordItem*)p,
*wq=(WordItem*)q;
return (wq->freq)-(wp->freq);
}
int wcs_cmp(const void *p,const void *q)//音序排序函数
{
return wcscmp(*(const wchar_t**)p,*(const wchar_t**)q);
}
int LoadUnicText(wchar_t * &text, const char*filename)//读入文件,必须用word保存为unicode编码
{
int char_num=0;
wchar_t ch,*p=text;
FILE *in;
if((in=fopen(filename,"rb"))==NULL){//判断源文件
printf("Can't open file!");
return 0;
}
char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file
// while(fgetwc(in)!=WEOF) char_num++;
rewind(in);
p=(wchar_t*)calloc(char_num,sizeof(wchar_t));
if(!text || (_msize(p)/sizeof(wchar_t) < (unsigned)char_num)){//分配检查判断
printf("内存分配失败!\n");
fcloseall();
return 0;
}
while((ch=fgetwc(in))!=WEOF) {//copy
*p=ch;
if(ch>=19968 && ch<=40869)//进行字频统计
wfreqs[ch-19968]++;
p++;
}
*p=L'\0'; //确保它是以0结尾
rewind(in);
if(fgetwc(in)==65279) char_num--;//unicode开头有一个不明的标记字符:值为65279
p=p-char_num;
text=p;
fclose(in);
return char_num;
}
wchar_t **GetUnicStringArray(wchar_t *TEXT,int char_num)//建立索引
{
wchar_t **pp;
int i=0;
if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num)
printf("GetUnicStringArray_源文本为空!\n");
return NULL;
}
pp=(wchar_t**)calloc(char_num+1,sizeof(wchar_t*));//分配row内存
if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check
printf("row内存分配失败!\n");
return NULL;
}
pp[char_num]=L"END!";//加一个结尾标记
for(i=0;i<char_num;i++)//赋值
pp[i]=TEXT++;
qsort(pp,char_num,sizeof(pp[i]),wcs_cmp);
// printf("排序后:\n");
// for (i=0;i<char_num;i++) printf("No.%3d:[%ls]\n",i,pp[i]);
return pp;
}
wchar_t **GetUnicStringArray2(wchar_t *TEXT,int char_num)//建立索引,不要 空格\t\n\r
{
wchar_t **pp;
int i=0;
if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num)
printf("GetUnicStringArray_源文本为空!\n");
return NULL;
}
pp=(wchar_t**)calloc(char_num,sizeof(wchar_t*));//分配row内存
if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check
printf("row内存分配失败!\n");
return NULL;
}
for(i=0;i<char_num;i++)//赋值
pp[i]=TEXT++;
qsort(pp,char_num,sizeof(pp[i]),wcs_cmp);
// printf("排序后:\n");
// for (i=0;i<char_num;i++) printf("No.%3d:[%ls]\n",i,pp[i]);
return pp;
}
int RetrievalUnicString(const wchar_t *string, wchar_t**array,
int array_num,int ContextLen,FILE *output)
{
int eg_num=0,i=0,hlen=0,rlen=0;
wchar_t **pp=(wchar_t**)array;
if(string==NULL || *string==L'\0') {
printf("没有输入要查找的字符串!\n");
return 0;
}
if(pp==NULL) return 0;
for(i=0;i<array_num;i++){
if(wcsncmp(pp[i],string,wcslen(string))==0){
int headlen=0,rearlen=0,j=0;
wchar_t* tmp;
eg_num++;
headlen=array_num-wcslen(pp[i]);//count headlen
if(ContextLen<headlen) headlen=ContextLen;//shorten headlen
rearlen=wcslen(pp[i])-wcslen(string);//count rearlen
if(ContextLen<rearlen) rearlen=ContextLen;//shorten rearlen
tmp=pp[i]-headlen;//get temporary string
fprintf(output,"No.%d::[",eg_num);
for(j=0;j<(headlen+rearlen+(int)wcslen(string));j++)//printf
fprintf(output,"%lc",*tmp++);
fprintf(output,"]\n");
// fprintf(output,"");
}
if((eg_num>0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0))
break; /*检索到的例子是相邻关系,所以在找到例子以后,一旦发现
下一条不符合条件,就可以立即跳出循环 */
}
if (eg_num==0) {
printf("文本没有找到相应的字符(串)!");
return 0;
}
fcloseall();
return eg_num;
}
wchar_t *cutoff(const wchar_t *source,FILE *output)//去除括号和词性标记
{
wchar_t *p=(wchar_t *)source;
// FILE *out;
// out=fopen("result30.txt","wt");
if(!p) return NULL;
printf("df");
while(*p){
if(*p==L'/' && *(p+5)==L' ') p+=7;//cutoff"/n]ns"类
if(*p==L'/' && *(p+6)==L' ') p+=8;//cutoff"/nt]ns"类
if((*p)==L'/' && *(p+2)==L' ') p+=4;//cutoff"/n "类
// {fprintf(output,"%lc",*p);}
if((*p)==L'/' && *(p+3)==L' ') p+=5;//cutoff"/ns "类
// {fprintf(output,"%lc",*p);}
if(*p==L'[') p++;//cutoff"["要放到最后一步来做
fprintf(output,"%lc",*p);
p++;
}
return NULL;
}
int Dicbulid(const wchar_t *source,FILE *output)//想做一个从文本直接提取词表的函数,失败
{
int i=0,j=0;
wchar_t *q=(wchar_t *)source;
//items=(WordItem*)calloc(wordcount,sizeof(WordItem));
//if(items==NULL)return 0;
if(!q) return 0;
while(*q){//" 字/"格式,建立单字词表
// wchar_t *q=p;
if((*q==L' ' || *q==L'[') && *(q+1) && *(q+2)==L'/'){
fprintf(output,"%lc\n",*(q+1));
q+=3;
i++;//计算个数
}
else q++;
}
return i;
// if (*q==L' ')
/* while(*(++q)!=L'/');
wchar_t tmp[200];
wcsncpy(tmp,p,q-p);
items[i].word=_wcsdup(tmp);
printf("%ls\n",items[i].word);
// fwrite(items[i].word,sizeof(wchar_t),wcslen(items[i].word)+1,output);
p=q+2;
if(*p==L' ' || *p==L'\n') p++;
i++;
}
*/
// for(j=0;j<i;j++)
// fwrite(items[j].word,sizeof(wchar_t),wcslen(items[j].word)+1,output);
// return i;
}
wchar_t *cleartag(const wchar_t *source,FILE *output)//从陈老师的检索结果提取出纯词条的文件
{
wchar_t *p=(wchar_t *)source;
if(!p) return NULL;
while(*p){
if(*p==L'\n')
fprintf(output,"%lc",*p);
if(*p>=L'一') //“一”最小的汉字 编码,作为阈值
p++;
}
return NULL;
}
int creatdic(const wchar_t *source,FILE *output)//从纯词条的文件建立一个带词频排序的词表
{
wchar_t *p=(wchar_t *)source;
wchar_t *q=(wchar_t *)source;
int Wordcount=0,i=0;
if(!p) return NULL;
while(*q){//计算词条数
if(*q==L'\n') Wordcount++;
q++;
}
printf("词条数[%d]",Wordcount);
items=(WordItem*)calloc(Wordcount,sizeof(WordItem));
if(items==NULL) return 0;
while(*p){
int len=0,j=0,tag=0;
wchar_t *words=p;
q=p;
while(*(++q)!=L'\n');
len=q-p;p=q+1;
words[len]=L'\0';
for(j=0;j<i;j++){
if (wcsncmp(words,items[j].word,len)==0) {
items[j].freq++;
tag=1;//find相同
}
}
if(tag==1) continue;
else{
items[i].word=_wcsdup(words);
items[i].freq=1;
i++;
}
}
// qsort(items,i,sizeof(WordItem),wcs_cmp);//按照音序排
qsort(items,i,sizeof(WordItem),freq_cmp);//按词频序排
// printf("%ls\t%d\n",items[i].word,items[i].freq);
// fwrite(items[i].word,sizeof(wchar_t),len+1,output);
// fwrite(&items[i].freq,sizeof(unsigned),1,output);
for(i=0;(i<Wordcount) && (items[i].word);i++)
fprintf(output,"%ls%d\n",items[i].word,items[i].freq);
// fprintf(output,"%lc",19968);
//
return Wordcount;
}
int RetrievalUnic2String(const wchar_t *string1, const wchar_t *string2,
wchar_t**array,int array_num,
int intelen,//设定距离
int ContextLen,FILE *output)
{
int eg_num=0,i=0,hlen=0,rlen=0;
wchar_t **pp=(wchar_t**)array;
wchar_t *string=(wchar_t*)string1;
if(string==NULL || *string==L'\0' || string2==NULL || *string2==L'\0') {
printf("没有输入要查找的字符串!\n");
return 0;
}
if(pp==NULL) return 0;
for(i=0;i<array_num;i++){
if(wcsncmp(pp[i],string,wcslen(string))==0){
int headlen=0,rearlen=0,j=0;
wchar_t* tmp;
eg_num++;
headlen=array_num-wcslen(pp[i]);//count headlen
if(ContextLen<headlen) headlen=ContextLen;//shorten headlen
rearlen=wcslen(pp[i])-wcslen(string)-intelen-wcslen(string2);//count rearlen
if(ContextLen<rearlen) rearlen=ContextLen;//shorten rearlen
tmp=pp[i]-headlen;//get temporary string
wchar_t **tmpp=GetUnicStringArray(tmp,wcslen(tmp));//再次2级化指针
if(wcsncmp(tmpp[i+intelen],string2,wcslen(string2))==0){
fprintf(output,"No.%d::[",eg_num);
for(j=0;j<(headlen+rearlen+(int)wcslen(string)+(int)wcslen(string2)+intelen);j++)//printf
fprintf(output,"%lc",*tmp++);
fprintf(output,"]\n");
}
}
if((eg_num>0) && (wcsncmp(pp[i+1],string,wcslen(string))!=0))
break; /*检索到的例子是相邻关系,所以在找到例子以后,一旦发现
下一条不符合条件,就可以立即跳出循环 */
}
if (eg_num==0) {
printf("文本没有找到相应的字符(串)!");
return 0;
}
fcloseall();
return eg_num;
}
int stringcmp2(const wchar_t *string1,const wchar_t *string2)
{
if(!string1 || !string2) return -1;
wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2;
while(p&&q){
if(*p!=*q) return p-string1;
p++,q++;
}
return 0;
}
int stringcmp (const wchar_t *string1,const wchar_t *string2)
{
if(!string1 || !string2) return -1;
wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2;
while(p&&q){
if(*p!=*q || wcschr(puncs,*p)) return p-string1;
p++,q++;
}
return 0;
}
void IM(wchar_t**array,int array_num,FILE *output)
{//互信息函数
wchar_t **p=(wchar_t**)array;
for(int i=0;i<array_num-1;i++){
int samelen=0,sl2=0,tmplen=0;
while(1)
{
samelen=stringcmp(p[i],p[i+1]);//求相同字段的长度
if(samelen>=2){
i++;
tmplen++;
sl2=samelen;
}
else break;
}
if(tmplen>0){//如果出现频次〉1
double tmpfreq=double(tmplen+1)*sl2/array_num;//保存临时词串每个字的频率的乘积
for(int j=0;j<sl2;j++){//累乘每个字的频率
wchar_t ch=p[i][j];
fprintf(output,"%lc",ch);
if(ch>19967 && ch<40870)//加上足够的汉字限制条件
tmpfreq=tmpfreq/(wfreqfs[ch-19968]);
//assert(tmpfreq);
}
fprintf(output,"\t%d\t%lf\r",tmplen+1,log(tmpfreq)/log(2));
}
}
}
void chars(wchar_t **array,int array_num,FILE *output)
{//打出所有字符的字符 编码 频率
wchar_t **p=(wchar_t**)array;
for(int i=0;i<array_num-1;i++){
int samelen=1;
while (p[i][0]==p[i+1][0]){
samelen++;
i++;
}
/* if(wcschr(puncs1,p[i][0]))
switch (p[i][0]){
case L'\n':fprintf(output,"[\\n]\t[%d]\n",samelen);break;
case L'\r':fprintf(output,"[\\r]\t[%d]\n",samelen);break;
case L'\t':fprintf(output,"[\\t]\t[%d]\n",samelen);break;
case L' ':fprintf(output,"[ ]\t[%d]\n",samelen);break;
default:break;
}
else */
if(wcschr(puncs1,p[i][0])) continue;
else fprintf(output,"[%lc]\t[%d]\t[%d]\n",p[i][0],p[i][0],samelen);
}
// fprintf(output,"%ls",p[array_num]);
}
void freqcmt(double *hzfreq,int *hzarray,int hzlen,int CorpusSize)
{//计算每个字的相对频率
for(int i=0;i<hzlen;i++){
if(hzarray[i]>0)
hzfreq[i]=(float)hzarray[i]/CorpusSize;
}
}
void main()
{
FILE* output,*output1,*output2;
char filename[]="199801unic.txt",filename2[]="modal_IM.txt";////"postcorpus.bin";人民日报语料unic.txt
wchar_t string1[]=L"/v",string2[]=L"/",**array,*text,*textshou;
int char_num=0,ContextLen=5;
setlocale(LC_ALL,"chs");
// printf("%d",stringcmp(L"asd",NULL));
char_num=LoadUnicText(text,filename);
printf("从文本中成功加载字符%d个。\n",char_num);
printf("正在处理中!\n");
freqcmt(wfreqfs,wfreqs,20902,char_num);
array=GetUnicStringArray(text,char_num);printf("排序结束!\n");
output=fopen("im__re.txt","wt");
// IM4(array,char_num,output);
IM(array,char_num,output);
/*
output1=fopen("hanzi.txt","wt");
for(int i=0;i<20901;i++)//打印出汉字,编码,频次
if(wfreqs[i]>0)
fprintf(output1,"%lc\t[%d]\t[%d]\t[%e]\n",i+19968,i,wfreqs[i],wfreqfs[i]);
output2=fopen("char.txt","wt");
chars(array,char_num,output2);
*/ printf("处理完毕!");
fcloseall();
}
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -