📄 mycode.h

📁 专门处理UNICODE编码文件的工作区
💻 H
字号:
#include<stdio.h>
#include<string.h>
#include<locale.h>
#include<stdlib.h>
#include<malloc.h>
#include<io.h>
typedef struct word_item{
	wchar_t *word;
	unsigned freq;
}WordItem;
wchar_t *Puncs=L" \n\r\t";
int LoadUnicText(wchar_t * &text, const wchar_t *filename);//读入文件，必须用word保存为unicode编码
int wcs_cmp(const void *p,const void *q);//音序排序函数
unsigned int gbk2uni(const wchar_t *inname,const wchar_t *outname);
//读入GBK，输出UNICODE文件
wchar_t **GetUnicStringArray(wchar_t *TEXT,int char_num);//建立索引
int stringcmp (const wchar_t *string1,const wchar_t *string2);
//比较两个字符串中相同词条的长度
int stringcmp2 (const wchar_t *string1,const wchar_t *string2);
//比较两个字符串相同词条+词性的长度，如，人们  /n
void GetItems(WordItem *items,wchar_t**array,int array_num,FILE *output);	
//获取词条，排序输出。参数为词表结构数组，二级指针，词条数，输出文件的buf
void GetItems2(WordItem *items,wchar_t**array,int array_num,FILE *output);
//获取词条，排序输出。参数为词表结构数组，二级指针，词条数，输出文件的buf
int writefile(const wchar_t *filename,FILE *out);
//把输入的文本，输出到out
int countfilenum(const wchar_t *filename);//计算输入的文本中的字符数

//////////////////////////////////////////////////
int wcs_cmp(const void *p,const void *q)//音序排序函数
{
	return wcscmp(*(const wchar_t**)p,*(const wchar_t**)q);
}
int LoadUnicText(wchar_t * &text, const wchar_t *filename)//读入文件，必须用word保存为unicode编码
{
	int char_num=0;
	wchar_t ch,*p=text;
	FILE *in;
	if((in=_wfopen(filename,L"rb"))==NULL){//判断源文件
		AfxMessageBox(L"Can't open file!");
		return 0;
	}
	char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file
	
	rewind(in);
	p=(wchar_t*)calloc(char_num,sizeof(wchar_t));
	if(text || (_msize(p)/sizeof(wchar_t) < (unsigned)char_num)){//分配检查判断
		AfxMessageBox(L"内存分配失败!\n");
		fcloseall();
		return 0;
	}
	while((ch=fgetwc(in))!=WEOF)	{//copy
		*p=ch;				
		p++;		
	}
	*p=L'\0'; //确保它是以0结尾
	rewind(in);
	if(fgetwc(in)==65279) char_num--;//unicode开头有一个不明的标记字符：值为65279
	p=p-char_num;
	text=p;	
	fclose(in);
	return char_num;
}
unsigned int gbk2uni(const wchar_t *inname,const wchar_t *outname)
{
	FILE *in,*out;
	setlocale(LC_ALL,"chs");
	if((in=_wfopen(inname,L"rt"))==NULL){
		AfxMessageBox(L"Cant open file!");
//		printf("Cant open file!");
		return 0;
	}
	if((out=_wfopen(outname,L"wb"))==NULL){
		AfxMessageBox(L"Cant open file!");
//		printf("Cant open file!");
		return 0;
	}
	int  length =_filelength(_fileno(in));
	wchar_t ch;
	fputwc(65279,out);//输出unic标记
	int i=0;
	for(i=0; (ch=fgetwc(in))!=WEOF && i<length; i++){
		fputwc(ch,out);
	}
	fcloseall();
	return i;
}
wchar_t **GetUnicStringArray(wchar_t *TEXT,int char_num)//建立索引
{
	wchar_t **pp;
	int i=0;
	if(TEXT==NULL || *TEXT==L'\0') {//判断源串(char_num)
		AfxMessageBox(L"GetUnicStringArray_源文本为空!\n");
		return NULL;
	}

	pp=(wchar_t**)calloc(char_num+1,sizeof(wchar_t*));//分配row内存
	if(!pp || _msize(pp)/sizeof(wchar_t *)< (unsigned)char_num){//check
		AfxMessageBox(L"row内存分配失败!\n");
		return NULL;
	}
	pp[char_num]=L"END!";//加一个结尾标记


	for(int j=0;j<char_num;j++){//赋值
		if(TEXT[j]==L'/' && TEXT[j+1]<128){
			int n_head=1;
			while(!wcschr(Puncs,TEXT[j-n_head])){//注意：输入文本的开头必须有Puncs，否则出错
				n_head++;
			}
//			TEXT[j]=0;
			pp[i]=&TEXT[j-n_head+1];//指向每个词例的首字
			i++;
		}
	}
	CString msg;
	msg.Format(L"文本中有词例_%d个",i);
	AfxMessageBox(msg);
	qsort(pp,i,sizeof(pp[i]),wcs_cmp);
//	printf("排序后：\n");
//	for (i=0;i<char_num;i++) printf("No.%3d:[%ls]\n",i,pp[i]);	
	return pp;
}
/*
int stringcmp (const wchar_t *string1,const wchar_t *string2)
{//比较两个字符串中相同词条的长度
	if(!string1 || !string2) return -1;
	wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2;
	while(p && q){
		if(*p!=*q || *(p)==L'/' || *(q)==L'/') return p-string1;
		p++,q++;
	}
	return 0;
}
*/
int itemlen(const wchar_t *string)
{//取每个词条+词性的长度
	if(!string) return -1;
	wchar_t *p=(wchar_t *)string;
	while(*p++!=L'/');
	while(*p++!=L' ');
	return p-string-1;
}
int stringcmp(const wchar_t *string1,const wchar_t *string2)
{//比较两个字符串相同词条的长度，如，人们/,返回整个的长度
	if(!string1 || !string2) return -1;
	wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2;
	
	while(*p++!=L'/');
	int len1=p-string1-1;
	while(*q++!=L'/');
	int len2=q-string2-1;
	if (len1!=len2) return 0;
	p=(wchar_t *)string1,q=(wchar_t *)string2;
	while(p && q && (len1>-1) && (len2>-1)){
		if(len1==0) return p-string1;
		if(*p!=*q) return 0;
		p++,q++;
		len1--;
		len2--;
		
	}
	return 0;
}
int stringcmp2 (const wchar_t *string1,const wchar_t *string2)
{//比较两个字符串相同词条+词性的长度，如，人们/n  ,返回整个的长度
	if(!string1 || !string2) return -1;
	wchar_t *p=(wchar_t *)string1,*q=(wchar_t *)string2;
	int len1=itemlen(p);
	int len2=itemlen(q);
	if (len1!=len2) return 0;
	while(p && q && (len1>-1) && (len2>-1)){
		if(len1==0) return p-string1;
		if(*p!=*q) return 0;
		p++,q++;
		len1--;
		len2--;
		
	}
	return 0;
}

void GetItems(WordItem *items,wchar_t**array,int array_num,FILE *output)	
{//获取词条，排序输出。参数为词表结构数组，二级指针，词条数，输出文件的buf
	wchar_t **p=(wchar_t**)array;
	setlocale(LC_ALL,"chs");
	for(int i=0;i<array_num-1;i++){
		int samelen=0,sl2=0,tmplen=1;
		while(1)
		{
			samelen=stringcmp(p[i],p[i+1]);//求相同字段的长度
			
			if(samelen>=1){
				i++;
				tmplen++;
				sl2=samelen;
			}
			else break;
		}
	
		wchar_t *tmpword=NULL;
		int j=0;
		for(j=0;j<20;j++){//取每个词条
			if(p[i][j]==L'/')
				break;
			fprintf(output,"%lc",p[i][j]);//保存临时词串频率
		}
	fprintf(output,"\t%d\n",tmplen);//保存临时词串频率
		
	}
}
void GetItems2(WordItem *items,wchar_t**array,int array_num,FILE *output)	
{//获取词条pos，排序输出。参数为词表结构数组，二级指针，词条数，输出文件的buf
	wchar_t **p=(wchar_t**)array;
	setlocale(LC_ALL,"chs");
	for(int i=0;i<array_num-1;i++){
		int samelen=0,sl2=0,tmplen=1;
		while(1)
		{
			samelen=stringcmp2(p[i],p[i+1]);//求相同字段的长度
			
			if(samelen>=1){
				i++;
				tmplen++;
				sl2=samelen;
			}
			else break;
		}
/*
		*items.word=
		*items.freq=tmplen;
*/		
		wchar_t *tmpword=NULL;
		int j=0;
		for(j=0;j<20;j++){//取每个词条
			if(p[i][j]==L'/')
				break;
			fprintf(output,"%lc",p[i][j]);//保存临时词串频率
		}
		fprintf(output,"\t");
		for(j++;j<30;j++){//取每个词性
			if(p[i][j]==L' ')
				break;
			fprintf(output,"%lc",p[i][j]);//保存临时词串频率
		}
	
		fprintf(output,"\t%d\n",tmplen);//保存临时词串频率
		
	}
}
int countfilenum(const wchar_t *filename)
{//计算输入的文本中的字符数
	int char_num=0;
	FILE *in;
	if((in=_wfopen(filename,L"rb"))==NULL){//判断源文件
		AfxMessageBox(L"Can't open file!");
		return 0;
	}
	char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file
	fclose(in);
	return char_num;
/*	rewind(in);
	unsigned int plen=1;
	while(p) {
		p++;
		plen++;
	}
	if(plen==1) p=(wchar_t*)calloc(char_num,sizeof(wchar_t));
	else p=(wchar_t*)realloc(p,(plen+char_num)*sizeof(wchar_t));
	if( p || ( ( _msize(p)/sizeof(wchar_t) ) < unsigned(char_num) ) ){//分配检查判断
		AfxMessageBox(L"内存分配失败!\n");
		fcloseall();
		return 0;
	}
//	p+=plen;//指向text的末尾
	while((ch=fgetwc(in))!=WEOF)	{//copy
		*p=ch;				
		p++;		
	}
	*p=L'\0'; //确保它是以0结尾
	rewind(in);
	if(fgetwc(in)==65279) char_num--;//unicode开头有一个不明的标记字符：值为65279
	p=p-(plen+char_num);
	text=p+1;	
	fclose(in);
	return plen+char_num;
*/}
int writefile(const wchar_t *filename,FILE *out)
{//把输入的文本，输出到out
	int char_num=0;
	FILE *in;
	if((in=_wfopen(filename,L"rb"))==NULL){//判断源文件
		AfxMessageBox(L"Can't open file!");
		return 0;
	}
	char_num=_filelength(_fileno(in))/sizeof(wchar_t); //length of the file
	for(int i=0;i<char_num;i++){
		wchar_t ch=fgetwc(in);
		fputwc(ch,out);
	}
	fclose(in);
//	fputws(L"**********************",out);
	return char_num;
}
💿 文件大小 5857 K
👤 上传用户 wuseyue
📂 所属分类多国语言处理
🏷️ 相关标签

#UNICODE #编码
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -