📄 分词算法源代码.txt
字号:
// Document.cpp: implementation of the Document class.
//
//////////////////////////////////////////////////////////////////////
#include "stdafx.h"
#include "frequency.h"
#include "Document.h"
#ifdef _DEBUG
#undef THIS_FILE
static char THIS_FILE[]=__FILE__;
#define new DEBUG_NEW
#endif
#include "Document.h"
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//词表操作
void Document::initTmpWordArray()
//词汇表初始化
{
int i;
for (i=0;i<=255;i++) tmpWordArray[i]=NULL;
}
void Document::InCreaseWord(WordType *pWord)
//指定词条频度加1
{
if (pWord==NULL) return;
pWord->count ++;
}
void Document::ArrangeWord(WordType * pWord,int intMethod,int Direction)
//intMethod=1 词条频率 =2 词条文内权重
//插入词条按照词频或者词权重对wordtype进行排序
{
int i,j;
if (intMethod==1)
{
// 插入排序
for (i=0;i<intWordCount[Direction-1];i++) //intWordCount描述lastWordArray[]中两个方向的词数.by xfguo
if (lastWordArray[Direction-1][i].count<=pWord->count)//Direction; 1 正向 ;2 反向
{
for(j=intWordCount[Direction-1]-1;j>=i;j--)
lastWordArray[Direction-1][j+1]=lastWordArray[Direction-1][j];
break;
}
lastWordArray[Direction-1][i]=*pWord;
intWordCount[Direction-1]++;
}
else if (intMethod==2)
{
for (i=0;i<intWordCount[Direction-1];i++)
if (lastWordArray[Direction-1][i].weight_count<=pWord->weight_count )
{
for(j=intWordCount[Direction-1]-1;j>=i;j--)
lastWordArray[Direction-1][j+1]=lastWordArray[Direction-1][j];
break;
}
lastWordArray[Direction-1][i]=*pWord;
intWordCount[Direction-1]++;
}
else if (intMethod==0)
{
lastWordArray[Direction-1][intWordCount[Direction-1]]=*pWord;
intWordCount[Direction-1]++;
}
}
void Document::AddNewWord(char * strWord,int intWordCount,int intWordLength,float weight)
//加入新的词汇
{
//if (IfExtendWordTable==2) return;
if (strlen(strWord)>48 ) return ;
WordType * pWordTmp,*pWordInsert,*pWordStart,*pTmp;
pWordTmp=(WordType *)malloc(sizeof(WordType));
pWordTmp->next=tmpWordArray[(unsigned char)(*strWord)];
pWordTmp->count=intWordCount;
pWordTmp->length=intWordLength;
pWordTmp->weight=weight; //???
strcpy(pWordTmp->word,strWord);
//插入排序
pWordStart=tmpWordArray[(unsigned char)(*strWord)];
pWordInsert=NULL;
while((pWordStart!=NULL)&&(pWordStart->length>intWordLength))
{
pWordInsert=pWordStart;
pWordStart=pWordStart->next;
}
if (pWordInsert==NULL)
{
pTmp=tmpWordArray[(unsigned char)(*strWord)];
tmpWordArray[(unsigned char)(*strWord)]=pWordTmp;
pWordTmp->next=pTmp;
}
else
{
pTmp=pWordInsert;
pWordInsert->next=pWordTmp;
pWordTmp->next =pWordStart;
}
}
void Document::freeWordTable(WordType * pWord)
//词汇表行的释放
{
WordType *tmpWord;
while (pWord!=NULL)
{
tmpWord=pWord->next;
free(pWord);
pWord=tmpWord;
}
/* WordType * q;
while (pWord!=NULL)
{
q=pWord->next;
free(pWord);
pWord=q;
}
*/
}
void Document::ArrangeWordInTable(int YuZhi,int intMethod,int Direction)
//intMethod=1 词条频率 =2 词条文内权重 =0 不排序
//输出词条排序
//tmpWordArray[]按一定的顺序插入到lastWordArray[]中
{
int i;
intWordCount[Direction-1]=0;
WordType *pWord;
for (i=0;i<=255;i++)
{ pWord=tmpWordArray[i];
while (pWord!=NULL)
{ if (pWord->count>=YuZhi) ArrangeWord(pWord,intMethod,Direction);
pWord=pWord->next;
}
}
}
void Document::ResetWordTable()//将词频设置为0
{
int i;
WordType *pWord;
for (i=0;i<=255;i++)
{ pWord=tmpWordArray[i];
while (pWord!=NULL)
{ pWord->count=0;
pWord->weight_count=0;
pWord=pWord->next;
}
}
}
void Document::ClearWordTable()//清空词表内的词条
{
int i;
WordType *pWord;
for (i=0;i<=255;i++)
{ pWord=tmpWordArray[i];
while (pWord!=NULL)
{ pWord->count=0;
pWord->weight_count=0;
pWord=pWord->next;
}
}
}
WordType * Document::FindWord(char *strWord)
//查询表中有无所查的单词,如果找到,返回指针位置,否则返回NULL
{
if (*strWord=='\0') return NULL;//出错误
WordType * pStartPlace;
pStartPlace=tmpWordArray[(unsigned char)(*strWord)];
while (pStartPlace!=NULL)
{
if (strcmp(strWord,pStartPlace->word)==0) return pStartPlace; //找到了
pStartPlace=pStartPlace->next ;
}
return NULL;//没有找到
}
void Document::freeTmpWordArray()
//词汇表的释放
{
int i;
for (i=0;i<=255;i++) {freeWordTable(tmpWordArray[i]);tmpWordArray[i]=NULL;}
}
void Document::initArrangeTable()
//释放临时词表
{
intWordCount[0]=0;
intWordCount[1]=0;
}
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
//词条文件操作
//筛选后导出词条
void Document::WriteWordToFile(char * strFileName,int ifWriteFre,int Direction)
//ifWriteFre =1 输出词频 =0 no
{
FILE *fWrite;
if((fWrite=fopen(strFileName,"w"))==NULL) return ;
int i;
int count;
char strWordTmp[50];
count=intWordCount[Direction-1];
for (i=0;i<count;i++)
{
if (Direction==2)
ReverseString(lastWordArray[Direction-1][i].word,strWordTmp);
else strcpy(strWordTmp,lastWordArray[Direction-1][i].word);
if (ifWriteFre==1){
//if (strWordTmp[0]<128)
// continue;
fprintf(fWrite,"%s %d\n",strWordTmp,lastWordArray[Direction-1][i].count);
}
else fprintf(fWrite,"%s\n",strWordTmp);
}
fclose(fWrite);
}
void Document::copylastword(WordType *pWordtype,int *wordnum)
{
int i;
int count=intWordCount[0];
*wordnum=intWordCount[0];
for (i=0;i<intWordCount[0];i++)
{
strcpy(pWordtype[i].word,lastWordArray[0][i].word);
pWordtype[i].count =lastWordArray[0][i].count;
}
}
int Document::ReadFileFromText(char* strFileName)
//读取文本文件内容到字符串。自动分配内存。
{
/* if (strSource!=NULL) free(strSource); */
FILE* fp;
long lFileLength;
if((fp=fopen(strFileName,"rb"))!=NULL)
{
fseek(fp,0,SEEK_END);
lFileLength=ftell(fp);
rewind(fp);
strSource=(char *)malloc(lFileLength+1);
fread(strSource,lFileLength,1,fp);
*(strSource+lFileLength)='\0';
fclose(fp);
//去掉换行
//直接在strsource上修改,那个更更好??-
char * strTmpSource;
strTmpSource=(char *)malloc(lFileLength+1);
int i,j;
j=0;
for (i=0;i<=lFileLength;i++)
{
if (*(strSource+i)!='\r' && *(strSource+i) !='\n')
{
*(strTmpSource+j) = *(strSource+i);
j ++;
}
}
lFileLength=j;
free(strSource);
strSource=(char *)malloc(lFileLength+1);
*(strSource+lFileLength+1)='\0';
strcpy(strSource,strTmpSource);
free(strTmpSource);
return 1;
}
else
{
//AfxMessageBox(strFileName);
return 0;
}
}
int Document::GetCharFromString(char * strSource,int intStartPlace,char * strReturn,int * intCharLength)
//取得源串中的从绝对地址开始的字符
//返回值:0 末尾;1 字符 ;2 符号 ;-1 出错 ;4 回车
// 汉语
{
unsigned char char1,char2;
char strVertueWord[1000];
int intStrLength;
WordType * pWord;
int i;
char1=(unsigned char)(*(strSource+intStartPlace));
if (char1==0) return 0;
char2=(unsigned char)(*(strSource+intStartPlace+1));
if (char1>128)
{
if (char2==0) return -1;
*strReturn=(char)(char1);
*(strReturn+1)=(char)(char2);
*(strReturn+2)='\0';
*intCharLength=2;
if (Direction==2) ReverseString("-—;/ 。,:、!?的了在你我她它他到是不和《〉》〈~【】〖〗×∥■▲○●→─·…★“”‘’()—",strVertueWord);
else strcpy(strVertueWord,";-—/ 。,:、!?的了在你我她它他到是不和《〉 》〈~【】〖〗×∥■▲○●→─·…★“”‘’()—");
if (strstr(strVertueWord,strReturn)==NULL) return 1;
else if ((strstr(strVertueWord,strReturn)-strVertueWord)%2==1) return 1;
else return 2;
}
*strReturn=(char)(char1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -