📄 ssps.h
字号:
/***************************************************************
* 工程: 自然语言处理综合系统
* 作者: CISTR BUPT
* 修改者 李赟(liyun@nlu.caai.cn)
* 描述: 单扫描算法快速分词(SSPS)(含词典管理)
* 主要函数:见头文件
* 版本: 1.0
* 修改: 增加DF统计,封装特征抽取功能
* 参考文献:IMFS1.0相关资料
**************************************************************/
//#define FOR_LINUX
//#include "function.h"
#pragma warning(disable:4786)
#ifndef _SSPSNEW_H_051213_
#define _SSPSNEW_H_051213_
#define MAXCHWORDLEN 100
#define MAXWORDLISTCOUNT 5
#include <string>
#include <map>
#include <cstring>
#include <vector>
#include <utility>
#include <cstdio>
//#include "sys/direct.h"
#include "stdlib.h"
#include <sys/types.h>
#include <sys/stat.h>
#include "ctype.h"
//#include <time.h>
//#include <afxcoll.h>
using namespace std;
#include "EigenVectorSelect.h"
class CSSPS
{
public:
CSSPS();
//初始化
bool Init(const string& DicFileName,EvsMethod method_in= DF_EVS,unsigned int kwdnum_in=200 ,const string& vecfiledir_in="..\\Dic",const string& testfiledir_in="..\\Test");
virtual ~CSSPS();
//打开和关闭词典
int DicOpenDic(const char *DicFileName);
int DicCloseDic();
//词典管理-删除词
int WDel(unsigned char *cWord);
//词典管理-增加词
int WInsert(unsigned char* cWord,unsigned char* cPos);
//快速分词,入口sBuffer 出口strResult ,分隔符为空格
void Segment(unsigned char* sBuffer, unsigned char* strResult,char tag1,char tag2);
void Segment(unsigned char* sBuffer, unsigned char* strResult,vector< pair<string,int> > &wordvec);
void SSPS(char* lpszBuffer,char* targetBuf);
long filelength1(int);
long get_runtime();
void *emalloc(__int32 i);
int OperateDic(const char *IFileName,const char *DicFileName);
unsigned char* loadkwd(unsigned char* filename);
int OperateDic( int flag,const char *IFileName,const char *DicFileName,char tag1,char tag2 );
int DicCloseDic(const char *DicFileName);
unsigned int GetVecInDoc(const unsigned char* sBuffer,map<string,unsigned int>&resmap)const;
bool exist(unsigned char * word, unsigned char len,unsigned char * pos);
//单次快速分词的同时获取文本count
//this function will call GetVecInDoc set onceonly = true
unsigned int CountDf(const char *path,const char *wordname,const char *ResultVecFileDir=NULL);
//更新训练数据
bool UpdateAllData();
//向量化一个字符串(用于分类)
string GetVecStr(const string&strin)const;
//读入一个文本文件并向量化
string GetVecStrFromFile(const string&filename)const;
bool ChangeEvsMethod(EvsMethod method_in,unsigned int kwdnum_in=200);
//根据语料向量化文本并训练一个类别
bool TrainFiles(const char *TrainFileDir,const char *ClassName);
//根据语料向量化测试文本
bool TrainTestFiles(const char *TrainFileDir,const char *ClassName);
//获取当前所有类别
unsigned short GetClassNames(set<string>& namevec);
private:
EvsMethod method;
unsigned int kwdnum;
string vecfiledir;
string testfiledir;
const char D; //用作分词标志的特殊字符,加在汉字词之间作为间隔标志
const char E; //
FILE* fLex;
struct aWORDdic{
unsigned char * sCIYU;//词语
unsigned char Len;//词语的长度
unsigned int iWordId[MAXWORDLISTCOUNT];
unsigned char * pos;
aWORDdic* ptrNext;
};
struct WIndexEntry{
__int16 WCount;
struct aWORDdic *WList;
}WIndexcom[94][94];//指针索引表(全局变量),共72区,每区94个汉字
struct WIndexEntry CIndexcom[128]; //指针索引表(全局变量),字符、数字等
// 从指针文件(当前目录下的Ssps.ptr文件)中读取索引表和打开
//词典正文文件(当前目录下的Ssps.lex文件)
FILE *fwptr; //声明词典指针文件
FILE *fword; //声明词典正文文件
map<string,unsigned int> allwords; //kwdid list
int isHanzi(unsigned char ch1,unsigned char ch2)const;
int isASCII(unsigned char ch)const;
int wbspace(unsigned char ch)const;
const string GetMethoStr()const;
int findword(int count,char *buffer,struct aWORDdic *Wptr);
//read a word from a string
unsigned char* GetaWord(unsigned char*,unsigned int& ,unsigned char*,unsigned char*,char,char) const;
void insertwordtomap(const unsigned char*start,unsigned int len, map<string,unsigned int>&resmap)const;
void freeDic();
};
#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -