📄 cataloglist.h
字号:
// CatalogList.h: interface for the CCatalogList class.
//
//////////////////////////////////////////////////////////////////////
#if !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_)
#define AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_
#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000
#include "wordlist.h"
#include <malloc.h>
#include "svm.h"
class CDocNode;
void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount);
struct DocInfo{
CDocNode *m_pDocNode;
CString m_sDocName;
CString m_sClsName;
double m_dSimRatio;
};
//用来记录文档向量中每一维特征的权重
struct sWeightNode
{
int s_idxWord; //特征的ID
short s_tfi; //特征在文档中出现的频次
double s_dWeight; //特征的权重
};
class CDocNode:public CObject
{
public:
CDocNode();
~CDocNode();
const CDocNode& operator=(const CDocNode& x);
static void AllocTempBuffer(int nLen);
static void DeallocTempBuffer();
void AllocBuffer(int nLen);
void DeallocBuffer();
void AllocResultsBuffer(short nLen);
void DeallocResultsBuffer();
//使用分词的方法形成文档的特征属性
int ScanChinese(char *, CWordList&, int, short idxCata=-1);
int ScanEnglish(char *, CWordList&, int, short idxCata=-1, bool bStem=false);
static int ScanChineseString(char*, CWordList&, int, long, short idxCata=-1);
static int ScanEnglishString(char*, CWordList&, int, long, short idxCata=-1, bool bStem=false);
//根据词典wordList计算文档每一维的权重,形成文档的向量,将其保存到数组m_sWeightSet
//这个方法要求词典wordList中每一个wordnode的m_dWeight的值都赋为此特征的反比文档频率
int ScanChineseWithDict(char *,CWordList&);
int ScanEnglishWithDict(char *,CWordList&, bool bStem=false);
static int ScanChineseStringWithDict(char *pPath,CWordList& wordList);
static int ScanEnglishStringWithDict(char *pPath, CWordList &wordList, bool bStem);
int GenDocVector();
int GenDocVector(DOC &doc);
double ComputeSimilarityRatio(); //与存放在m_pTemp中的向量进行相似度的计算
void Serialize(CArchive& ar);
CDocNode(const CDocNode& x);
int GetWordNum();
BOOL IsZero();
public:
double ComputeProbability(CWordList& wordList,int n);
long m_idxDoc; //文档标识
CString m_strDocName; //文档名称
int m_nAllocLen; //文档向量的长度,即数组m_sWeightSet的长度
sWeightNode *m_sWeightSet; //词权值列表
short m_nClassNum; //代表训练文档中的类别总数,即数组m_pResults的大小
double *m_pResults; //文档与每个类别的相似度
short m_nCataID; //代表当前文档的所属类别,由于只在分类时使用,所以序列化的时候不操作此属性
static sWeightNode *m_pTemp; //生成文档向量时需要使用的一块临时内存
static int m_nAllocTempLen; //临时内存的大小
private:
static char m_pSentence[MAX_PATH*10];
private:
bool IsNumber(char * p);
static int ParseFile(char *, int, int &);
};
class CCatalogNode;
class CCatalogList;
void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount);
class CCatalogNode
{
public:
CCatalogNode();
~CCatalogNode();
CCatalogNode(const CCatalogNode& x);
const CCatalogNode& operator = (const CCatalogNode& x);
const CCatalogNode& operator += (const CCatalogNode& x);
public:
void InitCatalogNode(int nMode=0);
void SetStartDocID(long lDocID);
CDocNode& GetNext(POSITION& rPos);
POSITION GetFirstPosition();
POSITION AddDoc(CDocNode& docnode);
void Serialize(CArchive& ar);
UINT GetDocNum();
//扫描路径pPath下的所有文档,将其添加到当前类节点中
long ScanDirectory(CString);
public:
CDocNode& GetAt( POSITION position );
short m_idxCata;
CString m_strCatalogName;
CString m_strDirName;
long m_lTotalWordNum;
private:
CList<CDocNode,CDocNode&> m_lstDocList;
long m_lCurDocID;
};
class CCatalogList
{
public:
CCatalogList();
virtual ~CCatalogList();
const CCatalogList& operator = (const CCatalogList& x);
const CCatalogList& operator += (const CCatalogList& x);
void InitCatalogList(int nMode=0);
void DumpToFile (CString strFileName, int nSaveMode=0);
BOOL GetFromFile(CString strFileName);
void DumpDocList(CString strFileName);
long BuildLib(CString strDirName);
public:
static int GetSaveMode();
void DumpCataList(CString strFileName);
bool BuildCatalogID(CCatalogList &);
CCatalogNode* GetCataByName(CString strCataName);
short GetCataIDByName(CString strCataName);
short GetCataIDArrayFromString(char * line, CArray<short, short> &aryCataID);
CDocNode* GetDocByName(CString strDocName);
CCatalogNode* GetCata(short idxCata);
CCatalogNode GetAt(POSITION pos) const;
CCatalogNode& GetAt(POSITION pos);
CCatalogNode& GetNext(POSITION& rPos);
POSITION GetFirstPosition();
POSITION AddCata(CCatalogNode& catanode);
bool GetDocName(short idxCata,long idxDoc,CString& strDocName);
bool GetCataName(short idxCata,CString& strCataName);
int GetCataNum();
long GetDocNum();
private:
void Serialize(CArchive& ar);
long ScanDirectory(CString strDirName);
private:
static int m_nSaveMode; //0 保存文档向量, 1 不保存文档向量
CList<CCatalogNode,CCatalogNode&> m_lstCatalogList;
};
#endif // !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_)
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -