⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 cataloglist.h

📁 良好的代码实现
💻 H
字号:
// CatalogList.h: interface for the CCatalogList class.
//
//////////////////////////////////////////////////////////////////////

#if !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_)
#define AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_

#if _MSC_VER > 1000
#pragma once
#endif // _MSC_VER > 1000

#include "wordlist.h"
#include <malloc.h>
#include "svm.h"

class CDocNode;
void AFXAPI SerializeElements(CArchive& ar,CDocNode* pElements,int nCount);

struct DocInfo{
	CDocNode *m_pDocNode;
	CString m_sDocName;
	CString m_sClsName;
	double m_dSimRatio;
};

//用来记录文档向量中每一维特征的权重
struct	sWeightNode
{
	int    s_idxWord;    //特征的ID
	short  s_tfi;        //特征在文档中出现的频次
	double s_dWeight;    //特征的权重
};

class CDocNode:public CObject
{
public:
	CDocNode();
	~CDocNode();
	const CDocNode& operator=(const CDocNode& x);
	static void AllocTempBuffer(int nLen);
	static void DeallocTempBuffer();
	void AllocBuffer(int nLen);
	void DeallocBuffer();
	void AllocResultsBuffer(short nLen);
	void DeallocResultsBuffer();
	//使用分词的方法形成文档的特征属性
	int ScanChinese(char *, CWordList&, int, short idxCata=-1);
	int ScanEnglish(char *, CWordList&, int, short idxCata=-1, bool bStem=false);
	static int ScanChineseString(char*, CWordList&, int, long, short idxCata=-1);
	static int ScanEnglishString(char*, CWordList&, int, long, short idxCata=-1, bool bStem=false);
	//根据词典wordList计算文档每一维的权重,形成文档的向量,将其保存到数组m_sWeightSet
	//这个方法要求词典wordList中每一个wordnode的m_dWeight的值都赋为此特征的反比文档频率
	int ScanChineseWithDict(char *,CWordList&);
	int ScanEnglishWithDict(char *,CWordList&, bool bStem=false);
	static int ScanChineseStringWithDict(char *pPath,CWordList& wordList);
	static int ScanEnglishStringWithDict(char *pPath, CWordList &wordList, bool bStem);
	int GenDocVector();
	int GenDocVector(DOC &doc);
	double ComputeSimilarityRatio();  //与存放在m_pTemp中的向量进行相似度的计算
	void Serialize(CArchive& ar);
	CDocNode(const CDocNode& x);
	int GetWordNum();
	BOOL IsZero();
public:
	double ComputeProbability(CWordList& wordList,int n);
	long m_idxDoc;                //文档标识
	CString	m_strDocName;         //文档名称
	int m_nAllocLen;              //文档向量的长度,即数组m_sWeightSet的长度
	sWeightNode	*m_sWeightSet;    //词权值列表
	short  m_nClassNum;           //代表训练文档中的类别总数,即数组m_pResults的大小
	double *m_pResults;           //文档与每个类别的相似度
	short  m_nCataID;             //代表当前文档的所属类别,由于只在分类时使用,所以序列化的时候不操作此属性
	static sWeightNode *m_pTemp;  //生成文档向量时需要使用的一块临时内存
	static int m_nAllocTempLen;   //临时内存的大小
private:
	static char m_pSentence[MAX_PATH*10];
private:
	bool IsNumber(char * p);
	static int ParseFile(char *, int, int &);
};

class CCatalogNode;
class CCatalogList;
void AFXAPI SerializeElements(CArchive& ar,CCatalogNode* pElements,int nCount);

class CCatalogNode
{
public:
	CCatalogNode();
	~CCatalogNode();
	CCatalogNode(const CCatalogNode& x);
	const CCatalogNode& operator = (const CCatalogNode& x);
	const CCatalogNode& operator += (const CCatalogNode& x);
public:
	void InitCatalogNode(int nMode=0);
	void SetStartDocID(long lDocID);
	CDocNode& GetNext(POSITION& rPos);
	POSITION GetFirstPosition();
	POSITION AddDoc(CDocNode& docnode);
	void Serialize(CArchive& ar);
	UINT GetDocNum();
	//扫描路径pPath下的所有文档,将其添加到当前类节点中
	long ScanDirectory(CString);
public:
	CDocNode& GetAt( POSITION position );
	short m_idxCata;
	CString	m_strCatalogName;
	CString m_strDirName;
	long	m_lTotalWordNum;
private:
	CList<CDocNode,CDocNode&>			m_lstDocList;
	long    m_lCurDocID;
};

class CCatalogList  
{
public:
	CCatalogList();
	virtual ~CCatalogList();
	const CCatalogList& operator = (const CCatalogList& x);
	const CCatalogList& operator += (const CCatalogList& x);
	void InitCatalogList(int nMode=0);
	void DumpToFile (CString strFileName, int nSaveMode=0);
	BOOL GetFromFile(CString strFileName);
	void DumpDocList(CString strFileName);
	long BuildLib(CString	strDirName);
public:
	static int GetSaveMode();
	void DumpCataList(CString strFileName);
	bool BuildCatalogID(CCatalogList &);
	CCatalogNode* GetCataByName(CString strCataName);
	short GetCataIDByName(CString strCataName);
	short GetCataIDArrayFromString(char * line, CArray<short, short> &aryCataID);
	CDocNode* GetDocByName(CString strDocName);
	CCatalogNode* GetCata(short idxCata);
	CCatalogNode GetAt(POSITION pos) const;
	CCatalogNode& GetAt(POSITION pos);
	CCatalogNode& GetNext(POSITION& rPos);
	POSITION GetFirstPosition();
	POSITION AddCata(CCatalogNode& catanode);
	bool GetDocName(short idxCata,long idxDoc,CString& strDocName);
	bool GetCataName(short idxCata,CString& strCataName);
	int  GetCataNum();
	long GetDocNum();
private:
	void Serialize(CArchive& ar);
	long ScanDirectory(CString strDirName);
private:
	static int m_nSaveMode;  //0 保存文档向量, 1 不保存文档向量
	CList<CCatalogNode,CCatalogNode&>	m_lstCatalogList;
};

#endif // !defined(AFX_CATALOGLIST_H__4CF04BF3_9DA2_11D3_8433_00C04F722981__INCLUDED_)

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -