📄 gvsm.cpp

📁 一个非常有用的开源代码
💻 CPP
字号:
#include "GVSM.h"#include "GHashTable.h"#include "GStemmer.h"#include "GArray.h"#include <math.h>const char* g_szStopWords[] = {	"a",	"about",	"all",	"also",	"although",	"an",	"and",	"any",	"are",	"as",	"at",	"be",	"but",	"by",	"can",	"did",	"each",	"every",	"for",	"from",	"had",	"have",	"he",	"her",	"him",	"his",	"how",	"i",	"if",	"in",	"is",	"it",	"its",	"my",	"nbsp",	"next",	"no",	"not",	"of",	"on",	"one",	"or",	"our",	"out",	"quite",	"really",	"so",	"some",	"that",	"the",	"them",	"then",	"there",	"this",	"to",	"too",	"use",	"very",	"was",	"we",	"what",	"when",	"where",	"who",	"will",	"with",	"you",};struct GVSMWordStats{	int m_nMaxFrequency;	int m_nDocsContainingWord;};GVSM::GVSM(){	m_pStemmer = new GStemmer();	m_pStopWords = new GConstStringHashTable(113, false);	int i;	for(i = 0; i < (int)(sizeof(g_szStopWords) / sizeof(const char*)); i++)		m_pStopWords->Add(g_szStopWords[i], NULL);	m_pVocabulary = new GConstStringHashTable(1031, false);	m_pStringHeap = new GStringHeap(1024);	m_pWords = new GPointerArray(1024);	m_pCurrentVector = NULL;	m_nVocabSizeBeforeThisDoc = 0;	m_pWordCount = NULL;	m_nDocumentCount = 0;}GVSM::~GVSM(){	delete(m_pStemmer);	delete(m_pStopWords);	delete(m_pVocabulary);	delete(m_pStringHeap);	delete(m_pWords);	delete(m_pWordCount);}/*static*/ void GVSM::ExtractWords(const char* pFile, int nSize, ProcessWordFunc pProcessWordFunc, void* pThis){	int nPos = 0;	int nWordStart;	while(true)	{		// Skip whitespace		while(nPos < nSize && pFile[nPos] < 'A')			nPos++;		nWordStart = nPos;		// Find the end of the word		while(nPos < nSize && pFile[nPos] >= 'A')			nPos++;		// Add the word		pProcessWordFunc(pThis, &pFile[nWordStart], nPos - nWordStart);		// Check for end of file		if(nPos >= nSize)			break;	}}void GVSM::AddWordToVocabulary(const char* szWord, int nLen){	if(nLen < 4)		return;	// Find the stem	const char* szStem = m_pStemmer->GetStem(szWord, nLen);	// Check for stop words	void* pValue;	if(m_pStopWords->Get(szStem, &pValue))		return;	// Check for existing words	int nIndex;	if(m_pVocabulary->Get(szStem, (void**)&nIndex))	{		struct GVSMWordStats* pWordStats = (struct GVSMWordStats*)m_pWords->GetPointer(nIndex);		if(nIndex < m_nVocabSizeBeforeThisDoc)		{			// This word was introduced by a previous document			if(m_pWordCount[nIndex] == 0)				pWordStats->m_nDocsContainingWord++;			m_pWordCount[nIndex]++;			if(m_pWordCount[nIndex] > pWordStats->m_nMaxFrequency)				pWordStats->m_nMaxFrequency = m_pWordCount[nIndex];		}		else		{			// This word was previously introduced by this document			pWordStats->m_nMaxFrequency++;		}	}	else	{		// This is a new vocabulary word		char* pNewWord = m_pStringHeap->Allocate(sizeof(struct GVSMWordStats) + strlen(szStem) + 1);		struct GVSMWordStats* pWordStats = (struct GVSMWordStats*)pNewWord;		pWordStats->m_nMaxFrequency = 1;		pWordStats->m_nDocsContainingWord = 1;		strcpy(pNewWord + sizeof(struct GVSMWordStats), szStem);		int nIndex = m_pWords->GetSize();		m_pWords->AddPointer(pNewWord);		m_pVocabulary->Add(pNewWord + sizeof(struct GVSMWordStats), (const void*)nIndex);	}}void AddWordToVocab(void* pThis, const char* szWord, int nLen){	((GVSM*)pThis)->AddWordToVocabulary(szWord, nLen);}void GVSM::AddDocumentToVocabulary(const char* szText, int nLength){	m_nVocabSizeBeforeThisDoc = GetVocabSize();	m_pWordCount = new int[m_nVocabSizeBeforeThisDoc]; // todo: it would scale better if we didn't allocate this every time a document is added	memset(m_pWordCount, '\0', sizeof(int) * m_nVocabSizeBeforeThisDoc);	m_nDocumentCount++;	ExtractWords(szText, nLength, AddWordToVocab, this);	delete(m_pWordCount);	m_pWordCount = NULL;}int GVSM::GetVocabSize(){	return m_pWords->GetSize();}int GVSM::FindStemIndex(const char* szStem){	int nIndex;	if(m_pVocabulary->Get(szStem, (void**)&nIndex))		return nIndex;	else		return -1;}const char* GVSM::GetVocabWord(int nIndex){	return (const char*)m_pWords->GetPointer(nIndex) + sizeof(struct GVSMWordStats);}int GVSM::GetMaxWordFrequency(int nIndex){	return ((struct GVSMWordStats*)m_pWords->GetPointer(nIndex))->m_nMaxFrequency;}int GVSM::GetNumberOfDocsContainingWord(int nIndex){	return ((struct GVSMWordStats*)m_pWords->GetPointer(nIndex))->m_nDocsContainingWord;}int GVSM::GetTrainingDocumentCount(){	return m_nDocumentCount;}void GVSM::AddWordToVector(const char* szWord, int nLen){	const char* szStem = m_pStemmer->GetStem(szWord, nLen);	int nIndex = FindStemIndex(szStem);	if(nIndex < 0)		return;	struct GVSMWordStats* pWordStats = (struct GVSMWordStats*)m_pWords->GetPointer(nIndex);	m_pCurrentVector[nIndex] += ((1.0 / pWordStats->m_nMaxFrequency) * log((double)(m_nDocumentCount / pWordStats->m_nDocsContainingWord)));}void AddWordToVec(void* pThis, const char* szWord, int nLen){	((GVSM*)pThis)->AddWordToVector(szWord, nLen);}void GVSM::GetVector(double* pOutVector, const char* szText, int nLength){	int nWords = GetVocabSize();	int i;	for(i = 0; i < nWords; i++)		pOutVector[i] = 0;	m_pCurrentVector = pOutVector;	ExtractWords(szText, nLength, AddWordToVec, this);	m_pCurrentVector = NULL;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -