📄 matcher.cpp

📁 可自动识别自然语言,句子匹配等功能,达到人工智能识别

💻 CPP

字号:

/////////////////////////////////////////////////////////////////////////////////
//
// Sentence matcher
// @author: Gonzales Cenelia
// homepage: www.ai-search.4t.com
//
// when applied to strings,the "Levenstein distance" is usualy used 
// to measure the distances between words but we could also use it to measure 
// the distance between sentences,the following code is a simple implementation 
// of an algorithm that trys to measure the distances between two sentences 
// by using the "Levenstein distance". //********************
//
// this code is copyrighted and has limited warranty. //***************
//////////////////////////////////////////////////////////////////////////
#include "matcher.h"

unsigned int tokenized_pattern_size;
unsigned int tokenized_text_size;

// tokenize a string and puts the result into a vector
void tokenize_string(const char *str, vstring &dest) {
	int len = strlen(str);
	char *szString = new char[len + 1];
	strcpy(szString, str);
	char *token = strtok(szString, seps);
	while(token != NULL) {
		dest.push_back(token);
		token = strtok(NULL, seps);
	}
	delete szString;
}

// search for a string inside a vector of strings
int search_string(const char *str, vstring v) {
	int size = v.size();
	for(int i = 0; i < size; ++i) {
		if(v[i] == str) {
			return i;
		}
	}
	return -1;
}

// compress sentences by replacing each word 
// by characters of the english alphabet
void compress_strings(char *pattern, char *text) {
	vstring v1, v2, temp;
	tokenize_string(pattern, v1);
	tokenize_string(text, v2);
	std::stringstream s1, s2;
	int size1 = v1.size();
	int size2 = v2.size();
	tokenized_pattern_size = size1;
	tokenized_text_size = size2;
	for(int i = 0, j = -1, prev_index; i < size1; ++i) {
		if(search_string(v1[i].c_str(), temp) == -1) {
			temp.push_back(v1[i].c_str());
			++j; prev_index = j;
		}
		s1 << symbols[j];
	}
	int index = prev_index, counter = 0, result;
	for(i = 0; i < size2; ++i) {
		result = search_string(v2[i].c_str(), temp);
		if(result == -1) {
			temp.push_back(v2[i].c_str());
			if(counter == 0) {
				index = prev_index + 1;
				++counter;
			}
			else {
				++index;
			}
		}
		else {
			if(index > prev_index) {
				prev_index = index;
				counter = 0;
			}
			index = result;
		}
		s2 << symbols[index];
	}
	strcpy(pattern, s1.str().c_str());
	strcpy(text, s2.str().c_str());
}

// measure the distance between two sentences by using 
// the "Levenshtein Distance"
int find_distance(const char *pattern, const char *text) {
	int patLen = strlen(pattern);
	int textLen = strlen(text);

	char *pszPattern = new char[patLen + 1];
	char *pszText = new char[textLen + 1];

	strcpy(pszPattern, pattern);
	strcpy(pszText, text);

	// compress the given sentences
	compress_strings(pszPattern, pszText);

	patLen = strlen(pszPattern);
	textLen = strlen(pszText);
	// measure the distances between the new compressed strings
	int retval = LD(pszText, textLen, pszPattern, patLen);

	delete pszPattern;
	delete pszText;
	return retval;
}

// Fuzzy Sentence Match
// function for matching two sentences
// returns a value between 0 and 1
// the closer that value is to 0 the closer the sentences are
float match(const char *pattern, const char *text) {
	int distance = find_distance(pattern, text);
	float max_size = max(tokenized_pattern_size, tokenized_text_size);
	return distance/max_size;
}

💿 文件大小 20 K

👤 上传用户 renyumeng

📂 所属分类人工智能/神经网络

🏷️ 相关标签

#自动识别 #人工智能 #语言 #识别

⌨️ 快捷键说明

复制代码 Ctrl + C

搜索代码 Ctrl + F

全屏模式 F11

切换主题 Ctrl + Shift + D

显示快捷键 ?

增大字号 Ctrl + =

减小字号 Ctrl + -