📄 ngrams.cpp
字号:
//====// Ngrams.cpp// - implements some methods declared in Ngrams.hpp// Notes// - this package is provided as is with no warranty// - the author is not responsible for any damage caused// either directly or indirectly by using this package// - anybody is free to do whatever he/she wants with this// package as long as this header section is preserved// - the author would like to thank Dr. Vlado Keselj for// providing his Perl module code as the main reference// Created on 2004-10-28 by// - Roger Zhang (rogerz@cs.dal.ca)// Modifications// - Roger Zhang on 2004-10-31// - a static code map is used for encoding non-printables// - Roger Zhang on 2004-11-06// - n-grams of size 1 to n-1 are always collected, but if// they will be included in output can be decided by the// user through the second argument of the store methods// - Roger Zhang on 2005-09-23// - whether n-grams of size 1 to n-1 are collected is now// decided at contruction time, which saves computation// time and memory when those n-grams are not interested// -// Last compiled under Linux with gcc-3//====#include "Ngrams.hpp"#include <iomanip>#include <sstream>#include <cctype>#include <list>namespace NLP{ using namespace std; map<char, string> Ngrams::encMap; // static member typedef map<string, int>::iterator MapIter; //==== // internal parse methods of Ngrams void Ngrams::parseQ() // parse *pQue for word n-grams of size 1 to n-1 { while (!pQue->empty()) { int j = 0; string ng = (*pQue)[0]; while (++j < pQue->size()) { if (++pTotal[j] && ++pMap[j][ng] > pMax[j]) { pMax[j] = pMap[j][ng]; } ng += " " + (*pQue)[j]; } if (++pTotal[j] && ++pMap[j][ng] > pMax[j]) { pMax[j] = pMap[j][ng]; } pQue->pop_front(); } } void Ngrams::parseR(string *s, bool once) // parse string for n-grams of size 1 to n-1 { if (once) { // single scan for (int k = s->length() - 1; k > 0; k--) { if (++pMap[k][s->erase(k, 1)] > pMax[k]) { pMax[k] = pMap[k][*s]; } pTotal[k]++; } } else { // multiple scan for (string ss(*s); !ss.empty(); ss = s->erase(0, 1)) { for (int k = ss.length(); k; ss.erase(--k, 1)) { if (++pMap[k][ss] > pMax[k]) { pMax[k] = pMap[k][ss]; } pTotal[k]++; } } } } void Ngrams::parseL(const string &s) // parse string for letter n-grams { for (int i = pStr->length(), j = s.length(), k = 0; k < j; k++) { if (isalpha(s[k]) && ++i) { // alphabetical letters are converted to upper case pStr->push_back(toupper(s[k])); } else if (i && (*pStr)[i - 1] != ' ' && i++) { // any other characters are treated as spaces, // and consecutive spaces are treated as one pStr->push_back(' '); } else { /* ignore s[k] */ } } parseB(s, false); } void Ngrams::parseB(const string &s, bool raw) // parse string for byte n-grams { if (raw) { pStr->append(s); } int i = 0, j = pStr->length() - size; while (i <= j) { string ng = pStr->substr(i++, size); if (++pMap[size][ng] > pMax[size]) { pMax[size] = pMap[size][ng]; } pTotal[size]++; if (incremental) { // collect shorter n-grams parseR(&ng, true); } } pStr->erase(0, i); // note that during file processing, the remainder of each // line except the last will be prepended to the next line if (!processingFile && incremental) { parseR(pStr, false); } } // Ngrams::parseB void Ngrams::parseW(const string &s) // parse string for word n-grams { int i, j, k = s.length(); for (i = j = 0; j < k; i = j) { if (isalpha(s[i])) { // hit left boundary of a word string word; do { word += tolower(s[j++]); } while (j < k && isalpha(s[j])); // find rite boundary pQue->push_back(word); } else if (isdigit(s[i])) { // hit left boundary of a number bool decimalPoint = false; while (++j < k) { // find rite boundary // there can be only 1 decimal point in a number // things like 10..20 are two numbers // 9.99. is 1 number but the 2nd point marks its end if (s[j] == '.' && !decimalPoint) { // first point decimalPoint = true; // ok, record its occurrence } else if (!isdigit(s[j])) { // any other non-digit break; // marks the end of the number } } pQue->push_back("<NUMBER>"); } else { // punctuation marks and/or white spaces ++j; // just skip over } } k = pQue->size() - size; for (i = 0; i <= k; i++) { // collect n-grams string ng = (*pQue)[0]; for (j = 1; j < size; ng += " " + (*pQue)[j++]) { if (incremental && ++pTotal[j] && ++pMap[j][ng] > pMax[j]) { pMax[j] = pMap[j][ng]; } } if (++pTotal[size] && ++pMap[size][ng] > pMax[size]) { pMax[size] = pMap[size][ng]; } // when loop ends, the words left in the queue are the ones pQue->pop_front(); // significant for consecutive processing } if (!processingFile && incremental) { // same reason as for parseR parseQ(); } } // Ngrams::parseW //==== // external helper methods for store operations static string numToStr(double num) // atof reversed { ostringstream oss; oss << setprecision(16) << num; return oss.str(); } void addHeader(void *buf, bool toFile, int k, int total, int max) // only useful when storing result of incremental operations { if (toFile) { ostream *o = (ostream*)buf; (*o) << "\n" << k << "-grams (total count = " << total << " top frequency = " << max << ")\n" << "------------------------------------------------\n"; } else { string *p = (string*)buf; p->append("\n"); p->append(numToStr(k) + "-grams (total count = "); p->append(numToStr(total) + " top frequency = "); p->append(numToStr(max) + ")\n"); p->append("------------------------------------------------\n"); } } void appendS(string &s, const MapIter &i, int n, map<char, string> *m) // append a (possibly normalized) map entry to string { for (int k = 0; k < i->first.length(); s += (*m)[i->first[k++]]); s += "\t "; if (n) { s += numToStr(i->second / double(n)) + "\n"; } else { s += numToStr(i->second) + "\n"; } } void appendF(ostream &o, const MapIter &i, int n, map<char, string> *m) // append a (possibly normalized) map entry to ostream { for (int k = 0; k < i->first.length(); o << (*m)[(i->first)[k++]]); o << "\t "; if (n) { o << setprecision(16) << i->second / double(n) << endl; } else { o << i->second << endl; } } static bool lessThan(MapIter i, MapIter j) // helper function for sorting { return i->second > j->second; } //==== // internal store methods of Ngrams void Ngrams::storeK(void *buf, int o, int f, int n, bool toFile, int k) // store k-grams to buffer { if (!f || f > pMap[k].size()) { // only store the top 'f' n-grams f = pMap[k].size(); } if (n == NORMALIZE_BY_MAX) { // normalize against top frequency n = pMax[k]; } else if (n == NORMALIZE_BY_SUM) { // normalize against total n-grams n = pTotal[k]; } else { // no normalization n = 0; } if (o == ORDER_BY_FREQUENCY) { // descending order by frequency // put contents of the map in a list sorted by frequency list<MapIter> t; for (MapIter i = pMap[k].begin(); i != pMap[k].end(); t.push_back(i++)); t.sort(lessThan); // store contents of the sorted list for (list<MapIter>::iterator j = t.begin(); f--; j++) { if (toFile) { appendF(*(ostream*)buf, *j, n, &encMap); } else { appendS(*(string*)buf, *j, n, &encMap); } } } else { // just store contents of the map for (MapIter i = pMap[k].begin(); f--; i++) { if (toFile) { appendF(*(ostream*)buf, i, n, &encMap); } else { appendS(*(string*)buf, i, n, &encMap); } } } } // Ngrams::storeK void Ngrams::storeB(void *buf, int o, int f, int n, bool toFile) // store to a buffer that can be either a string or an ostream { if (incremental) { for (int k = 1; k < size; k++) { addHeader(buf, toFile, k, pTotal[k], pMax[k]); storeK(buf, o, f, n, toFile, k); } addHeader(buf, toFile, size, pTotal[size], pMax[size]); } storeK(buf, o, f, n, toFile, size); } //==== // public methods of Ngrams Ngrams::Ngrams(int n, bool inc, int t) : size(n), incremental(inc), type(t), processingFile(false) { pMax = new int[size + 1]; pTotal = new int[size + 1]; pStr = new string(); pQue = new deque<string>(); pMap = new map<string, int>[size + 1]; for (int i = 0; i++ < size; pTotal[i] = pMax[i] = 0) { pMap[i].clear(); } if (encMap.empty()) { // initialize code table (only once) encMap['\0'] = "\\0", encMap[char(1)] = "\\1"; encMap[char(2)] = "\\2", encMap[char(3)] = "\\3"; encMap[char(4)] = "\\4", encMap[char(5)] = "\\5"; encMap[char(6)] = "\\A", encMap['\a'] = "\\a"; encMap['\b'] = "\\b", encMap['\t'] = "\\t"; encMap['\n'] = "\\n", encMap['\v'] = "\\v"; encMap['\f'] = "\\f", encMap['\r'] = "\\r"; encMap[char(14)] = "\\o", encMap[char(15)] = "\\i"; encMap[char(16)] = "\\l", encMap[char(17)] = "\\6"; encMap[char(18)] = "\\7", encMap[char(19)] = "\\8"; encMap[char(20)] = "\\9", encMap[char(21)] = "\\N"; encMap[char(22)] = "\\S", encMap[char(23)] = "\\T"; encMap[char(24)] = "\\c", encMap[char(25)] = "\\E"; encMap[char(26)] = "\\s", encMap[char(27)] = "\\e"; encMap[char(28)] = "\\F", encMap[char(29)] = "\\G"; encMap[char(30)] = "\\R", encMap[char(31)] = "\\U"; for (int i = 32; ++i < 127; encMap[char(i)] = char(i)); encMap['\\'] = "\\\\", encMap['^'] = "\\^"; encMap['_'] = "\\_", encMap[' '] = "_"; encMap['`'] = "\\`", encMap[char(127)] = "\\d"; } } // Ngrams::Ngrams Ngrams::~Ngrams() { delete pStr; delete pQue; delete [] pMap; delete [] pMax; delete [] pTotal; } void Ngrams::parse(istream &in) // parse file { processingFile = true; pStr->clear(), pQue->clear(); for (string s; getline(in, s); parse(s)) { if (type == BYTE_GRAM) { s += '\n'; } } if (incremental) { (type == WORD_GRAM) ? parseQ() : parseR(pStr, false); } processingFile = false; } void Ngrams::parse(const string &s) // parse string { if (!processingFile) { pStr->clear(), pQue->clear(); } if (type == WORD_GRAM) { parseW(s); } else if (type == BYTE_GRAM) { parseB(s, true); } else { // letter n-grams if (!pStr->empty() && *(pStr->rbegin()) != ' ') { pStr->push_back(' '); } parseL(s); } } void Ngrams::reset() // clear previous result { pStr->clear(), pQue->clear(); for (int i = 0; i++ < size; pTotal[i] = pMax[i] = 0) { pMap[i].clear(); } }} // namespace NLP
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -