📄 ngrams.cpp

📁 A C++ N-grams Package 2.0 This is a simple C++ n-grams package that includes a header, the correspo
💻 CPP
字号:
//====// Ngrams.cpp// - implements some methods declared in Ngrams.hpp// Notes// - this package is provided as is with no warranty// - the author is not responsible for any damage caused//   either directly or indirectly by using this package// - anybody is free to do whatever he/she wants with this//   package as long as this header section is preserved// - the author would like to thank Dr. Vlado Keselj for//   providing his Perl module code as the main reference// Created on 2004-10-28 by// - Roger Zhang (rogerz@cs.dal.ca)// Modifications// - Roger Zhang on 2004-10-31//   - a static code map is used for encoding non-printables// - Roger Zhang on 2004-11-06//   - n-grams of size 1 to n-1 are always collected, but if//     they will be included in output can be decided by the//     user through the second argument of the store methods// - Roger Zhang on 2005-09-23//   - whether n-grams of size 1 to n-1 are collected is now//     decided at contruction time, which saves computation//     time and memory when those n-grams are not interested// -// Last compiled under Linux with gcc-3//====#include "Ngrams.hpp"#include <iomanip>#include <sstream>#include <cctype>#include <list>namespace NLP{   using namespace std;   map<char, string> Ngrams::encMap; // static member   typedef map<string, int>::iterator MapIter;   //====   // internal parse methods of Ngrams   void Ngrams::parseQ()   // parse *pQue for word n-grams of size 1 to n-1   {      while (!pQue->empty()) {         int j = 0;         string ng = (*pQue)[0];         while (++j < pQue->size()) {            if (++pTotal[j] && ++pMap[j][ng] > pMax[j]) {               pMax[j] = pMap[j][ng];            }            ng += " " + (*pQue)[j];         }         if (++pTotal[j] && ++pMap[j][ng] > pMax[j]) {            pMax[j] = pMap[j][ng];         }         pQue->pop_front();      }   }   void Ngrams::parseR(string *s, bool once)   // parse string for n-grams of size 1 to n-1   {      if (once) { // single scan         for (int k = s->length() - 1; k > 0; k--) {            if (++pMap[k][s->erase(k, 1)] > pMax[k]) {               pMax[k] = pMap[k][*s];            }            pTotal[k]++;         }      } else { // multiple scan         for (string ss(*s); !ss.empty(); ss = s->erase(0, 1)) {            for (int k = ss.length(); k; ss.erase(--k, 1)) {               if (++pMap[k][ss] > pMax[k]) {                  pMax[k] = pMap[k][ss];               }               pTotal[k]++;            }         }      }   }   void Ngrams::parseL(const string &s)   // parse string for letter n-grams   {      for (int i = pStr->length(), j = s.length(), k = 0; k < j; k++) {         if (isalpha(s[k]) && ++i) {            // alphabetical letters are converted to upper case            pStr->push_back(toupper(s[k]));         } else if (i && (*pStr)[i - 1] != ' ' && i++) {            // any other characters are treated as spaces,            // and consecutive spaces are treated as one            pStr->push_back(' ');         } else { /* ignore s[k] */ }      }      parseB(s, false);   }   void Ngrams::parseB(const string &s, bool raw)   // parse string for byte n-grams   {      if (raw) {         pStr->append(s);      }      int i = 0, j = pStr->length() - size;      while (i <= j) {         string ng = pStr->substr(i++, size);         if (++pMap[size][ng] > pMax[size]) {            pMax[size] = pMap[size][ng];         }         pTotal[size]++;         if (incremental) { // collect shorter n-grams            parseR(&ng, true);         }      }      pStr->erase(0, i);      // note that during file processing, the remainder of each      // line except the last will be prepended to the next line      if (!processingFile && incremental) {         parseR(pStr, false);      }   } // Ngrams::parseB   void Ngrams::parseW(const string &s)   // parse string for word n-grams   {      int i, j, k = s.length();      for (i = j = 0; j < k; i = j) {         if (isalpha(s[i])) { // hit left boundary of a word            string word;            do {               word += tolower(s[j++]);            } while (j < k && isalpha(s[j])); // find rite boundary            pQue->push_back(word);         } else if (isdigit(s[i])) { // hit left boundary of a number            bool decimalPoint = false;            while (++j < k) { // find rite boundary               // there can be only 1 decimal point in a number               // things like 10..20 are two numbers               // 9.99. is 1 number but the 2nd point marks its end               if (s[j] == '.' && !decimalPoint) { // first point                  decimalPoint = true; // ok, record its occurrence               } else if (!isdigit(s[j])) { // any other non-digit                  break;             // marks the end of the number               }            }            pQue->push_back("<NUMBER>");         } else { // punctuation marks and/or white spaces            ++j; // just skip over         }      }      k = pQue->size() - size;      for (i = 0; i <= k; i++) { // collect n-grams         string ng = (*pQue)[0];         for (j = 1; j < size; ng += " " + (*pQue)[j++]) {            if (incremental && ++pTotal[j] && ++pMap[j][ng] > pMax[j]) {               pMax[j] = pMap[j][ng];            }         }         if (++pTotal[size] && ++pMap[size][ng] > pMax[size]) {            pMax[size] = pMap[size][ng];         }         // when loop ends, the words left in the queue are the ones         pQue->pop_front(); // significant for consecutive processing      }      if (!processingFile && incremental) { // same reason as for parseR         parseQ();      }   } // Ngrams::parseW   //====   // external helper methods for store operations   static string numToStr(double num) // atof reversed   {      ostringstream oss;      oss << setprecision(16) << num;      return oss.str();   }   void addHeader(void *buf, bool toFile, int k, int total, int max)   // only useful when storing result of incremental operations   {      if (toFile) {         ostream *o = (ostream*)buf;         (*o) << "\n" << k << "-grams  (total count = " << total              << "  top frequency = " << max << ")\n"              << "------------------------------------------------\n";      } else {         string *p = (string*)buf;         p->append("\n");         p->append(numToStr(k) + "-grams  (total count = ");         p->append(numToStr(total) + "  top frequency = ");         p->append(numToStr(max) + ")\n");         p->append("------------------------------------------------\n");      }   }   void appendS(string &s, const MapIter &i, int n, map<char, string> *m)   // append a (possibly normalized) map entry to string   {      for (int k = 0; k < i->first.length(); s += (*m)[i->first[k++]]);      s += "\t ";      if (n) {         s += numToStr(i->second / double(n)) + "\n";      } else {         s += numToStr(i->second) + "\n";      }   }   void appendF(ostream &o, const MapIter &i, int n, map<char, string> *m)   // append a (possibly normalized) map entry to ostream   {      for (int k = 0; k < i->first.length(); o << (*m)[(i->first)[k++]]);      o << "\t ";      if (n) {         o << setprecision(16) << i->second / double(n) << endl;      } else {         o << i->second << endl;      }   }   static bool lessThan(MapIter i, MapIter j) // helper function for sorting   {      return i->second > j->second;   }   //====   // internal store methods of Ngrams   void Ngrams::storeK(void *buf, int o, int f, int n, bool toFile, int k)   // store k-grams to buffer   {      if (!f || f > pMap[k].size()) { // only store the top 'f' n-grams         f = pMap[k].size();      }      if (n == NORMALIZE_BY_MAX) { // normalize against top frequency         n = pMax[k];      } else if (n == NORMALIZE_BY_SUM) { // normalize against total n-grams         n = pTotal[k];      } else { // no normalization         n = 0;      }      if (o == ORDER_BY_FREQUENCY) { // descending order by frequency         // put contents of the map in a list sorted by frequency         list<MapIter> t;         for (MapIter i = pMap[k].begin();                      i != pMap[k].end(); t.push_back(i++));         t.sort(lessThan);         // store contents of the sorted list         for (list<MapIter>::iterator j = t.begin(); f--; j++) {            if (toFile) {               appendF(*(ostream*)buf, *j, n, &encMap);            } else {               appendS(*(string*)buf, *j, n, &encMap);            }         }      } else { // just store contents of the map         for (MapIter i = pMap[k].begin(); f--; i++) {            if (toFile) {               appendF(*(ostream*)buf, i, n, &encMap);            } else {               appendS(*(string*)buf, i, n, &encMap);            }         }      }   } // Ngrams::storeK   void Ngrams::storeB(void *buf, int o, int f, int n, bool toFile)   // store to a buffer that can be either a string or an ostream   {      if (incremental) {         for (int k = 1; k < size; k++) {            addHeader(buf, toFile, k, pTotal[k], pMax[k]);            storeK(buf, o, f, n, toFile, k);         }         addHeader(buf, toFile, size, pTotal[size], pMax[size]);      }      storeK(buf, o, f, n, toFile, size);   }   //====   // public methods of Ngrams   Ngrams::Ngrams(int n, bool inc, int t)      : size(n), incremental(inc), type(t), processingFile(false)   {      pMax = new int[size + 1];      pTotal = new int[size + 1];      pStr = new string();      pQue = new deque<string>();      pMap = new map<string, int>[size + 1];      for (int i = 0; i++ < size; pTotal[i] = pMax[i] = 0) {         pMap[i].clear();      }      if (encMap.empty()) { // initialize code table (only once)         encMap['\0'] = "\\0", encMap[char(1)] = "\\1";         encMap[char(2)] = "\\2", encMap[char(3)] = "\\3";         encMap[char(4)] = "\\4", encMap[char(5)] = "\\5";         encMap[char(6)] = "\\A", encMap['\a'] = "\\a";         encMap['\b'] = "\\b", encMap['\t'] = "\\t";         encMap['\n'] = "\\n", encMap['\v'] = "\\v";         encMap['\f'] = "\\f", encMap['\r'] = "\\r";         encMap[char(14)] = "\\o", encMap[char(15)] = "\\i";         encMap[char(16)] = "\\l", encMap[char(17)] = "\\6";         encMap[char(18)] = "\\7", encMap[char(19)] = "\\8";         encMap[char(20)] = "\\9", encMap[char(21)] = "\\N";         encMap[char(22)] = "\\S", encMap[char(23)] = "\\T";         encMap[char(24)] = "\\c", encMap[char(25)] = "\\E";         encMap[char(26)] = "\\s", encMap[char(27)] = "\\e";         encMap[char(28)] = "\\F", encMap[char(29)] = "\\G";         encMap[char(30)] = "\\R", encMap[char(31)] = "\\U";         for (int i = 32; ++i < 127; encMap[char(i)] = char(i));         encMap['\\'] = "\\\\", encMap['^'] = "\\^";         encMap['_'] = "\\_", encMap[' '] = "_";         encMap['`'] = "\\`", encMap[char(127)] = "\\d";      }   } // Ngrams::Ngrams   Ngrams::~Ngrams()   {      delete pStr;      delete pQue;      delete [] pMap;      delete [] pMax;      delete [] pTotal;   }   void Ngrams::parse(istream &in) // parse file   {      processingFile = true;      pStr->clear(), pQue->clear();      for (string s; getline(in, s); parse(s)) {         if (type == BYTE_GRAM) {            s += '\n';         }      }      if (incremental) {         (type == WORD_GRAM) ? parseQ() : parseR(pStr, false);      }      processingFile = false;   }   void Ngrams::parse(const string &s) // parse string   {      if (!processingFile) {         pStr->clear(), pQue->clear();      }      if (type == WORD_GRAM) {         parseW(s);      } else if (type == BYTE_GRAM) {         parseB(s, true);      } else { // letter n-grams         if (!pStr->empty() && *(pStr->rbegin()) != ' ') {            pStr->push_back(' ');         }         parseL(s);      }   }   void Ngrams::reset() // clear previous result   {      pStr->clear(), pQue->clear();      for (int i = 0; i++ < size; pTotal[i] = pMax[i] = 0) {         pMap[i].clear();      }   }} // namespace NLP
💿 文件大小 7 K
👤 上传用户 kzdai22
📂 所属分类 Linux/Unix编程
🏷️ 相关标签

#includes #correspo #N-grams #Package
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -