📄 ngrams.hpp

📁 A C++ N-grams Package 2.0 This is a simple C++ n-grams package that includes a header, the correspo
💻 HPP
字号:
//====// Ngrams.hpp// - defines class Ngrams which is//   - a statistical text n-grams extraction tool//   - based on Dr. Vlado Keselj's Perl Ngrams module in CPAN// Interface// - object construction//   - default is letter trigrams//     Ngrams ng;//   - letter 5-grams//     Ngrams ng(5);//   - letter unigrams to 5-grams (incremental)//     Ngrams ng(5, true);//   - word bigrams//     Ngrams ng(2, false, Ngrams::WORD_GRAM);//   - byte unigrams to 6-grams (incremental)//     Ngrams ng(6, true, Ngrams::BYTE_GRAM);// - n-gram collection//   - from a string literal//     ng.parse("this is a text");//   - from a standard string variable//     std::string str;//     ng.parse(str);//   - from an open text input stream//     std::ifstream fin("file.txt");//     ng.parse(fin);// - result output//   - output to a standard string//     std::string str;//     ng.store(str);//   - output to an open text file//     std::ofstream fout("myfile.txt");//     ng.store(fout);//   - output to the standard output//     ng.store(cout);//   - output in decreasing frequency order//     ng.store(str, Ngrams::ORDER_BY_FREQUENCY);//   - only output the 1000 most frequent n-grams//     ng.store(cout, Ngrams::ORDER_BY_FREQUENCY, 1000);//   - normalize frequency by total number of n-grams//     ng.store(str, 0, 500, Ngrams::NORMALIZE_BY_SUM);//   - normalize output by top frequency//     ng.store(fout, 1, 100, Ngrams::NORMALIZE_BY_MAX);// - self clean-up//   - useful in multiple file processing//     ng.parse(fin1);//     ng.store(fout1);//     ng.reset();//     ng.parse(fin2);//     ng.store(fout2);//     ng.reset();//     ...// Notes// - this package is provided as is with no warranty// - the author is not responsible for any damage caused//   either directly or indirectly by using this package// - anybody is free to do whatever he/she wants with this//   package as long as this header section is preserved// - the author would like to thank Dr. Vlado Keselj for//   providing his Perl module code as the main reference// Created on 2004-10-28 by// - Roger Zhang (rogerz@cs.dal.ca)// Modifications// - Roger Zhang on 2004-10-31//   - a static code map is used for encoding non-printables// - Roger Zhang on 2004-11-06//   - n-grams of size 1 to n-1 are always collected, but if//     they will be included in output can be decided by the//     user through the second argument of the store methods// - Roger Zhang on 2005-09-23//   - whether n-grams of size 1 to n-1 are collected is now//     decided at contruction time, which saves computation//     time and memory when those n-grams are not interested//   - a detailed class interface description is added// -// Last compiled under Linux with gcc-3//====#ifndef _NLP_NGRAMS_#define _NLP_NGRAMS_#include <iostream>#include <string>#include <deque>#include <map>namespace NLP{   class Ngrams   {      //====      // data members      int size, type, *pMax, *pTotal;      bool processingFile, incremental;      std::string *pStr;      std::deque<std::string> *pQue;      std::map<std::string, int> *pMap;      static std::map<char, std::string> encMap;      //====      // internal helper routines      void parseL(const std::string &s);      void parseB(const std::string &s, bool raw);      void parseW(const std::string &s);      void parseR(std::string *s, bool once);      void parseQ();      void storeK(void *buf, int o, int f, int n, bool toFile, int k);      void storeB(void *buf, int o, int f, int n, bool toFile);   public:      //====      // n-gram types      // - default is letter n-grams, which never needs to be specified      //   only alphabetical characters count in letter based n-grams      // - every single byte counts in byte based n-grams      // - the words in word n-grams consist of letters only, decimal      //   digits are replaced by <NUMBER>, everything else is ignored      static const int BYTE_GRAM = 1;      static const int WORD_GRAM = 2;      //====      // options on output order      // - default is ascending alphabetical order      // - by frequency means descending frequency order      static const int ORDER_BY_ALPHABET = 0;      static const int ORDER_BY_FREQUENCY = 1;      //====      // frequency normalization options      // - default is no normalization, which never needs to be specified      // - by max means dividing by top frequency      // - by sum means dividing by total number of (not unique) n-grams      static const int NORMALIZE_BY_MAX = 1;      static const int NORMALIZE_BY_SUM = 2;      //====      // examples      // - Ngrams ng3letter;      // - Ngrams ng2letter(2);      // - Ngrams ng5letter(5, false);      // - Ngrams ng3byte(3, true, Ngrams::BYTE_GRAM);      // - Ngrams ng2word(2, false, Ngrams::WORD_GRAM);      // notes      // - when "inc" is true, incremental collection starting from      //   unigrams will be performed, i.e. output will be n tables      explicit Ngrams(int n = 3, bool inc = false, int t = 0);      ~Ngrams();      //====      // examples      // - std::string text;      //   std::cin >> text;      //   ng.parse(text);      // - ng.parse("here's the text, blah blah ...");      void parse(const std::string &text);      //====      // examples      // - std::ifstream fin("myfile.txt");      //   ng.parse(fin);      // - ng.parse(cin);      // notes      // - supplied input stream must be readable      // - this method reads until the end of file      // - this method does not close the input stream      void parse(std::istream &in);      //====      // examples      // - std::string result;      //   ng.store(result);      // - ng.store(result, Ngrams::ORDER_BY_FREQUENCY);      // - ng.store(result, 0, 100);      // - ng.store(result, 0, 1000, Ngrams::NORMALIZE_BY_MAX);      // notes      // - existing contents of "str", if any, will not be touched      // - this method stores the n-grams table to the end of "str"      void store(         std::string &str,         int order = ORDER_BY_ALPHABET,         int first = 0,         int normalize = 0      ) { storeB(&str, order, first, normalize, false); }      //====      // examples      // - std::ofstream fout("myfile.txt");      //   ng.store(fout);      // - ng.store(cout, 0, 10, Ngrams::NORMALIZE_BY_SUM);      // notes      // - supplied output stream must be writable      // - this method does not close the output stream      void store(         std::ostream &out,         int order = ORDER_BY_ALPHABET,         int first = 0,         int normalize = 0      ) { storeB(&out, order, first, normalize, true); }      //====      // examples      // - ng.parse("some text ...");      //   ng.store(someString);      //   ng.reset();      //   ng.parse("some other text ...");      void reset();   }; // class Ngrams} // namespace NLP#endif // _NLP_NGRAMS_
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -