📄 ngrams.hpp
字号:
//====// Ngrams.hpp// - defines class Ngrams which is// - a statistical text n-grams extraction tool// - based on Dr. Vlado Keselj's Perl Ngrams module in CPAN// Interface// - object construction// - default is letter trigrams// Ngrams ng;// - letter 5-grams// Ngrams ng(5);// - letter unigrams to 5-grams (incremental)// Ngrams ng(5, true);// - word bigrams// Ngrams ng(2, false, Ngrams::WORD_GRAM);// - byte unigrams to 6-grams (incremental)// Ngrams ng(6, true, Ngrams::BYTE_GRAM);// - n-gram collection// - from a string literal// ng.parse("this is a text");// - from a standard string variable// std::string str;// ng.parse(str);// - from an open text input stream// std::ifstream fin("file.txt");// ng.parse(fin);// - result output// - output to a standard string// std::string str;// ng.store(str);// - output to an open text file// std::ofstream fout("myfile.txt");// ng.store(fout);// - output to the standard output// ng.store(cout);// - output in decreasing frequency order// ng.store(str, Ngrams::ORDER_BY_FREQUENCY);// - only output the 1000 most frequent n-grams// ng.store(cout, Ngrams::ORDER_BY_FREQUENCY, 1000);// - normalize frequency by total number of n-grams// ng.store(str, 0, 500, Ngrams::NORMALIZE_BY_SUM);// - normalize output by top frequency// ng.store(fout, 1, 100, Ngrams::NORMALIZE_BY_MAX);// - self clean-up// - useful in multiple file processing// ng.parse(fin1);// ng.store(fout1);// ng.reset();// ng.parse(fin2);// ng.store(fout2);// ng.reset();// ...// Notes// - this package is provided as is with no warranty// - the author is not responsible for any damage caused// either directly or indirectly by using this package// - anybody is free to do whatever he/she wants with this// package as long as this header section is preserved// - the author would like to thank Dr. Vlado Keselj for// providing his Perl module code as the main reference// Created on 2004-10-28 by// - Roger Zhang (rogerz@cs.dal.ca)// Modifications// - Roger Zhang on 2004-10-31// - a static code map is used for encoding non-printables// - Roger Zhang on 2004-11-06// - n-grams of size 1 to n-1 are always collected, but if// they will be included in output can be decided by the// user through the second argument of the store methods// - Roger Zhang on 2005-09-23// - whether n-grams of size 1 to n-1 are collected is now// decided at contruction time, which saves computation// time and memory when those n-grams are not interested// - a detailed class interface description is added// -// Last compiled under Linux with gcc-3//====#ifndef _NLP_NGRAMS_#define _NLP_NGRAMS_#include <iostream>#include <string>#include <deque>#include <map>namespace NLP{ class Ngrams { //==== // data members int size, type, *pMax, *pTotal; bool processingFile, incremental; std::string *pStr; std::deque<std::string> *pQue; std::map<std::string, int> *pMap; static std::map<char, std::string> encMap; //==== // internal helper routines void parseL(const std::string &s); void parseB(const std::string &s, bool raw); void parseW(const std::string &s); void parseR(std::string *s, bool once); void parseQ(); void storeK(void *buf, int o, int f, int n, bool toFile, int k); void storeB(void *buf, int o, int f, int n, bool toFile); public: //==== // n-gram types // - default is letter n-grams, which never needs to be specified // only alphabetical characters count in letter based n-grams // - every single byte counts in byte based n-grams // - the words in word n-grams consist of letters only, decimal // digits are replaced by <NUMBER>, everything else is ignored static const int BYTE_GRAM = 1; static const int WORD_GRAM = 2; //==== // options on output order // - default is ascending alphabetical order // - by frequency means descending frequency order static const int ORDER_BY_ALPHABET = 0; static const int ORDER_BY_FREQUENCY = 1; //==== // frequency normalization options // - default is no normalization, which never needs to be specified // - by max means dividing by top frequency // - by sum means dividing by total number of (not unique) n-grams static const int NORMALIZE_BY_MAX = 1; static const int NORMALIZE_BY_SUM = 2; //==== // examples // - Ngrams ng3letter; // - Ngrams ng2letter(2); // - Ngrams ng5letter(5, false); // - Ngrams ng3byte(3, true, Ngrams::BYTE_GRAM); // - Ngrams ng2word(2, false, Ngrams::WORD_GRAM); // notes // - when "inc" is true, incremental collection starting from // unigrams will be performed, i.e. output will be n tables explicit Ngrams(int n = 3, bool inc = false, int t = 0); ~Ngrams(); //==== // examples // - std::string text; // std::cin >> text; // ng.parse(text); // - ng.parse("here's the text, blah blah ..."); void parse(const std::string &text); //==== // examples // - std::ifstream fin("myfile.txt"); // ng.parse(fin); // - ng.parse(cin); // notes // - supplied input stream must be readable // - this method reads until the end of file // - this method does not close the input stream void parse(std::istream &in); //==== // examples // - std::string result; // ng.store(result); // - ng.store(result, Ngrams::ORDER_BY_FREQUENCY); // - ng.store(result, 0, 100); // - ng.store(result, 0, 1000, Ngrams::NORMALIZE_BY_MAX); // notes // - existing contents of "str", if any, will not be touched // - this method stores the n-grams table to the end of "str" void store( std::string &str, int order = ORDER_BY_ALPHABET, int first = 0, int normalize = 0 ) { storeB(&str, order, first, normalize, false); } //==== // examples // - std::ofstream fout("myfile.txt"); // ng.store(fout); // - ng.store(cout, 0, 10, Ngrams::NORMALIZE_BY_SUM); // notes // - supplied output stream must be writable // - this method does not close the output stream void store( std::ostream &out, int order = ORDER_BY_ALPHABET, int first = 0, int normalize = 0 ) { storeB(&out, order, first, normalize, true); } //==== // examples // - ng.parse("some text ..."); // ng.store(someString); // ng.reset(); // ng.parse("some other text ..."); void reset(); }; // class Ngrams} // namespace NLP#endif // _NLP_NGRAMS_
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -