⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 stawords.cpp

📁 一个很好的贝叶斯分类器
💻 CPP
字号:


#pragma warning(disable:4786)
#include"stdafx.h"
#include"EngWordSout.h"

typedef map<string, int>::value_type sival_type;


//vector<string> *retrieve_text(string file_name)

void retrieve_text(string file_name)
//将文件读入存到Vector中
{

       ifstream artcile_file( file_name.c_str(), ios::in );

       if (!artcile_file) {

              cout << "Conn't open " << file_name.c_str() << " !" << endl;

              exit (1);

       }

     //  vector<string> *lines_of_text = new vector<string>;

       string textline;

       while ( getline(artcile_file, textline, '\n'))

       {

              //cout << "    " << textline << '\n';

              lines_of_text->push_back(textline);

       }

       //return lines_of_text;

}

 

void strip_caps( vector<string> *text_file )

{

       string caps( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" );

      

       vector<string>::iterator iter = text_file->begin();

       for ( ; iter != text_file->end(); ++iter )

       {

              string::size_type pos = 0;

              while ( (pos = (*iter).find_first_of( caps, pos ))//只需要一个匹配就行了

                     != string::npos )

              {

                     (*iter)[ pos ] = tolower( (*iter)[pos] );

              }

       } //end of for

}

 

vector<string> *separate_words( const vector<string> *text_file )

{

       string filter("abcdefghijklmnopqrstuvwxy");
		 //包含独立的单词集合

       vector<string> *words = new vector<string>;

       short line_pos = 0;

       for ( ; line_pos<text_file->size(); ++line_pos )
       //一行行处理!
       {

              string textline = (*text_file)[line_pos];

             

              //  用来遍历所有的字母

              string::size_type pos      = 0;

              //  单词的开始位置

              string::size_type prev_pos = textline.find_first_of(filter);

              //  单词末尾的下一空格位置

              string::size_type temp_pos = textline.find_first_of(filter); 

              //  一个小开关,其值为TURE时,prev_pos指向单词开始的位置

              bool onoff = false;

              while ( (pos = textline.find_first_of(filter, pos))

                     != string::npos )

              {

                     if ( onoff )

                     {

                            prev_pos = temp_pos - 1;

                            //  将onoff值改为false,使单词开始的位置不会改变

                            onoff = false;

                     }

                     ++pos;

                    

                     if ( (pos - temp_pos) != 1 )

                     {

                            //  为下一次的赋值做准备

                            onoff = true;

                            //  将分离出的单词输入words

                            words->push_back(

                                   textline.substr( prev_pos, temp_pos - prev_pos ));

                     }

                    

                     temp_pos = pos;

              }    // end of while

             

              //  输入最后一个单词,除非这一段没有找到任何字母

              if ( prev_pos != string::npos )

              {

                     words->push_back(

                            textline.substr( prev_pos, temp_pos - prev_pos ));

              }

       }  //  end of for  

       return words;

}

 

map< string, int > *appear_total( const vector<string> *words )

{

       //  创建单词排除集合

       set<string> exclusion_set;

       ifstream exclusion_file( "pkg95.txt", ios::in );

       if (!exclusion_file) {

              cout << "Conn't open pkg95.txt !" << endl;

              exit (1);

       }

       string textline;

       while ( getline(exclusion_file, textline, '\n'))

       {

              //cout << "    " << textline << '\n';

              exclusion_set.insert(textline);

       }

       map<string, int> *word_map = new map<string, int>;


       //  开始向word_map中记录数据

       vector<string>::const_iterator iter = words->begin();

       for ( ; iter != words->end(); ++iter )

       {

              //  如果少于3个字符或在排除集合中存在,则不输入到map中

              if ( (*iter).size() < 3||exclusion_set.count( *iter ) )

              {
                     continue;

              }   

              //  如果count()返回0,则单词不存在,加入它

              if ( !word_map->count(*iter) )

              {

                     word_map->insert( sival_type( (*iter), 1 ) );

              }

              else

              {
                     //将单词的出现次数加1

                     (*word_map)[ (*iter) ] += 1;

              }

       } //end of for

       return word_map;

}

 

multimap< int, string, greater<int> > * multimap_total( map<string, int> *text_map )
//multimap它与map 类似,所不同的是它允许重复键
{

       multimap<int, string, greater<int> > *word_map =

              new multimap< int, string, greater<int> >;

       map< string, int >::iterator map_siter = text_map->begin();

 

       for ( ; map_siter != text_map->end(); ++map_siter )

       {

              word_map->insert(make_pair((*map_siter).second, (*map_siter).first));

       }

       {

              string ofile("3_1_2out.txt");

              ofstream outfile( ofile.c_str() );

              if (!outfile)

              {

                     cerr << "error: unable to open output file: "

                            << ofile << endl;

              }


              multimap< int, string, greater<int> >::iterator map_siter = word_map->begin();

             for ( ; map_siter != word_map->end(); ++map_siter )

              {

                     outfile << (*map_siter).second;

                    

                     for ( int n = 0; n < 15 - (*map_siter).second.size(); ++n )

                     {

                            outfile << ' ';

                     }

                     outfile << "出现 " << (*map_siter).first << "\t次" << endl;
					 //To access the value of the key for the element, use Iter -> first

              }  // end of for

             

              cout << "程序已将处理结果写入3_1_2out.txt,该文件保存在当前目录"

                     << endl;

       }

       return word_map;

}

 

void map_output( map<string, int> *text_map )

{

       string ofile("3_1_1out.txt");

      

       ofstream outfile( ofile.c_str() );

       if (!outfile)

       {

              cerr << "error: unable to open output file: "

                     << ofile << endl;

       }

      

       map< string, int >::iterator map_siter = text_map->begin();

       for ( ; map_siter != text_map->end(); ++map_siter )

       {

              outfile << (*map_siter).first;

             

              for ( int n = 0; n < 15 - (*map_siter).first.size(); ++n )

              {

                     outfile << ' ';

              }

              outfile << "出现 " << (*map_siter).second << "\t次" << endl;

       }  // end of for
       cout << "程序已将处理结果写入3_1_1out.txt,该文件保存在当前目录"

              << endl;

}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -