📄 preprocess.cpp

📁 Gaussian Mixture Algorithm
💻 CPP
字号:
// -*- C++ -*-// SVM with stochastic gradient (preprocessing)// Copyright (C) 2007- Leon Bottou// This program is free software; you can redistribute it and/or// modify it under the terms of the GNU Lesser General Public// License as published by the Free Software Foundation; either// version 2.1 of the License, or (at your option) any later version.//// This program is distributed in the hope that it will be useful,// but WITHOUT ANY WARRANTY; without even the implied warranty of// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the// GNU General Public License for more details.//// You should have received a copy of the GNU General Public License// along with this program; if not, write to the Free Software// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA// $Id: preprocess.cpp,v 1.2 2008/10/29 23:58:58 cp71 Exp $#include "preprocess.h"classes_t classes;voidreadClasses(const char *fname){  cerr << "# Reading " << fname << endl;  igzstream f;  f.open(fname);  if (! f.good()) {    cerr << "ERROR: cannot open file " << fname << endl;    ::exit(10);  }  classes.clear();  for(;;) {    string topic;    int id, rev;    f >> topic >> id >> rev;    if (! f.good())      break;    if (topic == "CCAT")      classes[id] = true;    else if (classes.find(id) == classes.end())      classes[id] = false;  }  if (!f.eof()) {    cerr << "ERROR: failed reading " << fname << endl;    ::exit(10);  }  int pcount = 0;  int ncount = 0;  for (classes_t::const_iterator it=classes.begin(); it!=classes.end(); it++)    if (it->second)      pcount++;    else      ncount++;  cerr << "# Done reading "       << pcount << " positives and "       << ncount << " negatives. " << endl;}dico_t dico;docs_t train;docs_t test;void readDocs(const char *fname, docs_t &docs, bool freezedico){  cerr << "# Reading " << fname << endl;  igzstream f;  f.open(fname);  if (! f.good()) {    cerr << "ERROR: cannot open file " << fname << endl;    ::exit(10);  }  string token;  f >> token;  if (token != ".I")    {      cerr << "ERROR: Cannot read initial .I in " << fname << endl;      ::exit(10);    }  int id = 0;  int count = 0;  while(f.good())    {      f >> id >> token;      count += 1;      if (! f.good() || token != ".W")        {          cerr << "ERROR (" << id << "): "               << "Cannot read \"<id> .W\"." << endl;          ::exit(10);        }      int wid = -1;      string otoken;      SVector s;      for(;;)        {          f >> token;          if (!f.good() || token == ".I")            break;          if (token != otoken)            {              dico_t::iterator it = dico.find(token);              if (it != dico.end())                wid = it->second;              else if (freezedico)                continue;              else                {                  wid = dico.size() + 1;                  dico[token] = wid;                }              otoken = token;            }          s.set(wid, s.get(wid)+1.0);        }      if (s.npairs() <= 0)        {          cerr << "ERROR (" << id << "): "               << "Empty vector " << id << "?" << endl;          ::exit(10);        }      docs[id] = s;    }  if (!f.eof())    {      cerr << "ERROR (" << id << "): "           << "Failed reading words" << endl;      ::exit(10);    }  cerr << "# Done reading " << count << " documents." << endl;}intvector_t trainid;intvector_t testid;void listKeys(docs_t &docs, intvector_t &ivec, bool shuffle){  ivec.clear();  for (docs_t::iterator it = docs.begin(); it != docs.end(); it++)    ivec.push_back(it->first);  if (shuffle)    random_shuffle(ivec.begin(), ivec.end());}voidcomputeNormalizedTfIdf(){  cerr << "# Computing document frequencies" << endl;  int terms = dico.size();  vector<double> nt(terms+1);  double nd = trainid.size();  for(int i=0; i<terms+1; i++)    nt[i] = 0;  for(int i=0; i<(int)trainid.size(); i++)    {      int id = trainid[i];      SVector s = train[id];      for (const SVector::Pair *p = s; p->i >= 0; p++)        if (p->v > 0)          nt[p->i] += 1;    }  cerr << "# Computing TF/IDF for training set" << endl;  for(int i=0; i<(int)trainid.size(); i++)    {      int id = trainid[i];      SVector s = train[id];      SVector v;      for (const SVector::Pair *p = s; p->i >= 0; p++)        if (nt[p->i] > 0)          v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i]));      double norm = dot(v,v);      v.scale(1.0 / sqrt(norm));      train[id] = v;    }  cerr << "# Computing TF/IDF for testing set" << endl;  for(int i=0; i<(int)testid.size(); i++)    {      int id = testid[i];      SVector s = test[id];      SVector v;      for (const SVector::Pair *p = s; p->i >= 0; p++)        if (nt[p->i] > 0)          v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i]));      double norm = dot(v,v);      v.scale(1.0 / sqrt(norm));      test[id] = v;    }  cerr << "# Done." << endl;}voidsaveSvmLight(const char *fname, docs_t &docs, intvector_t &ids){  cerr << "# Writing " << fname << "."  << endl;  ogzstream f;  f.open(fname);  if (! f.good())    {      cerr << "ERROR: cannot open " << fname << " for writing." << endl;      ::exit(10);    }  for(int i=0; i<(int)ids.size(); i++)    {      int id = ids[i];      bool y = classes[id];      SVector s = docs[id];      int p = s.npairs();      if (p <= 0)        {          cerr << "ERROR: empty vector " << id << "." << endl;          ::exit(10);        }      f << ((y) ? +1 : -1);      f << s;      if (! f.good())        {          cerr << "ERROR: writing " << fname << " for writing." << endl;          ::exit(10);        }    }  cerr << "# Done. Wrote " << ids.size() << " examples." << endl;}void preprocess(string path){  string file1 = path + "/rcv1-v2.topics.qrels.gz";  readClasses(file1.c_str());  string file2 = path + "/lyrl2004_tokens_train.dat.gz";  readDocs(file2.c_str(), test);  cerr << "# Dictionary size (so far) " << dico.size() << endl;  // We freeze the dictionary at this point.  // As a result we only use features common to both the training and testing set.  // This is consistent with joachims svmperf experiments.  string file3 = path + "/lyrl2004_tokens_test_pt0.dat.gz";  string file4 = path + "/lyrl2004_tokens_test_pt1.dat.gz";  string file5 = path + "/lyrl2004_tokens_test_pt2.dat.gz";  string file6 = path + "/lyrl2004_tokens_test_pt3.dat.gz";  readDocs(file3.c_str(), train, true);  readDocs(file4.c_str(), train, true);  readDocs(file5.c_str(), train, true);  readDocs(file6.c_str(), train, true);  cerr << "# Got " << test.size() << " testing documents." << endl;  cerr << "# Got " << train.size() << " training documents." << endl;  cerr << "# Dictionary size " << dico.size() << endl;  listKeys(train, trainid, true);  listKeys(test, testid);  computeNormalizedTfIdf();  string file7 = path + "/train.dat.gz";  string file8 = path + "/test.dat.gz";  saveSvmLight(file7.c_str(), train, trainid);  saveSvmLight(file8.c_str(), test, testid);  cerr << "# The End." << endl;}
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -