⭐ 欢迎来到虫虫下载站! | 📦 资源下载 📁 资源专辑 ℹ️ 关于我们
⭐ 虫虫下载站

📄 feature_index.cpp

📁 Conditional Random Fields的训练识别工具
💻 CPP
字号:
/*  CRF++ -- Yet Another CRF toolkit  $Id: feature_index.cpp 1558 2006-11-25 04:59:20Z taku $;  Copyright(C) 2005 Taku Kudo <taku@chasen.org>  This is free software with ABSOLUTELY NO WARRANTY.  This library is free software; you can redistribute it and/or  modify it under the terms of the GNU Lesser General Public  License as published by the Free Software Foundation; either  version 2.1 of the License, or(at your option) any later version.  This library is distributed in the hope that it will be useful,  but WITHOUT ANY WARRANTY; without even the implied warranty of  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU  Lesser General Public License for more details.  You should have received a copy of the GNU Lesser General Public  License along with this library; if not, write to the Free Software  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA*/#include <iostream>#include <fstream>#include <cstring>#include <set>#include "common.h"#include "feature_index.h"namespace CRFPP{  static inline char *read_ptr(char **ptr, size_t size)  {    char *r = *ptr;    *ptr += size;    return r;  }  template <class T> static inline void read_static(char **ptr, T& value)  {    char *r = read_ptr(ptr, sizeof(T));    memcpy(&value, r, sizeof(T));  }  int DecoderFeatureIndex::getID(const char *key)  {    return da_.exactMatchSearch<Darts::DoubleArray::result_type> (key);  }  int EncoderFeatureIndex::getID(const char *key)  {    std::map <std::string, std::pair<int,unsigned int> >::iterator      it = dic_.find(key);    if (it == dic_.end()) {      dic_.insert(std::make_pair<std::string, std::pair<int, unsigned int> > (key, std::make_pair<int, unsigned int> (maxid_, 1)));      int n = maxid_;      maxid_ += (key[0] == 'U' ? y_.size() : y_.size() * y_.size());      return n;    } else {      it->second.second++;      return it->second.first;    }    return -1;  }  bool EncoderFeatureIndex::open(const char *filename1,                                 const char *filename2)  {    return openTemplate(filename1) && openTagSet(filename2);  }  bool EncoderFeatureIndex::openTemplate(const char *filename)  {    std::ifstream ifs(filename);    CHECK_FALSE(ifs) << "open failed: "  << filename;    std::string line;    while (std::getline(ifs, line)) {      if (! line[0] || line[0] == '#') continue;      if (line[0] == 'U') {        unigram_templs_.push_back(this->strdup(line.c_str()));      } else if (line[0] == 'B') {        bigram_templs_.push_back(this->strdup(line.c_str()));      } else {        CHECK_FALSE(true) << "unknown type: " << line << " " << filename;      }    }    return true;  }  bool EncoderFeatureIndex::openTagSet(const char *file)  {    std::ifstream ifs(file);    CHECK_FALSE(ifs) << "no such file or directory: " << file ;    char  line[8192];    char* column[1024];    size_t max_size = 0;    std::set<std::string> candset;    while (ifs.getline(line, sizeof(line))) {      if (line[0] == '\0' || line[0] == ' ' || line[0] == '\t') continue;      size_t size = tokenize2(line, "\t ", column, 1024);      if (max_size == 0) max_size = size;      CHECK_FALSE(max_size == size)        << "inconsistent column size: " << max_size << " " << size << " " << file;      xsize_ = size - 1;      candset.insert(column[max_size-1]);    }    y_.clear();    for (std::set<std::string>::iterator it = candset.begin(); it != candset.end(); ++it)      y_.push_back(this->strdup(it->c_str()));    ifs.close();    return true;  }  bool DecoderFeatureIndex::open(const char *filename1,                                 const char *filename2)  {    CHECK_FALSE(mmap_.open(filename1)) << mmap_.what();    char *ptr = mmap_.begin();    unsigned int version_ = 0;    read_static<unsigned int> (&ptr, version_);    CHECK_FALSE(version_ / 100 == version / 100)      << "model version is different: " << version_      << " vs " << version << " : " << filename1;    int type = 0;    read_static<int> (&ptr, type);    read_static<double> (&ptr, cost_factor_);    read_static<unsigned int> (&ptr, maxid_);    read_static<unsigned int> (&ptr, xsize_);    unsigned int dsize = 0;    read_static<unsigned int> (&ptr, dsize);    unsigned int y_str_size;    read_static<unsigned int> (&ptr, y_str_size);    char *y_str = read_ptr(&ptr, y_str_size);    size_t pos = 0;    while (pos < y_str_size) {      y_.push_back(y_str + pos);      while (y_str[pos++] != '\0') {};    }    unsigned int tmpl_str_size;    read_static<unsigned int> (&ptr, tmpl_str_size);    char *tmpl_str = read_ptr(&ptr, tmpl_str_size);    pos = 0;    while (pos < tmpl_str_size) {      char *v = tmpl_str + pos;      if (v[0] == '\0') {        ++pos;      } else if (v[0] == 'U') {        unigram_templs_.push_back(v);      } else if (v[0] == 'B') {        bigram_templs_.push_back(v);      } else {        CHECK_FALSE(true) << "unknown type: " << v;      }      while (tmpl_str[pos++] != '\0') {};    }    da_.set_array(ptr);    ptr += dsize;    alpha_float_ = (float *)ptr;    ptr += sizeof(float) * maxid_;    CHECK_FALSE(ptr == mmap_.end()) <<      "model file is broken: " << filename1;    return true;  }  void EncoderFeatureIndex::shrink(size_t freq)  {    if (freq <= 1) return;    std::map<int,int> old2new;    int new_maxid = 0;    for (std::map<std::string, std::pair<int, unsigned int> >::iterator           it = dic_.begin(); it != dic_.end();) {      const std::string &key = it->first;      if (it->second.second >= freq) {        old2new.insert(std::make_pair<int,int> (it->second.first, new_maxid));        it->second.first = new_maxid;        new_maxid += (key[0] == 'U' ? y_.size() : y_.size() * y_.size());        ++it;      } else {        dic_.erase(it++);      }    }    feature_cache_.shrink(old2new);    maxid_ = new_maxid;    return;  }  void DecoderFeatureIndex::clear()  {    char_freelist_.free();    feature_cache_.clear();    for (size_t i = 0; i < thread_num_; ++i) {      node_freelist_[i].free();      path_freelist_[i].free();    }  }  void EncoderFeatureIndex::clear() {}  bool EncoderFeatureIndex::convert(const char *filename1,                                    const char *filename2)  {    std::ifstream ifs(filename1);    y_.clear();    dic_.clear();    unigram_templs_.clear();    bigram_templs_.clear();    xsize_ = 0;    maxid_ = 0;    CHECK_FALSE(ifs) << "open failed: " << filename1;    char line[8192];    char *column[8];    // read header    while (true) {      CHECK_FALSE(ifs.getline(line, sizeof(line)))        << " format error: " << filename1;      if (std::strlen(line) == 0) break;      size_t size = tokenize(line, "\t ", column, 2);      CHECK_FALSE(size == 2) << "format error: " << filename1;      if (std::strcmp(column[0], "xsize:") == 0)        xsize_ = std::atoi(column[1]);      if (std::strcmp(column[0], "maxid:") == 0)        maxid_ = std::atoi(column[1]);    }    CHECK_FALSE(maxid_ > 0) << "maxid is not defined: " << filename1;    CHECK_FALSE(xsize_ > 0) << "xsize is not defined: " << filename1;    while (true) {      CHECK_FALSE(ifs.getline(line, sizeof(line)))        << "format error: " << filename1;      if (std::strlen(line) == 0) break;      y_.push_back(this->strdup(line));    }    while (true) {      CHECK_FALSE(ifs.getline(line, sizeof(line)))        << "format error: " << filename1;      if (std::strlen(line) == 0) break;      if (line[0] == 'U') {        unigram_templs_.push_back(this->strdup(line));      } else if (line[0] == 'B') {        bigram_templs_.push_back(this->strdup(line));      } else {        CHECK_FALSE(true) << "unknown type: " << line << " " << filename1;      }    }    while (true) {      CHECK_FALSE(ifs.getline(line, sizeof(line)))        << "format error: " << filename1;      if (std::strlen(line) == 0) break;      size_t size = tokenize(line, "\t ", column, 2);      CHECK_FALSE(size == 2) << "format error: " << filename1;      dic_.insert(std::make_pair<std::string, std::pair<int, unsigned int> > (column[1], std::make_pair<int, unsigned int> (std::atoi(column[0]), 1)));    }    std::vector<double> alpha;    while (ifs.getline(line, sizeof(line)))      alpha.push_back(std::atof(line));    alpha_ = &alpha[0];    CHECK_FALSE(alpha.size() == maxid_) << " file is broken: "  << filename1;    return save(filename2, false);  }  bool EncoderFeatureIndex::save(const char *filename, bool textmodelfile)  {    std::vector <char *> key;    std::vector <int>    val;    std::string y_str;    for (size_t i = 0; i < y_.size(); ++i) {      y_str += std::string(y_[i]);      y_str += '\0';    }    std::string templ_str;    for (size_t i = 0; i < unigram_templs_.size(); ++i) {      templ_str += std::string(unigram_templs_[i]);      templ_str += '\0';    }    for (size_t i = 0; i < bigram_templs_.size(); ++i) {      templ_str += std::string(bigram_templs_[i]);      templ_str += '\0';    }    while ((y_str.size() + templ_str.size()) % 4 != 0) templ_str += '\0';    for (std::map<std::string, std::pair<int, unsigned int> >::iterator it = dic_.begin();         it != dic_.end(); ++it) {      key.push_back((char *)(it->first.c_str()));      val.push_back(it->second.first);    }    Darts::DoubleArray da;    CHECK_FALSE(da.build(key.size(), &key[0], 0, &val[0]) == 0)      << "cannot build double-array";    std::ofstream bofs;    bofs.open(filename, OUTPUT_MODE);    CHECK_FALSE(bofs) << "open failed: " << filename;    unsigned int version_ = version;    bofs.write((char *)(&version_),sizeof(unsigned int));    int type = 0;    bofs.write((char *)(&type),sizeof(int));    bofs.write((char *)(&cost_factor_),sizeof(double));    bofs.write((char *)(&maxid_), sizeof(unsigned int));    xsize_ = _min(xsize_, max_xsize_);    bofs.write((char *)(&xsize_), sizeof(unsigned int));    unsigned int dsize = da.unit_size() * da.size();    bofs.write((char *)(&dsize), sizeof(unsigned int));    unsigned int size = y_str.size();    bofs.write((char *)(&size),  sizeof(unsigned int));    bofs.write((char *)y_str.data(),    y_str.size());    size = templ_str.size();    bofs.write((char *)(&size),  sizeof(unsigned int));    bofs.write((char *)templ_str.data(), templ_str.size());    bofs.write((char *)da.array(), dsize);    for (size_t i  = 0; i < maxid_; ++i) {      float alpha = (float)alpha_[i];      bofs.write((char *)(&alpha), sizeof(float));    }    bofs.close();    if (textmodelfile) {      std::string filename2 = filename;      filename2 += ".txt";      std::ofstream tofs(filename2.c_str());      CHECK_FALSE(tofs) << " no such file or directory: " << filename2;      // header      tofs << "version: "     << version_ << std::endl;      tofs << "cost-factor: " << cost_factor_ << std::endl;      tofs << "maxid: "       << maxid_ << std::endl;      tofs << "xsize: "       << xsize_ << std::endl;      tofs << std::endl;      // y      for (size_t i = 0; i < y_.size(); ++i)        tofs << y_[i] << std::endl;      tofs << std::endl;      // template      for (size_t i = 0; i < unigram_templs_.size(); ++i)        tofs << unigram_templs_[i] << std::endl;      for (size_t i = 0; i < bigram_templs_.size(); ++i)        tofs << bigram_templs_[i] << std::endl;      tofs << std::endl;      // dic      for (std::map<std::string, std::pair<int, unsigned int> >::iterator it = dic_.begin();           it != dic_.end(); ++it) {        tofs << it->second.first << " " << it->first << std::endl;      }      tofs << std::endl;      tofs.setf(std::ios::fixed, std::ios::floatfield);      tofs.precision(16);      for (size_t i  = 0; i < maxid_; ++i)        tofs << alpha_[i] << std::endl;    }    return true;  }  char *FeatureIndex::strdup(const char *p)  {    size_t len = std::strlen(p);    char *q = char_freelist_.alloc(len+1);    std::strcpy(q, p);    return q;  }  void FeatureIndex::calcCost(Node *n)  {    n->cost = 0.0;#define ADD_COST(T, A) \      { T c = 0; \        for (int *f = n->fvector; *f != -1; ++f) c += (A)[*f + n->y]; \        n->cost =cost_factor_ *(T)c; }    if (alpha_float_) ADD_COST(float,  alpha_float_)                        else             ADD_COST(double, alpha_);#undef ADD_COST  }  void FeatureIndex::calcCost(Path *p)  {    p->cost = 0.0;#define ADD_COST(T, A) \      { T c = 0.0; \        for (int *f = p->fvector; *f != -1; ++f) \          c += (A)[*f + p->lnode->y * y_.size() + p->rnode->y]; \        p->cost =cost_factor_*(T)c; }    if (alpha_float_) ADD_COST(float,  alpha_float_)                        else             ADD_COST(double, alpha_);  }#undef ADD_COST}

⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -