📄 learnmodel.cpp

📁 C＋＋编写的机器学习算法 Lemga is a C++ package which consists of classes for several learning models and gener
💻 CPP
字号:
/** @file *  $Id: learnmodel.cpp 2538 2006-01-08 10:01:17Z ling $ */#include <assert.h>#include <cmath>#include <sstream>#include <stdio.h>#include "learnmodel.h"namespace lemga {/** A local helper for load_data */static DataSet*load_data (DataSet* pd, std::istream& is, UINT n, UINT in, UINT out) {    for (UINT i = 0; i < n; ++i) {        Input x(in);        Output y(out);        for (UINT j = 0; j < in; ++j)            if (!(is >> x[j])) return pd;        for (UINT j = 0; j < out; ++j)            if (!(is >> y[j])) return pd;        pd->append(x, y);    }    return pd;}/** Each sample consists of first the input and then the output. *  Numbers are separated by spaces. *  @param is the input stream *  @param n gives the number of samples *  @param in is the dimension of input *  @param out is the dimension of output *  @todo documentation: why separate function */DataSet* load_data (std::istream& is, UINT n, UINT in, UINT out) {    DataSet* pd = new DataSet();    return load_data(pd, is, n, in, out);}/** An easier-to-use version, where the output dimension is fixed *  at 1, and the input dimension is auto-detected. This version *  requires that each row of stream @a is should be a sample. */DataSet* load_data (std::istream& is, UINT n) {    assert(n > 0);    /* read the first line and infer the input dimension */    Input x;    do {        char line[1024*10];        is.getline(line, 1024*10);        std::istringstream iss(line);        REAL xi;        while (iss >> xi)            x.push_back(xi);    } while (x.empty() && !is.eof());    if (x.empty()) return 0;    Output y(1, x.back());    x.pop_back();    DataSet* pd = new DataSet();    pd->append(x, y);    return load_data(pd, is, n-1, x.size(), 1);}/** @param n_in is the dimension of input. *  @param n_out is the dimension of output. */LearnModel::LearnModel (UINT n_in, UINT n_out)    : Object(), _n_in(n_in), _n_out(n_out), n_samples(0), logf(NULL){ /* empty */ }bool LearnModel::serialize (std::ostream& os,                            ver_list& vl) const {    SERIALIZE_PARENT(Object, os, vl, 1);    return (os << _n_in << ' ' << _n_out << '\n');}bool LearnModel::unserialize (std::istream& is, ver_list& vl,                              const id_t& _id) {    assert(_id == empty_id);    UNSERIALIZE_PARENT(Object, is, vl, 1, v);    if (v == 0) return true;    return (is >> _n_in >> _n_out);}/** @todo if we don't make a new logf, this is not needed */LearnModel::LearnModel (const LearnModel &lm)    : Object(lm), _n_in(lm._n_in), _n_out(lm._n_out),      ptd(lm.ptd), ptw(lm.ptw), n_samples(lm.n_samples), logf(lm.logf) /* is this reasonable? */{ /* empty */ }/** @param out is the output from the learned hypothesis. *  @param y is the real output. *  @return Regression error between @a out and @a y. *  A commonly used measure is the squared error. */REAL LearnModel::r_error (const Output& out, const Output& y) const {    assert(out.size() == n_output());    assert(y.size() == n_output());    REAL err = 0;    for (UINT i = 0; i < _n_out; ++i) {        REAL dif = out[i] - y[i];        err += dif * dif;    }    return err / 2;}/** @param out is the output from the learned hypothesis. *  @param y is the real output. *  @return Classification error between @a out and @a y. *  The error measure is not necessary symmetric. A commonly used *  measure is @a out != @a y. */REAL LearnModel::c_error (const Output& out, const Output& y) const {    assert(n_output() == 1);    assert(std::fabs(std::fabs(y[0]) - 1) < INFINITESIMAL);    return (out[0]*y[0] <= 0);}REAL LearnModel::train_r_error () const {    assert(ptw != 0);    REAL err = 0;    for (UINT i = 0; i < n_samples; ++i)        err += (*ptw)[i] * r_error(get_output(i), ptd->y(i));    return err;}REAL LearnModel::train_c_error () const {    assert(ptw != 0);    REAL err = 0;    for (UINT i = 0; i < n_samples; ++i)        err += (*ptw)[i] * c_error(get_output(i), ptd->y(i));    return err;}REAL LearnModel::test_r_error (const pDataSet& pd) const {    UINT n = pd->size();    REAL err = 0;    for (UINT i = 0; i < n; ++i)        err += r_error((*this)(pd->x(i)), pd->y(i));    return err / n;}REAL LearnModel::test_c_error (const pDataSet& pd) const {    UINT n = pd->size();    REAL err = 0;    for (UINT i = 0; i < n; ++i)        err += c_error((*this)(pd->x(i)), pd->y(i));    return err / n;}/** If the learning model/algorithm can only do training using uniform *  sample weight, i.e., support_weighted_data() returns @c false, a *  ``boostrapped'' copy of the original data set will be generated and *  used in the following training. The boostrapping is done by randomly *  pick samples (with replacement) w.r.t. the given weight @a pw. * *  In order to make the life easier, when support_weighted_data() returns *  @c true, a @c NULL @a pw will be replaced by a uniformly distributed *  probability vector. So we have the following invariant *  @invariant support_weighted_data() == (@a ptw != 0) * *  @param pd gives the data set. *  @param pw gives the sample weight, whose default value is @c NULL. *  @sa support_weighted_data(), train() */void LearnModel::set_train_data (const pDataSet& pd, const pDataWgt& pw) {    n_samples = pd->size();    assert(n_samples > 0);    if (support_weighted_data()) {        ptd = pd;        ptw = (pw != 0)? pw : new DataWgt(n_samples, 1.0 / n_samples);    }    else {        ptd = (!pw)? pd : pd->random_sample(*pw, n_samples);        ptw = 0;    }    assert(!ptw || n_samples == ptw->size());    assert(support_weighted_data() == (ptw != NULL));#ifndef NDEBUG    // assert: ptw is a probability vector    if (ptw != 0) {        REAL wsum = 0;        for (UINT i = 0; i < n_samples; i++) {            assert((*ptw)[i] >= 0);            wsum += (*ptw)[i];        }        assert(wsum-1 > -EPSILON && wsum-1 < EPSILON);    }#endif    UINT din = pd->x(0).size(), dout = pd->y(0).size();    if (_n_in == 0)        _n_in = din;    else if (_n_in != din) {        std::cerr << id() << "::set_train_data: Error: "            "Wrong input dimension.\n";        std::exit(-1);    }    if (_n_out == 0)        _n_out = dout;    else if (_n_out != dout) {        std::cerr << id() << "::set_train_data: Error: "            "Wrong output dimension.\n";        std::exit(-1);    }}REAL LearnModel::margin_of (const Input&, const Output&) const {    OBJ_FUNC_UNDEFINED("margin_of");}REAL LearnModel::min_margin () const {    REAL min_m = INFINITY;    for (UINT i = 0; i < n_samples; ++i) {        // assume all examples count (in computing the minimum)        assert((*ptw)[i] > INFINITESIMAL);        REAL m = margin(i);        if (min_m > m) min_m = m;    }    return min_m;}} // namespace lemga
⌨️ 快捷键说明

复制代码 Ctrl + C
搜索代码 Ctrl + F
全屏模式 F11
切换主题 Ctrl + Shift + D
显示快捷键 ?
增大字号 Ctrl + =
减小字号 Ctrl + -