📄 learnmodel.h
字号:
// -*- C++ -*-#ifndef __LEMGA_LEARNMODEL_H__#define __LEMGA_LEARNMODEL_H__/** @file * @brief Type definitions, @link lemga::DataSet DataSet@endlink, and * @link lemga::LearnModel LearnModel@endlink class. * @todo: add input_dim check to dataset and learnmodel * * $Id: learnmodel.h 2538 2006-01-08 10:01:17Z ling $ */#include <assert.h>#include <vector>#include "object.h"#include "dataset.h"#include "shared_ptr.h"#define VERBOSE_OUTPUT 1namespace lemga {typedef std::vector<REAL> Input;typedef std::vector<REAL> Output;typedef dataset<Input,Output> DataSet;typedef std::vector<REAL> DataWgt;typedef const_shared_ptr<DataSet> pDataSet;typedef const_shared_ptr<DataWgt> pDataWgt;/// Load a data set from a streamDataSet* load_data (std::istream&, UINT, UINT, UINT);DataSet* load_data (std::istream&, UINT);/** @brief A unified interface for learning models. * * I try to provide * + r_error and c_error * for regression problems, r_error should be defined; * for classification problems, c_error should be defined; * these two errors can both be present * * The training data is stored with the learning model (as a pointer) * Say: why (the benefit of store with, a pointer); maybe not a pointer * Say: what's the impact of doing this (what will be changed * from normal implementation) * Say: wgt: could be NULL if the model doesn't support ...otherwise shoud * be a probability vector (randome_sample)... * * @anchor learnmodel_training_order * The flowchart of the learning ... * -# <code>lm->initialize();</code>\n * Prepare a training * -# <code>lm->set_train_data(sample_data);</code>\n * Specify the training data * -# <code>err = lm->train();</code>\n * Usually, the return value has no meaning * -# <code>y = (*lm)(x);</code>\n * Apply the learning model to new data. * * @todo documentation * @todo Do we really need two errors? */class LearnModel : public Object {protected: UINT _n_in; ///< dimension of input UINT _n_out; ///< dimension of output pDataSet ptd; ///< pointer to the training data set pDataWgt ptw; ///< pointer to the sample weight (for training) UINT n_samples; ///< equal to @c ptd->size() FILE* logf; ///< file to record train/validate errorpublic: LearnModel (UINT n_in = 0, UINT n_out = 0); LearnModel (const LearnModel&); //@{ @name Basic virtual LearnModel* create () const = 0; virtual LearnModel* clone () const = 0; UINT n_input () const { return _n_in; } UINT n_output () const { return _n_out; } void set_log_file (FILE* f) { logf = f; } //@} //@{ @name Training related /** @brief Whether the learning model/algorithm supports unequally * weighted data. * @return @c true if supporting; @c false otherwise. The default * is @c false, just for safety. * @sa set_train_data() */ virtual bool support_weighted_data () const { return false; } /// Error measure for regression problems virtual REAL r_error (const Output& out, const Output& y) const; /// Error measure for classification problems virtual REAL c_error (const Output& out, const Output& y) const; /// Training error (regression) REAL train_r_error () const; /// Training error (classification) REAL train_c_error () const; /// Test error (regression) REAL test_r_error (const pDataSet&) const; /// Test error (classification) REAL test_c_error (const pDataSet&) const; /** @brief Initialize the model for training. * @todo whether or when to initialize is a big problem. If we allow * "continue_on_learning", then sometimes the initialization is not needed * @note this was a virtual function, I changed it to be empty */ virtual void initialize () {} /// Set the data set and sample weight to be used in training virtual void set_train_data (const pDataSet&, const pDataWgt& = 0); /// Return pointer to the embedded training data set const pDataSet& train_data () const { return ptd; } const pDataWgt& data_weight () const { return ptw; } /** @brief Train with preset data set and sample weight. * @return Probably the training error. * @todo Make the return type @c void */ virtual REAL train () = 0; //@} virtual Output operator() (const Input&) const = 0; /** @brief Get the output of the hypothesis on the @a idx-th input. * @note It is possible to cache results to save computational effort. */ virtual Output get_output (UINT idx) const { assert(ptw != NULL); // no data sampling return operator()(ptd->x(idx)); } //@{ @name Margin related /** @brief The normalization term for margins. * * The margin concept can be normalized or unnormalized. For example, * for a perceptron model, the unnormalized margin would be the wegithed * sum of the input features, and the normalized margin would be the * distance to the hyperplane, and the normalization term is the norm * of the hyperplane weight. * * Since the normalization term is usually a constant, it would be * more efficient if it is precomputed instead of being calculated * every time when a margin is asked for. The best way is to use a * cache. Here I use a easier way: let the users decide when to * compute the normalization term. */ virtual REAL margin_norm () const { return 1; } /// Report the (unnormalized) margin of an example (@a x, @a y). virtual REAL margin_of (const Input& x, const Output& y) const; /** @brief Report the (unnormalized) margin of the example @a i. * @note It is possible to cache results to save computational effort. */ virtual REAL margin (UINT i) const { assert(ptw != NULL); // no data sampling return margin_of(ptd->x(i), ptd->y(i)); } /// The minimal (unnormalized) in-sample margin. REAL min_margin () const; //@}protected: virtual bool serialize (std::ostream&, ver_list&) const; virtual bool unserialize (std::istream&, ver_list&, const id_t& = empty_id);};} // namespace lemga#ifdef __LEARNMODEL_H__#warning "This header file may conflict with another `learnmodel.h' file."#endif#define __LEARNMODEL_H__#endif
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -