optimize.h
来自「C++编写的机器学习算法 Lemga is a C++ package whi」· C头文件 代码 · 共 381 行
H
381 行
// -*- C++ -*-#ifndef __LEMGA_OPTIMIZE_H__#define __LEMGA_OPTIMIZE_H__/** @file * @brief Some generic optimization algorithms. * * $Id: optimize.h 1917 2005-01-07 01:25:30Z ling $ */#include <assert.h>#include <iostream>#include <utility>#include "vectorop.h"namespace lemga {using namespace op;/** @brief Interface used in iterative optimization algorithms. * * This template is used in joint with the generic optimization * algorithm iterative_optimize(). It describes functions that a * class have to provide in order to use iterative_optimize(). * * Some arithmetic operations on gradient are needed for optimization. * The full set required consists of * - @a u += @a v * - -@a v * - @a v *= @a r * - inner_product<@a typename> (@a u, @a v) * where @a u and @a v are weights/gradients and @a r the step length. * * Using such interface improves the flexibility of code. However, * there are always concerns about the performance. Here are * several considerations for the sake of performance: * * - The weight(s) is the variable to be ``searched'' in order to * optimize the cost. It is assumed to be stored somewhere (say, * in a derived class) and is not passed as a parameter to these * functions. * - direction() returns a const reference to a class member. * - The inheritance of different methods doesn't mean the parent * class could be changed without taking care of the classes * derived from it. This is just used as a way to share functions. * - step_lenth() return non-positive step length if the direction * is improper. (It could have returned a pair<bool,REAL>.) * * @todo Does making these functions pure virtual sacrifice * the performance? */template <class Dir, class Step>struct _search { typedef Dir direction_type; typedef Step step_length_type; /// Initialize local variables. void initialize () {} /// Search direction at @a w. const Dir& direction () { return dir; } /// Should we go in direction @a d? How far?. std::pair<bool,Step> step_length (const Dir& d) { return std::make_pair(false, 0); } /// Update the weight. void update_weight (const Dir& d, const Step& s) {} /// Stopping criteria. bool satisfied () { return true; }protected: Dir dir;};/// Main search routine.template <class SEARCH>void iterative_optimize (SEARCH s) { s.initialize(); while (!s.satisfied()) { const typename SEARCH::direction_type& pd = s.direction(); const std::pair<bool, typename SEARCH::step_length_type> stp = s.step_length(pd); if (!stp.first) break; s.update_weight(pd, stp.second); }}/** @brief Gradient descent. * @todo Add options for on-line or batch mode; documentation * Why use get_weight() and set_weight(), not update_weight()? * (how much is the performance decrease) */template <class LM, class Dir, class Step>struct _gradient_descent : public _search<Dir,Step> { LM* plm; Step learning_rate; _gradient_descent (LM* lm, const Step& lr) : _search<Dir,Step>(), plm(lm), learning_rate(lr) {} void initialize () { stp_cnt = 0; w = plm->weight(); } const Dir& direction () { using namespace op; return (this->dir = -plm->gradient()); } std::pair<bool,Step> step_length (const Dir&) { return std::make_pair(true, learning_rate); } void update_weight (Dir d, const Step& s) { using namespace op; w += (d *= s); ++stp_cnt; plm->set_weight(w); } bool satisfied () { return plm->stop_opt(stp_cnt, plm->cost()); }protected: Dir w; unsigned int stp_cnt;};/** @brief Gradient descent with weight decay. * * The cost function includes a regularization term which prefers * smaller weight values. That is, the cost to be minimized is * @f[ \hat{E} = E + \frac{\lambda}2 \Vert w\Vert^2. @f] * The updating rule thus becomes @f[ * w \leftarrow w - \eta \frac{\partial\hat{E}}{\partial w} * = (1-\eta\lambda)w - \eta\frac{\partial E}{\partial w}. * @f] * @f$\lambda@f$ usually ranges from 0.01 to 0.1. * * @todo Well, I guess weight decay is more a term in the cost * than an optimization technique. Though it is easy to add it * here (as an optimization technique). */template <class LM, class Dir, class Step>struct _gd_weightdecay : public _gradient_descent<LM,Dir,Step> { Step decay; _gd_weightdecay (LM* lm, const Step& lr, const Step& dcy) : _gradient_descent<LM,Dir,Step>(lm,lr), decay(dcy) {} void update_weight (Dir d, const Step& s) { using namespace op; assert(s*decay < 1); this->w *= (1 - s*decay); this->w += (d *= s); ++this->stp_cnt; this->plm->set_weight(this->w); }};/** @brief Gradient descent with momentum. * * With momentum, the weight change is accumulated: @f[ * \Delta w\leftarrow-\eta\frac{\partial E}{\partial w}+\mu\Delta w. * @f] * When the derivative is roughly constant, this speeds up the * training by using an effectively larger learning rate @f[ * \Delta w \approx -\frac{\eta}{1-\mu}\frac{\partial E}{\partial w}. * @f] * When the derivative somehow ``fluctuates'', the momentum stablizes * the optimization to some exent by effectively decreasing the * learning rate. */template <class LM, class Dir, class Step>struct _gd_momentum : public _gradient_descent<LM,Dir,Step> { Step momentum; _gd_momentum (LM* lm, const Step& lr, const Step& m) : _gradient_descent<LM,Dir,Step>(lm,lr), momentum(m) {} const Dir& direction () { assert(momentum >= 0 && momentum < 1); using namespace op; if (this->stp_cnt > 0) { this->dir *= momentum; this->dir += -this->plm->gradient(); } else this->dir = -this->plm->gradient(); return this->dir; }};template <class LM, class Dir, class Step, class Cost>struct _gd_adaptive : public _gradient_descent<LM,Dir,Step> { using _gradient_descent<LM,Dir,Step>::plm; using _gradient_descent<LM,Dir,Step>::learning_rate; using _gradient_descent<LM,Dir,Step>::w; using _gradient_descent<LM,Dir,Step>::stp_cnt; Step alpha, beta; _gd_adaptive (LM* lm, const Step& lr, const Step& a, const Step& b) : _gradient_descent<LM,Dir,Step>(lm,lr), alpha(a), beta(b) {} void initialize () { _gradient_descent<LM,Dir,Step>::initialize(); old_cost = plm->cost(); } std::pair<bool,Step> step_length (Dir d) { assert(alpha >= 1 && beta < 1); using namespace op; Step lr = learning_rate; d *= learning_rate; Dir wd = w; plm->set_weight(wd += d); Cost c = plm->cost(); if (c < old_cost) learning_rate *= alpha; else { do { learning_rate *= beta; d *= beta; wd = w; plm->set_weight(wd += d); c = plm->cost(); } while (!(c < old_cost) && learning_rate > 1e-6); lr = learning_rate; } const bool cont = (c < old_cost); if (cont) old_cost = c; else plm->set_weight(w); return std::make_pair(cont, lr); } void update_weight (Dir d, const Step& s) { ++stp_cnt; using namespace op; w += (d *= s); // DO NOT set_weight; done in step_length() } bool satisfied () { return plm->stop_opt(stp_cnt, old_cost); }protected: Cost old_cost;};namespace details {template <class LM, class Dir, class Step, class Cost>Step line_search (LM& lm, const Dir& w, Cost& cst3, const Dir& dir, Step step) { using namespace op; assert(w == lm.weight()); cst3 = lm.cost(); Step stp3 = 0; Dir d = dir, wd = w; d *= step; lm.set_weight(wd += d); Cost cst5 = lm.cost(); while (cst5 > cst3 && step > 2e-7) { std::cout << '-'; step *= 0.5; d *= 0.5; wd = w; lm.set_weight(wd += d); cst5 = lm.cost(); } if (cst5 > cst3) { std::cerr << "\tWarning: not a descending direction\n"; lm.set_weight(w); // undo return 0; } Step stp1, stp5 = step; do { std::cout << '*'; step += step; stp1 = stp3; stp3 = stp5; cst3 = cst5; stp5 += step; d = dir; d *= stp5; wd = w; lm.set_weight(wd += d); cst5 = lm.cost(); } while (cst5 < cst3); while (stp3 > stp1*1.01 || stp5 > stp3*1.01) { std::cout << '.'; Step stp2 = (stp1 + stp3) / 2; Step stp4 = (stp3 + stp5) / 2; d = dir; d *= stp2; wd = w; lm.set_weight(wd += d); Cost cst2 = lm.cost(); d = dir; d *= stp4; wd = w; lm.set_weight(wd += d); Cost cst4 = lm.cost(); if (cst4 < cst2 && cst4 < cst3) { stp1 = stp3; stp3 = stp4; cst3 = cst4; } else if (cst2 < cst3) { stp5 = stp3; stp3 = stp2; cst3 = cst2; } else { stp1 = stp2; stp5 = stp4; } } std::cout << "\tcost = " << cst3 << ", step = " << stp3 << '\n'; return stp3;}} // namespace lemga::detailstemplate <class LM, class Dir, class Step, class Cost>struct _line_search : public _gradient_descent<LM,Dir,Step> { using _gradient_descent<LM,Dir,Step>::plm; using _gradient_descent<LM,Dir,Step>::learning_rate; using _gradient_descent<LM,Dir,Step>::w; using _gradient_descent<LM,Dir,Step>::stp_cnt; _line_search (LM* lm, const Step& lr) : _gradient_descent<LM,Dir,Step>(lm,lr) {} void initialize () { _gradient_descent<LM,Dir,Step>::initialize(); cost_w = plm->cost(); } std::pair<bool,Step> step_length (const Dir& d) { const Step stp = details::line_search(*plm, w, cost_w, d, learning_rate); return std::make_pair((stp>0), stp); } bool satisfied () { return plm->stop_opt(stp_cnt, cost_w); }protected: Cost cost_w;};template <class LM, class Dir, class Step, class Cost>struct _conjugate_gradient : public _line_search<LM,Dir,Step,Cost> { using _gradient_descent<LM,Dir,Step>::plm; using _gradient_descent<LM,Dir,Step>::w; using _gradient_descent<LM,Dir,Step>::stp_cnt; using _search<Dir,Step>::dir; _conjugate_gradient (LM* lm, const Step& lr) : _line_search<LM,Dir,Step,Cost>(lm,lr) {} const Dir& direction () { // get g const Dir g = plm->gradient(); const Step g_norm = op::inner_product<Step>(g, g); using namespace op; // get d if (stp_cnt == 0) dir = -g; else { const Step g_dot_old = op::inner_product<Step>(g, g_old); assert(g_norm_old > 0); Step beta = (g_norm - g_dot_old) / g_norm_old; if (beta < 0) beta = 0; dir *= beta; dir += -g; } g_old = g; g_norm_old = g_norm; return dir; }private: Dir g_old; Step g_norm_old;};} // namespace lemga#ifdef __OPTIMIZE_H__#warning "This header file may conflict with another `optimize.h' file."#endif#define __OPTIMIZE_H__#endif
⌨️ 快捷键说明
复制代码Ctrl + C
搜索代码Ctrl + F
全屏模式F11
增大字号Ctrl + =
减小字号Ctrl + -
显示快捷键?