📄 trainer.cpp
字号:
/* * Copyright (C) 2004 - 2005 by * Hieu Xuan Phan & Minh Le Nguyen {hieuxuan, nguyenml}@jaist.ac.jp * Graduate School of Information Science, * Japan Advanced Institute of Science and Technology (JAIST) * * trainer.cpp - this file is part of FlexCRFs. * * Begin: Dec. 15, 2004 * Last change: Nov. 01, 2005 * * FlexCRFs is a free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, * or (at your option) any later version. * * FlexCRFs is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with FlexCRFs; if not, write to the Free Software Foundation, * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. */#include <stdio.h>#include <math.h>#include <time.h>#include "../../../include/trainer.h"#include "../../../include/model.h"#include "../../../include/evaluation.h"#include "../../../include/mathlib.h"using namespace std;/*--------------------------------------------------------------------------------*/extern "C" { // interface to LBFGS optimization written in FORTRAN extern void lbfgs(int * n, int * m, double * x, double * f, double * g, int * diagco, double * diag, int * iprint, double * eps, double * xtol, double * w, int * iflag); }/*--------------------------------------------------------------------------------*/// destructortrainer::trainer() {}// destructortrainer::~trainer() { if (gradlogli) { delete gradlogli; } if (diag) { delete diag; } if (Mi) { delete Mi; } if (Vi) { delete Vi; } if (alpha) { delete alpha; } if (next_alpha) { delete next_alpha; } if (temp) { delete temp; } for (int i = 0; i < betas.size(); i++) { delete betas[i]; } if (ExpF) { delete ExpF; } if (ws) { delete ws; } if (iprint) { delete iprint; } if (temp_lambda) { delete temp_lambda; }}// initializevoid trainer::init() { popt = pmodel->popt; pdata = pmodel->pdata; pdict = pmodel->pdict; pfgen = pmodel->pfgen; if (popt->order == FIRST_ORDER) { num_labels = popt->num_labels; } else if (popt->order == SECOND_ORDER) { num_labels = popt->num_2orderlabels; } num_features = popt->num_features; lambda = pmodel->lambda; temp_lambda = new double[num_features]; is_logging = popt->is_logging; // allocate memory for vector gradient of log-likelihood function gradlogli = new double[num_features]; // diag is only for LBFGS optimization diag = new double[num_features]; Mi = new doublematrix(num_labels, num_labels); Vi = new doublevector(num_labels); alpha = new doublevector(num_labels); next_alpha = new doublevector(num_labels); temp = new doublevector(num_labels); // allocate memory for vector of feature expectations ExpF = new double[num_features]; // allocate memory for ws (workspace) // this memory is only for LBFGS optimization int ws_size = num_features * (2 * popt->m_for_hessian + 1) + 2 * popt->m_for_hessian; ws = new double[ws_size]; iprint = new int[2];}// compute norm of a vectordouble trainer::norm(int len, double * vect) { double res = 0.0; for (int i = 0; i < len; i++) { res += vect[i] * vect[i]; } return sqrt(res);}// the training methodvoid trainer::train(FILE * fout) { // for timing time_t s_train_time, e_train_time; time_t s_iter_time, e_iter_time; int hours, minutes, seconds; // initialization init(); double f = 0.0; // log-likelihood function value double old_f; // only for tracking the termination condition double xtol = 1.0e-16; // machine precision int num_iters = 0; // the iteration counter // for lbfgs optimization iprint[0] = popt->debug_level - 2; iprint[1] = popt->debug_level - 1; int iflag = 0; // indicate whether or not user provide the diagonal matrix Hk0 // at each iteration (here we chose "not provide", i.e., diagco = false) int diagco = 0; // counter variable int i; // get initial values for lambda components for (i = 0; i < num_features; i++) { lambda[i] = popt->init_lambda_val; temp_lambda[i] = popt->init_lambda_val; } // get the start time s_train_time = time(NULL); // logging if (is_logging) { popt->write_options(fout); fprintf(fout, "Start to train ...\n"); } double max_f1 = 0.0; int max_iter = -1; // the main loop for training CRF do { // get the start time of the current iteration s_iter_time = time(NULL); // call this function to compute two things: // 1. the value of log likelihood of the current lambda // 2. the gradient component of the log likelihood function if (popt->order == FIRST_ORDER) { f = compute_logli_gradient_1order(lambda, gradlogli, num_iters + 1, fout); } else if (popt->order == SECOND_ORDER) { f = compute_logli_gradient_2order(lambda, gradlogli, num_iters + 1, fout); } // negate f and gradient vector because the LBFGS optimization below minimizes // the ojective function while we would like to maximize it f *= -1; for (i = 0; i < num_features; i++) { gradlogli[i] *= -1; } /* extern void lbfgs(int * n, int * m, double * x, double * f, double * g, int & diagco, double * diag, int * iprint, double * eps, double * xtol, double * w, int * iflag); */ // calling LBFGS optimization routine lbfgs(&num_features, &(popt->m_for_hessian), lambda, &f, gradlogli, &diagco, diag, iprint, &(popt->eps_for_convergence), &xtol, ws, &iflag); // checking after calling LBFGS if (iflag < 0) { // LBFGS error printf("LBFGS routine encounters an error\n"); if (is_logging) { fprintf(fout, "LBFGS routine encounters an error\n"); } break; } // increase the iteration counter num_iters++; // get the end time of the current iteration e_iter_time = time(NULL); e_iter_time -= s_iter_time; // display the elapsed time printf("\tIteration elapsed: %d seconds\n", e_iter_time); if (is_logging) { fprintf(fout, "\tIteration elapsed: %d seconds\n", e_iter_time); } // evaluate during training if (popt->evaluate_during_training) { pmodel->apply_tstdata(); printf("\n"); double total_f1 = pmodel->peval->evaluate(fout); if (total_f1 > max_f1) { max_f1 = total_f1; max_iter = num_iters; for (i = 0; i < num_features; i++) { temp_lambda[i] = lambda[i]; } } if (popt->chunk_evaluate_during_training) { printf("\tCurrent max chunk-based F1: %6.2f (iteration %d)\n", max_f1, max_iter); } else { printf("\tCurrent max tag-based F1: %6.2f (iteration %d)\n", max_f1, max_iter); } if (is_logging) { fprintf(fout, "\n"); if (popt->chunk_evaluate_during_training) { fprintf(fout, "\tCurrent max chunk-based F1: %6.2f (iteration %d)\n", max_f1, max_iter); } else { fprintf(fout, "\tCurrent max tag-based F1: %6.2f (iteration %d)\n", max_f1, max_iter); } } // get the end time of the current iteration e_iter_time = time(NULL); e_iter_time -= s_iter_time; // display the elapsed time printf("\tTraining iteration elapsed (including testing & evaluation time): %d seconds\n", e_iter_time); if (is_logging) { fprintf(fout, "\tTraining iteration elapsed (including testing & evaluation time): %d seconds\n", e_iter_time); } } } while (iflag != 0 && num_iters < popt->num_iterations); // get the end time e_train_time = time(NULL); e_train_time -= s_train_time; // display the training time printf("\nThe training process elapsed: %d seconds\n\n", e_train_time); if (is_logging) { fprintf(fout, "\nThe training process elapsed: %d seconds\n\n", e_train_time); } if (popt->evaluate_during_training) { for (i = 0; i < num_features; i++) { lambda[i] = temp_lambda[i]; } }}// compute log-likelihood gradient vector (first-order Markov)double trainer::compute_logli_gradient_1order(double * lambda, double * gradlogli, int num_iters, FILE * fout) { double logli = 0.0; // counter variable int i, j, k; for (i = 0; i < num_features; i++) { gradlogli[i] = -1 * lambda[i] / popt->sigma_square; logli -= (lambda[i] * lambda[i]) / (2 * popt->sigma_square); } dataset::iterator datait; sequence::iterator seqit; int seq_count = 0; // go though all training data sequences for (datait = pdata->ptrndata->begin(); datait != pdata->ptrndata->end(); datait++) { seq_count++; int seq_len = datait->size(); *alpha = 1; for (i = 0; i < num_features; i++) { ExpF[i] = 0; } int betassize = betas.size(); if (betassize < seq_len) { // allocate more beta vector for (i = 0; i < seq_len - betassize; i++) { betas.push_back(new doublevector(num_labels)); } } int scalesize = scale.size(); if (scalesize < seq_len) { // allocate more scale elements for (i = 0; i < seq_len - scalesize; i++) { scale.push_back(1.0); } } // compute beta values in a backward fashion // also scale beta-values to 1 to avoid numerical problems scale[seq_len - 1] = (popt->is_scaling) ? num_labels : 1; betas[seq_len - 1]->assign(1.0 / scale[seq_len - 1]); // start to compute beta values in backward fashion for (int i = seq_len - 1; i > 0; i--) { // compute the Mi matrix and Vi vector compute_log_Mi_1order(*datait, i, Mi, Vi, 1); *temp = *(betas[i]); temp->comp_mult(Vi); mathlib::mult(num_labels, betas[i - 1], Mi, temp, 0); // scale for the next (backward) beta values scale[i - 1] = (popt->is_scaling) ? betas[i - 1]->sum() : 1; betas[i - 1]->comp_mult(1.0 / scale[i - 1]); } // end of beta values computation // start to compute the log-likelihood of the current sequence double seq_logli = 0; for (j = 0; j < seq_len; j++) { compute_log_Mi_1order(*datait, j, Mi, Vi, 1);
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -