📄 loqo_solver.cpp
字号:
{ a = new LOQOfloat[m*local_n]; ierr = PLA_Matrix_create(MPIfloat, m, n, templ, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &a_global); CheckError(ierr);}void Solver_Parallel_LOQO::allocate_d(){ d = new LOQOfloat[m]; ierr = PLA_Mvector_create(MPIfloat, m, 1, templ, PLA_ALIGN_FIRST, &d_global); CheckError(ierr);}void Solver_Parallel_LOQO::allocate_work_space(){ ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &x); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &dist); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &g); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &t); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, m, 1, templ, PLA_ALIGN_FIRST, &yy); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &z); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &s); CheckError(ierr);}void Solver_Parallel_LOQO::setup_a(int *work_set){ for(int i=n_low_loc; i<n_up_loc; ++i) a[i-n_low_loc] = y[work_set[i]]; ierr = PLA_Obj_set_to_zero(a_global_view); CheckError(ierr); ierr = PLA_API_begin(); CheckError(ierr); ierr = PLA_Obj_API_open(a_global_view); CheckError(ierr); ierr = PLA_API_axpy_matrix_to_global(1, local_n, plus_one, (void *)a, 1, a_global_view, 0, n_low_loc); CheckError(ierr); ierr = PLA_Obj_API_close(a_global_view); CheckError(ierr); ierr = PLA_API_end(); CheckError(ierr);}void Solver_Parallel_LOQO::setup_d(int *not_work_set){ d[0] = 0; for(int i=0; i<lmn; ++i) { if(fabs(alpha[not_work_set[i]]) > TOL_ZERO) d[0] -= y[not_work_set[i]]*alpha[not_work_set[i]]; } ierr = PLA_Obj_set_to_zero(d_global); CheckError(ierr); ierr = PLA_API_begin(); CheckError(ierr); ierr = PLA_Obj_API_open(d_global); CheckError(ierr); if(rank == 0) ierr = PLA_API_axpy_vector_to_global(1, plus_one, (void *) d, 1, d_global, 0); CheckError(ierr); ierr = PLA_Obj_API_close(d_global); CheckError(ierr); ierr = PLA_API_end(); CheckError(ierr);}void Solver_Parallel_LOQO::setup_up(int *work_set){ for(int i=n_low_loc; i<n_up_loc; ++i) up[i-n_low_loc] = get_C(work_set[i]); ierr = PLA_Obj_set_to_zero(up_global_view); CheckError(ierr); ierr = PLA_API_begin(); CheckError(ierr); ierr = PLA_Obj_API_open(up_global_view); CheckError(ierr); ierr = PLA_API_axpy_vector_to_global(local_n, plus_one, (void *) up, 1, up_global_view, n_low_loc); CheckError(ierr); ierr = PLA_Obj_API_close(up_global_view); CheckError(ierr); ierr = PLA_API_end(); CheckError(ierr);}void Solver_Parallel_LOQO::setup_low(){ for(int i=n_low_loc; i<n_up_loc; ++i) low[i-n_low_loc] = 0; ierr = PLA_Obj_set_to_zero(low_global_view); CheckError(ierr); ierr = PLA_API_begin(); CheckError(ierr); ierr = PLA_Obj_API_open(low_global_view); CheckError(ierr); ierr = PLA_API_axpy_vector_to_global(local_n, plus_one, (void *) low, 1, low_global_view, n_low_loc); CheckError(ierr); ierr = PLA_Obj_API_close(low_global_view); CheckError(ierr); ierr = PLA_API_end(); CheckError(ierr);}void Solver_Parallel_LOQO::setup_problem(int *work_set, int *not_work_set){ // Note: We have to store Q_bb in column major format // since this is storage layout expected by PLAPACK-API. int start_i = 0; for(int i=0; i<n; ++i, ++start_i) { if(work_set[i] >= l_low_loc) break; } num_rows = 0; for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i) ++num_rows; for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i) //for(int i=n_low_loc; i<n_up_loc; ++i) { const Qfloat *Q_i = Q->get_Q_subset(work_set[i], work_set, n); //c[i-n_low_loc] = G[work_set[i]]; c[i-start_i] = G[work_set[i]]; for(int j=0; j<n; ++j) { //Q_bb[j*local_n+(i-n_low_loc)] = (LOQOfloat) Q_i[work_set[j]]; Q_bb[j*num_rows+(i-start_i)] = (LOQOfloat) Q_i[work_set[j]]; if(alpha[work_set[j]] > TOL_ZERO) // c[i-n_low_loc] -= Q_bb[j*local_n+(i-n_low_loc)]*alpha[work_set[j]]; c[i-start_i] -= Q_bb[j*num_rows+(i-start_i)]*alpha[work_set[j]]; } } ierr = MPI_Barrier(comm); CheckError(ierr); // Setup distributed objects. ierr = PLA_Obj_set_to_zero(Q_bb_global_view); CheckError(ierr); ierr = PLA_Obj_set_to_zero(c_global_view); CheckError(ierr); ierr = PLA_API_begin(); CheckError(ierr); ierr = PLA_Obj_API_open(Q_bb_global_view); CheckError(ierr); ierr = PLA_Obj_API_open(c_global_view); CheckError(ierr);// ierr = PLA_API_axpy_matrix_to_global(local_n, n, plus_one, (void *) Q_bb,// local_n, Q_bb_global_view, n_low_loc, // 0); CheckError(ierr); ierr = PLA_API_axpy_matrix_to_global(num_rows, n, plus_one, (void *) Q_bb, num_rows, Q_bb_global_view, start_i, 0); CheckError(ierr);// ierr = PLA_API_axpy_vector_to_global(local_n, plus_one, (void *) c, 1, // c_global_view, n_low_loc); // CheckError(ierr); ierr = PLA_API_axpy_vector_to_global(num_rows, plus_one, (void *) c, 1, c_global_view, start_i); CheckError(ierr); ierr = PLA_Obj_API_close(Q_bb_global_view); CheckError(ierr); ierr = PLA_Obj_API_close(c_global_view); CheckError(ierr); ierr = PLA_API_end(); CheckError(ierr); setup_a(work_set); setup_d(not_work_set);}int Solver_Parallel_LOQO::solve_inner(int *work_set){ int result = -1; double sigdig; double epsilon_loqo = 1e-10; double margin; int iteration; for(margin=init_margin, iteration=init_iter; margin <= 0.9999999 && result != OPTIMAL_SOLUTION;) { sigdig = -log10(opt_precision); // run pr_loqo result = parallel_loqo(c_global_view, Q_bb_global_view, a_global_view, d_global, low_global_view, up_global_view, &x_view, &g_view, &t_view, &yy, &z_view, &s_view, &dist_view, 3, sigdig, iteration, margin, up[0]/4, 0); // Make solution known to all processors, local access // occurs via alpha_new dist_ and dual ierr = PLA_Copy(x_view, x_loc_view); CheckError(ierr); ierr = PLA_Copy(dist_view, dist_loc_view); CheckError(ierr); ierr = PLA_Copy(yy, yy_loc); CheckError(ierr); // TODO: What happens if we have a choldc problem in the parallel // version? Should we implement Manteuffel shifting or find a // workaround here? if(result != OPTIMAL_SOLUTION) { // increase number of iterations iteration += 2000; init_iter += 10; // reduce precision opt_precision *= 10.0; info("NOTICE: Reducing precision of PR_LOQO!\n"); info_flush(); } } // Check precision of alphas for(int i=0; i<n; ++i) { // TODO: Should use up[i] and low[i], but they are not global if(alpha_new[i] < get_C(i)-epsilon_loqo && dist_[i] < -eps) epsilon_loqo = 2*(get_C(i)-alpha_new[i]); else if(alpha_new[i] > 0+epsilon_loqo && dist_[i] > eps) epsilon_loqo = 2*alpha_new[i]; } info("Using epsilon_loqo= %g\n", epsilon_loqo); info_flush(); // Clip alphas to bounds for(int i=0; i<n; ++i) { if(fabs(alpha_new[i]) < epsilon_loqo) { alpha_new[i] = 0; } if(fabs(alpha_new[i]-get_C(i)) < epsilon_loqo) { alpha_new[i] = get_C(i); } } // Compute obj after optimization double obj_after = 0.0; int start_i = 0; for(int i=0; i<n; ++i, ++start_i) { if(work_set[i] >= l_low_loc) break; } for(int i=start_i; work_set[i] < l_up_loc && i<n; ++i) // for(int i=n_low_loc; i<n_up_loc; ++i) { // obj_after += alpha_new[i]*c[i-n_low_loc]; obj_after += alpha_new[i]*c[i-start_i]; for(int j=0; j<n; ++j)// obj_after += 0.5*alpha_new[i]*Q_bb[j*local_n+(i-n_low_loc)]// *alpha_new[j]; obj_after += 0.5*alpha_new[i]*Q_bb[j*num_rows+(i-start_i)] *alpha_new[j]; } // Get local changes for obj_after from other processors double obj_after_loc=0.0; double obj_after_other=0.0; for(int i=0; i<size; ++i) { if(i == rank) ierr = MPI_Bcast(&obj_after, 1, MPI_DOUBLE, i, comm); else { ierr = MPI_Bcast(&obj_after_loc, 1, MPI_DOUBLE, i, comm); obj_after_other += obj_after_loc; } } obj_after += obj_after_other; printf("obj after/before = %g / %g\n", obj_after, obj_before); fflush(stdout); printf("delta a/b = %g\n", obj_after-obj_before); fflush(stdout); // Check for progress if(obj_after >= obj_before) { // Increase precision opt_precision /= 100.0; ++precision_violations; info("NOTICE: Increasing Precision of PR_LOQO.\n"); info_flush(); } if(precision_violations > 500) { // Relax stopping criterion eps *= 10.0; precision_violations = 0; info("WARNING: Relaxing epsilon on KKT-Conditions.\n"); info_flush(); } return result;}void Solver_Parallel_LOQO::Solve(int l, const QMatrix& Q, const double *b_, const schar *y_, double *alpha_, double Cp, double Cn, double eps, SolutionInfo* si, int shrinking){ double total_time = MPI_Wtime(); double problem_setup_time = 0; double inner_solver_time = 0; double gradient_updating_time = 0; double time; // Initialization this->l = l; this->Q = &Q; this->si = si; clone(b,b_,l); clone(y,y_,l); clone(alpha,alpha_,l); this->Cp = Cp; this->Cn = Cn; this->eps = eps; this->lmn = l - n; int *work_set = new int[n]; int *not_work_set = new int[l]; double *delta_alpha = new double[n]; this->G_n = new double[lmn]; this->p_cache_status = new char[size*l]; // Setup alpha status and work status { alpha_status = new char[l]; work_status = new char[l]; work_count = new int[l]; for(int i=0; i<l; ++i) { update_alpha_status(i); work_status[i] = WORK_N; work_count[i] = -1; for(int k=0; k<size; ++k) p_cache_status[k*size+i] = NOT_CACHED; } } // Setup local index ranges this->l_low = new int[size]; this->l_up = new int[size]; this->n_low = new int[size]; this->n_up = new int[size]; this->lmn_low = new int[size]; this->lmn_up = new int[size]; setup_range(l_low, l_up, l); setup_range(n_low, n_up, n); setup_range(lmn_low, lmn_up, lmn); this->l_low_loc = l_low[rank]; this->l_up_loc = l_up[rank]; this->n_low_loc = n_low[rank]; this->n_up_loc = n_up[rank]; this->lmn_up_loc = lmn_up[rank]; this->lmn_low_loc = lmn_low[rank]; this->local_l = l_up_loc - l_low_loc; this->local_n = n_up_loc - n_low_loc; this->local_lmn = lmn_up_loc - lmn_low_loc; // Setup gradient { info("Initializing gradient..."); info_flush(); G = new double[l]; double *G_send = new double[l]; double *G_recv = new double[l]; for(int i=0; i<l; ++i) { G[i] = b[i]; G_send[i] = 0; } // Compute local portion of gradient for(int i=l_low_loc; i<l_up_loc; ++i) { if(!is_lower_bound(i)) { const Qfloat *Q_i = Q.get_Q(i,l); double alpha_i = alpha[i]; for(int j=0; j<l; ++j) G_send[j] += alpha_i * Q_i[j]; } } // Get contributions from other processors for(int k=0; k<size; ++k) { if(rank == k) { ierr = MPI_Bcast(G_send, l, MPIfloat, k, comm); CheckError(ierr); for(int i=0; i<l; ++i) G[i] += G_send[i]; } else { ierr = MPI_Bcast(G_recv, l, MPIfloat, k, comm); CheckError(ierr); for(int i=0; i<l; ++i) G[i] += G_recv[i]; } } delete[] G_recv; delete[] G_send;// for(int i=0; i<l; ++i)// {// if(!is_lower_bound(i))// {// const Qfloat *Q_i = Q.get_Q(i,l); // double alpha_i = alpha[i];// for(int j=0; j<l; ++j)// {// G[j] += alpha_i * Q_i[j];// }// }// } info("done.\n"); info_flush(); } // Allocate local & global space for problem setup //Q_bb = new LOQOfloat[local_n * n]; Q_bb = new LOQOfloat[n*n]; // c = new LOQOfloat[local_n]; c = new LOQOfloat[n]; up = new LOQOfloat[local_n]; low = new LOQOfloat[local_n]; allocate_a(); allocate_d(); allocate_work_space(); ierr = PLA_Matrix_create(MPIfloat, n, n, templ, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &Q_bb_global); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &c_global); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &up_global); CheckError(ierr); ierr = PLA_Mvector_create(MPIfloat, n, 1, templ, PLA_ALIGN_FIRST, &low_global); CheckError(ierr); ierr = PLA_Mscalar_create(MPIfloat, PLA_ALL_ROWS, PLA_ALL_COLS, n, 1, templ, &x_loc); CheckError(ierr); ierr = PLA_Mscalar_create(MPIfloat, PLA_ALL_ROWS, PLA_ALL_COLS, n, 1, templ, &dist_loc); CheckError(ierr); ierr = PLA_Mscalar_create(MPIfloat, PLA_ALL_ROWS, PLA_ALL_COLS, m, 1, templ, &yy_loc); CheckError(ierr); ierr = PLA_Mscalar_create(MPIfloat, PLA_ALL_ROWS, PLA_ALL_COLS, 1, 1, templ, &plus_one); CheckError(ierr); ierr = PLA_Obj_set_to_one(plus_one); CheckError(ierr); iter=0; // Optimization loop while(1) { // Only one processor does the working set selection. int status = 0; if(rank == 0) { if(iter > 0) { status = select_working_set(work_set, not_work_set); } else { init_working_set(work_set, not_work_set); } } // Send status to other processors. ierr = MPI_Bcast(&status, 1, MPI_INT, 0, comm); // Now check for optimality if(status != 0) break; // Send new opt_precision, as select_working_set might have // changed it. ierr = MPI_Bcast(&opt_precision, 1, MPI_DOUBLE, 0, comm); // Send new working set size and working set to other processors. ierr = MPI_Bcast(&n, 1, MPI_INT, 0, comm); ierr = MPI_Bcast(work_set, n, MPI_INT, 0, comm); lmn = l - n; ierr = MPI_Bcast(not_work_set, lmn, MPI_INT, 0, comm);// info("working set proc[%d]=", rank);// for(int i=0; i<n; ++i)// info(" %d", work_set[i]);// info("\n"); info_flush();// info("not workset proc[%d]=", rank);// for(int i=0; i<lmn; ++i)// info(" %d", not_work_set[i]);// info("\n"); info_flush(); // Recompute ranges, as n and lmn might have changed setup_range(n_low, n_up, n); this->n_low_loc = n_low[rank]; this->n_up_loc = n_up[rank]; setup_range(lmn_low, lmn_up, lmn); this->lmn_low_loc = lmn_low[rank]; this->lmn_up_loc = lmn_up[rank]; this->local_n = n_up_loc - n_low_loc; this->local_lmn = lmn_up_loc - lmn_low_loc; // Create new views into existing objs, as n might // have changed ierr = PLA_Obj_view(Q_bb_global, n, n, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &Q_bb_global_view); CheckError(ierr); ierr = PLA_Obj_view(c_global, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &c_global_view); CheckError(ierr); ierr = PLA_Obj_view(a_global, m, n, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &a_global_view); CheckError(ierr); ierr = PLA_Obj_view(up_global, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &up_global_view); CheckError(ierr); ierr = PLA_Obj_view(low_global, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &low_global_view); CheckError(ierr); ierr = PLA_Obj_view(x, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &x_view); CheckError(ierr); ierr = PLA_Obj_view(dist, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &dist_view); CheckError(ierr); ierr = PLA_Obj_view(g, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &g_view); CheckError(ierr); ierr = PLA_Obj_view(t, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &t_view); CheckError(ierr); ierr = PLA_Obj_view(z, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &z_view); CheckError(ierr); ierr = PLA_Obj_view(s, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &s_view); CheckError(ierr); ierr = PLA_Obj_view(x_loc, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &x_loc_view); CheckError(ierr); ierr = PLA_Obj_view(dist_loc, n, 1, PLA_ALIGN_FIRST, PLA_ALIGN_FIRST, &dist_loc_view); CheckError(ierr); void *addr_buf = NULL;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -