📄 lmbc_core.c
字号:
*jacTe, /* J^T e_i mx1 */ *jac, /* nxm */ *jacTjac, /* mxm */ *Dp, /* mx1 */ *diag_jacTjac, /* diagonal of J^T J, mx1 */ *pDp; /* p + Dp, mx1 */register LM_REAL mu, /* damping constant */ tmp; /* mainly used in matrix & vector multiplications */LM_REAL p_eL2, jacTe_inf, pDp_eL2; /* ||e(p)||_2, ||J^T e||_inf, ||e(p+Dp)||_2 */LM_REAL p_L2, Dp_L2=LM_REAL_MAX, dF, dL;LM_REAL tau, eps1, eps2, eps2_sq, eps3;LM_REAL init_p_eL2;int nu=2, nu2, stop=0, nfev, njev=0, nlss=0;const int nm=n*m;/* variables for constrained LM */struct FUNC_STATE fstate;LM_REAL alpha=LM_CNST(1e-4), beta=LM_CNST(0.9), gamma=LM_CNST(0.99995), gamma_sq=gamma*gamma, rho=LM_CNST(1e-8);LM_REAL t, t0;LM_REAL steptl=LM_CNST(1e3)*(LM_REAL)sqrt(LM_REAL_EPSILON), jacTeDp;LM_REAL tmin=LM_CNST(1e-12), tming=LM_CNST(1e-18); /* minimum step length for LS and PG steps */const LM_REAL tini=LM_CNST(1.0); /* initial step length for LS and PG steps */int nLMsteps=0, nLSsteps=0, nPGsteps=0, gprevtaken=0;int numactive;int (*linsolver)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m)=NULL; mu=jacTe_inf=t=0.0; tmin=tmin; /* -Wall */ if(n<m){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): cannot solve a problem with fewer measurements [%d] than unknowns [%d]\n"), n, m); return LM_ERROR; } if(!jacf){ fprintf(stderr, RCAT("No function specified for computing the Jacobian in ", LEVMAR_BC_DER) RCAT("().\nIf no such function is available, use ", LEVMAR_BC_DIF) RCAT("() rather than ", LEVMAR_BC_DER) "()\n"); return LM_ERROR; } if(!LEVMAR_BOX_CHECK(lb, ub, m)){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): at least one lower bound exceeds the upper one\n")); return LM_ERROR; } if(opts){ tau=opts[0]; eps1=opts[1]; eps2=opts[2]; eps2_sq=opts[2]*opts[2]; eps3=opts[3]; } else{ // use default values tau=LM_CNST(LM_INIT_MU); eps1=LM_CNST(LM_STOP_THRESH); eps2=LM_CNST(LM_STOP_THRESH); eps2_sq=LM_CNST(LM_STOP_THRESH)*LM_CNST(LM_STOP_THRESH); eps3=LM_CNST(LM_STOP_THRESH); } if(!work){ worksz=LM_BC_DER_WORKSZ(m, n); //2*n+4*m + n*m + m*m; work=(LM_REAL *)malloc(worksz*sizeof(LM_REAL)); /* allocate a big chunk in one step */ if(!work){ fprintf(stderr, LCAT(LEVMAR_BC_DER, "(): memory allocation request failed\n")); return LM_ERROR; } freework=1; } /* set up work arrays */ e=work; hx=e + n; jacTe=hx + n; jac=jacTe + m; jacTjac=jac + nm; Dp=jacTjac + m*m; diag_jacTjac=Dp + m; pDp=diag_jacTjac + m; fstate.n=n; fstate.hx=hx; fstate.x=x; fstate.adata=adata; fstate.nfev=&nfev; /* see if starting point is within the feasile set */ for(i=0; i<m; ++i) pDp[i]=p[i]; BOXPROJECT(p, lb, ub, m); /* project to feasible set */ for(i=0; i<m; ++i) if(pDp[i]!=p[i]) fprintf(stderr, RCAT("Warning: component %d of starting point not feasible in ", LEVMAR_BC_DER) "()! [%g projected to %g]\n", i, pDp[i], p[i]); /* compute e=x - f(p) and its L2 norm */ (*func)(p, hx, m, n, adata); nfev=1; /* ### e=x-hx, p_eL2=||e|| */#if 1 p_eL2=LEVMAR_L2NRMXMY(e, x, hx, n);#else for(i=0, p_eL2=0.0; i<n; ++i){ e[i]=tmp=x[i]-hx[i]; p_eL2+=tmp*tmp; }#endif init_p_eL2=p_eL2; if(!LM_FINITE(p_eL2)) stop=7; for(k=0; k<itmax && !stop; ++k){ /* Note that p and e have been updated at a previous iteration */ if(p_eL2<=eps3){ /* error is small */ stop=6; break; } /* Compute the Jacobian J at p, J^T J, J^T e, ||J^T e||_inf and ||p||^2. * Since J^T J is symmetric, its computation can be sped up by computing * only its upper triangular part and copying it to the lower part */ (*jacf)(p, jac, m, n, adata); ++njev; /* J^T J, J^T e */ if(nm<__BLOCKSZ__SQ){ // this is a small problem /* J^T*J_ij = \sum_l J^T_il * J_lj = \sum_l J_li * J_lj. * Thus, the product J^T J can be computed using an outer loop for * l that adds J_li*J_lj to each element ij of the result. Note that * with this scheme, the accesses to J and JtJ are always along rows, * therefore induces less cache misses compared to the straightforward * algorithm for computing the product (i.e., l loop is innermost one). * A similar scheme applies to the computation of J^T e. * However, for large minimization problems (i.e., involving a large number * of unknowns and measurements) for which J/J^T J rows are too large to * fit in the L1 cache, even this scheme incures many cache misses. In * such cases, a cache-efficient blocking scheme is preferable. * * Thanks to John Nitao of Lawrence Livermore Lab for pointing out this * performance problem. * * Note that the non-blocking algorithm is faster on small * problems since in this case it avoids the overheads of blocking. */ register int l, im; register LM_REAL alpha, *jaclm; /* looping downwards saves a few computations */ for(i=m*m; i-->0; ) jacTjac[i]=0.0; for(i=m; i-->0; ) jacTe[i]=0.0; for(l=n; l-->0; ){ jaclm=jac+l*m; for(i=m; i-->0; ){ im=i*m; alpha=jaclm[i]; //jac[l*m+i]; for(j=i+1; j-->0; ) /* j<=i computes lower triangular part only */ jacTjac[im+j]+=jaclm[j]*alpha; //jac[l*m+j] /* J^T e */ jacTe[i]+=alpha*e[l]; } } for(i=m; i-->0; ) /* copy to upper part */ for(j=i+1; j<m; ++j) jacTjac[i*m+j]=jacTjac[j*m+i]; } else{ // this is a large problem /* Cache efficient computation of J^T J based on blocking */ LEVMAR_TRANS_MAT_MAT_MULT(jac, jacTjac, n, m); /* cache efficient computation of J^T e */ for(i=0; i<m; ++i) jacTe[i]=0.0; for(i=0; i<n; ++i){ register LM_REAL *jacrow; for(l=0, jacrow=jac+i*m, tmp=e[i]; l<m; ++l) jacTe[l]+=jacrow[l]*tmp; } } /* Compute ||J^T e||_inf and ||p||^2. Note that ||J^T e||_inf * is computed for free (i.e. inactive) variables only. * At a local minimum, if p[i]==ub[i] then g[i]>0; * if p[i]==lb[i] g[i]<0; otherwise g[i]=0 */ for(i=j=numactive=0, p_L2=jacTe_inf=0.0; i<m; ++i){ if(ub && p[i]==ub[i]){ ++numactive; if(jacTe[i]>0.0) ++j; } else if(lb && p[i]==lb[i]){ ++numactive; if(jacTe[i]<0.0) ++j; } else if(jacTe_inf < (tmp=FABS(jacTe[i]))) jacTe_inf=tmp; diag_jacTjac[i]=jacTjac[i*m+i]; /* save diagonal entries so that augmentation can be later canceled */ p_L2+=p[i]*p[i]; } //p_L2=sqrt(p_L2);#if 0if(!(k%100)){ printf("Current estimate: "); for(i=0; i<m; ++i) printf("%.9g ", p[i]); printf("-- errors %.9g %0.9g, #active %d [%d]\n", jacTe_inf, p_eL2, numactive, j);}#endif /* check for convergence */ if(j==numactive && (jacTe_inf <= eps1)){ Dp_L2=0.0; /* no increment for p in this case */ stop=1; break; } /* compute initial damping factor */ if(k==0){ if(!lb && !ub){ /* no bounds */ for(i=0, tmp=LM_REAL_MIN; i<m; ++i) if(diag_jacTjac[i]>tmp) tmp=diag_jacTjac[i]; /* find max diagonal element */ mu=tau*tmp; } else mu=LM_CNST(0.5)*tau*p_eL2; /* use Kanzow's starting mu */ } /* determine increment using a combination of adaptive damping, line search and projected gradient search */ while(1){ /* augment normal equations */ for(i=0; i<m; ++i) jacTjac[i*m+i]+=mu; /* solve augmented equations */#ifdef HAVE_LAPACK /* 5 alternatives are available: LU, Cholesky, 2 variants of QR decomposition and SVD. * Cholesky is the fastest but might be inaccurate; QR is slower but more accurate; * SVD is the slowest but most accurate; LU offers a tradeoff between accuracy and speed */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU; //issolved=AX_EQ_B_CHOL(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_CHOL; //issolved=AX_EQ_B_QR(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_QR; //issolved=AX_EQ_B_QRLS(jacTjac, jacTe, Dp, m, m); ++nlss; linsolver=(int (*)(LM_REAL *A, LM_REAL *B, LM_REAL *x, int m))AX_EQ_B_QRLS; //issolved=AX_EQ_B_SVD(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_SVD;#else /* use the LU included with levmar */ issolved=AX_EQ_B_LU(jacTjac, jacTe, Dp, m); ++nlss; linsolver=AX_EQ_B_LU;#endif /* HAVE_LAPACK */ if(issolved){ for(i=0; i<m; ++i) pDp[i]=p[i] + Dp[i]; /* compute p's new estimate and ||Dp||^2 */ BOXPROJECT(pDp, lb, ub, m); /* project to feasible set */ for(i=0, Dp_L2=0.0; i<m; ++i){ Dp[i]=tmp=pDp[i]-p[i]; Dp_L2+=tmp*tmp; } //Dp_L2=sqrt(Dp_L2); if(Dp_L2<=eps2_sq*p_L2){ /* relative change in p is small, stop */ stop=2; break; } if(Dp_L2>=(p_L2+eps2)/(LM_CNST(EPSILON)*LM_CNST(EPSILON))){ /* almost singular */ stop=4; break; } (*func)(pDp, hx, m, n, adata); ++nfev; /* evaluate function at p + Dp */ /* ### hx=x-hx, pDp_eL2=||hx|| */#if 1 pDp_eL2=LEVMAR_L2NRMXMY(hx, x, hx, n);#else for(i=0, pDp_eL2=0.0; i<n; ++i){ /* compute ||e(pDp)||_2 */ hx[i]=tmp=x[i]-hx[i]; pDp_eL2+=tmp*tmp; }#endif if(!LM_FINITE(pDp_eL2)){ stop=7; break; } if(pDp_eL2<=gamma_sq*p_eL2){ for(i=0, dL=0.0; i<m; ++i) dL+=Dp[i]*(mu*Dp[i]+jacTe[i]);#if 1 if(dL>0.0){ dF=p_eL2-pDp_eL2; tmp=(LM_CNST(2.0)*dF/dL-LM_CNST(1.0)); tmp=LM_CNST(1.0)-tmp*tmp*tmp; mu=mu*( (tmp>=LM_CNST(ONE_THIRD))? tmp : LM_CNST(ONE_THIRD) ); } else mu=(mu>=pDp_eL2)? pDp_eL2 : mu; /* pDp_eL2 is the new pDp_eL2 */#else mu=(mu>=pDp_eL2)? pDp_eL2 : mu; /* pDp_eL2 is the new pDp_eL2 */#endif nu=2; for(i=0 ; i<m; ++i) /* update p's estimate */ p[i]=pDp[i]; for(i=0; i<n; ++i) /* update e and ||e||_2 */ e[i]=hx[i]; p_eL2=pDp_eL2;
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -