SHOGUN  4.0.0
MultiLaplacianInferenceMethod.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Wu Lin
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  *
30  *
31  * Code adapted from
32  * https://gist.github.com/yorkerlin/14ace49f2278f3859614
33  * Gaussian Process Machine Learning Toolbox
34  * http://www.gaussianprocess.org/gpml/code/matlab/doc/
35  * and
36  * GPstuff - Gaussian process models for Bayesian analysis
37  * http://becs.aalto.fi/en/research/bayes/gpstuff/
38  *
39  * The reference pseudo code is the algorithm 3.3 of the GPML textbook
40  */
41 
43 
44 #ifdef HAVE_EIGEN3
48 #include <shogun/lib/external/brent.h>
49 
50 using namespace shogun;
51 using namespace Eigen;
52 
53 namespace shogun
54 {
55 
56 #ifndef DOXYGEN_SHOULD_SKIP_THIS
57 
59 class CMultiPsiLine : public func_base
60 {
61 public:
63  MatrixXd K;
64  VectorXd dalpha;
65  VectorXd start_alpha;
66  Map<VectorXd>* alpha;
70  CLikelihoodModel* lik;
71  CLabels* lab;
72 
73  virtual double operator() (double x)
74  {
75  const index_t C=((CMulticlassLabels*)lab)->get_num_classes();
76  const index_t n=((CMulticlassLabels*)lab)->get_num_labels();
77  Map<VectorXd> eigen_f(f->vector, f->vlen);
78  Map<VectorXd> eigen_m(m->vector, m->vlen);
79 
80  // compute alpha=alpha+x*dalpha and f=K*alpha+m
81  (*alpha)=start_alpha+x*dalpha;
82 
83  float64_t result=0;
84  for(index_t bl=0; bl<C; bl++)
85  {
86  eigen_f.block(bl*n,0,n,1)=K*alpha->block(bl*n,0,n,1)*CMath::sq(scale);
87  result+=alpha->block(bl*n,0,n,1).dot(eigen_f.block(bl*n,0,n,1))/2.0;
88  eigen_f.block(bl*n,0,n,1)+=eigen_m;
89  }
90 
91  // get first and second derivatives of log likelihood
92  (*dlp)=lik->get_log_probability_derivative_f(lab, (*f), 1);
93 
94  result -=SGVector<float64_t>::sum(lik->get_log_probability_f(lab, *f));
95 
96  return result;
97  }
98 };
99 
100 #endif /* DOXYGEN_SHOULD_SKIP_THIS */
101 
103 {
104  init();
105 }
106 
108  CFeatures* feat, CMeanFunction* m, CLabels* lab, CLikelihoodModel* mod)
109  : CLaplacianInferenceBase(kern, feat, m, lab, mod)
110 {
111  init();
112 }
113 
114 void CMultiLaplacianInferenceMethod::init()
115 {
116  m_nlz=0;
117  SG_ADD(&m_nlz, "nlz", "negative log marginal likelihood ", MS_NOT_AVAILABLE);
118  SG_ADD(&m_U, "U", "the matrix used to compute gradient wrt hyperparameters", MS_NOT_AVAILABLE);
119 }
120 
122 {
123 }
124 
126 {
128 
130  "Labels must be type of CMulticlassLabels\n");
132  "likelihood model should support multi-classification\n");
133 }
134 
135 SGVector<float64_t> CMultiLaplacianInferenceMethod::get_diagonal_vector()
136 {
138  update();
139 
140  get_dpi_helper();
141 
142  return SGVector<float64_t>(m_W);
143 }
144 
146 {
148  update();
149 
150  return m_nlz;
151 }
152 
154  const TParameter* param)
155 {
156  //SoftMax likelihood does not have this kind of derivative
157  SG_ERROR("Not Implemented!\n");
158  return SGVector<float64_t> ();
159 }
160 
162 {
163  //Sigma=K-K*(E-E*R(M*M')^{-1}*R'*E)*K
164  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
165  const index_t n=m_labels->get_num_labels();
169 
170  m_Sigma=SGMatrix<float64_t>(C*n, C*n);
172  eigen_Sigma.fill(0);
173 
174  MatrixXd eigen_U(C*n,n);
175  for(index_t bl=0; bl<C; bl++)
176  {
177  eigen_U.block(bl*n,0,n,n)=eigen_K*CMath::sq(m_scale)*eigen_E.block(0,bl*n,n,n);
178  eigen_Sigma.block(bl*n,bl*n,n,n)=(MatrixXd::Identity(n,n)-eigen_U.block(bl*n,0,n,n))*(eigen_K*CMath::sq(m_scale));
179  }
180  MatrixXd eigen_V=eigen_M.triangularView<Upper>().adjoint().solve(eigen_U.transpose());
181  eigen_Sigma+=eigen_V.transpose()*eigen_V;
182 }
183 
185 {
186 }
187 
189 {
190  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
191  const index_t n=m_labels->get_num_labels();
192  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
193  Map<MatrixXd> eigen_dpi_matrix(eigen_dpi.data(),n,C);
194 
195  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
196  Map<MatrixXd> eigen_mu_matrix(eigen_mu.data(),n,C);
197  // with log_sum_exp trick
198  VectorXd max_coeff=eigen_mu_matrix.rowwise().maxCoeff();
199  eigen_dpi_matrix=eigen_mu_matrix.array().colwise()-max_coeff.array();
200  VectorXd log_sum_exp=((eigen_dpi_matrix.array().exp()).rowwise().sum()).array().log();
201  eigen_dpi_matrix=(eigen_dpi_matrix.array().colwise()-log_sum_exp.array()).exp();
202 
203  // without log_sum_exp trick
204  //eigen_dpi_matrix=eigen_mu_matrix.array().exp();
205  //VectorXd tmp_for_dpi=eigen_dpi_matrix.rowwise().sum();
206  //eigen_dpi_matrix=eigen_dpi_matrix.array().colwise()/tmp_for_dpi.array();
207 }
208 
210 {
211  float64_t Psi_Old = CMath::INFTY;
212  float64_t Psi_New;
213  float64_t Psi_Def;
214  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
215  const index_t n=m_labels->get_num_labels();
216 
217  // get mean vector and create eigen representation of it
219  Map<VectorXd> eigen_mean_bl(mean.vector, mean.vlen);
220  VectorXd eigen_mean=eigen_mean_bl.replicate(C,1);
221 
222  // create eigen representation of kernel matrix
224 
225  // create shogun and eigen representation of function vector
226  m_mu=SGVector<float64_t>(mean.vlen*C);
227  Map<VectorXd> eigen_mu(m_mu, m_mu.vlen);
228 
229  // f = mean as default value
230  eigen_mu=eigen_mean;
231 
233  m_labels, m_mu));
234 
235  if (m_alpha.vlen!=C*n)
236  {
237  // set alpha a zero vector
239  m_alpha.zero();
240  Psi_New=Psi_Def;
241  m_E=SGMatrix<float64_t>(n,C*n);
244  }
245  else
246  {
248  for(index_t bl=0; bl<C; bl++)
249  eigen_mu.block(bl*n,0,n,1)=eigen_ktrtr*CMath::sq(m_scale)*alpha.block(bl*n,0,n,1);
250 
251  //alpha'*(f-m)/2.0
252  Psi_New=alpha.dot(eigen_mu)/2.0;
253  // compute f = K * alpha + m
254  eigen_mu+=eigen_mean;
255 
257 
258  // if default is better, then use it
259  if (Psi_Def < Psi_New)
260  {
261  m_alpha.zero();
262  eigen_mu=eigen_mean;
263  Psi_New=Psi_Def;
264  }
265  }
266 
267  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
268  Map<VectorXd> eigen_W(m_W.vector, m_W.vlen);
271 
272  // get first derivative of log probability function
274 
275  index_t iter=0;
276  Map<MatrixXd> & eigen_M=eigen_L;
277 
278  while (Psi_Old-Psi_New>m_tolerance && iter<m_iter)
279  {
280  Map<VectorXd> eigen_dlp(m_dlp.vector, m_dlp.vlen);
281 
282  get_dpi_helper();
283  Map<VectorXd> eigen_dpi(m_W.vector, m_W.vlen);
284 
285 
286  Psi_Old = Psi_New;
287  iter++;
288 
289  m_nlz=0;
290 
291  for(index_t bl=0; bl<C; bl++)
292  {
293  VectorXd eigen_sD=eigen_dpi.block(bl*n,0,n,1).cwiseSqrt();
294  LLT<MatrixXd> chol_tmp((eigen_sD*eigen_sD.transpose()).cwiseProduct(eigen_ktrtr*CMath::sq(m_scale))+
295  MatrixXd::Identity(m_ktrtr.num_rows, m_ktrtr.num_cols));
296  MatrixXd eigen_L_tmp=chol_tmp.matrixU();
297  MatrixXd eigen_E_bl=eigen_L_tmp.triangularView<Upper>().adjoint().solve(MatrixXd(eigen_sD.asDiagonal()));
298  eigen_E_bl=eigen_E_bl.transpose()*eigen_E_bl;
299  eigen_E.block(0,bl*n,n,n)=eigen_E_bl;
300  if (bl==0)
301  eigen_M=eigen_E_bl;
302  else
303  eigen_M+=eigen_E_bl;
304  m_nlz+=eigen_L_tmp.diagonal().array().log().sum();
305  }
306 
307  LLT<MatrixXd> chol_tmp(eigen_M);
308  eigen_M = chol_tmp.matrixU();
309  m_nlz+=eigen_M.diagonal().array().log().sum();
310 
311  VectorXd eigen_b=eigen_dlp;
312  Map<VectorXd> & tmp1=eigen_dlp;
313  tmp1=eigen_dpi.array()*(eigen_mu-eigen_mean).array();
314  Map<MatrixXd> m_tmp(tmp1.data(),n,C);
315  VectorXd tmp2=m_tmp.array().rowwise().sum();
316 
317  for(index_t bl=0; bl<C; bl++)
318  eigen_b.block(bl*n,0,n,1)+=eigen_dpi.block(bl*n,0,n,1).cwiseProduct(eigen_mu.block(bl*n,0,n,1)-eigen_mean_bl-tmp2);
319 
320  Map<VectorXd> &eigen_c=eigen_W;
321  for(index_t bl=0; bl<C; bl++)
322  eigen_c.block(bl*n,0,n,1)=eigen_E.block(0,bl*n,n,n)*(eigen_ktrtr*CMath::sq(m_scale)*eigen_b.block(bl*n,0,n,1));
323 
324  Map<MatrixXd> c_tmp(eigen_c.data(),n,C);
325 
326  VectorXd tmp3=c_tmp.array().rowwise().sum();
327  VectorXd tmp4=eigen_M.triangularView<Upper>().adjoint().solve(tmp3);
328 
329  VectorXd &eigen_dalpha=eigen_b;
330  eigen_dalpha+=eigen_E.transpose()*(eigen_M.triangularView<Upper>().solve(tmp4))-eigen_c-eigen_alpha;
331 
332  // perform Brent's optimization
333  CMultiPsiLine func;
334 
335  func.scale=m_scale;
336  func.K=eigen_ktrtr;
337  func.dalpha=eigen_dalpha;
338  func.start_alpha=eigen_alpha;
339  func.alpha=&eigen_alpha;
340  func.dlp=&m_dlp;
341  func.f=&m_mu;
342  func.m=&mean;
343  func.lik=m_model;
344  func.lab=m_labels;
345 
346  float64_t x;
347  Psi_New=local_min(0, m_opt_max, m_opt_tolerance, func, x);
348  m_nlz+=Psi_New;
349  }
350 
351 }
352 
354 {
355  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
356  const index_t n=m_labels->get_num_labels();
357  m_U=SGMatrix<float64_t>(n, n*C);
358  Map<MatrixXd> eigen_U(m_U.matrix, m_U.num_rows, m_U.num_cols);
361  eigen_U=eigen_M.triangularView<Upper>().adjoint().solve(eigen_E);
362 }
363 
365 {
366  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
367  //currently only explicit term is computed
368  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
369  const index_t n=m_labels->get_num_labels();
370  Map<MatrixXd> eigen_U(m_U.matrix, m_U.num_rows, m_U.num_cols);
372  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
373  float64_t result=0;
374  //currently only explicit term is computed
375  for(index_t bl=0; bl<C; bl++)
376  {
377  result+=((eigen_E.block(0,bl*n,n,n)-eigen_U.block(0,bl*n,n,n).transpose()*eigen_U.block(0,bl*n,n,n)).array()
378  *eigen_dK.array()).sum();
379  result-=(eigen_dK*eigen_alpha.block(bl*n,0,n,1)).dot(eigen_alpha.block(bl*n,0,n,1));
380  }
381 
382  return result/2.0;
383 }
384 
386  const TParameter* param)
387 {
388  REQUIRE(!strcmp(param->m_name, "scale"), "Can't compute derivative of "
389  "the nagative log marginal likelihood wrt %s.%s parameter\n",
390  get_name(), param->m_name)
391 
392  Map<MatrixXd> eigen_K(m_ktrtr.matrix, m_ktrtr.num_rows, m_ktrtr.num_cols);
393 
394  SGVector<float64_t> result(1);
395 
396  SGMatrix<float64_t> dK(m_ktrtr.num_rows, m_ktrtr.num_cols);
397  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
398 
399  // compute derivative K wrt scale
400  eigen_dK=eigen_K*m_scale*2.0;
401 
402  result[0]=get_derivative_helper(dK);
403 
404  return result;
405 }
406 
408  const TParameter* param)
409 {
410  // create eigen representation of K, Z, dfhat, dlp and alpha
412 
413  SGVector<float64_t> result;
414 
415  if (param->m_datatype.m_ctype==CT_VECTOR ||
416  param->m_datatype.m_ctype==CT_SGVECTOR)
417  {
418  REQUIRE(param->m_datatype.m_length_y,
419  "Length of the parameter %s should not be NULL\n", param->m_name)
420  result=SGVector<float64_t>(*(param->m_datatype.m_length_y));
421  }
422  else
423  {
424  result=SGVector<float64_t>(1);
425  }
426 
427  for (index_t i=0; i<result.vlen; i++)
428  {
430 
431  if (result.vlen==1)
432  dK=m_kernel->get_parameter_gradient(param);
433  else
434  dK=m_kernel->get_parameter_gradient(param, i);
435 
436  Map<MatrixXd> eigen_dK(dK.matrix, dK.num_rows, dK.num_cols);
437  eigen_dK=eigen_dK*CMath::sq(m_scale);
438 
439  result[i]=get_derivative_helper(dK);
440  }
441 
442  return result;
443 }
444 
446  const TParameter* param)
447 {
448  // create eigen representation of K, Z, dfhat and alpha
450  Map<VectorXd> eigen_alpha(m_alpha.vector, m_alpha.vlen);
451  const index_t C=((CMulticlassLabels*)m_labels)->get_num_classes();
452  const index_t n=m_labels->get_num_labels();
453 
454  SGVector<float64_t> result;
455 
456  if (param->m_datatype.m_ctype==CT_VECTOR ||
457  param->m_datatype.m_ctype==CT_SGVECTOR)
458  {
460  "Length of the parameter %s should not be NULL\n", param->m_name)
461  result=SGVector<float64_t>(*(param->m_datatype.m_length_y));
462  }
463  else
464  {
465  result=SGVector<float64_t>(1);
466  }
467 
468  for (index_t i=0; i<result.vlen; i++)
469  {
471 
472  if (result.vlen==1)
474  else
476 
477  Map<VectorXd> eigen_dmu(dmu.vector, dmu.vlen);
478 
479  result[i]=0;
480  //currently only compute the explicit term
481  for(index_t bl=0; bl<C; bl++)
482  result[i]-=eigen_alpha.block(bl*n,0,n,1).dot(eigen_dmu);
483  }
484 
485  return result;
486 }
487 
488 }
489 
490 #endif /* HAVE_EIGEN3 */
virtual SGVector< float64_t > get_log_probability_f(const CLabels *lab, SGVector< float64_t > func) const =0
virtual bool supports_multiclass() const
virtual ELabelType get_label_type() const =0
virtual void update_alpha()=0
SGVector< float64_t > m_alpha
virtual void update_approx_cov()=0
int32_t index_t
Definition: common.h:62
Vector::Scalar dot(Vector a, Vector b)
Definition: Redux.h:56
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
static const float64_t INFTY
infinity
Definition: Math.h:2048
virtual int32_t get_num_labels() const =0
multi-class labels 0,1,...
Definition: LabelTypes.h:20
static T sum(T *vec, int32_t len)
Return sum(vec)
Definition: SGVector.h:341
static T sq(T x)
Definition: Math.h:450
Definition: SGMatrix.h:20
parameter struct
Definition: Parameter.h:32
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
The Laplace approximation inference method base class.
virtual SGVector< float64_t > get_mean_vector(const CFeatures *features) const =0
An abstract class of the mean function.
Definition: MeanFunction.h:28
virtual SGVector< float64_t > get_derivative_wrt_likelihood_model(const TParameter *param)=0
Multiclass Labels for multi-class classification.
TSGDataType m_datatype
Definition: Parameter.h:165
SGMatrix< float64_t > m_L
double float64_t
Definition: common.h:50
SGMatrix< float64_t > m_E
Matrix::Scalar sum(Matrix m, bool no_diag=false)
Definition: Redux.h:70
The Laplace approximation inference method class for multi classification.
index_t num_rows
Definition: SGMatrix.h:329
virtual SGVector< float64_t > get_derivative_wrt_inference_method(const TParameter *param)=0
Matrix< float64_t,-1,-1, 0,-1,-1 > MatrixXd
index_t num_cols
Definition: SGMatrix.h:331
virtual SGVector< float64_t > get_derivative_wrt_kernel(const TParameter *param)=0
index_t * m_length_y
Definition: DataType.h:78
virtual SGVector< float64_t > get_parameter_derivative(const CFeatures *features, const TParameter *param, index_t index=-1)
Definition: MeanFunction.h:52
EContainerType m_ctype
Definition: DataType.h:71
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
virtual SGVector< float64_t > get_derivative_wrt_mean(const TParameter *param)=0
The class Features is the base class of all feature objects.
Definition: Features.h:68
void scale(Matrix A, Matrix B, typename Matrix::Scalar alpha)
Definition: Core.h:83
virtual void update_chol()=0
virtual SGMatrix< float64_t > get_parameter_gradient(const TParameter *param, index_t index=-1)
Definition: Kernel.h:732
virtual void check_members() const
virtual SGVector< float64_t > get_log_probability_derivative_f(const CLabels *lab, SGVector< float64_t > func, index_t i) const =0
The Kernel base class.
Definition: Kernel.h:153
#define SG_ADD(...)
Definition: SGObject.h:81
virtual void update_deriv()=0
virtual bool parameter_hash_changed()
Definition: SGObject.cpp:263
The Likelihood model base class.
SGMatrix< float64_t > m_ktrtr
CLikelihoodModel * m_model
index_t vlen
Definition: SGVector.h:481
static void * get_derivative_helper(void *p)

SHOGUN Machine Learning Toolbox - Documentation