SHOGUN  6.0.0
LinalgBackendViennaCL.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016, Shogun-Toolbox e.V. <shogun-team@shogun-toolbox.org>
3  * All rights reserved.
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  * 1. Redistributions of source code must retain the above copyright notice,
8  * this list of conditions and the following disclaimer.
9  *
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  *
14  * 3. Neither the name of the copyright holder nor the names of its
15  * contributors may be used to endorse or promote products derived from
16  * this software without specific prior written permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  *
30  * Authors: 2016 Pan Deng, Soumyajit De, Heiko Strathmann, Viktor Gal
31  */
32 
33 #ifndef LINALG_BACKEND_VIENNACL_H__
34 #define LINALG_BACKEND_VIENNACL_H__
35 
38 
39 #ifdef HAVE_VIENNACL
40 
41 #include <viennacl/linalg/inner_prod.hpp>
42 #include <viennacl/linalg/prod.hpp>
43 #include <viennacl/vector.hpp>
44 #include <viennacl/matrix.hpp>
46 
47 #if VIENNACL_VERSION >= 10700
48 #include <viennacl/linalg/sum.hpp>
49 #endif
50 
51 namespace shogun
52 {
53 
57 class LinalgBackendViennaCL : public LinalgBackendGPUBase
58 {
59  template <typename T>
60  friend struct GPUMemoryViennaCL;
61 
62 public:
63  #define DEFINE_FOR_ALL_PTYPE(METHODNAME, Container) \
64  METHODNAME(char, Container); \
65  METHODNAME(uint8_t, Container); \
66  METHODNAME(int16_t, Container); \
67  METHODNAME(uint16_t, Container); \
68  METHODNAME(int32_t, Container); \
69  METHODNAME(uint32_t, Container); \
70  METHODNAME(float32_t, Container); \
71  METHODNAME(float64_t, Container); \
72 
73  #define DEFINE_FOR_NON_INTEGER_PTYPE(METHODNAME, Container) \
74  METHODNAME(float32_t, Container); \
75  METHODNAME(float64_t, Container); \
76 
77 
78  #define BACKEND_GENERIC_IN_PLACE_ADD(Type, Container) \
79  virtual void add(Container<Type>& a, Container<Type>& b, Type alpha, \
80  Type beta, Container<Type>& result) const \
81  { \
82  add_impl(a, b, alpha, beta, result); \
83  }
86  #undef BACKEND_GENERIC_ADD
87 
89  #define BACKEND_GENERIC_DOT(Type, Container) \
90  virtual Type dot(const Container<Type>& a, const Container<Type>& b) const \
91  { \
92  return dot_impl(a, b); \
93  }
95  #undef BACKEND_GENERIC_DOT
96 
98  #define BACKEND_GENERIC_IN_PLACE_ELEMENT_PROD(Type, Container) \
99  virtual void element_prod(Container<Type>& a, Container<Type>& b,\
100  Container<Type>& result) const \
101  { \
102  element_prod_impl(a, b, result); \
103  }
105  #undef BACKEND_GENERIC_IN_PLACE_ELEMENT_PROD
106 
108  #define BACKEND_GENERIC_LOGISTIC(Type, Container) \
109  virtual void logistic(Container<Type>& a, Container<Type>& result) const \
110  { \
111  logistic_impl(a, result); \
112  }
114  #undef BACKEND_GENERIC_LOGISTIC
115 
117  #define BACKEND_GENERIC_IN_PLACE_MATRIX_PROD(Type, Container) \
118  virtual void matrix_prod(SGMatrix<Type>& a, Container<Type>& b,\
119  Container<Type>& result, bool transpose_A, bool transpose_B) const \
120  { \
121  matrix_prod_impl(a, b, result, transpose_A, transpose_B); \
122  }
125  #undef BACKEND_GENERIC_IN_PLACE_MATRIX_PROD
126 
128  #define BACKEND_GENERIC_MAX(Type, Container) \
129  virtual Type max(const Container<Type>& a) const \
130  { \
131  return max_impl(a); \
132  }
135  #undef BACKEND_GENERIC_MAX
136 
138  #define BACKEND_GENERIC_MEAN(Type, Container) \
139  virtual float64_t mean(const Container<Type>& a) const \
140  { \
141  return mean_impl(a); \
142  }
143  DEFINE_FOR_ALL_PTYPE(BACKEND_GENERIC_MEAN, SGVector)
144  DEFINE_FOR_ALL_PTYPE(BACKEND_GENERIC_MEAN, SGMatrix)
145  #undef BACKEND_GENERIC_MEAN
146 
148  #define BACKEND_GENERIC_IN_PLACE_SCALE(Type, Container) \
149  virtual void scale(Container<Type>& a, Type alpha, Container<Type>& result) const \
150  { \
151  scale_impl(a, result, alpha); \
152  }
155  #undef BACKEND_GENERIC_IN_PLACE_SCALE
156 
158  #define BACKEND_GENERIC_SET_CONST(Type, Container) \
159  virtual void set_const(Container<Type>& a, const Type value) const \
160  { \
161  set_const_impl(a, value); \
162  }
165  #undef BACKEND_GENERIC_SET_CONST
166 
168  #define BACKEND_GENERIC_SUM(Type, Container) \
169  virtual Type sum(const Container<Type>& a, bool no_diag) const \
170  { \
171  return sum_impl(a, no_diag); \
172  }
175  #undef BACKEND_GENERIC_SUM
176 
178  #define BACKEND_GENERIC_SYMMETRIC_SUM(Type, Container) \
179  virtual Type sum_symmetric(const Container<Type>& a, bool no_diag) const \
180  { \
181  return sum_symmetric_impl(a, no_diag); \
182  }
184  #undef BACKEND_GENERIC_SYMMETRIC_SUM
185 
187  #define BACKEND_GENERIC_COLWISE_SUM(Type, Container) \
188  virtual SGVector<Type> colwise_sum(const Container<Type>& a, bool no_diag) const \
189  { \
190  return colwise_sum_impl(a, no_diag); \
191  }
193  #undef BACKEND_GENERIC_COLWISE_SUM
194 
196  #define BACKEND_GENERIC_ROWWISE_SUM(Type, Container) \
197  virtual SGVector<Type> rowwise_sum(const Container<Type>& a, bool no_diag) const \
198  { \
199  return rowwise_sum_impl(a, no_diag); \
200  }
202  #undef BACKEND_GENERIC_ROWWISE_SUM
203 
205  #define BACKEND_GENERIC_TO_GPU(Type, Container) \
206  virtual GPUMemoryBase<Type>* to_gpu(const Container<Type>& a) const \
207  { \
208  return to_gpu_impl(a); \
209  }
212  #undef BACKEND_GENERIC_TO_GPU
213 
215  #define BACKEND_GENERIC_FROM_GPU(Type, Container) \
216  virtual void from_gpu(const Container<Type>& a, Type* data) const \
217  { \
218  return from_gpu_impl(a, data); \
219  }
222  #undef BACKEND_GENERIC_FROM_GPU
223 
224  #undef DEFINE_FOR_ALL_PTYPE
225 
226 private:
228  template <typename T, template<typename> class Container>
229  GPUMemoryViennaCL<T>* cast_to_viennacl(const Container<T> &a) const
230  {
231  return static_cast<GPUMemoryViennaCL<T>*>(a.gpu_ptr.get());
232  }
233 
235  template <typename T>
236  void add_impl(SGVector<T>& a, SGVector<T>& b, T alpha, T beta, SGVector<T>& result) const
237  {
238  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
239  GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
240  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
241 
242  result_gpu->data_vector(a.size()) =
243  alpha * a_gpu->data_vector(a.size()) + beta * b_gpu->data_vector(b.size());
244  }
245 
247  template <typename T>
248  void add_impl(SGMatrix<T>& a, SGMatrix<T>& b, T alpha, T beta, SGMatrix<T>& result) const
249  {
250  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
251  GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
252  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
253 
254  result_gpu->data_matrix(a.num_rows, a.num_cols) =
255  alpha * a_gpu->data_matrix(a.num_rows, a.num_cols)
256  + beta * b_gpu->data_matrix(b.num_rows, b.num_cols);
257  }
258 
260  template <typename T>
261  T dot_impl(const SGVector<T>& a, const SGVector<T>& b) const
262  {
263  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
264  GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
265 
266  return viennacl::linalg::inner_prod(
267  a_gpu->data_vector(a.size()), b_gpu->data_vector(b.size()));
268  }
269 
271  template <typename T>
272  void element_prod_impl(SGMatrix<T>& a, SGMatrix<T>& b, SGMatrix<T>& result) const
273  {
274  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
275  GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
276  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
277 
278  result_gpu->data_matrix(a.num_rows, a.num_cols) =
279  viennacl::linalg::element_prod(a_gpu->data_matrix(a.num_rows,
280  a.num_cols), b_gpu->data_matrix(a.num_rows, a.num_cols));
281  }
282 
284  template <typename T>
285  void logistic_impl(SGMatrix<T>& a, SGMatrix<T>& result) const
286  {
287  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
288  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
289 
290  const std::string operation = "return 1.0/(1+exp(-1*element));";
291 
292  std::string kernel_name = "logistic_" + linalg::implementation::ocl::get_type_string<T>();
293  viennacl::ocl::kernel& kernel =
294  linalg::implementation::ocl::
295  generate_single_arg_elementwise_kernel<T>(kernel_name, operation);
296 
297  kernel.global_work_size(0,
298  linalg::implementation::ocl::align_to_multiple_1d(a.num_rows*a.num_cols));
299 
300  viennacl::ocl::enqueue(kernel(a_gpu->data_matrix(a.num_rows, a.num_cols),
301  cl_int(a.num_rows*a.num_cols), cl_int(a_gpu->m_offset),
302  result_gpu->data_matrix(a.num_rows, a.num_cols), cl_int(result_gpu->m_offset)));
303 
304  result.gpu_ptr = std::shared_ptr<GPUMemoryBase<T>>(
305  result_gpu->clone_vector(result_gpu,a.num_rows*a.num_cols));
306  }
307 
309  template <typename T>
310  void matrix_prod_impl(SGMatrix<T>& a, SGVector<T>& b, SGVector<T>& result,
311  bool transpose, bool transpose_B=false) const
312  {
313  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
314  GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
315  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
316 
317  if (transpose)
318  result_gpu->data_vector(result.vlen) = viennacl::linalg::prod(
319  viennacl::trans(a_gpu->data_matrix(a.num_rows, a.num_cols)),
320  b_gpu->data_vector(b.vlen));
321  else
322  result_gpu->data_vector(result.vlen) = viennacl::linalg::prod(
323  a_gpu->data_matrix(a.num_rows, a.num_cols), b_gpu->data_vector(b.vlen));
324  }
325 
327  template <typename T>
328  void matrix_prod_impl(SGMatrix<T>& a, SGMatrix<T>& b, SGMatrix<T>& result,
329  bool transpose_A, bool transpose_B) const
330  {
331  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
332  GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
333  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
334 
335  if (transpose_A && transpose_B)
336  result_gpu->data_matrix(result.num_rows, result.num_cols) =
337  viennacl::linalg::prod(viennacl::trans(a_gpu->data_matrix(
338  a.num_rows, a.num_cols)), viennacl::trans(b_gpu->data_matrix(
339  b.num_rows, b.num_cols)));
340 
341  else if (transpose_A)
342  result_gpu->data_matrix(result.num_rows, result.num_cols) =
343  viennacl::linalg::prod(viennacl::trans(a_gpu->data_matrix(
344  a.num_rows, a.num_cols)), b_gpu->data_matrix(b.num_rows,
345  b.num_cols));
346 
347  else if (transpose_B)
348  result_gpu->data_matrix(result.num_rows, result.num_cols) =
349  viennacl::linalg::prod(a_gpu->data_matrix(a.num_rows, a.num_cols),
350  viennacl::trans(b_gpu->data_matrix(b.num_rows, b.num_cols)));
351 
352  else
353  result_gpu->data_matrix(result.num_rows, result.num_cols) =
354  viennacl::linalg::prod(a_gpu->data_matrix(a.num_rows, a.num_cols),
355  b_gpu->data_matrix(b.num_rows, b.num_cols));
356  }
357 
359  template <typename T, template<typename> class Container>
360  T max_impl(const Container<T>& a) const
361  {
362  typedef typename std::aligned_storage<sizeof(T), alignof(T)>::type aligned_t;
363 
364  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
365  GPUMemoryViennaCL<T>* result_gpu = new GPUMemoryViennaCL<T>(1);
366 
367  viennacl::ocl::kernel& kernel = generate_max_kernel<T>();
368  viennacl::ocl::enqueue(kernel(a_gpu->data_vector(a.size()),
369  cl_int(a.size()), cl_int(a_gpu->m_offset),
370  result_gpu->data_vector(1)));
371 
372  T* result = reinterpret_cast<T*>(SG_MALLOC(aligned_t, 1));
373  viennacl::backend::memory_read(*(result_gpu->m_data),
374  result_gpu->m_offset*sizeof(T), sizeof(T), result);
375 
376  return result[0];
377  }
378 
380  template <typename T, template <typename> class Container>
381  float64_t mean_impl(const Container<T>& a) const
382  {
383  return sum_impl(a)/float64_t(a.size());
384  }
385 
387  template <typename T>
388  void scale_impl(SGVector<T>& a, SGVector<T>& result, T alpha) const
389  {
390  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
391  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
392 
393  result_gpu->data_vector(a.size()) = alpha * a_gpu->data_vector(a.size());
394  }
395 
397  template <typename T>
398  void scale_impl(SGMatrix<T>& a, SGMatrix<T>& result, T alpha) const
399  {
400  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
401  GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
402 
403  result_gpu->data_matrix(a.num_rows, a.num_cols) =
404  alpha * a_gpu->data_matrix(a.num_rows, a.num_cols);
405  }
406 
408  template <typename T, template <typename> class Container>
409  void set_const_impl(Container<T>& a, T value) const
410  {
411  GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
412  typename GPUMemoryViennaCL<T>::VCLVectorBase vcl_vector =
413  a_gpu->data_vector(a.size());
414  viennacl::linalg::vector_assign(vcl_vector, value);
415  }
416 
418  template <typename T>
419  T sum_impl(const SGMatrix<T>& mat, bool no_diag=false) const
420  {
421  typedef typename std::aligned_storage<sizeof(T), alignof(T)>::type aligned_t;
422 
423  GPUMemoryViennaCL<T>* mat_gpu = cast_to_viennacl(mat);
424  GPUMemoryViennaCL<T>* result_gpu = new GPUMemoryViennaCL<T>(1);
425 
426  viennacl::ocl::kernel& kernel = generate_sum_kernel<T>(no_diag);
427  viennacl::ocl::enqueue(kernel(mat_gpu->data_matrix(mat.num_rows, mat.num_cols),
428  cl_int(mat.num_rows), cl_int(mat.num_cols), cl_int(mat_gpu->m_offset),
429  result_gpu->data_vector(1)));
430 
431  T* result;
432  result = reinterpret_cast<T*>(SG_MALLOC(aligned_t, 1));
433  viennacl::backend::memory_read(*(result_gpu->m_data),
434  result_gpu->m_offset*sizeof(T), sizeof(T), result);
435 
436  return result[0];
437  }
438 
440  template <typename T>
441  T sum_impl(const SGVector<T>& vec, bool no_diag=false) const
442  {
443  #if VIENNACL_VERSION >= 10700
444  GPUMemoryViennaCL<T>* vec_gpu = cast_to_viennacl(vec);
445  return viennacl::linalg::sum(vec_gpu->data_vector(vec.size()));
446  #else
447  return sum_impl(SGMatrix<T>(vec));
448  #endif
449  }
450 
452  template <typename T>
453  T sum_symmetric_impl(const SGMatrix<T>& mat, bool no_diag=false) const
454  {
455  return sum_impl(mat, no_diag);
456  }
457 
459  template <typename T>
460  SGVector<T> colwise_sum_impl(const SGMatrix<T>& mat, bool no_diag) const
461  {
462  GPUMemoryViennaCL<T>* mat_gpu = cast_to_viennacl(mat);
463  GPUMemoryViennaCL<T>* result_gpu = new GPUMemoryViennaCL<T>(mat.num_cols);
464  viennacl::ocl::kernel& kernel = generate_colwise_sum_kernel<T>(no_diag);
465  kernel.global_work_size(0, linalg::implementation::ocl::align_to_multiple_1d(mat.num_cols));
466 
467  viennacl::ocl::enqueue(kernel(mat_gpu->data_matrix(mat.num_rows, mat.num_cols),
468  cl_int(mat.num_rows), cl_int(mat.num_cols), cl_int(mat_gpu->m_offset),
469  result_gpu->data_vector(mat.num_cols), cl_int(result_gpu->m_offset)));
470 
471  return SGVector<T>(result_gpu, mat.num_cols);
472  }
473 
475  template <typename T>
476  SGVector<T> rowwise_sum_impl(const SGMatrix<T>& mat, bool no_diag) const
477  {
478  GPUMemoryViennaCL<T>* mat_gpu = cast_to_viennacl(mat);
479  GPUMemoryViennaCL<T>* result_gpu = new GPUMemoryViennaCL<T>(mat.num_rows);
480  viennacl::ocl::kernel& kernel = generate_rowwise_sum_kernel<T>(no_diag);
481  kernel.global_work_size(0, linalg::implementation::ocl::align_to_multiple_1d(mat.num_rows));
482 
483  viennacl::ocl::enqueue(kernel(mat_gpu->data_matrix(mat.num_rows, mat.num_cols),
484  cl_int(mat.num_rows), cl_int(mat.num_cols), cl_int(mat_gpu->m_offset),
485  result_gpu->data_vector(mat.num_rows), cl_int(result_gpu->m_offset)));
486 
487  return SGVector<T>(result_gpu, mat.num_rows);
488  }
489 
491  template <typename T, template<typename> class Container>
492  GPUMemoryBase<T>* to_gpu_impl(const Container<T>& a) const
493  {
494  GPUMemoryViennaCL<T>* gpu_ptr = new GPUMemoryViennaCL<T>();
495 
496  viennacl::backend::memory_create(*(gpu_ptr->m_data), sizeof(T)*a.size(),
497  viennacl::context());
498  viennacl::backend::memory_write(*(gpu_ptr->m_data), 0,
499  a.size()*sizeof(T), a.data());
500 
501  return gpu_ptr;
502  }
503 
505  template <typename T, template<typename> class Container>
506  void from_gpu_impl(const Container<T>& a, T* data) const
507  {
508  GPUMemoryViennaCL<T>* gpu_ptr = cast_to_viennacl(a);
509  viennacl::backend::memory_read(*(gpu_ptr->m_data),
510  gpu_ptr->m_offset*sizeof(T), a.size()*sizeof(T), data);
511  }
512 
513 #undef DEFINE_FOR_ALL_PTYPE
514 #undef DEFINE_FOR_NON_INTEGER_PTYPE
515 };
516 
517 }
518 
519 #endif //HAVE_VIENNACL
520 
521 #endif //LINALG_BACKEND_VIENNACL_H__
#define BACKEND_GENERIC_MAX(Type, Container)
#define BACKEND_GENERIC_IN_PLACE_ADD(Type, Container)
#define BACKEND_GENERIC_ROWWISE_SUM(Type, Container)
#define DEFINE_FOR_NON_INTEGER_PTYPE(METHODNAME, Container)
#define DEFINE_FOR_ALL_PTYPE(METHODNAME, Container)
#define BACKEND_GENERIC_IN_PLACE_SCALE(Type, Container)
#define BACKEND_GENERIC_SET_CONST(Type, Container)
#define BACKEND_GENERIC_DOT(Type, Container)
#define BACKEND_GENERIC_LOGISTIC(Type, Container)
#define BACKEND_GENERIC_SYMMETRIC_SUM(Type, Container)
#define BACKEND_GENERIC_SUM(Type, Container)
double float64_t
Definition: common.h:60
#define BACKEND_GENERIC_FROM_GPU(Type, Container)
#define BACKEND_GENERIC_IN_PLACE_MATRIX_PROD(Type, Container)
#define BACKEND_GENERIC_IN_PLACE_ELEMENT_PROD(Type, Container)
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
T sum(const Container< T > &a, bool no_diag=false)
#define BACKEND_GENERIC_TO_GPU(Type, Container)
#define BACKEND_GENERIC_COLWISE_SUM(Type, Container)
void element_prod(Block< SGMatrix< T >> &a, Block< SGMatrix< T >> &b, SGMatrix< T > &result)

SHOGUN Machine Learning Toolbox - Documentation