33 #ifndef LINALG_BACKEND_VIENNACL_H__ 34 #define LINALG_BACKEND_VIENNACL_H__ 41 #include <viennacl/linalg/inner_prod.hpp> 42 #include <viennacl/linalg/prod.hpp> 43 #include <viennacl/vector.hpp> 44 #include <viennacl/matrix.hpp> 47 #if VIENNACL_VERSION >= 10700 48 #include <viennacl/linalg/sum.hpp> 57 class LinalgBackendViennaCL :
public LinalgBackendGPUBase
60 friend struct GPUMemoryViennaCL;
63 #define DEFINE_FOR_ALL_PTYPE(METHODNAME, Container) \ 64 METHODNAME(char, Container); \ 65 METHODNAME(uint8_t, Container); \ 66 METHODNAME(int16_t, Container); \ 67 METHODNAME(uint16_t, Container); \ 68 METHODNAME(int32_t, Container); \ 69 METHODNAME(uint32_t, Container); \ 70 METHODNAME(float32_t, Container); \ 71 METHODNAME(float64_t, Container); \ 73 #define DEFINE_FOR_NON_INTEGER_PTYPE(METHODNAME, Container) \ 74 METHODNAME(float32_t, Container); \ 75 METHODNAME(float64_t, Container); \ 78 #define BACKEND_GENERIC_IN_PLACE_ADD(Type, Container) \ 79 virtual void add(Container<Type>& a, Container<Type>& b, Type alpha, \ 80 Type beta, Container<Type>& result) const \ 82 add_impl(a, b, alpha, beta, result); \ 86 #undef BACKEND_GENERIC_ADD 89 #define BACKEND_GENERIC_DOT(Type, Container) \ 90 virtual Type dot(const Container<Type>& a, const Container<Type>& b) const \ 92 return dot_impl(a, b); \ 95 #undef BACKEND_GENERIC_DOT 98 #define BACKEND_GENERIC_IN_PLACE_ELEMENT_PROD(Type, Container) \ 99 virtual void element_prod(Container<Type>& a, Container<Type>& b,\ 100 Container<Type>& result) const \ 102 element_prod_impl(a, b, result); \ 105 #undef BACKEND_GENERIC_IN_PLACE_ELEMENT_PROD 108 #define BACKEND_GENERIC_LOGISTIC(Type, Container) \ 109 virtual void logistic(Container<Type>& a, Container<Type>& result) const \ 111 logistic_impl(a, result); \ 114 #undef BACKEND_GENERIC_LOGISTIC 117 #define BACKEND_GENERIC_IN_PLACE_MATRIX_PROD(Type, Container) \ 118 virtual void matrix_prod(SGMatrix<Type>& a, Container<Type>& b,\ 119 Container<Type>& result, bool transpose_A, bool transpose_B) const \ 121 matrix_prod_impl(a, b, result, transpose_A, transpose_B); \ 125 #undef BACKEND_GENERIC_IN_PLACE_MATRIX_PROD 128 #define BACKEND_GENERIC_MAX(Type, Container) \ 129 virtual Type max(const Container<Type>& a) const \ 131 return max_impl(a); \ 135 #undef BACKEND_GENERIC_MAX 138 #define BACKEND_GENERIC_MEAN(Type, Container) \ 139 virtual float64_t mean(const Container<Type>& a) const \ 141 return mean_impl(a); \ 145 #undef BACKEND_GENERIC_MEAN 148 #define BACKEND_GENERIC_IN_PLACE_SCALE(Type, Container) \ 149 virtual void scale(Container<Type>& a, Type alpha, Container<Type>& result) const \ 151 scale_impl(a, result, alpha); \ 155 #undef BACKEND_GENERIC_IN_PLACE_SCALE 158 #define BACKEND_GENERIC_SET_CONST(Type, Container) \ 159 virtual void set_const(Container<Type>& a, const Type value) const \ 161 set_const_impl(a, value); \ 165 #undef BACKEND_GENERIC_SET_CONST 168 #define BACKEND_GENERIC_SUM(Type, Container) \ 169 virtual Type sum(const Container<Type>& a, bool no_diag) const \ 171 return sum_impl(a, no_diag); \ 175 #undef BACKEND_GENERIC_SUM 178 #define BACKEND_GENERIC_SYMMETRIC_SUM(Type, Container) \ 179 virtual Type sum_symmetric(const Container<Type>& a, bool no_diag) const \ 181 return sum_symmetric_impl(a, no_diag); \ 184 #undef BACKEND_GENERIC_SYMMETRIC_SUM 187 #define BACKEND_GENERIC_COLWISE_SUM(Type, Container) \ 188 virtual SGVector<Type> colwise_sum(const Container<Type>& a, bool no_diag) const \ 190 return colwise_sum_impl(a, no_diag); \ 193 #undef BACKEND_GENERIC_COLWISE_SUM 196 #define BACKEND_GENERIC_ROWWISE_SUM(Type, Container) \ 197 virtual SGVector<Type> rowwise_sum(const Container<Type>& a, bool no_diag) const \ 199 return rowwise_sum_impl(a, no_diag); \ 202 #undef BACKEND_GENERIC_ROWWISE_SUM 205 #define BACKEND_GENERIC_TO_GPU(Type, Container) \ 206 virtual GPUMemoryBase<Type>* to_gpu(const Container<Type>& a) const \ 208 return to_gpu_impl(a); \ 212 #undef BACKEND_GENERIC_TO_GPU 215 #define BACKEND_GENERIC_FROM_GPU(Type, Container) \ 216 virtual void from_gpu(const Container<Type>& a, Type* data) const \ 218 return from_gpu_impl(a, data); \ 222 #undef BACKEND_GENERIC_FROM_GPU 224 #undef DEFINE_FOR_ALL_PTYPE 228 template <
typename T,
template<
typename>
class Container>
229 GPUMemoryViennaCL<T>* cast_to_viennacl(
const Container<T> &a)
const 231 return static_cast<GPUMemoryViennaCL<T>*
>(a.gpu_ptr.get());
235 template <
typename T>
236 void add_impl(SGVector<T>& a, SGVector<T>& b, T alpha, T beta, SGVector<T>& result)
const 238 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
239 GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
240 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
242 result_gpu->data_vector(a.size()) =
243 alpha * a_gpu->data_vector(a.size()) + beta * b_gpu->data_vector(b.size());
247 template <
typename T>
248 void add_impl(SGMatrix<T>& a, SGMatrix<T>& b, T alpha, T beta, SGMatrix<T>& result)
const 250 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
251 GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
252 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
254 result_gpu->data_matrix(a.num_rows, a.num_cols) =
255 alpha * a_gpu->data_matrix(a.num_rows, a.num_cols)
256 + beta * b_gpu->data_matrix(b.num_rows, b.num_cols);
260 template <
typename T>
261 T dot_impl(
const SGVector<T>& a,
const SGVector<T>& b)
const 263 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
264 GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
266 return viennacl::linalg::inner_prod(
267 a_gpu->data_vector(a.size()), b_gpu->data_vector(b.size()));
271 template <
typename T>
272 void element_prod_impl(SGMatrix<T>& a, SGMatrix<T>& b, SGMatrix<T>& result)
const 274 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
275 GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
276 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
278 result_gpu->data_matrix(a.num_rows, a.num_cols) =
280 a.num_cols), b_gpu->data_matrix(a.num_rows, a.num_cols));
284 template <
typename T>
285 void logistic_impl(SGMatrix<T>& a, SGMatrix<T>& result)
const 287 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
288 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
290 const std::string operation =
"return 1.0/(1+exp(-1*element));";
292 std::string kernel_name =
"logistic_" + linalg::implementation::ocl::get_type_string<T>();
293 viennacl::ocl::kernel& kernel =
294 linalg::implementation::ocl::
295 generate_single_arg_elementwise_kernel<T>(kernel_name, operation);
297 kernel.global_work_size(0,
298 linalg::implementation::ocl::align_to_multiple_1d(a.num_rows*a.num_cols));
300 viennacl::ocl::enqueue(kernel(a_gpu->data_matrix(a.num_rows, a.num_cols),
301 cl_int(a.num_rows*a.num_cols), cl_int(a_gpu->m_offset),
302 result_gpu->data_matrix(a.num_rows, a.num_cols), cl_int(result_gpu->m_offset)));
304 result.gpu_ptr = std::shared_ptr<GPUMemoryBase<T>>(
305 result_gpu->clone_vector(result_gpu,a.num_rows*a.num_cols));
309 template <
typename T>
310 void matrix_prod_impl(SGMatrix<T>& a, SGVector<T>& b, SGVector<T>& result,
311 bool transpose,
bool transpose_B=
false)
const 313 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
314 GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
315 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
318 result_gpu->data_vector(result.vlen) = viennacl::linalg::prod(
319 viennacl::trans(a_gpu->data_matrix(a.num_rows, a.num_cols)),
320 b_gpu->data_vector(b.vlen));
322 result_gpu->data_vector(result.vlen) = viennacl::linalg::prod(
323 a_gpu->data_matrix(a.num_rows, a.num_cols), b_gpu->data_vector(b.vlen));
327 template <
typename T>
328 void matrix_prod_impl(SGMatrix<T>& a, SGMatrix<T>& b, SGMatrix<T>& result,
329 bool transpose_A,
bool transpose_B)
const 331 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
332 GPUMemoryViennaCL<T>* b_gpu = cast_to_viennacl(b);
333 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
335 if (transpose_A && transpose_B)
336 result_gpu->data_matrix(result.num_rows, result.num_cols) =
337 viennacl::linalg::prod(viennacl::trans(a_gpu->data_matrix(
338 a.num_rows, a.num_cols)), viennacl::trans(b_gpu->data_matrix(
339 b.num_rows, b.num_cols)));
341 else if (transpose_A)
342 result_gpu->data_matrix(result.num_rows, result.num_cols) =
343 viennacl::linalg::prod(viennacl::trans(a_gpu->data_matrix(
344 a.num_rows, a.num_cols)), b_gpu->data_matrix(b.num_rows,
347 else if (transpose_B)
348 result_gpu->data_matrix(result.num_rows, result.num_cols) =
349 viennacl::linalg::prod(a_gpu->data_matrix(a.num_rows, a.num_cols),
350 viennacl::trans(b_gpu->data_matrix(b.num_rows, b.num_cols)));
353 result_gpu->data_matrix(result.num_rows, result.num_cols) =
354 viennacl::linalg::prod(a_gpu->data_matrix(a.num_rows, a.num_cols),
355 b_gpu->data_matrix(b.num_rows, b.num_cols));
359 template <
typename T,
template<
typename>
class Container>
360 T max_impl(
const Container<T>& a)
const 362 typedef typename std::aligned_storage<sizeof(T), alignof(T)>::type aligned_t;
364 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
365 GPUMemoryViennaCL<T>* result_gpu =
new GPUMemoryViennaCL<T>(1);
367 viennacl::ocl::kernel& kernel = generate_max_kernel<T>();
368 viennacl::ocl::enqueue(kernel(a_gpu->data_vector(a.size()),
369 cl_int(a.size()), cl_int(a_gpu->m_offset),
370 result_gpu->data_vector(1)));
372 T* result =
reinterpret_cast<T*
>(SG_MALLOC(aligned_t, 1));
373 viennacl::backend::memory_read(*(result_gpu->m_data),
374 result_gpu->m_offset*
sizeof(T),
sizeof(T), result);
380 template <
typename T,
template <
typename>
class Container>
381 float64_t mean_impl(
const Container<T>& a)
const 387 template <
typename T>
388 void scale_impl(SGVector<T>& a, SGVector<T>& result, T alpha)
const 390 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
391 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
393 result_gpu->data_vector(a.size()) = alpha * a_gpu->data_vector(a.size());
397 template <
typename T>
398 void scale_impl(SGMatrix<T>& a, SGMatrix<T>& result, T alpha)
const 400 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
401 GPUMemoryViennaCL<T>* result_gpu = cast_to_viennacl(result);
403 result_gpu->data_matrix(a.num_rows, a.num_cols) =
404 alpha * a_gpu->data_matrix(a.num_rows, a.num_cols);
408 template <
typename T,
template <
typename>
class Container>
409 void set_const_impl(Container<T>& a, T value)
const 411 GPUMemoryViennaCL<T>* a_gpu = cast_to_viennacl(a);
412 typename GPUMemoryViennaCL<T>::VCLVectorBase vcl_vector =
413 a_gpu->data_vector(a.size());
414 viennacl::linalg::vector_assign(vcl_vector, value);
418 template <
typename T>
419 T sum_impl(
const SGMatrix<T>& mat,
bool no_diag=
false)
const 421 typedef typename std::aligned_storage<sizeof(T), alignof(T)>::type aligned_t;
423 GPUMemoryViennaCL<T>* mat_gpu = cast_to_viennacl(mat);
424 GPUMemoryViennaCL<T>* result_gpu =
new GPUMemoryViennaCL<T>(1);
426 viennacl::ocl::kernel& kernel = generate_sum_kernel<T>(no_diag);
427 viennacl::ocl::enqueue(kernel(mat_gpu->data_matrix(mat.num_rows, mat.num_cols),
428 cl_int(mat.num_rows), cl_int(mat.num_cols), cl_int(mat_gpu->m_offset),
429 result_gpu->data_vector(1)));
432 result =
reinterpret_cast<T*
>(SG_MALLOC(aligned_t, 1));
433 viennacl::backend::memory_read(*(result_gpu->m_data),
434 result_gpu->m_offset*
sizeof(T),
sizeof(T), result);
440 template <
typename T>
441 T sum_impl(
const SGVector<T>& vec,
bool no_diag=
false)
const 443 #if VIENNACL_VERSION >= 10700 444 GPUMemoryViennaCL<T>* vec_gpu = cast_to_viennacl(vec);
447 return sum_impl(SGMatrix<T>(vec));
452 template <
typename T>
453 T sum_symmetric_impl(
const SGMatrix<T>& mat,
bool no_diag=
false)
const 455 return sum_impl(mat, no_diag);
459 template <
typename T>
460 SGVector<T> colwise_sum_impl(
const SGMatrix<T>& mat,
bool no_diag)
const 462 GPUMemoryViennaCL<T>* mat_gpu = cast_to_viennacl(mat);
463 GPUMemoryViennaCL<T>* result_gpu =
new GPUMemoryViennaCL<T>(mat.num_cols);
464 viennacl::ocl::kernel& kernel = generate_colwise_sum_kernel<T>(no_diag);
465 kernel.global_work_size(0, linalg::implementation::ocl::align_to_multiple_1d(mat.num_cols));
467 viennacl::ocl::enqueue(kernel(mat_gpu->data_matrix(mat.num_rows, mat.num_cols),
468 cl_int(mat.num_rows), cl_int(mat.num_cols), cl_int(mat_gpu->m_offset),
469 result_gpu->data_vector(mat.num_cols), cl_int(result_gpu->m_offset)));
471 return SGVector<T>(result_gpu, mat.num_cols);
475 template <
typename T>
476 SGVector<T> rowwise_sum_impl(
const SGMatrix<T>& mat,
bool no_diag)
const 478 GPUMemoryViennaCL<T>* mat_gpu = cast_to_viennacl(mat);
479 GPUMemoryViennaCL<T>* result_gpu =
new GPUMemoryViennaCL<T>(mat.num_rows);
480 viennacl::ocl::kernel& kernel = generate_rowwise_sum_kernel<T>(no_diag);
481 kernel.global_work_size(0, linalg::implementation::ocl::align_to_multiple_1d(mat.num_rows));
483 viennacl::ocl::enqueue(kernel(mat_gpu->data_matrix(mat.num_rows, mat.num_cols),
484 cl_int(mat.num_rows), cl_int(mat.num_cols), cl_int(mat_gpu->m_offset),
485 result_gpu->data_vector(mat.num_rows), cl_int(result_gpu->m_offset)));
487 return SGVector<T>(result_gpu, mat.num_rows);
491 template <
typename T,
template<
typename>
class Container>
492 GPUMemoryBase<T>* to_gpu_impl(
const Container<T>& a)
const 494 GPUMemoryViennaCL<T>* gpu_ptr =
new GPUMemoryViennaCL<T>();
496 viennacl::backend::memory_create(*(gpu_ptr->m_data),
sizeof(T)*a.size(),
497 viennacl::context());
498 viennacl::backend::memory_write(*(gpu_ptr->m_data), 0,
499 a.size()*
sizeof(T), a.data());
505 template <
typename T,
template<
typename>
class Container>
506 void from_gpu_impl(
const Container<T>& a, T* data)
const 508 GPUMemoryViennaCL<T>* gpu_ptr = cast_to_viennacl(a);
509 viennacl::backend::memory_read(*(gpu_ptr->m_data),
510 gpu_ptr->m_offset*
sizeof(T), a.size()*
sizeof(T), data);
513 #undef DEFINE_FOR_ALL_PTYPE 514 #undef DEFINE_FOR_NON_INTEGER_PTYPE 519 #endif //HAVE_VIENNACL 521 #endif //LINALG_BACKEND_VIENNACL_H__ #define BACKEND_GENERIC_MAX(Type, Container)
#define BACKEND_GENERIC_IN_PLACE_ADD(Type, Container)
#define BACKEND_GENERIC_ROWWISE_SUM(Type, Container)
#define DEFINE_FOR_NON_INTEGER_PTYPE(METHODNAME, Container)
#define DEFINE_FOR_ALL_PTYPE(METHODNAME, Container)
#define BACKEND_GENERIC_IN_PLACE_SCALE(Type, Container)
#define BACKEND_GENERIC_SET_CONST(Type, Container)
#define BACKEND_GENERIC_DOT(Type, Container)
#define BACKEND_GENERIC_LOGISTIC(Type, Container)
#define BACKEND_GENERIC_SYMMETRIC_SUM(Type, Container)
#define BACKEND_GENERIC_SUM(Type, Container)
#define BACKEND_GENERIC_FROM_GPU(Type, Container)
#define BACKEND_GENERIC_IN_PLACE_MATRIX_PROD(Type, Container)
#define BACKEND_GENERIC_IN_PLACE_ELEMENT_PROD(Type, Container)
all of classes and functions are contained in the shogun namespace
T sum(const Container< T > &a, bool no_diag=false)
#define BACKEND_GENERIC_TO_GPU(Type, Container)
#define BACKEND_GENERIC_COLWISE_SUM(Type, Container)
void element_prod(Block< SGMatrix< T >> &a, Block< SGMatrix< T >> &b, SGMatrix< T > &result)