33 #ifndef LINALG_BACKEND_VIENNACL_KERNELS_H__ 34 #define LINALG_BACKEND_VIENNACL_KERNELS_H__ 48 static viennacl::ocl::kernel& generate_max_kernel()
50 std::string kernel_name =
"max_" + linalg::implementation::ocl::get_type_string<T>();
52 if (linalg::implementation::ocl::kernel_exists(kernel_name))
53 return linalg::implementation::ocl::get_kernel(kernel_name);
55 std::string source = linalg::implementation::ocl::generate_kernel_preamble<T>(kernel_name);
59 __kernel void KERNEL_NAME( 60 __global DATATYPE* vec, int size, int offset, 61 __global DATATYPE* result) 63 __local DATATYPE buffer[WORK_GROUP_SIZE_1D]; 64 int local_id = get_local_id(0); 65 DATATYPE thread_max = -INFINITY; 66 for (int i=local_id; i<size; i+=WORK_GROUP_SIZE_1D) 68 DATATYPE v = vec[i+offset]; 69 thread_max = max(v, thread_max); 71 buffer[local_id] = thread_max; 72 for (int j = WORK_GROUP_SIZE_1D/2; j > 0; j = j>>1) 74 barrier(CLK_LOCAL_MEM_FENCE); 76 buffer[local_id] = max(buffer[local_id], buffer[local_id + j]); 78 barrier(CLK_LOCAL_MEM_FENCE); 79 if (get_global_id(0)==0) 85 viennacl::ocl::kernel& kernel = linalg::implementation::ocl::compile_kernel(kernel_name, source); 87 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_1D); 88 kernel.global_work_size(0, OCL_WORK_GROUP_SIZE_1D); 99 static viennacl::ocl::kernel& generate_sum_kernel(
bool no_diag)
101 std::string kernel_name =
"sum_" + linalg::implementation::ocl::get_type_string<T>();
102 if (no_diag) kernel_name.append(
"_no_diag");
104 if (linalg::implementation::ocl::kernel_exists(kernel_name))
105 return linalg::implementation::ocl::get_kernel(kernel_name);
107 std::string source = linalg::implementation::ocl::generate_kernel_preamble<T>(kernel_name);
108 if (no_diag) source.append(
"#define NO_DIAG\n");
112 __kernel void KERNEL_NAME( 113 __global DATATYPE* mat, int nrows, int ncols, int offset, 114 __global DATATYPE* result) 116 __local DATATYPE buffer[WORK_GROUP_SIZE_1D]; 117 int size = nrows*ncols; 119 int local_id = get_local_id(0); 121 DATATYPE thread_sum = 0; 122 for (int i=local_id; i<size; i+=WORK_GROUP_SIZE_1D) 125 if (!(i/nrows == i%nrows)) 127 thread_sum += mat[i+offset]; 130 buffer[local_id] = thread_sum; 132 for (int j = WORK_GROUP_SIZE_1D/2; j > 0; j = j>>1) 134 barrier(CLK_LOCAL_MEM_FENCE); 136 buffer[local_id] += buffer[local_id + j]; 139 barrier(CLK_LOCAL_MEM_FENCE); 141 if (get_global_id(0)==0) 147 viennacl::ocl::kernel& kernel = 148 linalg::implementation::ocl::compile_kernel(kernel_name, source); 150 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_1D); 151 kernel.global_work_size(0, OCL_WORK_GROUP_SIZE_1D); 162 static viennacl::ocl::kernel& generate_colwise_sum_kernel(
bool no_diag)
164 std::string kernel_name =
"colwise_sum_" + linalg::implementation::ocl::get_type_string<T>();
165 if (no_diag) kernel_name.append(
"_no_diag");
167 if (linalg::implementation::ocl::kernel_exists(kernel_name))
168 return linalg::implementation::ocl::get_kernel(kernel_name);
170 std::string source = linalg::implementation::ocl::generate_kernel_preamble<T>(kernel_name);
171 if (no_diag) source.append(
"#define NO_DIAG\n");
175 __kernel void KERNEL_NAME( 176 __global DATATYPE* mat, int nrows, int ncols, int offset, 177 __global DATATYPE* result, int result_offset) 179 int j = get_global_id(0); 185 for (int i=0; i<nrows; i++) 190 sum += mat[offset+i+j*nrows]; 193 result[j+result_offset] = sum; 198 viennacl::ocl::kernel& kernel = 199 linalg::implementation::ocl::compile_kernel(kernel_name, source); 201 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_1D); 212 static viennacl::ocl::kernel& generate_rowwise_sum_kernel(
bool no_diag)
214 std::string kernel_name =
"rowwise_sum_" + linalg::implementation::ocl::get_type_string<T>();
215 if (no_diag) kernel_name.append(
"_no_diag");
217 if (linalg::implementation::ocl::kernel_exists(kernel_name))
218 return linalg::implementation::ocl::get_kernel(kernel_name);
220 std::string source = linalg::implementation::ocl::generate_kernel_preamble<T>(kernel_name);
221 if (no_diag) source.append(
"#define NO_DIAG\n");
225 __kernel void KERNEL_NAME( 226 __global DATATYPE* mat, int nrows, int ncols, int offset, 227 __global DATATYPE* result, int result_offset) 229 int i = get_global_id(0); 235 for (int j=0; j<ncols; j++) 240 sum += mat[offset+i+j*nrows]; 243 result[i+result_offset] = sum; 248 viennacl::ocl::kernel& kernel = linalg::implementation::ocl::compile_kernel(kernel_name, source); 250 kernel.local_work_size(0, OCL_WORK_GROUP_SIZE_1D); 256 #endif // HAVE_VIENNACL 258 #endif // LINALG_BACKEND_VIENNACL_KERNELS_H__ all of classes and functions are contained in the shogun namespace