ergo
mm_kernel_inner_sse2_A.h
Go to the documentation of this file.
1 /* Ergo, version 3.8, a program for linear scaling electronic structure
2  * calculations.
3  * Copyright (C) 2019 Elias Rudberg, Emanuel H. Rubensson, Pawel Salek,
4  * and Anastasia Kruchinina.
5  *
6  * This program is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program. If not, see <http://www.gnu.org/licenses/>.
18  *
19  * Primary academic reference:
20  * Ergo: An open-source program for linear-scaling electronic structure
21  * calculations,
22  * Elias Rudberg, Emanuel H. Rubensson, Pawel Salek, and Anastasia
23  * Kruchinina,
24  * SoftwareX 7, 107 (2018),
25  * <http://dx.doi.org/10.1016/j.softx.2018.03.005>
26  *
27  * For further information about Ergo, see <http://www.ergoscf.org>.
28  */
29 
39 #ifndef MM_KERNEL_INNER_SSE2_A_H
40 #define MM_KERNEL_INNER_SSE2_A_H
41 #include "common.h"
42 #include "vector_intrin.h"
43 
62 template<typename T_real, typename T_reg, int T_M, int T_N, int T_K>
64  public:
65  typedef T_real real;
66  static int const M = T_M;
67  static int const N = T_N;
68  static int const K = T_K;
69  protected:
70  static int const floats_per_register = ( sizeof(T_reg) / sizeof(real) );
73  private:
75  template<int T_ROWS_kernel, int T_COLS_kernel, typename T_ordering_kernel, int T_repetitions>
76  class Pack;
77  public:
84  // Consider passing derived class from std::vector as arguments
85  // that have compile time constant length that can be checked at
86  // compile time using static asserts
90  static void exec( real const * const * const A,
91  real const * const * const B,
92  real * const C,
93  int const i = 1,
94  int const offset_A = 0,
95  int const offset_B = 0,
96  int const offset_C = 0 );
97 
98  template<int T_offset_A, int T_offset_B, int T_offset_C>
99  static void exec( real const * const * const A,
100  real const * const * const B,
101  real * const C,
102  int const i = 1 );
103 
104 
105 
106  protected:
107  template<int T_loop_index, int T_end>
108  struct Loop {
109  static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {
110  X_reg[T_loop_index].set_to_zero();
112  }
113  static inline void ALWAYS_INLINE inner( int const row_A_reg, // == row_C_reg
114  int const row_B,
115  Vector_intrin<real, T_reg> const & A_reg,
117  real const * B_packed ) {
119  B_reg.load_p( &B_packed[row_B * T_N * floats_per_register +
120  T_loop_index * floats_per_register] );
121  B_reg *= A_reg;
122  C_reg[row_A_reg + T_loop_index * T_M / floats_per_register] += B_reg;
123  Loop<T_loop_index+1, T_end>::inner( row_A_reg, row_B,
124  A_reg, C_reg,
125  B_packed );
126  }
127  static inline void ALWAYS_INLINE middle( int const col_A, // == row_B
129  real const * A,
130  real const * B_packed ) {
132  A_reg.load_p( &A[col_A * T_M + T_loop_index * floats_per_register] );
133  // Loop over cols of B
134  Loop<0, T_N>::inner( T_loop_index, // == row_A_reg == row_C_reg
135  col_A, // == row_B
136  A_reg,
137  C_reg,
138  B_packed );
139  Loop<T_loop_index+1, T_end>::middle( col_A, C_reg, A, B_packed );
140  }
141  static inline void ALWAYS_INLINE outer( int const start_i,
143  real const * A,
144  real const * B_packed ) {
145  // Loop over (register) rows of A and C
146  Loop<0, T_M/floats_per_register>::middle( start_i + T_loop_index,
147  C_reg,
148  A,
149  B_packed );
150  Loop<T_loop_index+1, T_end>::outer( start_i, C_reg, A, B_packed );
151  }
152  static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
153  real const * X ) {
154  X_reg[T_loop_index] += &X[T_loop_index * floats_per_register];
156  }
157  static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
158  real * X ) {
159  X_reg[T_loop_index].store_p( &X[T_loop_index * floats_per_register] );
161  }
162 
164  real const * const * const A,
165  real const * const * const B ) {
166  // Loop over columns of A and rows of B^T
167  Loop<0, T_K>::outer( 0, C_reg, A[T_loop_index], B[T_loop_index] );
169  }
170  };
171 
172  template<int T_end>
173  struct Loop<T_end, T_end> {
174  static inline void ALWAYS_INLINE set_to_zero( Vector_intrin<real, T_reg> * X_reg ) {}
175  static inline void ALWAYS_INLINE inner( int const row_A_reg, // == row_C_reg
176  int const row_B,
177  Vector_intrin<real, T_reg> const & A_reg,
179  real const * B_packed ) {}
180  static inline void ALWAYS_INLINE middle( int const col_A, // == row_B
182  real const * A,
183  real const * B_packed ) {}
184  static inline void ALWAYS_INLINE outer( int const start_i,
186  real const * A,
187  real const * B_packed ) {}
188  static inline void ALWAYS_INLINE add( Vector_intrin<real, T_reg> * X_reg,
189  real const * X ) {}
190  static inline void ALWAYS_INLINE store( Vector_intrin<real, T_reg> const * X_reg,
191  real * X ) {}
193  real const * const * const A,
194  real const * const * const B ) {}
195  };
196 };
197 
198 // Doesn't matter if inlined or not... same performance for 4, 4, 5
199 // IT DOES MATTER WHEN OUTER CONTROL STRUCTURE IS ADDED ON TOP!!!
200 // THEN, INLINING IS BAD
201 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
203  real const * const * const B,
204  real * const C,
205  int const i,
206  int const offset_A,
207  int const offset_B,
208  int const offset_C) {
209  STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
210  Vector_intrin<real, T_reg> C_reg[T_M * T_N / floats_per_register];
211  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::set_to_zero( C_reg );
212 #if 1 // I loose a bit performance because of the offsets
213  for (int ind = 0; ind < i; ++ind)
214  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + offset_A, B[ind] + offset_B );
216  // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
217  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C + offset_C);
218  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C + offset_C);
219 #else
220  for (int ind = 0; ind < i; ++ind)
223  // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
224  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C);
225  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C);
226 #endif
227 } // end exec
228 
229 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
230  template<int T_offset_A, int T_offset_B, int T_offset_C>
232  real const * const * const B,
233  real * const C,
234  int const i ) {
235  STATIC_ASSERT_DEBUG(!(T_M%floats_per_register), TEMPLATE_ARGUMENT_T_M_MUST_BE_MULTIPLE_OF_floats_per_register);
236  Vector_intrin<real, T_reg> C_reg[T_M * T_N / floats_per_register];
237  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::set_to_zero( C_reg );
238  for (int ind = 0; ind < i; ++ind)
239  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A[ind] + T_offset_A, B[ind] + T_offset_B );
241  // MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_K>::outer( 0, C_reg, A, B );
242  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::add( C_reg, C + T_offset_C);
243  MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::template Loop<0, T_M * T_N / floats_per_register>::store( C_reg, C + T_offset_C);
244 } // end exec template
245 
246 
247 
248 
249 
250 
252 template<typename real, typename T_reg, int T_M, int T_N, int T_K>
253  template<int T_rows, int T_cols, typename T_ordering_kernel, int T_repetitions>
254  class MM_kernel_inner_sse2_A<real, T_reg, T_M, T_N, T_K>::Pack {
255  public:
256  static int const size_packed = T_rows * T_cols * T_repetitions;
257  static int const rows = T_rows;
258  static int const cols = T_cols;
259 
260  template<typename T_ordering_matrix>
262  typedef real * PtrTypePacked;
263  typedef real const * const PtrType;
264  inline static void exec( PtrType X, PtrTypePacked X_packed,
265  int const row_k,
266  int const col_k,
267  int const rows_total_matrix,
268  int const cols_total_matrix ) {
269  for ( int ir = 0; ir < T_repetitions; ++ir)
270  X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ]
271  = X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ];
272  }
273  };
274 
275  template<typename T_ordering_matrix>
277  typedef real const * const PtrTypePacked;
278  typedef real * PtrType;
279  inline static void exec( PtrType X, PtrTypePacked X_packed,
280  int const row_k,
281  int const col_k,
282  int const rows_total_matrix,
283  int const cols_total_matrix ) {
284  for ( int ir = 0; ir < T_repetitions; ++ir)
285  X[ T_ordering_matrix::get(row_k, col_k, rows_total_matrix, cols_total_matrix) ] =
286  X_packed[ T_ordering_kernel::get( row_k, col_k, T_rows, T_cols ) * T_repetitions + ir ];
287  }
288  };
289 
290  template<template<typename T_ordering> class T_assign, typename T_ordering_matrix>
291  static void exec(typename T_assign<T_ordering_matrix>::PtrType X,
292  typename T_assign<T_ordering_matrix>::PtrTypePacked X_packed,
293  int const rows_total_matrix, int const cols_total_matrix) {
294  // Loop over columns of kernel
295  for ( int col_k = 0; col_k < T_cols; ++col_k ) {
296  // Loop over rows of kernel
297  for ( int row_k = 0; row_k < T_rows; ++row_k ) {
298  T_assign<T_ordering_matrix>::exec( X, X_packed,
299  row_k, col_k,
300  rows_total_matrix, cols_total_matrix );
301  }
302  }
303  } // end exec()
304 
309  template<typename T_ordering_matrix>
310  inline static void pack(real const * const X, real * X_packed,
311  int const rows_total_matrix, int const cols_total_matrix) {
312  exec< Assign_to_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
313  }
318  template<typename T_ordering_matrix>
319  inline static void unpack(real * X, real const * const X_packed,
320  int const rows_total_matrix, int const cols_total_matrix) {
321  exec< Extract_from_packed, T_ordering_matrix >(X, X_packed, rows_total_matrix, cols_total_matrix);
322  }
323 };
324 #endif
MM_kernel_inner_sse2_A::Pack_type_A
Pack< M, K, Ordering_col_wise, 1 > Pack_type_A
Type that can (should) be used to pack A.
Definition: mm_kernel_inner_sse2_A.h:76
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::inner
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:175
MM_kernel_inner_sse2_A::Pack::Assign_to_packed
Definition: mm_kernel_inner_sse2_A.h:261
MM_kernel_inner_sse2_A::Pack::Assign_to_packed::PtrTypePacked
real * PtrTypePacked
Type of packed pointer - note the absence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:262
MM_kernel_inner_sse2_A::Pack::Extract_from_packed::PtrType
real * PtrType
Type of matrix pointer - note the absence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:278
MM_kernel_inner_sse2_A::Pack::Extract_from_packed
Definition: mm_kernel_inner_sse2_A.h:276
MM_kernel_inner_sse2_A::Pack::Assign_to_packed::exec
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:264
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::store
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:190
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::multiple_loop
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:192
MM_kernel_inner_sse2_A::real
T_real real
Real number type (usually float or double)
Definition: mm_kernel_inner_sse2_A.h:65
MM_kernel_inner_sse2_A::Pack::exec
static void exec(typename T_assign< T_ordering_matrix >::PtrType X, typename T_assign< T_ordering_matrix >::PtrTypePacked X_packed, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:291
MM_kernel_inner_sse2_A::Loop::multiple_loop
static void ALWAYS_INLINE multiple_loop(Vector_intrin< real, T_reg > *C_reg, real const *const *const A, real const *const *const B)
Definition: mm_kernel_inner_sse2_A.h:163
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::set_to_zero
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:174
real
ergo_real real
Definition: test.cc:46
rows
mat::SizesAndBlocks rows
Definition: test.cc:51
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::outer
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:184
Vector_intrin::load_p
void ALWAYS_INLINE load_p(Treal const *ptr)
Definition: vector_intrin.h:62
B
#define B
MM_kernel_inner_sse2_A::Loop::middle
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:127
MM_kernel_inner_sse2_A::Pack::Assign_to_packed::PtrType
real const *const PtrType
Type of matrix pointer - note the presence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:263
MM_kernel_inner_sse2_A::Pack::unpack
static void unpack(real *X, real const *const X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for extracting matrix from packed matrix.
Definition: mm_kernel_inner_sse2_A.h:319
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::add
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:188
MM_kernel_inner_sse2_A::Pack
Template for packing of matrix elements.
Definition: mm_kernel_inner_sse2_A.h:76
MM_kernel_inner_sse2_A::N
static int const N
Number of columns of B and C.
Definition: mm_kernel_inner_sse2_A.h:67
Vector_intrin
Vector class template for access to SIMD operations.
Definition: vector_intrin.h:60
MM_kernel_inner_sse2_A::Loop::add
static void ALWAYS_INLINE add(Vector_intrin< real, T_reg > *X_reg, real const *X)
Definition: mm_kernel_inner_sse2_A.h:152
STATIC_ASSERT_DEBUG
#define STATIC_ASSERT_DEBUG(expr, msg)
Definition: common.h:72
MM_kernel_inner_sse2_A::Pack::pack
static void pack(real const *const X, real *X_packed, int const rows_total_matrix, int const cols_total_matrix)
Convenience function for assignments to packed matrix.
Definition: mm_kernel_inner_sse2_A.h:310
MM_kernel_inner_sse2_A::Loop< T_end, T_end >::middle
static void ALWAYS_INLINE middle(int const col_A, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:180
MM_kernel_inner_sse2_A::floats_per_register
static int const floats_per_register
Number of real numbers that fit in one register.
Definition: mm_kernel_inner_sse2_A.h:70
Vector_intrin::store_p
void ALWAYS_INLINE store_p(Treal *ptr) const
Definition: vector_intrin.h:68
MM_kernel_inner_sse2_A::Loop
Definition: mm_kernel_inner_sse2_A.h:108
cols
mat::SizesAndBlocks cols
Definition: test.cc:52
A
#define A
MM_kernel_inner_sse2_A::Loop::outer
static void ALWAYS_INLINE outer(int const start_i, Vector_intrin< real, T_reg > *C_reg, real const *A, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:141
MM_kernel_inner_sse2_A::exec
static void exec(real const *const *const A, real const *const *const B, real *const C, int const i=1, int const offset_A=0, int const offset_B=0, int const offset_C=0)
Executes the matrix-matrix multiply C += A B with the three matrices A, B, and C stored according to ...
Definition: mm_kernel_inner_sse2_A.h:202
MM_kernel_inner_sse2_A::Pack::Extract_from_packed::exec
static void exec(PtrType X, PtrTypePacked X_packed, int const row_k, int const col_k, int const rows_total_matrix, int const cols_total_matrix)
Definition: mm_kernel_inner_sse2_A.h:279
ALWAYS_INLINE
#define ALWAYS_INLINE
Definition: common.h:45
MM_kernel_inner_sse2_A::Pack_type_C
Pack< M, N, Ordering_col_wise, 1 > Pack_type_C
Type that can (should) be used to pack C.
Definition: mm_kernel_inner_sse2_A.h:80
common.h
Macros for inlining and static assertions and structs for access to matrix elements specifying the la...
Vector_intrin::set_to_zero
void ALWAYS_INLINE set_to_zero()
Definition: vector_intrin.h:90
MM_kernel_inner_sse2_A::Loop::inner
static void ALWAYS_INLINE inner(int const row_A_reg, int const row_B, Vector_intrin< real, T_reg > const &A_reg, Vector_intrin< real, T_reg > *C_reg, real const *B_packed)
Definition: mm_kernel_inner_sse2_A.h:113
MM_kernel_inner_sse2_A
Matrix multiplication template for architectures with SSE2 or higher and compilers that support C++ i...
Definition: mm_kernel_inner_sse2_A.h:63
MM_kernel_inner_sse2_A::M
static int const M
Number of rows of A and C.
Definition: mm_kernel_inner_sse2_A.h:66
vector_intrin.h
Vector template for convenient access to SIMD operations.
MM_kernel_inner_sse2_A::Loop::store
static void ALWAYS_INLINE store(Vector_intrin< real, T_reg > const *X_reg, real *X)
Definition: mm_kernel_inner_sse2_A.h:157
MM_kernel_inner_sse2_A::K
static int const K
Number of columns of A and rows of B.
Definition: mm_kernel_inner_sse2_A.h:68
MM_kernel_inner_sse2_A::Pack::Extract_from_packed::PtrTypePacked
real const *const PtrTypePacked
Type of packed pointer - note the presence of const qualifiers.
Definition: mm_kernel_inner_sse2_A.h:277
MM_kernel_inner_sse2_A::Loop::set_to_zero
static void ALWAYS_INLINE set_to_zero(Vector_intrin< real, T_reg > *X_reg)
Definition: mm_kernel_inner_sse2_A.h:109
MM_kernel_inner_sse2_A::Pack_type_B
Pack< K, N, Ordering_row_wise, floats_per_register > Pack_type_B
Type that can (should) be used to pack B.
Definition: mm_kernel_inner_sse2_A.h:79