Home  · Classes  · Annotated Classes  · Modules  · Members  · Namespaces  · Related Pages
StatisticFunctions.h
Go to the documentation of this file.
1 // --------------------------------------------------------------------------
2 // OpenMS -- Open-Source Mass Spectrometry
3 // --------------------------------------------------------------------------
4 // Copyright The OpenMS Team -- Eberhard Karls University Tuebingen,
5 // ETH Zurich, and Freie Universitaet Berlin 2002-2013.
6 //
7 // This software is released under a three-clause BSD license:
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above copyright
11 // notice, this list of conditions and the following disclaimer in the
12 // documentation and/or other materials provided with the distribution.
13 // * Neither the name of any author or any participating institution
14 // may be used to endorse or promote products derived from this software
15 // without specific prior written permission.
16 // For a full list of authors, refer to the file AUTHORS.
17 // --------------------------------------------------------------------------
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 // ARE DISCLAIMED. IN NO EVENT SHALL ANY OF THE AUTHORS OR THE CONTRIBUTING
22 // INSTITUTIONS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 // OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 // WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 // OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 // ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 //
30 // --------------------------------------------------------------------------
31 // $Maintainer: Clemens Groepl $
32 // $Authors: Clemens Groepl, Johannes Junker, Mathias Walzer$
33 // --------------------------------------------------------------------------
34 
35 #include <numeric>
36 #include <algorithm>
37 #include <OpenMS/CONCEPT/Types.h>
38 #include <boost/lambda/lambda.hpp>
39 #include <boost/lambda/casts.hpp>
40 #include <boost/function/function_base.hpp>
41 
42 #ifndef OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
43 #define OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
44 
45 namespace OpenMS
46 {
47 
48  namespace Math
49  {
55  template <typename IteratorType>
56  static DoubleReal sum(IteratorType begin, IteratorType end)
57  {
58  return std::accumulate(begin, end, 0.0);
59  }
60 
68  template <typename IteratorType>
69  static DoubleReal mean(IteratorType begin, IteratorType end)
70  {
71  SignedSize size = std::distance(begin, end);
72  if (size <= 0)
73  {
74  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
75  }
76  return sum(begin, end) / size;
77  }
78 
90  template <typename IteratorType>
91  static DoubleReal median(IteratorType begin, IteratorType end, bool sorted = FALSE)
92  {
93  Size size = std::distance(begin, end);
94 
95  if (size == 0)
96  {
97  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
98  }
99 
100  if (!sorted)
101  {
102  std::sort(begin, end);
103  }
104 
105  if (size % 2 == 0) // even size => average two middle values
106  {
107  IteratorType it1 = begin;
108  std::advance(it1, size / 2 - 1);
109  IteratorType it2 = it1;
110  std::advance(it2, 1);
111  return (*it1 + *it2) / 2.0;
112  }
113  else
114  {
115  IteratorType it = begin;
116  std::advance(it, (size - 1) / 2);
117  return *it;
118  }
119  }
120 
132  template <typename IteratorType>
133  static DoubleReal quantile(IteratorType begin, IteratorType end, UInt quantile, bool sorted = FALSE)
134  {
135  Size size = std::distance(begin, end);
136 
137  if (size == 0)
138  {
139  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
140  }
141  if (quantile > 100 || quantile < 1) //TODO is 0 quantile a valid request?
142  {
143  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
144  }
145 
146  int l = floor( (double(quantile) * (double(size) / 100)) + 0.5); // will not be negative, so this is round nearest
147 
148  if (!sorted)
149  {
150  std::sort(begin, end);
151  }
152 
153  IteratorType it = begin;
154  std::advance(it, l - 1);
155  return *it;
156 
157  }
158 
168  template <typename IteratorType1, typename IteratorType2>
169  static DoubleReal meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
170  {
171  //no data or different lengths
172  SignedSize dist = std::distance(begin_a, end_a);
173  if (dist == 0 || dist != std::distance(begin_b, end_b))
174  {
175  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
176  }
177 
178  DoubleReal error = 0;
179  while (begin_a != end_a)
180  {
181  DoubleReal tmp(*begin_a - *begin_b);
182  error += tmp * tmp;
183  ++begin_a;
184  ++begin_b;
185  }
186 
187  return error / dist;
188  }
189 
199  template <typename IteratorType1, typename IteratorType2>
200  static DoubleReal classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
201  {
202  //no data or different lengths
203  SignedSize dist = std::distance(begin_a, end_a);
204  if (dist == 0 || dist != std::distance(begin_b, end_b))
205  {
206  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
207  }
208 
209  DoubleReal correct = (DoubleReal) dist;
210  while (begin_a != end_a)
211  {
212  if ((*begin_a < 0 && *begin_b >= 0) || (*begin_a >= 0 && *begin_b < 0))
213  {
214  --correct;
215  }
216  ++begin_a;
217  ++begin_b;
218  }
219 
220  return correct / dist;
221  }
222 
232  template <typename IteratorType1, typename IteratorType2>
233  static DoubleReal matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
234  {
235  //no data or different lengths
236  Int dist = std::distance(begin_a, end_a);
237  if (dist == 0 || dist != std::distance(begin_b, end_b))
238  {
239  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
240  }
241 
242  DoubleReal tp = 0;
243  DoubleReal fp = 0;
244  DoubleReal tn = 0;
245  DoubleReal fn = 0;
246 
247  while (begin_a != end_a)
248  {
249  if (*begin_a < 0 && *begin_b >= 0)
250  {
251  ++fn;
252  }
253  else if (*begin_a < 0 && *begin_b < 0)
254  {
255  ++tn;
256  }
257  else if (*begin_a >= 0 && *begin_b >= 0)
258  {
259  ++tp;
260  }
261  else if (*begin_a >= 0 && *begin_b < 0)
262  {
263  ++fp;
264  }
265 
266  ++begin_a;
267  ++begin_b;
268  }
269 
270  return (tp * tn - fp * fn) / sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn));
271  }
272 
284  template <typename IteratorType1, typename IteratorType2>
285  static DoubleReal pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
286  {
287  //no data or different lengths
288  SignedSize dist = std::distance(begin_a, end_a);
289  if (dist == 0 || dist != std::distance(begin_b, end_b))
290  {
291  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
292  }
293 
294  //calculate average
295  DoubleReal avg_a = std::accumulate(begin_a, end_a, 0.0) / dist;
296  DoubleReal avg_b = std::accumulate(begin_b, end_b, 0.0) / dist;
297 
298  DoubleReal numerator = 0;
299  DoubleReal denominator_a = 0;
300  DoubleReal denominator_b = 0;
301  while (begin_a != end_a)
302  {
303  DoubleReal temp_a = *begin_a - avg_a;
304  DoubleReal temp_b = *begin_b - avg_b;
305  numerator += (temp_a * temp_b);
306  denominator_a += (temp_a * temp_a);
307  denominator_b += (temp_b * temp_b);
308  ++begin_a;
309  ++begin_b;
310  }
311 
312  return numerator / sqrt(denominator_a * denominator_b);
313  }
314 
316  template <typename Value>
317  static void computeRank(std::vector<Value> & w)
318  {
319  Size i = 0; // main index
320  Size z = 0; // "secondary" index
321  Value rank = 0;
322  Size n = (w.size() - 1);
323  //store original indices for later
324  std::vector<std::pair<Size, Value> > w_idx;
325  for (Size j = 0; j < w.size(); ++j)
326  {
327  w_idx.push_back(std::make_pair(j, w[j]));
328  }
329  //sort
330  std::sort(w_idx.begin(), w_idx.end(),
331  boost::lambda::ret<bool>((&boost::lambda::_1->*& std::pair<Size, Value>::second) <
332  (&boost::lambda::_2->*& std::pair<Size, Value>::second)));
333  //replace pairs <orig_index, value> in w_idx by pairs <orig_index, rank>
334  while (i < n)
335  {
336  // test for equality with tolerance:
337  if (fabs(w_idx[i + 1].second - w_idx[i].second) > 0.0000001 * fabs(w_idx[i + 1].second)) // no tie
338  {
339  w_idx[i].second = Value(i + 1);
340  ++i;
341  }
342  else // tie, replace by mean rank
343  {
344  // count number of ties
345  for (z = i + 1; (z <= n) && fabs(w_idx[z].second - w_idx[i].second) <= 0.0000001 * fabs(w_idx[z].second); ++z)
346  {
347  }
348  // compute mean rank of tie
349  rank = 0.5 * (i + z + 1);
350  // replace intensities by rank
351  for (Size v = i; v <= z - 1; ++v)
352  {
353  w_idx[v].second = rank;
354  }
355  i = z;
356  }
357  }
358  if (i == n)
359  w_idx[n].second = Value(n + 1);
360  //restore original order and replace elements of w with their ranks
361  for (Size j = 0; j < w.size(); ++j)
362  {
363  w[w_idx[j].first] = w_idx[j].second;
364  }
365  }
366 
378  template <typename IteratorType1, typename IteratorType2>
379  static DoubleReal rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
380  {
381  //no data or different lengths
382  SignedSize dist = std::distance(begin_a, end_a);
383  if (dist == 0 || dist != std::distance(begin_b, end_b))
384  {
385  throw Exception::InvalidRange(__FILE__, __LINE__, __PRETTY_FUNCTION__);
386  }
387 
388  // store and sort intensities of model and data
389  std::vector<DoubleReal> ranks_data;
390  ranks_data.reserve(dist);
391  std::vector<DoubleReal> ranks_model;
392  ranks_model.reserve(dist);
393 
394  while (begin_a != end_a)
395  {
396  ranks_model.push_back(*begin_a);
397  ranks_data.push_back(*begin_b);
398  ++begin_a;
399  ++begin_b;
400  }
401 
402  // replace entries by their ranks
403  computeRank(ranks_data);
404  computeRank(ranks_model);
405 
406  DoubleReal mu = DoubleReal(ranks_data.size() + 1) / 2.; // mean of ranks
407  // Was the following, but I think the above is more correct ... (Clemens)
408  // DoubleReal mu = (ranks_data.size() + 1) / 2;
409 
410  DoubleReal sum_model_data = 0;
411  DoubleReal sqsum_data = 0;
412  DoubleReal sqsum_model = 0;
413 
414  for (Int i = 0; i < dist; ++i)
415  {
416  sum_model_data += (ranks_data[i] - mu) * (ranks_model[i] - mu);
417  sqsum_data += (ranks_data[i] - mu) * (ranks_data[i] - mu);
418  sqsum_model += (ranks_model[i] - mu) * (ranks_model[i] - mu);
419  }
420 
421  // check for division by zero
422  if (!sqsum_data || !sqsum_model)
423  return 0;
424 
425  return sum_model_data / (sqrt(sqsum_data) * sqrt(sqsum_model));
426  }
427 
428  } // namespace Math
429 } // namespace OpenMS
430 
431 #endif // OPENMS_MATH_STATISTICS_STATISTICFUNCTIONS_H
static DoubleReal median(IteratorType begin, IteratorType end, bool sorted=FALSE)
Calculates the median of a range of values.
Definition: StatisticFunctions.h:91
static DoubleReal matthewsCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Matthews correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:233
static DoubleReal sum(IteratorType begin, IteratorType end)
Calculates the sum of a range of values.
Definition: StatisticFunctions.h:56
static void computeRank(std::vector< Value > &w)
Replaces the elements in vector w by their ranks.
Definition: StatisticFunctions.h:317
static DoubleReal classificationRate(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the classification rate for the values in [begin_a, end_a) and [begin_b, end_b)
Definition: StatisticFunctions.h:200
ptrdiff_t SignedSize
Signed Size type e.g. used as pointer difference.
Definition: Types.h:151
static DoubleReal quantile(IteratorType begin, IteratorType end, UInt quantile, bool sorted=FALSE)
Calculates the quantile of a range of values.
Definition: StatisticFunctions.h:133
static DoubleReal pearsonCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the Pearson correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:285
static DoubleReal mean(IteratorType begin, IteratorType end)
Calculates the mean of a range of values.
Definition: StatisticFunctions.h:69
static DoubleReal rankCorrelationCoefficient(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
calculates the rank correlation coefficient for the values in [begin_a, end_a) and [begin_b...
Definition: StatisticFunctions.h:379
static DoubleReal meanSquareError(IteratorType1 begin_a, IteratorType1 end_a, IteratorType2 begin_b, IteratorType2 end_b)
Calculates the mean square error for the values in [begin_a, end_a) and [begin_b, end_b) ...
Definition: StatisticFunctions.h:169
Invalid range exception.
Definition: Exception.h:286
size_t Size
Size type e.g. used as variable which can hold result of size()
Definition: Types.h:144
int Int
Signed integer type.
Definition: Types.h:100
double DoubleReal
Double-precision real type.
Definition: Types.h:118

OpenMS / TOPP release 1.11.1 Documentation generated on Thu Nov 14 2013 11:19:21 using doxygen 1.8.5