yat  0.16.4pre
utility.h
1 #ifndef _theplu_yat_statistics_utility_
2 #define _theplu_yat_statistics_utility_
3 
4 // $Id: utility.h 3792 2019-04-12 07:15:09Z peter $
5 
6 /*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016, 2018 Peter Johansson
12 
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14 
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19 
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27 */
28 
29 #include "Percentiler.h"
30 
31 #include "yat/classifier/DataLookupWeighted1D.h"
32 #include "yat/classifier/Target.h"
33 #include "yat/normalizer/utility.h"
34 #include "yat/utility/concept_check.h"
35 #include "yat/utility/DataIterator.h"
36 #include "yat/utility/deprecate.h"
37 #include "yat/utility/iterator_traits.h"
38 #include "yat/utility/Matrix.h"
39 #include "yat/utility/sort_index.h"
40 #include "yat/utility/Vector.h"
41 #include "yat/utility/VectorBase.h"
42 #include "yat/utility/yat_assert.h"
43 
44 #include <boost/concept_check.hpp>
45 #include <boost/iterator/iterator_concepts.hpp>
46 #include <boost/iterator/iterator_traits.hpp>
47 #include <boost/iterator/permutation_iterator.hpp>
48 
49 #include <gsl/gsl_math.h>
50 #include <gsl/gsl_statistics_double.h>
51 
52 #include <algorithm>
53 #include <cmath>
54 #include <iterator>
55 #include <stdexcept>
56 #include <vector>
57 
58 namespace theplu {
59 namespace yat {
60 namespace statistics {
61 
71  template <typename T, typename ForwardIterator>
72  void add(T& o, ForwardIterator first, ForwardIterator last,
73  const classifier::Target& target);
74 
95  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
96  void benjamini_hochberg(BidirectionalIterator1 first,
97  BidirectionalIterator1 last,
98  BidirectionalIterator2 result);
99 
100 
120  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
121  void benjamini_hochberg_unsorted(RandomAccessIterator first,
122  RandomAccessIterator last,
123  MutableRandomAccessIterator result);
124 
125 
137  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
138  unsigned int n2, unsigned int t);
139 
147  utility::Matrix correlation(utility::Matrix x);
148 
149 
161  template<typename InputIterator>
162  double entropy(InputIterator first, InputIterator last);
163 
179  double pearson_p_value(double r, unsigned int n);
180 
188  double kurtosis(const utility::VectorBase&);
189 
190 
204  template <class RandomAccessIterator>
205  double mad(RandomAccessIterator first, RandomAccessIterator last,
206  bool sorted=false);
207 
208 
226  template <class RandomAccessIterator>
227  double median(RandomAccessIterator first, RandomAccessIterator last,
228  bool sorted=false);
229 
230 
252  template<class T>
253  double mutual_information(const T& A);
254 
280  template <class RandomAccessIterator>
281  double percentile(RandomAccessIterator first, RandomAccessIterator last,
282  double p, bool sorted=false) YAT_DEPRECATE;
283 
294  template <class RandomAccessIterator>
295  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
296  double p, bool sorted=false)
297  {
298  Percentiler percentiler(p, sorted);
299  return percentiler(first, last);
300  }
301 
308  double skewness(const utility::VectorBase&);
309 
310 
312 
313  template <typename T, typename ForwardIterator>
314  void add(T& o, ForwardIterator first, ForwardIterator last,
315  const classifier::Target& target)
316  {
317  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
318  BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>));
320  for (size_t i=0; first!=last; ++i, ++first)
321  o.add(traits.data(first), target.binary(i), traits.weight(first));
322  }
323 
324 
325  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
326  void benjamini_hochberg(BidirectionalIterator1 first,
327  BidirectionalIterator1 last,
328  BidirectionalIterator2 result)
329  {
330  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>));
331  BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>));
332 
333  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>));
334  BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>));
335  BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>));
336 
337  typedef typename boost::iterator_difference<BidirectionalIterator1>::type
338  diff_type;
339 
340  diff_type n = std::distance(first, last);
341  if (!n)
342  return;
343  std::advance(result, n-1);
344  first = last;
345  --first;
346  diff_type rank = n;
347 
348  double x = 1.0;
349  while (rank) {
350  x = std::min(*first * n/static_cast<double>(rank), x);
351  *result = x;
352  --rank;
353  --first;
354  --result;
355  }
356  }
357 
358 
359  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
360  void benjamini_hochberg_unsorted(RandomAccessIterator first,
361  RandomAccessIterator last,
362  MutableRandomAccessIterator result)
363  {
364  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>));
365  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
366 
367  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>));
368  BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>));
369  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>));
370 
371  std::vector<size_t> idx;
372  utility::sort_index(first, last, idx);
373  benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
374  boost::make_permutation_iterator(first, idx.end()),
375  boost::make_permutation_iterator(result, idx.begin()));
376  }
377 
378 
379  template<typename InputIterator>
380  double entropy(InputIterator first, InputIterator last)
381  {
382  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>));
383  BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>));
384  using boost::Convertible;
385  typedef typename boost::iterator_value<InputIterator>::type T;
386  BOOST_CONCEPT_ASSERT((Convertible<T,double>));
387  double sum = 0;
388  double N = 0;
389  for (; first != last; ++first) {
390  if (*first) {
391  N += *first;
392  sum += *first * std::log(static_cast<double>(*first));
393  }
394  }
395  return -sum / N + log(N);
396  }
397 
398 
399  template <class RandomAccessIterator>
400  double mad(RandomAccessIterator first, RandomAccessIterator last,
401  bool sorted)
402  {
404  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
405  double m = median(first, last, sorted);
406  typedef typename boost::iterator_value<RandomAccessIterator>::type T;
407  std::vector<T> ad(std::distance(first, last));
408  // copy weights if needed
409  normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
410  // assign data (absolute deviation from median)
413  while (first!=last) {
414  *first2 = std::abs(traits.data(first)-m);
415  ++first;
416  ++first2;
417  }
418  return median(ad.begin(), ad.end(), false);
419  }
420 
421 
422  template <class RandomAccessIterator>
423  double median(RandomAccessIterator first, RandomAccessIterator last,
424  bool sorted)
425  {
427  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
428  return percentile2(first, last, 50.0, sorted);
429  }
430 
431 
432  template<class T>
433  double mutual_information(const T& n)
434  {
435  BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
436  using boost::Convertible;
437  BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
438 
439  // p_x = \sum_y p_xy
440 
441  // Mutual Information is defined as
442  // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
443  // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
444  // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
445  // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
446  // - entropy_xy + entropy_x + entropy_y
447 
448  utility::Vector rowsum(n.columns(), 0);
449  for (size_t c = 0; c<n.columns(); ++c)
450  rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
451 
452  utility::Vector colsum(n.rows(), 0);
453  for (size_t r = 0; r<n.rows(); ++r)
454  colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
455 
456  double mi = - entropy(n.begin(), n.end());
457  mi += entropy(rowsum.begin(), rowsum.end());
458  mi += entropy(colsum.begin(), colsum.end());
459 
460  return mi/M_LN2;
461  }
462 
463 
464  template <class RandomAccessIterator>
465  double percentile(RandomAccessIterator first, RandomAccessIterator last,
466  double p, bool sorted)
467  {
468  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
470  // range is one value only is a special case
471  if (first+1 == last)
473  if (sorted) {
474  // have to take care of this special case
475  if (p>=100)
477  double j = p/100 * (std::distance(first,last)-1);
478  int i = static_cast<int>(j);
479  return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
480  }
481  using std::iterator_traits;
482  typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
483  std::vector<value_t> v_copy(first, last);
484  size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
485  if (i+2 < v_copy.size()) {
486  std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
487  }
488  else
489  std::sort(v_copy.begin(), v_copy.end());
490  return percentile(v_copy.begin(), v_copy.end(), p, true);
491  }
492 
493 
494 }}} // of namespace statistics, yat, and theplu
495 
496 #endif
void benjamini_hochberg(BidirectionalIterator1 first, BidirectionalIterator1 last, BidirectionalIterator2 result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:326
double mad(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Median absolute deviation from median.
Definition: utility.h:400
data_reference data(Iter iter) const
Definition: iterator_traits.h:440
Class for containing sample labels.
Definition: Target.h:47
double mutual_information(const T &A)
Calculates the mutual information of A.
Definition: utility.h:433
Concept check for Data Iterator.
Definition: concept_check.h:240
Definition: iterator_traits.h:412
The Department of Theoretical Physics namespace as we define it.
Concept check for Container2D.
Definition: concept_check.h:65
double skewness(const utility::VectorBase &)
Computes the skewness of the data in a vector.
double pearson_p_value(double r, unsigned int n)
one-sided p-value
double median(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Definition: utility.h:423
double entropy(InputIterator first, InputIterator last)
Definition: utility.h:380
DataIterator.
Definition: DataIterator.h:63
This is the yat interface to GSL vector.
Definition: Vector.h:59
This is the yat interface to GSL vector.
Definition: VectorBase.h:55
bool binary(size_t i) const
Default binary is set to false for all classes except class 0.
void sort_index(InputIterator first, InputIterator last, std::vector< size_t > &sort_index)
Definition: sort_index.h:146
weight_reference weight(Iter iter) const
Definition: iterator_traits.h:446
double kurtosis(const utility::VectorBase &)
Computes the kurtosis of the data in a vector.
void benjamini_hochberg_unsorted(RandomAccessIterator first, RandomAccessIterator last, MutableRandomAccessIterator result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:360
void add(T &o, ForwardIterator first, ForwardIterator last, const classifier::Target &target)
Definition: utility.h:314
utility::Matrix correlation(utility::Matrix x)
double cdf_hypergeometric_P(unsigned int k, unsigned int n1, unsigned int n2, unsigned int t)
double percentile2(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:295
double percentile(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:465
Functor to calculate percentile of a range.
Definition: Percentiler.h:51

Generated on Thu Dec 12 2019 03:12:08 for yat by  doxygen 1.8.11