yat  0.18.2pre
utility.h
1 #ifndef _theplu_yat_statistics_utility_
2 #define _theplu_yat_statistics_utility_
3 
4 // $Id: utility.h 3875 2020-03-05 01:40:57Z peter $
5 
6 /*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016, 2018, 2020 Peter Johansson
12 
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14 
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19 
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27 */
28 
29 #include "Percentiler.h"
30 
31 #include "yat/classifier/DataLookupWeighted1D.h"
32 #include "yat/classifier/Target.h"
33 #include "yat/normalizer/utility.h"
34 #include "yat/utility/concept_check.h"
35 #include "yat/utility/DataIterator.h"
36 #include "yat/utility/deprecate.h"
37 #include "yat/utility/iterator_traits.h"
38 #include "yat/utility/Matrix.h"
39 #include "yat/utility/sort_index.h"
40 #include "yat/utility/Vector.h"
41 #include "yat/utility/VectorBase.h"
42 #include "yat/utility/yat_assert.h"
43 
44 #include <boost/concept_check.hpp>
45 #include <boost/iterator/iterator_concepts.hpp>
46 #include <boost/iterator/iterator_traits.hpp>
47 #include <boost/iterator/permutation_iterator.hpp>
48 
49 #include <gsl/gsl_math.h>
50 #include <gsl/gsl_statistics_double.h>
51 
52 #include <algorithm>
53 #include <cmath>
54 #include <iterator>
55 #include <stdexcept>
56 #include <vector>
57 
58 namespace theplu {
59 namespace yat {
60 namespace statistics {
61 
71  template <typename T, typename ForwardIterator>
72  void add(T& o, ForwardIterator first, ForwardIterator last,
73  const classifier::Target& target);
74 
95  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
96  void benjamini_hochberg(BidirectionalIterator1 first,
97  BidirectionalIterator1 last,
98  BidirectionalIterator2 result);
99 
100 
120  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
121  void benjamini_hochberg_unsorted(RandomAccessIterator first,
122  RandomAccessIterator last,
123  MutableRandomAccessIterator result);
124 
125 
137  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
138  unsigned int n2, unsigned int t);
139 
147  utility::Matrix correlation(utility::Matrix x);
148 
149 
161  template<typename InputIterator>
162  double entropy(InputIterator first, InputIterator last);
163 
179  double pearson_p_value(double r, unsigned int n);
180 
188  double kurtosis(const utility::VectorBase&);
189 
190 
206  template <class RandomAccessIterator>
207  double mad(RandomAccessIterator first, RandomAccessIterator last,
208  bool sorted=false);
209 
210 
230  template <class RandomAccessIterator>
231  double median(RandomAccessIterator first, RandomAccessIterator last,
232  bool sorted=false);
233 
234 
256  template<class T>
257  double mutual_information(const T& A);
258 
284  template <class RandomAccessIterator>
285  double percentile(RandomAccessIterator first, RandomAccessIterator last,
286  double p, bool sorted=false) YAT_DEPRECATE;
287 
300  template <class RandomAccessIterator>
301  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
302  double p, bool sorted=false)
303  {
304  Percentiler percentiler(p, sorted);
305  return percentiler(first, last);
306  }
307 
314  double skewness(const utility::VectorBase&);
315 
316 
318 
319  template <typename T, typename ForwardIterator>
320  void add(T& o, ForwardIterator first, ForwardIterator last,
321  const classifier::Target& target)
322  {
323  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
324  BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>));
326  for (size_t i=0; first!=last; ++i, ++first)
327  o.add(traits.data(first), target.binary(i), traits.weight(first));
328  }
329 
330 
331  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
332  void benjamini_hochberg(BidirectionalIterator1 first,
333  BidirectionalIterator1 last,
334  BidirectionalIterator2 result)
335  {
336  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>));
337  BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>));
338 
339  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>));
340  BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>));
341  BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>));
342 
343  typedef typename boost::iterator_difference<BidirectionalIterator1>::type
344  diff_type;
345 
346  diff_type n = std::distance(first, last);
347  if (!n)
348  return;
349  std::advance(result, n-1);
350  first = last;
351  --first;
352  diff_type rank = n;
353 
354  double x = 1.0;
355  while (rank) {
356  x = std::min(*first * n/static_cast<double>(rank), x);
357  *result = x;
358  --rank;
359  --first;
360  --result;
361  }
362  }
363 
364 
365  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
366  void benjamini_hochberg_unsorted(RandomAccessIterator first,
367  RandomAccessIterator last,
368  MutableRandomAccessIterator result)
369  {
370  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>));
371  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
372 
373  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>));
374  BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>));
375  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>));
376 
377  std::vector<size_t> idx;
378  utility::sort_index(first, last, idx);
379  benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
380  boost::make_permutation_iterator(first, idx.end()),
381  boost::make_permutation_iterator(result, idx.begin()));
382  }
383 
384 
385  template<typename InputIterator>
386  double entropy(InputIterator first, InputIterator last)
387  {
388  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>));
389  BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>));
390  using boost::Convertible;
391  typedef typename boost::iterator_value<InputIterator>::type T;
392  BOOST_CONCEPT_ASSERT((Convertible<T,double>));
393  double sum = 0;
394  double N = 0;
395  for (; first != last; ++first) {
396  if (*first) {
397  N += *first;
398  sum += *first * std::log(static_cast<double>(*first));
399  }
400  }
401  return -sum / N + log(N);
402  }
403 
404 
405  template <class RandomAccessIterator>
406  double mad(RandomAccessIterator first, RandomAccessIterator last,
407  bool sorted)
408  {
410  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
411  double m = median(first, last, sorted);
412  typedef typename boost::iterator_value<RandomAccessIterator>::type T;
413  std::vector<T> ad(std::distance(first, last));
414  // copy weights if needed
415  normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
416  // assign data (absolute deviation from median)
419  while (first!=last) {
420  *first2 = std::abs(traits.data(first)-m);
421  ++first;
422  ++first2;
423  }
424  return median(ad.begin(), ad.end(), false);
425  }
426 
427 
428  template <class RandomAccessIterator>
429  double median(RandomAccessIterator first, RandomAccessIterator last,
430  bool sorted)
431  {
433  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
434  return percentile2(first, last, 50.0, sorted);
435  }
436 
437 
438  template<class T>
439  double mutual_information(const T& n)
440  {
441  BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
442  using boost::Convertible;
443  BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
444 
445  // p_x = \sum_y p_xy
446 
447  // Mutual Information is defined as
448  // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
449  // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
450  // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
451  // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
452  // - entropy_xy + entropy_x + entropy_y
453 
454  utility::Vector rowsum(n.columns(), 0);
455  for (size_t c = 0; c<n.columns(); ++c)
456  rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
457 
458  utility::Vector colsum(n.rows(), 0);
459  for (size_t r = 0; r<n.rows(); ++r)
460  colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
461 
462  double mi = - entropy(n.begin(), n.end());
463  mi += entropy(rowsum.begin(), rowsum.end());
464  mi += entropy(colsum.begin(), colsum.end());
465 
466  return mi/M_LN2;
467  }
468 
469 
470  template <class RandomAccessIterator>
471  double percentile(RandomAccessIterator first, RandomAccessIterator last,
472  double p, bool sorted)
473  {
474  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
476  // range is one value only is a special case
477  if (first+1 == last)
479  if (sorted) {
480  // have to take care of this special case
481  if (p>=100)
483  double j = p/100 * (std::distance(first,last)-1);
484  int i = static_cast<int>(j);
485  return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
486  }
487  using std::iterator_traits;
488  typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
489  std::vector<value_t> v_copy(first, last);
490  size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
491  if (i+2 < v_copy.size()) {
492  std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
493  }
494  else
495  std::sort(v_copy.begin(), v_copy.end());
496  return percentile(v_copy.begin(), v_copy.end(), p, true);
497  }
498 
499 
500 }}} // of namespace statistics, yat, and theplu
501 
502 #endif
void benjamini_hochberg(BidirectionalIterator1 first, BidirectionalIterator1 last, BidirectionalIterator2 result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:332
double mad(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Median absolute deviation from median.
Definition: utility.h:406
Class for containing sample labels.
Definition: Target.h:47
double mutual_information(const T &A)
Calculates the mutual information of A.
Definition: utility.h:439
Concept check for Data Iterator.
Definition: concept_check.h:240
data_reference data(Iter iter) const
Definition: iterator_traits.h:440
Definition: iterator_traits.h:412
The Department of Theoretical Physics namespace as we define it.
Concept check for Container2D.
Definition: concept_check.h:65
double skewness(const utility::VectorBase &)
Computes the skewness of the data in a vector.
double pearson_p_value(double r, unsigned int n)
one-sided p-value
bool binary(size_t i) const
Default binary is set to false for all classes except class 0.
double median(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Definition: utility.h:429
double entropy(InputIterator first, InputIterator last)
Definition: utility.h:386
DataIterator.
Definition: DataIterator.h:63
This is the yat interface to GSL vector.
Definition: Vector.h:59
This is the yat interface to GSL vector.
Definition: VectorBase.h:55
void sort_index(InputIterator first, InputIterator last, std::vector< size_t > &sort_index)
Definition: sort_index.h:146
double kurtosis(const utility::VectorBase &)
Computes the kurtosis of the data in a vector.
void benjamini_hochberg_unsorted(RandomAccessIterator first, RandomAccessIterator last, MutableRandomAccessIterator result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:366
void add(T &o, ForwardIterator first, ForwardIterator last, const classifier::Target &target)
Definition: utility.h:320
utility::Matrix correlation(utility::Matrix x)
weight_reference weight(Iter iter) const
Definition: iterator_traits.h:446
double cdf_hypergeometric_P(unsigned int k, unsigned int n1, unsigned int n2, unsigned int t)
double percentile2(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:301
double percentile(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:471
Functor to calculate percentile of a range.
Definition: Percentiler.h:51

Generated on Tue Sep 7 2021 17:32:32 for yat by  doxygen 1.8.14