yat  0.14.5pre
utility.h
1 #ifndef _theplu_yat_statistics_utility_
2 #define _theplu_yat_statistics_utility_
3 
4 // $Id: utility.h 3550 2017-01-03 05:41:02Z peter $
5 
6 /*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016 Peter Johansson
12 
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14 
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19 
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27 */
28 
29 #include "Percentiler.h"
30 
31 #include "yat/classifier/DataLookupWeighted1D.h"
32 #include "yat/classifier/Target.h"
33 #include "yat/normalizer/utility.h"
34 #include "yat/utility/concept_check.h"
35 #include "yat/utility/DataIterator.h"
36 #include "yat/utility/deprecate.h"
37 #include "yat/utility/iterator_traits.h"
38 #include "yat/utility/sort_index.h"
39 #include "yat/utility/Vector.h"
40 #include "yat/utility/VectorBase.h"
41 #include "yat/utility/yat_assert.h"
42 
43 #include <boost/concept_check.hpp>
44 #include <boost/iterator/iterator_concepts.hpp>
45 #include <boost/iterator/iterator_traits.hpp>
46 #include <boost/iterator/permutation_iterator.hpp>
47 
48 #include <gsl/gsl_math.h>
49 #include <gsl/gsl_statistics_double.h>
50 
51 #include <algorithm>
52 #include <cmath>
53 #include <iterator>
54 #include <stdexcept>
55 #include <vector>
56 
57 namespace theplu {
58 namespace yat {
59 namespace statistics {
60 
70  template <typename T, typename ForwardIterator>
71  void add(T& o, ForwardIterator first, ForwardIterator last,
72  const classifier::Target& target);
73 
94  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
95  void benjamini_hochberg(BidirectionalIterator1 first,
96  BidirectionalIterator1 last,
97  BidirectionalIterator2 result);
98 
99 
119  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
120  void benjamini_hochberg_unsorted(RandomAccessIterator first,
121  RandomAccessIterator last,
122  MutableRandomAccessIterator result);
123 
124 
136  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
137  unsigned int n2, unsigned int t);
138 
139 
151  template<typename InputIterator>
152  double entropy(InputIterator first, InputIterator last);
153 
169  double pearson_p_value(double r, unsigned int n);
170 
178  double kurtosis(const utility::VectorBase&);
179 
180 
194  template <class RandomAccessIterator>
195  double mad(RandomAccessIterator first, RandomAccessIterator last,
196  bool sorted=false);
197 
198 
216  template <class RandomAccessIterator>
217  double median(RandomAccessIterator first, RandomAccessIterator last,
218  bool sorted=false);
219 
220 
242  template<class T>
243  double mutual_information(const T& A);
244 
270  template <class RandomAccessIterator>
271  double percentile(RandomAccessIterator first, RandomAccessIterator last,
272  double p, bool sorted=false) YAT_DEPRECATE;
273 
284  template <class RandomAccessIterator>
285  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
286  double p, bool sorted=false)
287  {
288  Percentiler percentiler(p, sorted);
289  return percentiler(first, last);
290  }
291 
298  double skewness(const utility::VectorBase&);
299 
300 
302 
303  template <typename T, typename ForwardIterator>
304  void add(T& o, ForwardIterator first, ForwardIterator last,
305  const classifier::Target& target)
306  {
307  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>));
308  BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>));
310  for (size_t i=0; first!=last; ++i, ++first)
311  o.add(traits.data(first), target.binary(i), traits.weight(first));
312  }
313 
314 
315  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
316  void benjamini_hochberg(BidirectionalIterator1 first,
317  BidirectionalIterator1 last,
318  BidirectionalIterator2 result)
319  {
320  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>));
321  BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>));
322 
323  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>));
324  BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>));
325  BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>));
326 
327  typedef typename boost::iterator_difference<BidirectionalIterator1>::type
328  diff_type;
329 
330  diff_type n = std::distance(first, last);
331  if (!n)
332  return;
333  std::advance(result, n-1);
334  first = last;
335  --first;
336  diff_type rank = n;
337 
338  double x = 1.0;
339  while (rank) {
340  x = std::min(*first * n/static_cast<double>(rank), x);
341  *result = x;
342  --rank;
343  --first;
344  --result;
345  }
346  }
347 
348 
349  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
350  void benjamini_hochberg_unsorted(RandomAccessIterator first,
351  RandomAccessIterator last,
352  MutableRandomAccessIterator result)
353  {
354  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>));
355  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
356 
357  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>));
358  BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>));
359  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>));
360 
361  std::vector<size_t> idx;
362  utility::sort_index(first, last, idx);
363  benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
364  boost::make_permutation_iterator(first, idx.end()),
365  boost::make_permutation_iterator(result, idx.begin()));
366  }
367 
368 
369  template<typename InputIterator>
370  double entropy(InputIterator first, InputIterator last)
371  {
372  BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>));
373  BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>));
374  using boost::Convertible;
375  typedef typename boost::iterator_value<InputIterator>::type T;
376  BOOST_CONCEPT_ASSERT((Convertible<T,double>));
377  double sum = 0;
378  double N = 0;
379  for (; first != last; ++first) {
380  if (*first) {
381  N += *first;
382  sum += *first * std::log(static_cast<double>(*first));
383  }
384  }
385  return -sum / N + log(N);
386  }
387 
388 
389  template <class RandomAccessIterator>
390  double mad(RandomAccessIterator first, RandomAccessIterator last,
391  bool sorted)
392  {
394  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
395  double m = median(first, last, sorted);
396  typedef typename boost::iterator_value<RandomAccessIterator>::type T;
397  std::vector<T> ad(std::distance(first, last));
398  // copy weights if needed
399  normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
400  // assign data (absolute deviation from median)
403  while (first!=last) {
404  *first2 = std::abs(traits.data(first)-m);
405  ++first;
406  ++first2;
407  }
408  return median(ad.begin(), ad.end(), false);
409  }
410 
411 
412  template <class RandomAccessIterator>
413  double median(RandomAccessIterator first, RandomAccessIterator last,
414  bool sorted)
415  {
417  BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>));
418  return percentile2(first, last, 50.0, sorted);
419  }
420 
421 
422  template<class T>
423  double mutual_information(const T& n)
424  {
425  BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
426  using boost::Convertible;
427  BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
428 
429  // p_x = \sum_y p_xy
430 
431  // Mutual Information is defined as
432  // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
433  // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
434  // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
435  // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
436  // - entropy_xy + entropy_x + entropy_y
437 
438  utility::Vector rowsum(n.columns(), 0);
439  for (size_t c = 0; c<n.columns(); ++c)
440  rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
441 
442  utility::Vector colsum(n.rows(), 0);
443  for (size_t r = 0; r<n.rows(); ++r)
444  colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
445 
446  double mi = - entropy(n.begin(), n.end());
447  mi += entropy(rowsum.begin(), rowsum.end());
448  mi += entropy(colsum.begin(), colsum.end());
449 
450  return mi/M_LN2;
451  }
452 
453 
454  template <class RandomAccessIterator>
455  double percentile(RandomAccessIterator first, RandomAccessIterator last,
456  double p, bool sorted)
457  {
458  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
460  // range is one value only is a special case
461  if (first+1 == last)
463  if (sorted) {
464  // have to take care of this special case
465  if (p>=100)
467  double j = p/100 * (std::distance(first,last)-1);
468  int i = static_cast<int>(j);
469  return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
470  }
471  using std::iterator_traits;
472  typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
473  std::vector<value_t> v_copy(first, last);
474  size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
475  if (i+2 < v_copy.size()) {
476  std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
477  }
478  else
479  std::sort(v_copy.begin(), v_copy.end());
480  return percentile(v_copy.begin(), v_copy.end(), p, true);
481  }
482 
483 
484 }}} // of namespace statistics, yat, and theplu
485 
486 #endif
void benjamini_hochberg(BidirectionalIterator1 first, BidirectionalIterator1 last, BidirectionalIterator2 result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:316
double mad(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Median absolute deviation from median.
Definition: utility.h:390
data_reference data(Iter iter) const
Definition: iterator_traits.h:440
Class for containing sample labels.
Definition: Target.h:47
double mutual_information(const T &A)
Calculates the mutual information of A.
Definition: utility.h:423
Concept check for Data Iterator.
Definition: concept_check.h:228
Definition: iterator_traits.h:412
Concept check for Container2D.
Definition: concept_check.h:59
double skewness(const utility::VectorBase &)
Computes the skewness of the data in a vector.
double pearson_p_value(double r, unsigned int n)
one-sided p-value
double median(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Definition: utility.h:413
double entropy(InputIterator first, InputIterator last)
Definition: utility.h:370
DataIterator.
Definition: DataIterator.h:63
This is the yat interface to GSL vector.
Definition: Vector.h:57
This is the yat interface to GSL vector.
Definition: VectorBase.h:52
bool binary(size_t i) const
Default binary is set to false for all classes except class 0.
void sort_index(InputIterator first, InputIterator last, std::vector< size_t > &sort_index)
Definition: sort_index.h:146
weight_reference weight(Iter iter) const
Definition: iterator_traits.h:446
double kurtosis(const utility::VectorBase &)
Computes the kurtosis of the data in a vector.
void benjamini_hochberg_unsorted(RandomAccessIterator first, RandomAccessIterator last, MutableRandomAccessIterator result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:350
void add(T &o, ForwardIterator first, ForwardIterator last, const classifier::Target &target)
Definition: utility.h:304
double cdf_hypergeometric_P(unsigned int k, unsigned int n1, unsigned int n2, unsigned int t)
double percentile2(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:285
double percentile(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:455
Functor to calculate percentile of a range.
Definition: Percentiler.h:51

Generated on Tue Sep 26 2017 02:33:29 for yat by  doxygen 1.8.5