yat  0.13.2pre
utility.h
1 #ifndef _theplu_yat_statistics_utility_
2 #define _theplu_yat_statistics_utility_
3 
4 // $Id: utility.h 3245 2014-05-24 17:01:31Z peter $
5 
6 /*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013, 2014 Peter Johansson
12 
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14 
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19 
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27 */
28 
29 #include "Percentiler.h"
30 
31 #include "yat/classifier/DataLookupWeighted1D.h"
32 #include "yat/classifier/Target.h"
33 #include "yat/normalizer/utility.h"
34 #include "yat/utility/concept_check.h"
35 #include "yat/utility/DataIterator.h"
36 #include "yat/utility/deprecate.h"
37 #include "yat/utility/iterator_traits.h"
38 #include "yat/utility/sort_index.h"
39 #include "yat/utility/Vector.h"
40 #include "yat/utility/VectorBase.h"
41 #include "yat/utility/yat_assert.h"
42 
43 #include <boost/concept_check.hpp>
44 #include <boost/iterator/permutation_iterator.hpp>
45 
46 #include <gsl/gsl_math.h>
47 #include <gsl/gsl_statistics_double.h>
48 
49 #include <algorithm>
50 #include <cmath>
51 #include <iterator>
52 #include <stdexcept>
53 #include <vector>
54 
55 namespace theplu {
56 namespace yat {
57 namespace statistics {
58 
64  template <typename T, typename ForwardIterator>
65  void add(T& o, ForwardIterator first, ForwardIterator last,
66  const classifier::Target& target);
67 
85  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
86  void benjamini_hochberg(BidirectionalIterator1 first,
87  BidirectionalIterator1 last,
88  BidirectionalIterator2 result);
89 
90 
107  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
108  void benjamini_hochberg_unsorted(RandomAccessIterator first,
109  RandomAccessIterator last,
110  MutableRandomAccessIterator result);
111 
112 
124  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
125  unsigned int n2, unsigned int t);
126 
127 
138  template<typename InputIterator>
139  double entropy(InputIterator first, InputIterator last);
140 
156  double pearson_p_value(double r, unsigned int n);
157 
165  double kurtosis(const utility::VectorBase&);
166 
167 
179  template <class RandomAccessIterator>
180  double mad(RandomAccessIterator first, RandomAccessIterator last,
181  bool sorted=false);
182 
183 
199  template <class RandomAccessIterator>
200  double median(RandomAccessIterator first, RandomAccessIterator last,
201  bool sorted=false);
202 
203 
225  template<class T>
226  double mutual_information(const T& A);
227 
253  template <class RandomAccessIterator>
254  double percentile(RandomAccessIterator first, RandomAccessIterator last,
255  double p, bool sorted=false) YAT_DEPRECATE;
256 
262  template <class RandomAccessIterator>
263  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
264  double p, bool sorted=false)
265  {
266  Percentiler percentiler(p, sorted);
267  return percentiler(first, last);
268  }
269 
276  double skewness(const utility::VectorBase&);
277 
278 
280 
281  template <typename T, typename ForwardIterator>
282  void add(T& o, ForwardIterator first, ForwardIterator last,
283  const classifier::Target& target)
284  {
286  for (size_t i=0; first!=last; ++i, ++first)
287  o.add(traits.data(first), target.binary(i), traits.weight(first));
288  }
289 
290 
291  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
292  void benjamini_hochberg(BidirectionalIterator1 first,
293  BidirectionalIterator1 last,
294  BidirectionalIterator2 result)
295  {
296  using boost::Mutable_BidirectionalIterator;
297  using boost::BidirectionalIterator;
298  BOOST_CONCEPT_ASSERT((BidirectionalIterator<BidirectionalIterator1>));
299  BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
300  typedef typename std::iterator_traits<BidirectionalIterator1> traits;
301  typename traits::difference_type n = std::distance(first, last);
302  if (!n)
303  return;
304  std::advance(result, n-1);
305  first = last;
306  --first;
307  typename traits::difference_type rank = n;
308 
309  double prev = 1.0;
310  while (rank) {
311  *result = std::min(*first * n/static_cast<double>(rank), prev);
312  prev = *result;
313  --rank;
314  --first;
315  --result;
316  }
317  }
318 
319 
320  template<typename RandomAccessIterator, typename MutableRandomAccessIterator>
321  void benjamini_hochberg_unsorted(RandomAccessIterator first,
322  RandomAccessIterator last,
323  MutableRandomAccessIterator result)
324  {
325  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
326  using boost::Mutable_RandomAccessIterator;
327  BOOST_CONCEPT_ASSERT((Mutable_RandomAccessIterator<MutableRandomAccessIterator>));
328 
329  std::vector<size_t> idx;
330  utility::sort_index(first, last, idx);
331  benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()),
332  boost::make_permutation_iterator(first, idx.end()),
333  boost::make_permutation_iterator(result, idx.begin()));
334  }
335 
336 
337  template<typename InputIterator>
338  double entropy(InputIterator first, InputIterator last)
339  {
340  BOOST_CONCEPT_ASSERT((boost::InputIterator<InputIterator>));
341  using boost::Convertible;
342  typedef typename InputIterator::value_type T;
343  BOOST_CONCEPT_ASSERT((Convertible<T,double>));
344  double sum = 0;
345  double N = 0;
346  for (; first != last; ++first) {
347  if (*first) {
348  N += *first;
349  sum += *first * std::log(static_cast<double>(*first));
350  }
351  }
352  return -sum / N + log(N);
353  }
354 
355 
356  template <class RandomAccessIterator>
357  double mad(RandomAccessIterator first, RandomAccessIterator last,
358  bool sorted)
359  {
360  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
362  double m = median(first, last, sorted);
363  typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
364  std::vector<T> ad(std::distance(first, last));
365  // copy weights if needed
366  normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
367  // assign data (absolute deviation from median)
370  while (first!=last) {
371  *first2 = std::abs(traits.data(first)-m);
372  ++first;
373  ++first2;
374  }
375  return median(ad.begin(), ad.end(), false);
376  }
377 
378 
379  template <class RandomAccessIterator>
380  double median(RandomAccessIterator first, RandomAccessIterator last,
381  bool sorted)
382  {
383  return percentile2(first, last, 50.0, sorted);
384  }
385 
386 
387  template<class T>
388  double mutual_information(const T& n)
389  {
390  BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
391  using boost::Convertible;
392  BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
393 
394  // p_x = \sum_y p_xy
395 
396  // Mutual Information is defined as
397  // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
398  // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
399  // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
400  // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
401  // - entropy_xy + entropy_x + entropy_y
402 
403  utility::Vector rowsum(n.columns(), 0);
404  for (size_t c = 0; c<n.columns(); ++c)
405  rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
406 
407  utility::Vector colsum(n.rows(), 0);
408  for (size_t r = 0; r<n.rows(); ++r)
409  colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
410 
411  double mi = - entropy(n.begin(), n.end());
412  mi += entropy(rowsum.begin(), rowsum.end());
413  mi += entropy(colsum.begin(), colsum.end());
414 
415  return mi/M_LN2;
416  }
417 
418 
419  template <class RandomAccessIterator>
420  double percentile(RandomAccessIterator first, RandomAccessIterator last,
421  double p, bool sorted)
422  {
423  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
425  // range is one value only is a special case
426  if (first+1 == last)
428  if (sorted) {
429  // have to take care of this special case
430  if (p>=100)
432  double j = p/100 * (std::distance(first,last)-1);
433  int i = static_cast<int>(j);
434  return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
435  }
436  using std::iterator_traits;
437  typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
438  std::vector<value_t> v_copy(first, last);
439  size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
440  if (i+2 < v_copy.size()) {
441  std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
442  }
443  else
444  std::sort(v_copy.begin(), v_copy.end());
445  return percentile(v_copy.begin(), v_copy.end(), p, true);
446  }
447 
448 
449 }}} // of namespace statistics, yat, and theplu
450 
451 #endif
void benjamini_hochberg(BidirectionalIterator1 first, BidirectionalIterator1 last, BidirectionalIterator2 result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:292
double mad(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Median absolute deviation from median.
Definition: utility.h:357
data_reference data(Iter iter) const
Definition: iterator_traits.h:440
Class for containing sample labels.
Definition: Target.h:47
double mutual_information(const T &A)
Calculates the mutual information of A.
Definition: utility.h:388
Concept check for Data Iterator.
Definition: concept_check.h:226
Definition: iterator_traits.h:412
Concept check for Container2D.
Definition: concept_check.h:57
double skewness(const utility::VectorBase &)
Computes the skewness of the data in a vector.
double pearson_p_value(double r, unsigned int n)
one-sided p-value
double median(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Definition: utility.h:380
double entropy(InputIterator first, InputIterator last)
Definition: utility.h:338
DataIterator.
Definition: DataIterator.h:62
This is the yat interface to GSL vector.
Definition: Vector.h:57
This is the yat interface to GSL vector.
Definition: VectorBase.h:52
bool binary(size_t i) const
Default binary is set to false for all classes except class 0.
void sort_index(InputIterator first, InputIterator last, std::vector< size_t > &sort_index)
Definition: sort_index.h:147
weight_reference weight(Iter iter) const
Definition: iterator_traits.h:446
double kurtosis(const utility::VectorBase &)
Computes the kurtosis of the data in a vector.
void benjamini_hochberg_unsorted(RandomAccessIterator first, RandomAccessIterator last, MutableRandomAccessIterator result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:321
void add(T &o, ForwardIterator first, ForwardIterator last, const classifier::Target &target)
Definition: utility.h:282
double cdf_hypergeometric_P(unsigned int k, unsigned int n1, unsigned int n2, unsigned int t)
double percentile2(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:263
double percentile(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:420
Functor to calculate percentile of a range.
Definition: Percentiler.h:50

Generated on Wed Jan 4 2017 02:23:07 for yat by  doxygen 1.8.5