yat  0.12.3pre
utility.h
1 #ifndef _theplu_yat_statistics_utility_
2 #define _theplu_yat_statistics_utility_
3 
4 // $Id: utility.h 3137 2013-11-28 04:11:23Z peter $
5 
6 /*
7  Copyright (C) 2004 Jari Häkkinen, Peter Johansson
8  Copyright (C) 2005 Peter Johansson
9  Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
10  Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
11  Copyright (C) 2009, 2010, 2011, 2013 Peter Johansson
12 
13  This file is part of the yat library, http://dev.thep.lu.se/yat
14 
15  The yat library is free software; you can redistribute it and/or
16  modify it under the terms of the GNU General Public License as
17  published by the Free Software Foundation; either version 3 of the
18  License, or (at your option) any later version.
19 
20  The yat library is distributed in the hope that it will be useful,
21  but WITHOUT ANY WARRANTY; without even the implied warranty of
22  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23  General Public License for more details.
24 
25  You should have received a copy of the GNU General Public License
26  along with yat. If not, see <http://www.gnu.org/licenses/>.
27 */
28 
29 #include "Percentiler.h"
30 
31 #include "yat/classifier/DataLookupWeighted1D.h"
32 #include "yat/classifier/Target.h"
33 #include "yat/normalizer/utility.h"
34 #include "yat/utility/concept_check.h"
35 #include "yat/utility/DataIterator.h"
36 #include "yat/utility/deprecate.h"
37 #include "yat/utility/iterator_traits.h"
38 #include "yat/utility/Vector.h"
39 #include "yat/utility/VectorBase.h"
40 #include "yat/utility/yat_assert.h"
41 
42 #include <boost/concept_check.hpp>
43 #include <gsl/gsl_math.h>
44 #include <gsl/gsl_statistics_double.h>
45 
46 #include <algorithm>
47 #include <cmath>
48 #include <iterator>
49 #include <stdexcept>
50 #include <vector>
51 
52 namespace theplu {
53 namespace yat {
54 namespace statistics {
55 
61  template <typename T, typename ForwardIterator>
62  void add(T& o, ForwardIterator first, ForwardIterator last,
63  const classifier::Target& target);
64 
80  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
81  void benjamini_hochberg(BidirectionalIterator1 first,
82  BidirectionalIterator1 last,
83  BidirectionalIterator2 result);
84 
85 
97  double cdf_hypergeometric_P(unsigned int k, unsigned int n1,
98  unsigned int n2, unsigned int t);
99 
100 
111  template<typename InputIterator>
112  double entropy(InputIterator first, InputIterator last);
113 
129  double pearson_p_value(double r, unsigned int n);
130 
138  double kurtosis(const utility::VectorBase&);
139 
140 
152  template <class RandomAccessIterator>
153  double mad(RandomAccessIterator first, RandomAccessIterator last,
154  bool sorted=false);
155 
156 
172  template <class RandomAccessIterator>
173  double median(RandomAccessIterator first, RandomAccessIterator last,
174  bool sorted=false);
175 
176 
198  template<class T>
199  double mutual_information(const T& A);
200 
226  template <class RandomAccessIterator>
227  double percentile(RandomAccessIterator first, RandomAccessIterator last,
228  double p, bool sorted=false) YAT_DEPRECATE;
229 
235  template <class RandomAccessIterator>
236  double percentile2(RandomAccessIterator first, RandomAccessIterator last,
237  double p, bool sorted=false)
238  {
239  Percentiler percentiler(p, sorted);
240  return percentiler(first, last);
241  }
242 
249  double skewness(const utility::VectorBase&);
250 
251 
253 
254  template <typename T, typename ForwardIterator>
255  void add(T& o, ForwardIterator first, ForwardIterator last,
256  const classifier::Target& target)
257  {
259  for (size_t i=0; first!=last; ++i, ++first)
260  o.add(traits.data(first), target.binary(i), traits.weight(first));
261  }
262 
263 
264  template<typename BidirectionalIterator1, typename BidirectionalIterator2>
265  void benjamini_hochberg(BidirectionalIterator1 first,
266  BidirectionalIterator1 last,
267  BidirectionalIterator2 result)
268  {
269  using boost::Mutable_BidirectionalIterator;
270  BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
271  BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
272  size_t n = std::distance(first, last);
273  if (!n)
274  return;
275  std::advance(result, n-1);
276  first = last;
277  --first;
278  size_t rank = n;
279 
280  double prev = 1.0;
281  while (rank) {
282  *result = std::min(*first * n/static_cast<double>(rank), prev);
283  prev = *result;
284  --rank;
285  --first;
286  --result;
287  }
288  }
289 
290 
291  template<typename InputIterator>
292  double entropy(InputIterator first, InputIterator last)
293  {
294  BOOST_CONCEPT_ASSERT((boost::InputIterator<InputIterator>));
295  using boost::Convertible;
296  typedef typename InputIterator::value_type T;
297  BOOST_CONCEPT_ASSERT((Convertible<T,double>));
298  double sum = 0;
299  double N = 0;
300  for (; first != last; ++first) {
301  if (*first) {
302  N += *first;
303  sum += *first * std::log(static_cast<double>(*first));
304  }
305  }
306  return -sum / N + log(N);
307  }
308 
309 
310  template <class RandomAccessIterator>
311  double mad(RandomAccessIterator first, RandomAccessIterator last,
312  bool sorted)
313  {
314  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
316  double m = median(first, last, sorted);
317  typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
318  std::vector<T> ad(std::distance(first, last));
319  // copy weights if needed
320  normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
321  // assign data (absolute deviation from median)
324  while (first!=last) {
325  *first2 = std::abs(traits.data(first)-m);
326  ++first;
327  ++first2;
328  }
329  std::sort(ad.begin(), ad.end());
330  return median(ad.begin(), ad.end(), true);
331  }
332 
333 
334  template <class RandomAccessIterator>
335  double median(RandomAccessIterator first, RandomAccessIterator last,
336  bool sorted)
337  {
338  return percentile2(first, last, 50.0, sorted);
339  }
340 
341 
342  template<class T>
343  double mutual_information(const T& n)
344  {
345  BOOST_CONCEPT_ASSERT((utility::Container2D<T>));
346  using boost::Convertible;
347  BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>));
348 
349  // p_x = \sum_y p_xy
350 
351  // Mutual Information is defined as
352  // \sum_xy p_xy * log (p_xy / (p_x p_y)) =
353  // \sum_xy p_xy * [log p_xy - log p_x - log p_y]
354  // \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y
355  // \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y
356  // - entropy_xy + entropy_x + entropy_y
357 
358  utility::Vector rowsum(n.columns(), 0);
359  for (size_t c = 0; c<n.columns(); ++c)
360  rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0);
361 
362  utility::Vector colsum(n.rows(), 0);
363  for (size_t r = 0; r<n.rows(); ++r)
364  colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0);
365 
366  double mi = - entropy(n.begin(), n.end());
367  mi += entropy(rowsum.begin(), rowsum.end());
368  mi += entropy(colsum.begin(), colsum.end());
369 
370  return mi/M_LN2;
371  }
372 
373 
374  template <class RandomAccessIterator>
375  double percentile(RandomAccessIterator first, RandomAccessIterator last,
376  double p, bool sorted)
377  {
378  BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
380  // range is one value only is a special case
381  if (first+1 == last)
383  if (sorted) {
384  // have to take care of this special case
385  if (p>=100)
387  double j = p/100 * (std::distance(first,last)-1);
388  int i = static_cast<int>(j);
389  return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
390  }
391  using std::iterator_traits;
392  typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
393  std::vector<value_t> v_copy;
394  v_copy.reserve(std::distance(first,last));
395  std::copy(first, last, std::back_inserter(v_copy));
396  size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
397  if (i+2 < v_copy.size()) {
398  std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
399  }
400  else
401  std::sort(v_copy.begin(), v_copy.end());
402  return percentile(v_copy.begin(), v_copy.end(), p, true);
403  }
404 
405 
406 }}} // of namespace statistics, yat, and theplu
407 
408 #endif
void benjamini_hochberg(BidirectionalIterator1 first, BidirectionalIterator1 last, BidirectionalIterator2 result)
Benjamini Hochberg multiple test correction.
Definition: utility.h:265
double mad(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Median absolute deviation from median.
Definition: utility.h:311
data_reference data(Iter iter) const
Definition: iterator_traits.h:434
Class for containing sample labels.
Definition: Target.h:47
double mutual_information(const T &A)
Calculates the mutual information of A.
Definition: utility.h:343
Concept check for Data Iterator.
Definition: concept_check.h:224
Definition: iterator_traits.h:406
Concept check for Container2D.
Definition: concept_check.h:56
double skewness(const utility::VectorBase &)
Computes the skewness of the data in a vector.
double pearson_p_value(double r, unsigned int n)
one-sided p-value
double median(RandomAccessIterator first, RandomAccessIterator last, bool sorted=false)
Definition: utility.h:335
double entropy(InputIterator first, InputIterator last)
Definition: utility.h:292
DataIterator.
Definition: DataIterator.h:61
This is the yat interface to GSL vector.
Definition: Vector.h:57
This is the yat interface to GSL vector.
Definition: VectorBase.h:52
bool binary(size_t i) const
Default binary is set to false for all classes except class 0.
weight_reference weight(Iter iter) const
Definition: iterator_traits.h:440
double kurtosis(const utility::VectorBase &)
Computes the kurtosis of the data in a vector.
void add(T &o, ForwardIterator first, ForwardIterator last, const classifier::Target &target)
Definition: utility.h:255
double cdf_hypergeometric_P(unsigned int k, unsigned int n1, unsigned int n2, unsigned int t)
double percentile2(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:236
double percentile(RandomAccessIterator first, RandomAccessIterator last, double p, bool sorted=false)
Definition: utility.h:375
Functor to calculate percentile of a range.
Definition: Percentiler.h:50

Generated on Mon Jun 1 2015 12:29:51 for yat by  doxygen 1.8.5