3135 |
27 Nov 13 |
peter |
1 |
#ifndef _theplu_yat_statistics_utility_ |
3135 |
27 Nov 13 |
peter |
2 |
#define _theplu_yat_statistics_utility_ |
115 |
19 Jul 04 |
peter |
3 |
|
616 |
31 Aug 06 |
jari |
// $Id$ |
500 |
28 Jan 06 |
peter |
5 |
|
675 |
10 Oct 06 |
jari |
6 |
/* |
2119 |
12 Dec 09 |
peter |
Copyright (C) 2004 Jari Häkkinen, Peter Johansson |
831 |
27 Mar 07 |
peter |
Copyright (C) 2005 Peter Johansson |
2119 |
12 Dec 09 |
peter |
Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2007 Peter Johansson |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2008 Jari Häkkinen, Peter Johansson |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2009, 2010, 2011, 2013, 2014, 2016, 2018, 2020, 2021, 2022, 2023 Peter Johansson |
616 |
31 Aug 06 |
jari |
13 |
|
1437 |
25 Aug 08 |
peter |
This file is part of the yat library, http://dev.thep.lu.se/yat |
675 |
10 Oct 06 |
jari |
15 |
|
675 |
10 Oct 06 |
jari |
The yat library is free software; you can redistribute it and/or |
675 |
10 Oct 06 |
jari |
modify it under the terms of the GNU General Public License as |
1486 |
09 Sep 08 |
jari |
published by the Free Software Foundation; either version 3 of the |
675 |
10 Oct 06 |
jari |
License, or (at your option) any later version. |
675 |
10 Oct 06 |
jari |
20 |
|
675 |
10 Oct 06 |
jari |
The yat library is distributed in the hope that it will be useful, |
675 |
10 Oct 06 |
jari |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
675 |
10 Oct 06 |
jari |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
675 |
10 Oct 06 |
jari |
General Public License for more details. |
675 |
10 Oct 06 |
jari |
25 |
|
675 |
10 Oct 06 |
jari |
You should have received a copy of the GNU General Public License |
1487 |
10 Sep 08 |
jari |
along with yat. If not, see <http://www.gnu.org/licenses/>. |
675 |
10 Oct 06 |
jari |
28 |
*/ |
675 |
10 Oct 06 |
jari |
29 |
|
1317 |
21 May 08 |
peter |
30 |
#include "Percentiler.h" |
1317 |
21 May 08 |
peter |
31 |
|
822 |
18 Mar 07 |
peter |
32 |
#include "yat/classifier/DataLookupWeighted1D.h" |
837 |
22 Apr 07 |
peter |
33 |
#include "yat/classifier/Target.h" |
2502 |
28 Jun 11 |
peter |
34 |
#include "yat/normalizer/utility.h" |
2263 |
26 May 10 |
peter |
35 |
#include "yat/utility/concept_check.h" |
2502 |
28 Jun 11 |
peter |
36 |
#include "yat/utility/DataIterator.h" |
1500 |
15 Sep 08 |
peter |
37 |
#include "yat/utility/deprecate.h" |
1835 |
25 Feb 09 |
peter |
38 |
#include "yat/utility/iterator_traits.h" |
3745 |
27 Jul 18 |
peter |
39 |
#include "yat/utility/Matrix.h" |
3236 |
23 May 14 |
peter |
40 |
#include "yat/utility/sort_index.h" |
3137 |
28 Nov 13 |
peter |
41 |
#include "yat/utility/Vector.h" |
1025 |
01 Feb 08 |
peter |
42 |
#include "yat/utility/VectorBase.h" |
932 |
05 Oct 07 |
peter |
43 |
#include "yat/utility/yat_assert.h" |
675 |
10 Oct 06 |
jari |
44 |
|
2263 |
26 May 10 |
peter |
45 |
#include <boost/concept_check.hpp> |
3511 |
22 Jul 16 |
peter |
46 |
#include <boost/iterator/iterator_concepts.hpp> |
3511 |
22 Jul 16 |
peter |
47 |
#include <boost/iterator/iterator_traits.hpp> |
3236 |
23 May 14 |
peter |
48 |
#include <boost/iterator/permutation_iterator.hpp> |
3236 |
23 May 14 |
peter |
49 |
|
3137 |
28 Nov 13 |
peter |
50 |
#include <gsl/gsl_math.h> |
3137 |
28 Nov 13 |
peter |
51 |
#include <gsl/gsl_statistics_double.h> |
2263 |
26 May 10 |
peter |
52 |
|
464 |
16 Dec 05 |
peter |
53 |
#include <algorithm> |
464 |
16 Dec 05 |
peter |
54 |
#include <cmath> |
2496 |
22 Jun 11 |
peter |
55 |
#include <iterator> |
936 |
05 Oct 07 |
peter |
56 |
#include <stdexcept> |
115 |
19 Jul 04 |
peter |
57 |
#include <vector> |
115 |
19 Jul 04 |
peter |
58 |
|
115 |
19 Jul 04 |
peter |
59 |
namespace theplu { |
680 |
11 Oct 06 |
jari |
60 |
namespace yat { |
3135 |
27 Nov 13 |
peter |
61 |
namespace statistics { |
115 |
19 Jul 04 |
peter |
62 |
|
1305 |
14 May 08 |
peter |
63 |
/** |
1145 |
25 Feb 08 |
peter |
Adding a range [\a first, \a last) into an object of type T. The |
1145 |
25 Feb 08 |
peter |
requirements for the type T is to have an add(double, bool, double) |
1145 |
25 Feb 08 |
peter |
function. |
3511 |
22 Jul 16 |
peter |
67 |
|
3511 |
22 Jul 16 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c ForwardIterator is a \readable_iterator |
3511 |
22 Jul 16 |
peter |
- \c ForwardIterator is \single_pass_iterator |
822 |
18 Mar 07 |
peter |
71 |
*/ |
1145 |
25 Feb 08 |
peter |
72 |
template <typename T, typename ForwardIterator> |
1145 |
25 Feb 08 |
peter |
73 |
void add(T& o, ForwardIterator first, ForwardIterator last, |
1874 |
17 Mar 09 |
peter |
74 |
const classifier::Target& target); |
822 |
18 Mar 07 |
peter |
75 |
|
2476 |
14 Apr 11 |
peter |
76 |
/** |
2476 |
14 Apr 11 |
peter |
\brief Benjamini Hochberg multiple test correction |
2476 |
14 Apr 11 |
peter |
78 |
|
3136 |
28 Nov 13 |
peter |
Given a sorted range of p-values such that |
3136 |
28 Nov 13 |
peter |
\f$ p_1 \le p_2 \le ... \le p_N \f$ a Benjamnini-Hochberg corrected |
3136 |
28 Nov 13 |
peter |
p-value, \c q, is calculated recursively as |
3136 |
28 Nov 13 |
peter |
\f$ q_i = \f$ min \f$(p_i \frac{N}{i}, q_{i+1})\f$ with the anchor |
2476 |
14 Apr 11 |
peter |
constraint that \f$ q_m = p_m \f$. |
2476 |
14 Apr 11 |
peter |
84 |
|
3236 |
23 May 14 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c BidirectionalIterator1 is a \readable_iterator |
3511 |
22 Jul 16 |
peter |
- \c BidirectionalIterator1 is a \bidirectional_traversal_iterator |
3236 |
23 May 14 |
peter |
- \c BidirectionalIterator1::value type is convertible to \c double |
3511 |
22 Jul 16 |
peter |
- \c BidirectionalIterator2 is a \readable_iterator |
3511 |
22 Jul 16 |
peter |
- \c BidirectionalIterator2 is a \writable_iterator |
3511 |
22 Jul 16 |
peter |
- \c BidirectionalIterator2 is a \bidirectional_traversal_iterator |
3236 |
23 May 14 |
peter |
- \c double is convertible to \c BidirectionalIterator2::value_type |
2476 |
14 Apr 11 |
peter |
93 |
|
2476 |
14 Apr 11 |
peter |
\since New in yat 0.8 |
2476 |
14 Apr 11 |
peter |
95 |
*/ |
2476 |
14 Apr 11 |
peter |
96 |
template<typename BidirectionalIterator1, typename BidirectionalIterator2> |
2476 |
14 Apr 11 |
peter |
97 |
void benjamini_hochberg(BidirectionalIterator1 first, |
2476 |
14 Apr 11 |
peter |
98 |
BidirectionalIterator1 last, |
2476 |
14 Apr 11 |
peter |
99 |
BidirectionalIterator2 result); |
2476 |
14 Apr 11 |
peter |
100 |
|
3136 |
28 Nov 13 |
peter |
101 |
|
3236 |
23 May 14 |
peter |
102 |
/** |
3236 |
23 May 14 |
peter |
\brief Benjamini Hochberg multiple test correction |
3236 |
23 May 14 |
peter |
104 |
|
3236 |
23 May 14 |
peter |
Similar to benjamini_hochberg() but does not assume that input |
3236 |
23 May 14 |
peter |
range, [first, last), is sorted. The resulting range is the same |
3236 |
23 May 14 |
peter |
as if sorting input range, call benjamini_hochberg, and unsort |
3236 |
23 May 14 |
peter |
the result range. |
3236 |
23 May 14 |
peter |
109 |
|
3236 |
23 May 14 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator is a \readable_iterator |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator is a \random_access_traversal_iterator |
3236 |
23 May 14 |
peter |
- \c RandomAccessIterator::value type is convertible to \c double |
3511 |
22 Jul 16 |
peter |
- \c MutableRandomAccessIterator is a \readable_iterator |
3511 |
22 Jul 16 |
peter |
- \c MutableRandomAccessIterator is a \writable_iterator |
3511 |
22 Jul 16 |
peter |
- \c MutableRandomAccessIterator is a \random_access_traversal_iterator |
3236 |
23 May 14 |
peter |
- \c double is convertible to \c MutableRandomAccessIterator::value_type |
3236 |
23 May 14 |
peter |
118 |
|
3236 |
23 May 14 |
peter |
\since New in yat 0.13 |
3236 |
23 May 14 |
peter |
120 |
*/ |
3236 |
23 May 14 |
peter |
121 |
template<typename RandomAccessIterator, typename MutableRandomAccessIterator> |
3236 |
23 May 14 |
peter |
122 |
void benjamini_hochberg_unsorted(RandomAccessIterator first, |
3236 |
23 May 14 |
peter |
123 |
RandomAccessIterator last, |
3236 |
23 May 14 |
peter |
124 |
MutableRandomAccessIterator result); |
3236 |
23 May 14 |
peter |
125 |
|
3236 |
23 May 14 |
peter |
126 |
|
464 |
16 Dec 05 |
peter |
127 |
/// |
588 |
21 Jun 06 |
markus |
/// Calculates the probability to get \a k or smaller from a |
464 |
16 Dec 05 |
peter |
/// hypergeometric distribution with parameters \a n1 \a n2 \a |
464 |
16 Dec 05 |
peter |
/// t. Hypergeomtric situation you get in the following situation: |
464 |
16 Dec 05 |
peter |
/// Let there be \a n1 ways for a "good" selection and \a n2 ways |
464 |
16 Dec 05 |
peter |
/// for a "bad" selection out of a total of possibilities. Take \a |
464 |
16 Dec 05 |
peter |
/// t samples without replacement and \a k of those are "good" |
464 |
16 Dec 05 |
peter |
/// samples. \a k will follow a hypergeomtric distribution. |
464 |
16 Dec 05 |
peter |
135 |
/// |
4336 |
14 Apr 23 |
peter |
/// @return cumulative hypergeometric distribution functions P(k). |
648 |
14 Sep 06 |
peter |
137 |
/// |
3236 |
23 May 14 |
peter |
138 |
double cdf_hypergeometric_P(unsigned int k, unsigned int n1, |
1271 |
09 Apr 08 |
peter |
139 |
unsigned int n2, unsigned int t); |
169 |
23 Sep 04 |
peter |
140 |
|
3745 |
27 Jul 18 |
peter |
141 |
/** |
3745 |
27 Jul 18 |
peter |
Calculates the Pearson correlation between each pair of columns. |
169 |
23 Sep 04 |
peter |
143 |
|
3745 |
27 Jul 18 |
peter |
\return a symmetric matrix with correlations |
3745 |
27 Jul 18 |
peter |
145 |
|
3745 |
27 Jul 18 |
peter |
\since New in yat 0.16 |
3745 |
27 Jul 18 |
peter |
147 |
*/ |
4125 |
14 Jan 22 |
peter |
148 |
utility::Matrix correlation(const utility::MatrixBase& x); |
3745 |
27 Jul 18 |
peter |
149 |
|
3745 |
27 Jul 18 |
peter |
150 |
|
1145 |
25 Feb 08 |
peter |
151 |
/** |
3136 |
28 Nov 13 |
peter |
The entropy is calculated as \f$ - \sum_i p_i \log p_i \f$ where |
3135 |
27 Nov 13 |
peter |
\f$p_i = \frac{n_i}{\sum_j n_j} \f$ |
3135 |
27 Nov 13 |
peter |
154 |
|
3136 |
28 Nov 13 |
peter |
Requirements: |
3511 |
22 Jul 16 |
peter |
- \c InputIterator must be a \readable_iterator |
3511 |
22 Jul 16 |
peter |
- \c InputIterator must be a \single_pass_iterator |
3136 |
28 Nov 13 |
peter |
- \c InputIterator::value_type must be convertible to \c double |
3136 |
28 Nov 13 |
peter |
159 |
|
3136 |
28 Nov 13 |
peter |
\since New in yat 0.12 |
3135 |
27 Nov 13 |
peter |
161 |
*/ |
3135 |
27 Nov 13 |
peter |
162 |
template<typename InputIterator> |
3135 |
27 Nov 13 |
peter |
163 |
double entropy(InputIterator first, InputIterator last); |
3135 |
27 Nov 13 |
peter |
164 |
|
3135 |
27 Nov 13 |
peter |
165 |
/** |
1145 |
25 Feb 08 |
peter |
\brief one-sided p-value |
1145 |
25 Feb 08 |
peter |
167 |
|
1145 |
25 Feb 08 |
peter |
This function uses the t-distribution to calculate the one-sided |
1145 |
25 Feb 08 |
peter |
p-value. Given that the true correlation is zero (Null |
1145 |
25 Feb 08 |
peter |
hypothesis) the estimated correlation, r, after a transformation |
1145 |
25 Feb 08 |
peter |
is t-distributed: |
1145 |
25 Feb 08 |
peter |
172 |
|
1145 |
25 Feb 08 |
peter |
\f$ \sqrt{(n-2)} \frac{r}{\sqrt{(1-r^2)}} \in t(n-2) \f$ |
1145 |
25 Feb 08 |
peter |
174 |
|
1145 |
25 Feb 08 |
peter |
\return Probability that correlation is larger than \a r by |
1553 |
07 Oct 08 |
peter |
chance when having \a n samples. For \a r larger or equal to 1.0, |
1553 |
07 Oct 08 |
peter |
0.0 is returned. For \a r smaller or equal to -1.0, 1.0 is |
1553 |
07 Oct 08 |
peter |
returned. |
1145 |
25 Feb 08 |
peter |
179 |
*/ |
1271 |
09 Apr 08 |
peter |
180 |
double pearson_p_value(double r, unsigned int n); |
1145 |
25 Feb 08 |
peter |
181 |
|
464 |
16 Dec 05 |
peter |
182 |
/// |
703 |
18 Dec 06 |
jari |
/// @brief Computes the kurtosis of the data in a vector. |
588 |
21 Jun 06 |
markus |
184 |
/// |
703 |
18 Dec 06 |
jari |
/// The kurtosis measures how sharply peaked a distribution is, |
703 |
18 Dec 06 |
jari |
/// relative to its width. The kurtosis is normalized to zero for a |
703 |
18 Dec 06 |
jari |
/// gaussian distribution. |
703 |
18 Dec 06 |
jari |
188 |
/// |
1025 |
01 Feb 08 |
peter |
189 |
double kurtosis(const utility::VectorBase&); |
588 |
21 Jun 06 |
markus |
190 |
|
588 |
21 Jun 06 |
markus |
191 |
|
3511 |
22 Jul 16 |
peter |
192 |
/** |
3511 |
22 Jul 16 |
peter |
\brief Median absolute deviation from median |
3511 |
22 Jul 16 |
peter |
194 |
|
3511 |
22 Jul 16 |
peter |
Function is non-mutable function |
3511 |
22 Jul 16 |
peter |
196 |
|
3511 |
22 Jul 16 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \ref |
3511 |
22 Jul 16 |
peter |
concept_data_iterator |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \random_access_traversal_iterator |
3870 |
24 Feb 20 |
peter |
- \c For unweighted iterator, RandomAccessIterator must be an |
3870 |
24 Feb 20 |
peter |
\lvalue_iterator as implied by \forward_iterator. |
3511 |
22 Jul 16 |
peter |
203 |
|
3511 |
22 Jul 16 |
peter |
Since 0.6 function also work with a \ref |
3511 |
22 Jul 16 |
peter |
concept_weighted_iterator |
3511 |
22 Jul 16 |
peter |
206 |
*/ |
2202 |
21 Feb 10 |
peter |
207 |
template <class RandomAccessIterator> |
3136 |
28 Nov 13 |
peter |
208 |
double mad(RandomAccessIterator first, RandomAccessIterator last, |
2202 |
21 Feb 10 |
peter |
209 |
bool sorted=false); |
1835 |
25 Feb 09 |
peter |
210 |
|
672 |
07 Oct 06 |
peter |
211 |
|
3511 |
22 Jul 16 |
peter |
212 |
/** |
3511 |
22 Jul 16 |
peter |
Median is defined to be value in the middle. If number of values |
3511 |
22 Jul 16 |
peter |
is even median is the average of the two middle values. the |
3511 |
22 Jul 16 |
peter |
median value is given by p equal to 50. If \a sorted is false |
3511 |
22 Jul 16 |
peter |
(default), the range is copied, the copy is sorted, and then |
3511 |
22 Jul 16 |
peter |
used to calculate the median. |
3511 |
22 Jul 16 |
peter |
218 |
|
3511 |
22 Jul 16 |
peter |
Function is a non-mutable function, i.e., \a first and \a last |
3511 |
22 Jul 16 |
peter |
can be const_iterators. |
3511 |
22 Jul 16 |
peter |
221 |
|
3511 |
22 Jul 16 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \ref |
3511 |
22 Jul 16 |
peter |
concept_data_iterator |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \random_access_traversal_iterator |
3870 |
24 Feb 20 |
peter |
- \c For unweighted iterator, RandomAccessIterator must be an |
3870 |
24 Feb 20 |
peter |
\lvalue_iterator as implied by \forward_iterator. |
3511 |
22 Jul 16 |
peter |
228 |
|
3511 |
22 Jul 16 |
peter |
\return median of range |
3511 |
22 Jul 16 |
peter |
230 |
*/ |
3136 |
28 Nov 13 |
peter |
231 |
template <class RandomAccessIterator> |
3136 |
28 Nov 13 |
peter |
232 |
double median(RandomAccessIterator first, RandomAccessIterator last, |
3136 |
28 Nov 13 |
peter |
233 |
bool sorted=false); |
511 |
18 Feb 06 |
peter |
234 |
|
1874 |
17 Mar 09 |
peter |
235 |
|
671 |
07 Oct 06 |
peter |
236 |
/** |
3137 |
28 Nov 13 |
peter |
\brief Calculates the mutual information of \a A. |
3137 |
28 Nov 13 |
peter |
238 |
|
3137 |
28 Nov 13 |
peter |
The elements in A are unnormalized probabilies of the joint |
3137 |
28 Nov 13 |
peter |
distribution. |
3137 |
28 Nov 13 |
peter |
241 |
|
3137 |
28 Nov 13 |
peter |
The mutual information is calculated as \f$ \sum \sum p(x,y) \log_2 |
3137 |
28 Nov 13 |
peter |
\frac {p(x,y)} {p(x)p(y)} \f$ where |
3137 |
28 Nov 13 |
peter |
\f$ p(x,y) = \frac {A_{xy}}{\sum_{x,y} A_{xy}} \f$; |
3137 |
28 Nov 13 |
peter |
\f$ p(x) = \sum_y A_{xy} / \sum_{x,y} A_{xy} \f$; |
3137 |
28 Nov 13 |
peter |
\f$ p(y) = \sum_x A_{xy} / \sum_{x,y} A_{xy} \f$ |
3137 |
28 Nov 13 |
peter |
247 |
|
3137 |
28 Nov 13 |
peter |
Requirements: |
3137 |
28 Nov 13 |
peter |
- \c T must be a model of \ref concept_container_2d |
3137 |
28 Nov 13 |
peter |
- \c T::value_type must be convertible to \c double |
3137 |
28 Nov 13 |
peter |
251 |
|
3137 |
28 Nov 13 |
peter |
\return mutual information in bits; if you want in natural base |
3137 |
28 Nov 13 |
peter |
multiply with \c M_LN2 (defined in \c gsl/gsl_math.h ) |
3137 |
28 Nov 13 |
peter |
254 |
|
3137 |
28 Nov 13 |
peter |
\since New in yat 0.12 |
3137 |
28 Nov 13 |
peter |
256 |
*/ |
3137 |
28 Nov 13 |
peter |
257 |
template<class T> |
3137 |
28 Nov 13 |
peter |
258 |
double mutual_information(const T& A); |
3137 |
28 Nov 13 |
peter |
259 |
|
3137 |
28 Nov 13 |
peter |
260 |
/** |
4336 |
14 Apr 23 |
peter |
We have \a n1 samples of type 1 and \a n2 samples of type 2. |
4336 |
14 Apr 23 |
peter |
Samples are drawn with replacement until \a t samles of type 2 |
4336 |
14 Apr 23 |
peter |
are drawn. Then \a k, number of drawn samples of type 1, follows |
4336 |
14 Apr 23 |
peter |
negative hypergeometric distribution. |
4336 |
14 Apr 23 |
peter |
265 |
|
4336 |
14 Apr 23 |
peter |
\return probability to get \c k from a negative hypergeometric |
4336 |
14 Apr 23 |
peter |
distribution with parameters \c n1, \c n2, and \c t. |
4336 |
14 Apr 23 |
peter |
268 |
|
4336 |
14 Apr 23 |
peter |
\since New in yat 0.21 |
4336 |
14 Apr 23 |
peter |
270 |
*/ |
4336 |
14 Apr 23 |
peter |
271 |
double negative_hypergeometric_pdf(unsigned k, unsigned n1, |
4336 |
14 Apr 23 |
peter |
272 |
unsigned n2, unsigned t); |
4336 |
14 Apr 23 |
peter |
273 |
|
4336 |
14 Apr 23 |
peter |
274 |
/** |
671 |
07 Oct 06 |
peter |
The percentile is determined by the \a p, a number between 0 and |
671 |
07 Oct 06 |
peter |
100. The percentile is found by interpolation, using the formula |
671 |
07 Oct 06 |
peter |
\f$ percentile = (1 - \delta) x_i + \delta x_{i+1} \f$ where \a |
671 |
07 Oct 06 |
peter |
p is floor\f$((n - 1)p/100)\f$ and \f$ \delta \f$ is \f$ |
671 |
07 Oct 06 |
peter |
(n-1)p/100 - i \f$.Thus the minimum value of the vector is given |
671 |
07 Oct 06 |
peter |
by p equal to zero, the maximum is given by p equal to 100 and |
671 |
07 Oct 06 |
peter |
the median value is given by p equal to 50. If @a sorted |
671 |
07 Oct 06 |
peter |
is false (default), the vector is copied, the copy is sorted, |
671 |
07 Oct 06 |
peter |
and then used to calculate the median. |
932 |
05 Oct 07 |
peter |
284 |
|
933 |
05 Oct 07 |
peter |
Function is a non-mutable function, i.e., \a first and \a last |
933 |
05 Oct 07 |
peter |
can be const_iterators. |
3136 |
28 Nov 13 |
peter |
287 |
|
1874 |
17 Mar 09 |
peter |
Requirements: RandomAccessIterator is an iterator over a range of |
1874 |
17 Mar 09 |
peter |
doubles (or any type being convertable to double). |
3136 |
28 Nov 13 |
peter |
290 |
|
932 |
05 Oct 07 |
peter |
@return \a p'th percentile of range |
1404 |
07 Aug 08 |
peter |
292 |
|
1404 |
07 Aug 08 |
peter |
\deprecated percentile2 will replace this function in the future |
1404 |
07 Aug 08 |
peter |
294 |
|
1404 |
07 Aug 08 |
peter |
\note the definition of percentile used here is not identical to |
1404 |
07 Aug 08 |
peter |
that one used in percentile2 and Percentile. The difference is |
1404 |
07 Aug 08 |
peter |
smaller for large ranges. |
671 |
07 Oct 06 |
peter |
298 |
*/ |
1404 |
07 Aug 08 |
peter |
299 |
template <class RandomAccessIterator> |
3136 |
28 Nov 13 |
peter |
300 |
double percentile(RandomAccessIterator first, RandomAccessIterator last, |
1874 |
17 Mar 09 |
peter |
301 |
double p, bool sorted=false) YAT_DEPRECATE; |
1874 |
17 Mar 09 |
peter |
302 |
|
1874 |
17 Mar 09 |
peter |
303 |
/** |
1874 |
17 Mar 09 |
peter |
\see Percentiler |
3136 |
28 Nov 13 |
peter |
305 |
|
3511 |
22 Jul 16 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \ref |
3511 |
22 Jul 16 |
peter |
concept_data_iterator |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \random_access_traversal_iterator |
3870 |
24 Feb 20 |
peter |
- \c For unweighted iterator, RandomAccessIterator must be an |
3870 |
24 Feb 20 |
peter |
\lvalue_iterator as implied by \forward_iterator. |
3511 |
22 Jul 16 |
peter |
312 |
|
1874 |
17 Mar 09 |
peter |
\since new in yat 0.5 |
1874 |
17 Mar 09 |
peter |
314 |
*/ |
1874 |
17 Mar 09 |
peter |
315 |
template <class RandomAccessIterator> |
3136 |
28 Nov 13 |
peter |
316 |
double percentile2(RandomAccessIterator first, RandomAccessIterator last, |
1874 |
17 Mar 09 |
peter |
317 |
double p, bool sorted=false) |
1874 |
17 Mar 09 |
peter |
318 |
{ |
1874 |
17 Mar 09 |
peter |
319 |
Percentiler percentiler(p, sorted); |
1874 |
17 Mar 09 |
peter |
320 |
return percentiler(first, last); |
1874 |
17 Mar 09 |
peter |
321 |
} |
1874 |
17 Mar 09 |
peter |
322 |
|
1874 |
17 Mar 09 |
peter |
323 |
/// |
1874 |
17 Mar 09 |
peter |
/// @brief Computes the skewness of the data in a vector. |
1874 |
17 Mar 09 |
peter |
325 |
/// |
1874 |
17 Mar 09 |
peter |
/// The skewness measures the asymmetry of the tails of a |
1874 |
17 Mar 09 |
peter |
/// distribution. |
1874 |
17 Mar 09 |
peter |
328 |
/// |
1874 |
17 Mar 09 |
peter |
329 |
double skewness(const utility::VectorBase&); |
1874 |
17 Mar 09 |
peter |
330 |
|
3236 |
23 May 14 |
peter |
331 |
|
1874 |
17 Mar 09 |
peter |
//////// template implementations /////////// |
1874 |
17 Mar 09 |
peter |
333 |
|
1874 |
17 Mar 09 |
peter |
334 |
template <typename T, typename ForwardIterator> |
1874 |
17 Mar 09 |
peter |
335 |
void add(T& o, ForwardIterator first, ForwardIterator last, |
1874 |
17 Mar 09 |
peter |
336 |
const classifier::Target& target) |
1874 |
17 Mar 09 |
peter |
337 |
{ |
3511 |
22 Jul 16 |
peter |
338 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>)); |
3511 |
22 Jul 16 |
peter |
339 |
BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<ForwardIterator>)); |
2202 |
21 Feb 10 |
peter |
340 |
utility::iterator_traits<ForwardIterator> traits; |
1874 |
17 Mar 09 |
peter |
341 |
for (size_t i=0; first!=last; ++i, ++first) |
2202 |
21 Feb 10 |
peter |
342 |
o.add(traits.data(first), target.binary(i), traits.weight(first)); |
3236 |
23 May 14 |
peter |
343 |
} |
1874 |
17 Mar 09 |
peter |
344 |
|
1874 |
17 Mar 09 |
peter |
345 |
|
2476 |
14 Apr 11 |
peter |
346 |
template<typename BidirectionalIterator1, typename BidirectionalIterator2> |
2476 |
14 Apr 11 |
peter |
347 |
void benjamini_hochberg(BidirectionalIterator1 first, |
2476 |
14 Apr 11 |
peter |
348 |
BidirectionalIterator1 last, |
2476 |
14 Apr 11 |
peter |
349 |
BidirectionalIterator2 result) |
2476 |
14 Apr 11 |
peter |
350 |
{ |
3511 |
22 Jul 16 |
peter |
351 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator1>)); |
3511 |
22 Jul 16 |
peter |
352 |
BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator1>)); |
3511 |
22 Jul 16 |
peter |
353 |
|
3511 |
22 Jul 16 |
peter |
354 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<BidirectionalIterator2>)); |
3511 |
22 Jul 16 |
peter |
355 |
BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<BidirectionalIterator2>)); |
3511 |
22 Jul 16 |
peter |
356 |
BOOST_CONCEPT_ASSERT((boost_concepts::BidirectionalTraversal<BidirectionalIterator2>)); |
3511 |
22 Jul 16 |
peter |
357 |
|
3511 |
22 Jul 16 |
peter |
358 |
typedef typename boost::iterator_difference<BidirectionalIterator1>::type |
3511 |
22 Jul 16 |
peter |
359 |
diff_type; |
3511 |
22 Jul 16 |
peter |
360 |
|
3511 |
22 Jul 16 |
peter |
361 |
diff_type n = std::distance(first, last); |
2496 |
22 Jun 11 |
peter |
362 |
if (!n) |
2496 |
22 Jun 11 |
peter |
363 |
return; |
2496 |
22 Jun 11 |
peter |
364 |
std::advance(result, n-1); |
2496 |
22 Jun 11 |
peter |
365 |
first = last; |
2476 |
14 Apr 11 |
peter |
366 |
--first; |
3511 |
22 Jul 16 |
peter |
367 |
diff_type rank = n; |
3236 |
23 May 14 |
peter |
368 |
|
3511 |
22 Jul 16 |
peter |
369 |
double x = 1.0; |
2476 |
14 Apr 11 |
peter |
370 |
while (rank) { |
3511 |
22 Jul 16 |
peter |
371 |
x = std::min(*first * n/static_cast<double>(rank), x); |
3511 |
22 Jul 16 |
peter |
372 |
*result = x; |
2476 |
14 Apr 11 |
peter |
373 |
--rank; |
2476 |
14 Apr 11 |
peter |
374 |
--first; |
2476 |
14 Apr 11 |
peter |
375 |
--result; |
2476 |
14 Apr 11 |
peter |
376 |
} |
2476 |
14 Apr 11 |
peter |
377 |
} |
2476 |
14 Apr 11 |
peter |
378 |
|
2476 |
14 Apr 11 |
peter |
379 |
|
3236 |
23 May 14 |
peter |
380 |
template<typename RandomAccessIterator, typename MutableRandomAccessIterator> |
3236 |
23 May 14 |
peter |
381 |
void benjamini_hochberg_unsorted(RandomAccessIterator first, |
3236 |
23 May 14 |
peter |
382 |
RandomAccessIterator last, |
3236 |
23 May 14 |
peter |
383 |
MutableRandomAccessIterator result) |
3236 |
23 May 14 |
peter |
384 |
{ |
3511 |
22 Jul 16 |
peter |
385 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<RandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
386 |
BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>)); |
3236 |
23 May 14 |
peter |
387 |
|
3511 |
22 Jul 16 |
peter |
388 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<MutableRandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
389 |
BOOST_CONCEPT_ASSERT((boost_concepts::WritableIterator<MutableRandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
390 |
BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<MutableRandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
391 |
|
3236 |
23 May 14 |
peter |
392 |
std::vector<size_t> idx; |
3236 |
23 May 14 |
peter |
393 |
utility::sort_index(first, last, idx); |
3236 |
23 May 14 |
peter |
394 |
benjamini_hochberg(boost::make_permutation_iterator(first, idx.begin()), |
3236 |
23 May 14 |
peter |
395 |
boost::make_permutation_iterator(first, idx.end()), |
3236 |
23 May 14 |
peter |
396 |
boost::make_permutation_iterator(result, idx.begin())); |
3236 |
23 May 14 |
peter |
397 |
} |
3236 |
23 May 14 |
peter |
398 |
|
3236 |
23 May 14 |
peter |
399 |
|
3135 |
27 Nov 13 |
peter |
400 |
template<typename InputIterator> |
3135 |
27 Nov 13 |
peter |
401 |
double entropy(InputIterator first, InputIterator last) |
3135 |
27 Nov 13 |
peter |
402 |
{ |
3511 |
22 Jul 16 |
peter |
403 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<InputIterator>)); |
3511 |
22 Jul 16 |
peter |
404 |
BOOST_CONCEPT_ASSERT((boost_concepts::SinglePassIterator<InputIterator>)); |
3135 |
27 Nov 13 |
peter |
405 |
using boost::Convertible; |
3526 |
11 Oct 16 |
peter |
406 |
typedef typename boost::iterator_value<InputIterator>::type T; |
3135 |
27 Nov 13 |
peter |
407 |
BOOST_CONCEPT_ASSERT((Convertible<T,double>)); |
3135 |
27 Nov 13 |
peter |
408 |
double sum = 0; |
3135 |
27 Nov 13 |
peter |
409 |
double N = 0; |
3135 |
27 Nov 13 |
peter |
410 |
for (; first != last; ++first) { |
3135 |
27 Nov 13 |
peter |
411 |
if (*first) { |
3135 |
27 Nov 13 |
peter |
412 |
N += *first; |
3135 |
27 Nov 13 |
peter |
413 |
sum += *first * std::log(static_cast<double>(*first)); |
3135 |
27 Nov 13 |
peter |
414 |
} |
3135 |
27 Nov 13 |
peter |
415 |
} |
3135 |
27 Nov 13 |
peter |
416 |
return -sum / N + log(N); |
3135 |
27 Nov 13 |
peter |
417 |
} |
3135 |
27 Nov 13 |
peter |
418 |
|
3135 |
27 Nov 13 |
peter |
419 |
|
2202 |
21 Feb 10 |
peter |
420 |
template <class RandomAccessIterator> |
3236 |
23 May 14 |
peter |
421 |
double mad(RandomAccessIterator first, RandomAccessIterator last, |
2202 |
21 Feb 10 |
peter |
422 |
bool sorted) |
1874 |
17 Mar 09 |
peter |
423 |
{ |
2263 |
26 May 10 |
peter |
424 |
BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
425 |
BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>)); |
1874 |
17 Mar 09 |
peter |
426 |
double m = median(first, last, sorted); |
3526 |
11 Oct 16 |
peter |
427 |
typedef typename boost::iterator_value<RandomAccessIterator>::type T; |
2502 |
28 Jun 11 |
peter |
428 |
std::vector<T> ad(std::distance(first, last)); |
2502 |
28 Jun 11 |
peter |
// copy weights if needed |
2502 |
28 Jun 11 |
peter |
430 |
normalizer::detail::copy_weight_if_weighted(first, last, ad.begin()); |
2502 |
28 Jun 11 |
peter |
// assign data (absolute deviation from median) |
2502 |
28 Jun 11 |
peter |
432 |
utility::iterator_traits<RandomAccessIterator> traits; |
2502 |
28 Jun 11 |
peter |
433 |
utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin()); |
2502 |
28 Jun 11 |
peter |
434 |
while (first!=last) { |
2502 |
28 Jun 11 |
peter |
435 |
*first2 = std::abs(traits.data(first)-m); |
2502 |
28 Jun 11 |
peter |
436 |
++first; |
2502 |
28 Jun 11 |
peter |
437 |
++first2; |
1874 |
17 Mar 09 |
peter |
438 |
} |
3245 |
24 May 14 |
peter |
439 |
return median(ad.begin(), ad.end(), false); |
1874 |
17 Mar 09 |
peter |
440 |
} |
1874 |
17 Mar 09 |
peter |
441 |
|
3137 |
28 Nov 13 |
peter |
442 |
|
3137 |
28 Nov 13 |
peter |
443 |
template <class RandomAccessIterator> |
3137 |
28 Nov 13 |
peter |
444 |
double median(RandomAccessIterator first, RandomAccessIterator last, |
3137 |
28 Nov 13 |
peter |
445 |
bool sorted) |
3137 |
28 Nov 13 |
peter |
446 |
{ |
3511 |
22 Jul 16 |
peter |
447 |
BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
448 |
BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>)); |
3137 |
28 Nov 13 |
peter |
449 |
return percentile2(first, last, 50.0, sorted); |
1874 |
17 Mar 09 |
peter |
450 |
} |
1874 |
17 Mar 09 |
peter |
451 |
|
1874 |
17 Mar 09 |
peter |
452 |
|
3137 |
28 Nov 13 |
peter |
453 |
template<class T> |
3137 |
28 Nov 13 |
peter |
454 |
double mutual_information(const T& n) |
3137 |
28 Nov 13 |
peter |
455 |
{ |
3137 |
28 Nov 13 |
peter |
456 |
BOOST_CONCEPT_ASSERT((utility::Container2D<T>)); |
3137 |
28 Nov 13 |
peter |
457 |
using boost::Convertible; |
3137 |
28 Nov 13 |
peter |
458 |
BOOST_CONCEPT_ASSERT((Convertible<typename T::value_type,double>)); |
3137 |
28 Nov 13 |
peter |
459 |
|
3137 |
28 Nov 13 |
peter |
// p_x = \sum_y p_xy |
3137 |
28 Nov 13 |
peter |
461 |
|
3137 |
28 Nov 13 |
peter |
// Mutual Information is defined as |
3137 |
28 Nov 13 |
peter |
// \sum_xy p_xy * log (p_xy / (p_x p_y)) = |
3137 |
28 Nov 13 |
peter |
// \sum_xy p_xy * [log p_xy - log p_x - log p_y] |
3137 |
28 Nov 13 |
peter |
// \sum_xy p_xy log p_xy - p_xy log p_x - p_xy log p_y |
3137 |
28 Nov 13 |
peter |
// \sum_xy p_xy log p_xy - \sum_x p_x log p_x - \sum_y p_y log p_y |
3137 |
28 Nov 13 |
peter |
// - entropy_xy + entropy_x + entropy_y |
3137 |
28 Nov 13 |
peter |
468 |
|
3137 |
28 Nov 13 |
peter |
469 |
utility::Vector rowsum(n.columns(), 0); |
3137 |
28 Nov 13 |
peter |
470 |
for (size_t c = 0; c<n.columns(); ++c) |
3137 |
28 Nov 13 |
peter |
471 |
rowsum(c) = std::accumulate(n.begin_column(c), n.end_column(c), 0); |
3137 |
28 Nov 13 |
peter |
472 |
|
3137 |
28 Nov 13 |
peter |
473 |
utility::Vector colsum(n.rows(), 0); |
3137 |
28 Nov 13 |
peter |
474 |
for (size_t r = 0; r<n.rows(); ++r) |
3137 |
28 Nov 13 |
peter |
475 |
colsum(r) = std::accumulate(n.begin_row(r), n.end_row(r), 0); |
3137 |
28 Nov 13 |
peter |
476 |
|
3137 |
28 Nov 13 |
peter |
477 |
double mi = - entropy(n.begin(), n.end()); |
3137 |
28 Nov 13 |
peter |
478 |
mi += entropy(rowsum.begin(), rowsum.end()); |
3137 |
28 Nov 13 |
peter |
479 |
mi += entropy(colsum.begin(), colsum.end()); |
3137 |
28 Nov 13 |
peter |
480 |
|
3137 |
28 Nov 13 |
peter |
481 |
return mi/M_LN2; |
3137 |
28 Nov 13 |
peter |
482 |
} |
3137 |
28 Nov 13 |
peter |
483 |
|
3137 |
28 Nov 13 |
peter |
484 |
|
1874 |
17 Mar 09 |
peter |
485 |
template <class RandomAccessIterator> |
3137 |
28 Nov 13 |
peter |
486 |
double percentile(RandomAccessIterator first, RandomAccessIterator last, |
2673 |
03 Dec 11 |
peter |
487 |
double p, bool sorted) |
464 |
16 Dec 05 |
peter |
488 |
{ |
2202 |
21 Feb 10 |
peter |
489 |
BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>)); |
2263 |
26 May 10 |
peter |
490 |
BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>)); |
1404 |
07 Aug 08 |
peter |
// range is one value only is a special case |
1404 |
07 Aug 08 |
peter |
492 |
if (first+1 == last) |
1404 |
07 Aug 08 |
peter |
493 |
return utility::iterator_traits<RandomAccessIterator>().data(first); |
1404 |
07 Aug 08 |
peter |
494 |
if (sorted) { |
1404 |
07 Aug 08 |
peter |
// have to take care of this special case |
1404 |
07 Aug 08 |
peter |
496 |
if (p>=100) |
1404 |
07 Aug 08 |
peter |
497 |
return utility::iterator_traits<RandomAccessIterator>().data(--last); |
1404 |
07 Aug 08 |
peter |
498 |
double j = p/100 * (std::distance(first,last)-1); |
1404 |
07 Aug 08 |
peter |
499 |
int i = static_cast<int>(j); |
1404 |
07 Aug 08 |
peter |
500 |
return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1]; |
1404 |
07 Aug 08 |
peter |
501 |
} |
2503 |
29 Jun 11 |
peter |
502 |
using std::iterator_traits; |
2503 |
29 Jun 11 |
peter |
503 |
typedef typename iterator_traits<RandomAccessIterator>::value_type value_t; |
3239 |
24 May 14 |
peter |
504 |
std::vector<value_t> v_copy(first, last); |
1404 |
07 Aug 08 |
peter |
505 |
size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1)); |
1404 |
07 Aug 08 |
peter |
506 |
if (i+2 < v_copy.size()) { |
1404 |
07 Aug 08 |
peter |
507 |
std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end()); |
1404 |
07 Aug 08 |
peter |
508 |
} |
1404 |
07 Aug 08 |
peter |
509 |
else |
1404 |
07 Aug 08 |
peter |
510 |
std::sort(v_copy.begin(), v_copy.end()); |
1404 |
07 Aug 08 |
peter |
511 |
return percentile(v_copy.begin(), v_copy.end(), p, true); |
1404 |
07 Aug 08 |
peter |
512 |
} |
1404 |
07 Aug 08 |
peter |
513 |
|
1404 |
07 Aug 08 |
peter |
514 |
|
683 |
11 Oct 06 |
jari |
515 |
}}} // of namespace statistics, yat, and theplu |
588 |
21 Jun 06 |
markus |
516 |
|
115 |
19 Jul 04 |
peter |
517 |
#endif |