2721 |
12 Apr 12 |
peter |
1 |
#ifndef _theplu_yat_statistics_kolmogorov_smirnov_ |
2721 |
12 Apr 12 |
peter |
2 |
#define _theplu_yat_statistics_kolmogorov_smirnov_ |
1003 |
18 Jan 08 |
peter |
3 |
|
1003 |
18 Jan 08 |
peter |
// $Id$ |
1003 |
18 Jan 08 |
peter |
5 |
|
1003 |
18 Jan 08 |
peter |
6 |
/* |
2119 |
12 Dec 09 |
peter |
Copyright (C) 2008 Jari Häkkinen, Peter Johansson |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2009, 2010, 2011, 2012, 2013, 2016, 2022 Peter Johansson |
1003 |
18 Jan 08 |
peter |
9 |
|
1437 |
25 Aug 08 |
peter |
This file is part of the yat library, http://dev.thep.lu.se/yat |
1003 |
18 Jan 08 |
peter |
11 |
|
1003 |
18 Jan 08 |
peter |
The yat library is free software; you can redistribute it and/or |
1003 |
18 Jan 08 |
peter |
modify it under the terms of the GNU General Public License as |
1486 |
09 Sep 08 |
jari |
published by the Free Software Foundation; either version 3 of the |
1003 |
18 Jan 08 |
peter |
License, or (at your option) any later version. |
1003 |
18 Jan 08 |
peter |
16 |
|
1003 |
18 Jan 08 |
peter |
The yat library is distributed in the hope that it will be useful, |
1003 |
18 Jan 08 |
peter |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
1003 |
18 Jan 08 |
peter |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
1003 |
18 Jan 08 |
peter |
General Public License for more details. |
1003 |
18 Jan 08 |
peter |
21 |
|
1003 |
18 Jan 08 |
peter |
You should have received a copy of the GNU General Public License |
1487 |
10 Sep 08 |
jari |
along with yat. If not, see <http://www.gnu.org/licenses/>. |
1003 |
18 Jan 08 |
peter |
24 |
*/ |
1003 |
18 Jan 08 |
peter |
25 |
|
3523 |
10 Oct 16 |
peter |
26 |
#include "yat/utility/concept_check.h" |
3523 |
10 Oct 16 |
peter |
27 |
|
2202 |
21 Feb 10 |
peter |
28 |
#include <boost/concept_check.hpp> |
3523 |
10 Oct 16 |
peter |
29 |
#include <boost/iterator/iterator_concepts.hpp> |
2202 |
21 Feb 10 |
peter |
30 |
|
2055 |
08 Sep 09 |
peter |
31 |
#include <iosfwd> |
1003 |
18 Jan 08 |
peter |
32 |
#include <set> |
1003 |
18 Jan 08 |
peter |
33 |
#include <vector> |
1003 |
18 Jan 08 |
peter |
34 |
|
1003 |
18 Jan 08 |
peter |
35 |
namespace theplu { |
1003 |
18 Jan 08 |
peter |
36 |
namespace yat { |
2721 |
12 Apr 12 |
peter |
37 |
namespace statistics { |
1003 |
18 Jan 08 |
peter |
38 |
|
1003 |
18 Jan 08 |
peter |
39 |
/** |
2912 |
17 Dec 12 |
peter |
\brief Kolmogorov Smirnov Test |
1003 |
18 Jan 08 |
peter |
41 |
*/ |
1003 |
18 Jan 08 |
peter |
42 |
class KolmogorovSmirnov |
1003 |
18 Jan 08 |
peter |
43 |
{ |
1003 |
18 Jan 08 |
peter |
44 |
public: |
1701 |
07 Jan 09 |
peter |
45 |
/** |
2721 |
12 Apr 12 |
peter |
struct used to store data in KolmogorovSmirnov. |
2721 |
12 Apr 12 |
peter |
47 |
|
1701 |
07 Jan 09 |
peter |
This struct is public to make usage of the add range function |
1701 |
07 Jan 09 |
peter |
more convenient. |
1701 |
07 Jan 09 |
peter |
50 |
|
1701 |
07 Jan 09 |
peter |
\since New in yat 0.5 |
1701 |
07 Jan 09 |
peter |
52 |
*/ |
1701 |
07 Jan 09 |
peter |
53 |
struct Element |
1701 |
07 Jan 09 |
peter |
54 |
{ |
1701 |
07 Jan 09 |
peter |
55 |
/** |
1701 |
07 Jan 09 |
peter |
\brief default constructor |
1701 |
07 Jan 09 |
peter |
57 |
*/ |
2515 |
11 Jul 11 |
peter |
58 |
Element(void); |
1701 |
07 Jan 09 |
peter |
59 |
|
1701 |
07 Jan 09 |
peter |
60 |
/** |
1701 |
07 Jan 09 |
peter |
\brief Constructor |
1701 |
07 Jan 09 |
peter |
62 |
*/ |
2515 |
11 Jul 11 |
peter |
63 |
Element(double x, bool class_label, double w=1.0); |
1701 |
07 Jan 09 |
peter |
64 |
|
1701 |
07 Jan 09 |
peter |
65 |
/** |
1701 |
07 Jan 09 |
peter |
\brief data value |
1701 |
07 Jan 09 |
peter |
67 |
*/ |
1701 |
07 Jan 09 |
peter |
68 |
double value; |
2721 |
12 Apr 12 |
peter |
69 |
|
2721 |
12 Apr 12 |
peter |
70 |
/** |
2515 |
11 Jul 11 |
peter |
bool telling which group the data point belongs to |
1701 |
07 Jan 09 |
peter |
72 |
*/ |
1701 |
07 Jan 09 |
peter |
73 |
bool label; |
1701 |
07 Jan 09 |
peter |
74 |
|
1701 |
07 Jan 09 |
peter |
75 |
/** |
1701 |
07 Jan 09 |
peter |
weight for the data point |
1701 |
07 Jan 09 |
peter |
77 |
*/ |
1701 |
07 Jan 09 |
peter |
78 |
double weight; |
1701 |
07 Jan 09 |
peter |
79 |
|
1701 |
07 Jan 09 |
peter |
80 |
/** |
1701 |
07 Jan 09 |
peter |
\return true if value is less than rhs.value |
1701 |
07 Jan 09 |
peter |
82 |
*/ |
2064 |
16 Sep 09 |
peter |
83 |
bool operator<(const Element& rhs) const; |
1701 |
07 Jan 09 |
peter |
84 |
}; |
1701 |
07 Jan 09 |
peter |
85 |
|
2721 |
12 Apr 12 |
peter |
86 |
/** |
2721 |
12 Apr 12 |
peter |
\brief Constructor |
2721 |
12 Apr 12 |
peter |
88 |
*/ |
1003 |
18 Jan 08 |
peter |
89 |
KolmogorovSmirnov(void); |
1003 |
18 Jan 08 |
peter |
90 |
|
1003 |
18 Jan 08 |
peter |
91 |
/** |
1003 |
18 Jan 08 |
peter |
\brief add a value |
1003 |
18 Jan 08 |
peter |
93 |
*/ |
1003 |
18 Jan 08 |
peter |
94 |
void add(double value, bool class_label, double weight=1.0); |
1003 |
18 Jan 08 |
peter |
95 |
|
1003 |
18 Jan 08 |
peter |
96 |
/** |
1701 |
07 Jan 09 |
peter |
\brief add a range |
2721 |
12 Apr 12 |
peter |
98 |
|
3523 |
10 Oct 16 |
peter |
Type Requirements: |
3523 |
10 Oct 16 |
peter |
- \c ForwardIterator models \readable_iterator |
3523 |
10 Oct 16 |
peter |
- \c ForwardIterator models \forward_traversal_iterator |
3523 |
10 Oct 16 |
peter |
- \c value_type of \c ForwardIterator must be convertible to |
3523 |
10 Oct 16 |
peter |
\c KolmogorovSmirnov::Element |
2721 |
12 Apr 12 |
peter |
104 |
|
1701 |
07 Jan 09 |
peter |
Insertion takes typically N*log(N). However, if range is |
1701 |
07 Jan 09 |
peter |
sorted, insertion takes linear time. A common use case of this |
1701 |
07 Jan 09 |
peter |
function is when calculating several KS statistics on a data |
1701 |
07 Jan 09 |
peter |
set (values and weights) with different class labels. |
1701 |
07 Jan 09 |
peter |
109 |
|
1701 |
07 Jan 09 |
peter |
\since New in yat 0.5 |
1701 |
07 Jan 09 |
peter |
111 |
*/ |
1701 |
07 Jan 09 |
peter |
112 |
template <typename ForwardIterator> |
2721 |
12 Apr 12 |
peter |
113 |
void add(ForwardIterator first, ForwardIterator last); |
1701 |
07 Jan 09 |
peter |
114 |
|
1701 |
07 Jan 09 |
peter |
115 |
/** |
1593 |
21 Oct 08 |
peter |
\brief Large-Sample Approximation |
1593 |
21 Oct 08 |
peter |
117 |
|
1593 |
21 Oct 08 |
peter |
This analytical approximation of p-value can be used when all |
1593 |
21 Oct 08 |
peter |
weight equal unity and sample sizes \a n and \a m are |
1593 |
21 Oct 08 |
peter |
large. The p-value is calcuated as \f$ P = \displaystyle - 2 |
1593 |
21 Oct 08 |
peter |
\sum_{k=1}^{\infty} (-1)^ke^{-2k^2s^2}\f$, where s is the |
1593 |
21 Oct 08 |
peter |
scaled score: |
1593 |
21 Oct 08 |
peter |
123 |
|
2721 |
12 Apr 12 |
peter |
\f$ s = \sqrt\frac{nm}{n+m} \f$ score(). |
1593 |
21 Oct 08 |
peter |
125 |
|
1593 |
21 Oct 08 |
peter |
\since New in yat 0.5 |
1593 |
21 Oct 08 |
peter |
127 |
|
1593 |
21 Oct 08 |
peter |
Following Hollander and Wolfe |
1593 |
21 Oct 08 |
peter |
129 |
*/ |
1593 |
21 Oct 08 |
peter |
130 |
double p_value(void) const; |
1593 |
21 Oct 08 |
peter |
131 |
|
1593 |
21 Oct 08 |
peter |
132 |
/** |
1003 |
18 Jan 08 |
peter |
\brief p-value |
1003 |
18 Jan 08 |
peter |
134 |
|
1003 |
18 Jan 08 |
peter |
Performs a permutation test using \a perm label randomizations |
1003 |
18 Jan 08 |
peter |
and calculate how often a score equal or larger than score() is |
1003 |
18 Jan 08 |
peter |
obtained. |
1701 |
07 Jan 09 |
peter |
138 |
|
1701 |
07 Jan 09 |
peter |
\see shuffle |
1003 |
18 Jan 08 |
peter |
140 |
*/ |
1003 |
18 Jan 08 |
peter |
141 |
double p_value(size_t perm) const; |
1003 |
18 Jan 08 |
peter |
142 |
|
1003 |
18 Jan 08 |
peter |
143 |
/** |
2721 |
12 Apr 12 |
peter |
\brief Remove a data point |
2721 |
12 Apr 12 |
peter |
145 |
|
2721 |
12 Apr 12 |
peter |
\throw utility::runtime_error if no data point exist with \a |
2721 |
12 Apr 12 |
peter |
value, \a class_label, and \a weight. |
2721 |
12 Apr 12 |
peter |
148 |
|
3018 |
04 Apr 13 |
peter |
\since New in yat 0.9 |
2721 |
12 Apr 12 |
peter |
150 |
*/ |
2721 |
12 Apr 12 |
peter |
151 |
void remove(double value, bool class_label, double weight=1.0); |
2721 |
12 Apr 12 |
peter |
152 |
|
2721 |
12 Apr 12 |
peter |
153 |
/** |
1003 |
18 Jan 08 |
peter |
\brief resets everything to zero |
1003 |
18 Jan 08 |
peter |
155 |
*/ |
1003 |
18 Jan 08 |
peter |
156 |
void reset(void); |
1003 |
18 Jan 08 |
peter |
157 |
|
1003 |
18 Jan 08 |
peter |
158 |
/** |
1003 |
18 Jan 08 |
peter |
\brief Kolmogorov Smirnov statistic |
1003 |
18 Jan 08 |
peter |
160 |
|
4260 |
21 Dec 22 |
peter |
\return absolute value of signed_score() |
4260 |
21 Dec 22 |
peter |
162 |
|
1677 |
24 Dec 08 |
peter |
\f$ sup_x | F_{\textrm{True}}(x) - F_{\textrm{False}}(x) | \f$ where |
1677 |
24 Dec 08 |
peter |
\f$ F(x) = \frac{\sum_{i:x_i\le x}w_i}{ \sum w_i} \f$ |
4260 |
21 Dec 22 |
peter |
165 |
|
4260 |
21 Dec 22 |
peter |
\see scores() |
1003 |
18 Jan 08 |
peter |
167 |
*/ |
1003 |
18 Jan 08 |
peter |
168 |
double score(void) const; |
1003 |
18 Jan 08 |
peter |
169 |
|
1595 |
22 Oct 08 |
peter |
170 |
/** |
4260 |
21 Dec 22 |
peter |
In case of no ties and data points are sorted such that \f$ x_i |
4260 |
21 Dec 22 |
peter |
\le x_{i+1} \f$, the kth element is calculated as |
4260 |
21 Dec 22 |
peter |
173 |
|
4260 |
21 Dec 22 |
peter |
\f$ s[k] = \frac{\sum_i^k I(y_i=1) w_i}{ \sum I(y_i=1) w_i} - |
4260 |
21 Dec 22 |
peter |
\frac{\sum_i^k I(y_i=0) w_i}{ \sum I(y_i=0) w_i} \f$. |
4260 |
21 Dec 22 |
peter |
176 |
|
4260 |
21 Dec 22 |
peter |
In case \f$ x_k \f$ has tied values the sums include all |
4260 |
21 Dec 22 |
peter |
values tied with \f$ x_k \f$, i.e., \f$ i : x_i \le x_k \f$. |
4260 |
21 Dec 22 |
peter |
179 |
|
4259 |
20 Dec 22 |
peter |
\since Part of public interface since yat 0.21 |
4259 |
20 Dec 22 |
peter |
181 |
*/ |
4260 |
21 Dec 22 |
peter |
// \f$ F_{\textrm{True}}(x) - F_{\textrm{False}}(x) | \f$ where |
4260 |
21 Dec 22 |
peter |
// \f$ F(x) = \frac{\sum_{i:x_i\le x}w_i}{ \sum w_i} \f$ |
4259 |
20 Dec 22 |
peter |
184 |
void scores(std::vector<double>&) const; |
4259 |
20 Dec 22 |
peter |
185 |
|
4259 |
20 Dec 22 |
peter |
186 |
/** |
1701 |
07 Jan 09 |
peter |
\brief shuffle class labels |
1701 |
07 Jan 09 |
peter |
188 |
|
1701 |
07 Jan 09 |
peter |
This is equivalent to reset and re-add values with shuffled |
1701 |
07 Jan 09 |
peter |
class labels. |
1701 |
07 Jan 09 |
peter |
191 |
|
1701 |
07 Jan 09 |
peter |
\since New in yat 0.5 |
1701 |
07 Jan 09 |
peter |
193 |
*/ |
1701 |
07 Jan 09 |
peter |
194 |
void shuffle(void); |
1701 |
07 Jan 09 |
peter |
195 |
|
1701 |
07 Jan 09 |
peter |
196 |
/** |
2721 |
12 Apr 12 |
peter |
Same as score() but keeping the sign, in other words, |
1595 |
22 Oct 08 |
peter |
abs(signed_score())==score() |
1595 |
22 Oct 08 |
peter |
199 |
|
4260 |
21 Dec 22 |
peter |
Of the values calculated in scores(), return the one with the |
4260 |
21 Dec 22 |
peter |
greatest deviation from zero. |
4260 |
21 Dec 22 |
peter |
202 |
|
1874 |
17 Mar 09 |
peter |
A positive score implies that values in class \c true \em on |
1874 |
17 Mar 09 |
peter |
\em average are smaller than values in class \c false. |
1874 |
17 Mar 09 |
peter |
205 |
|
1595 |
22 Oct 08 |
peter |
\since New in yat 0.5 |
4260 |
21 Dec 22 |
peter |
207 |
|
4260 |
21 Dec 22 |
peter |
\see scores() |
1595 |
22 Oct 08 |
peter |
209 |
*/ |
1595 |
22 Oct 08 |
peter |
210 |
double signed_score(void) const; |
1595 |
22 Oct 08 |
peter |
211 |
|
1003 |
18 Jan 08 |
peter |
212 |
private: |
1701 |
07 Jan 09 |
peter |
// add weights to sum_w1 and sum_w2 respectively depending on |
1701 |
07 Jan 09 |
peter |
// label in element. |
1701 |
07 Jan 09 |
peter |
215 |
template <typename ForwardIterator> |
1701 |
07 Jan 09 |
peter |
216 |
void add_sum_w(ForwardIterator first, ForwardIterator last); |
2721 |
12 Apr 12 |
peter |
217 |
|
1003 |
18 Jan 08 |
peter |
218 |
mutable bool cached_; |
1003 |
18 Jan 08 |
peter |
219 |
mutable double score_; |
1701 |
07 Jan 09 |
peter |
220 |
typedef std::multiset<Element> data_w; |
1003 |
18 Jan 08 |
peter |
221 |
data_w data_; |
1003 |
18 Jan 08 |
peter |
222 |
double sum_w1_; |
1003 |
18 Jan 08 |
peter |
223 |
double sum_w2_; |
1003 |
18 Jan 08 |
peter |
224 |
|
1701 |
07 Jan 09 |
peter |
// using compiler generated copy and assignment |
1701 |
07 Jan 09 |
peter |
//KolmogorovSmirnov(const KolmogorovSmirnov&); |
1701 |
07 Jan 09 |
peter |
//KolmogorovSmirnov& operator=(const KolmogorovSmirnov&); |
1003 |
18 Jan 08 |
peter |
228 |
}; |
1003 |
18 Jan 08 |
peter |
229 |
|
1125 |
22 Feb 08 |
peter |
230 |
/** |
1125 |
22 Feb 08 |
peter |
\brief output operator |
1886 |
31 Mar 09 |
peter |
232 |
|
1886 |
31 Mar 09 |
peter |
\relates KolmogorovSmirnov |
1125 |
22 Feb 08 |
peter |
234 |
*/ |
1003 |
18 Jan 08 |
peter |
235 |
std::ostream& operator<<(std::ostream&, const KolmogorovSmirnov&); |
1003 |
18 Jan 08 |
peter |
236 |
|
1003 |
18 Jan 08 |
peter |
237 |
|
1701 |
07 Jan 09 |
peter |
// template implementations |
1701 |
07 Jan 09 |
peter |
239 |
|
1701 |
07 Jan 09 |
peter |
240 |
template <typename ForwardIterator> |
1701 |
07 Jan 09 |
peter |
241 |
void KolmogorovSmirnov::add(ForwardIterator first, ForwardIterator last) |
1701 |
07 Jan 09 |
peter |
242 |
{ |
3523 |
10 Oct 16 |
peter |
243 |
BOOST_CONCEPT_ASSERT((boost_concepts::ForwardTraversal<ForwardIterator>)); |
3523 |
10 Oct 16 |
peter |
244 |
BOOST_CONCEPT_ASSERT((boost_concepts::ReadableIterator<ForwardIterator>)); |
1701 |
07 Jan 09 |
peter |
245 |
ForwardIterator iter(first); |
1701 |
07 Jan 09 |
peter |
246 |
typename data_w::const_iterator hint(data_.begin()); |
2721 |
12 Apr 12 |
peter |
247 |
for ( ; iter!=last; ++iter) |
2202 |
21 Feb 10 |
peter |
248 |
if ((*iter).weight) // ignore data points with zero weight |
1701 |
07 Jan 09 |
peter |
249 |
hint = data_.insert(hint, *iter); |
1701 |
07 Jan 09 |
peter |
250 |
add_sum_w(first, last); |
1701 |
07 Jan 09 |
peter |
251 |
cached_=false; |
1701 |
07 Jan 09 |
peter |
252 |
} |
1701 |
07 Jan 09 |
peter |
253 |
|
1701 |
07 Jan 09 |
peter |
254 |
|
1701 |
07 Jan 09 |
peter |
255 |
template <typename ForwardIterator> |
2721 |
12 Apr 12 |
peter |
256 |
void KolmogorovSmirnov::add_sum_w(ForwardIterator first, |
1701 |
07 Jan 09 |
peter |
257 |
ForwardIterator last) |
1701 |
07 Jan 09 |
peter |
258 |
{ |
1701 |
07 Jan 09 |
peter |
259 |
while (first!=last) { |
2202 |
21 Feb 10 |
peter |
260 |
if ((*first).label) |
2202 |
21 Feb 10 |
peter |
261 |
sum_w1_ += (*first).weight; |
1701 |
07 Jan 09 |
peter |
262 |
else |
2202 |
21 Feb 10 |
peter |
263 |
sum_w2_ += (*first).weight; |
1701 |
07 Jan 09 |
peter |
264 |
++first; |
1701 |
07 Jan 09 |
peter |
265 |
} |
1701 |
07 Jan 09 |
peter |
266 |
} |
1701 |
07 Jan 09 |
peter |
267 |
|
1003 |
18 Jan 08 |
peter |
268 |
}}} // of namespace theplu yat statistics |
1003 |
18 Jan 08 |
peter |
269 |
|
1003 |
18 Jan 08 |
peter |
270 |
#endif |