1003 |
18 Jan 08 |
peter |
// $Id$ |
1003 |
18 Jan 08 |
peter |
2 |
|
1004 |
23 Jan 08 |
peter |
3 |
/* |
2119 |
12 Dec 09 |
peter |
Copyright (C) 2008 Jari Häkkinen, Peter Johansson |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2009, 2011, 2012, 2013, 2022 Peter Johansson |
1004 |
23 Jan 08 |
peter |
6 |
|
1437 |
25 Aug 08 |
peter |
This file is part of the yat library, http://dev.thep.lu.se/yat |
1004 |
23 Jan 08 |
peter |
8 |
|
1004 |
23 Jan 08 |
peter |
The yat library is free software; you can redistribute it and/or |
1004 |
23 Jan 08 |
peter |
modify it under the terms of the GNU General Public License as |
1486 |
09 Sep 08 |
jari |
published by the Free Software Foundation; either version 3 of the |
1004 |
23 Jan 08 |
peter |
License, or (at your option) any later version. |
1004 |
23 Jan 08 |
peter |
13 |
|
1004 |
23 Jan 08 |
peter |
The yat library is distributed in the hope that it will be useful, |
1004 |
23 Jan 08 |
peter |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
1004 |
23 Jan 08 |
peter |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
1004 |
23 Jan 08 |
peter |
General Public License for more details. |
1004 |
23 Jan 08 |
peter |
18 |
|
1004 |
23 Jan 08 |
peter |
You should have received a copy of the GNU General Public License |
1487 |
10 Sep 08 |
jari |
along with yat. If not, see <http://www.gnu.org/licenses/>. |
1004 |
23 Jan 08 |
peter |
21 |
*/ |
1004 |
23 Jan 08 |
peter |
22 |
|
1701 |
07 Jan 09 |
peter |
23 |
#include <config.h> |
1701 |
07 Jan 09 |
peter |
24 |
|
1003 |
18 Jan 08 |
peter |
25 |
#include "KolmogorovSmirnov.h" |
1003 |
18 Jan 08 |
peter |
26 |
#include "yat/random/random.h" |
2721 |
12 Apr 12 |
peter |
27 |
#include "yat/utility/Exception.h" |
1003 |
18 Jan 08 |
peter |
28 |
#include "yat/utility/stl_utility.h" |
1003 |
18 Jan 08 |
peter |
29 |
|
1003 |
18 Jan 08 |
peter |
30 |
#include <algorithm> |
1003 |
18 Jan 08 |
peter |
31 |
#include <cassert> |
1003 |
18 Jan 08 |
peter |
32 |
#include <cmath> |
1003 |
18 Jan 08 |
peter |
33 |
#include <deque> |
1003 |
18 Jan 08 |
peter |
34 |
#include <functional> |
1626 |
15 Nov 08 |
peter |
35 |
#include <limits> |
3018 |
04 Apr 13 |
peter |
36 |
#include <sstream> |
1703 |
08 Jan 09 |
peter |
37 |
#include <vector> |
1003 |
18 Jan 08 |
peter |
38 |
|
1003 |
18 Jan 08 |
peter |
39 |
namespace theplu { |
1003 |
18 Jan 08 |
peter |
40 |
namespace yat { |
1003 |
18 Jan 08 |
peter |
41 |
namespace statistics { |
1003 |
18 Jan 08 |
peter |
42 |
|
1003 |
18 Jan 08 |
peter |
43 |
|
2515 |
11 Jul 11 |
peter |
44 |
KolmogorovSmirnov::Element::Element(void) |
2515 |
11 Jul 11 |
peter |
45 |
{} |
2515 |
11 Jul 11 |
peter |
46 |
|
2515 |
11 Jul 11 |
peter |
47 |
|
2515 |
11 Jul 11 |
peter |
48 |
KolmogorovSmirnov::Element::Element(double x, bool class_label, double w) |
2721 |
12 Apr 12 |
peter |
49 |
: value(x), label(class_label), weight(w) |
2515 |
11 Jul 11 |
peter |
50 |
{} |
2515 |
11 Jul 11 |
peter |
51 |
|
2515 |
11 Jul 11 |
peter |
52 |
|
1003 |
18 Jan 08 |
peter |
53 |
KolmogorovSmirnov::KolmogorovSmirnov(void) |
1003 |
18 Jan 08 |
peter |
54 |
: cached_(true), score_(0.0), sum_w1_(0.0), sum_w2_(0.0) |
1003 |
18 Jan 08 |
peter |
55 |
{ |
1003 |
18 Jan 08 |
peter |
56 |
} |
1003 |
18 Jan 08 |
peter |
57 |
|
1003 |
18 Jan 08 |
peter |
58 |
|
1003 |
18 Jan 08 |
peter |
59 |
void KolmogorovSmirnov::add(double x, bool target, double w) |
1003 |
18 Jan 08 |
peter |
60 |
{ |
1003 |
18 Jan 08 |
peter |
61 |
if (w==0) // ignoring zero weight data |
1003 |
18 Jan 08 |
peter |
62 |
return; |
1003 |
18 Jan 08 |
peter |
63 |
|
2014 |
31 Jul 09 |
peter |
64 |
assert(!std::isnan(x)); |
1701 |
07 Jan 09 |
peter |
65 |
data_.insert(Element(x,target,w)); |
1003 |
18 Jan 08 |
peter |
66 |
if (target){ |
1003 |
18 Jan 08 |
peter |
67 |
sum_w1_+=w; |
1003 |
18 Jan 08 |
peter |
68 |
} |
1003 |
18 Jan 08 |
peter |
69 |
else { |
1003 |
18 Jan 08 |
peter |
70 |
sum_w2_+=w; |
1003 |
18 Jan 08 |
peter |
71 |
} |
1003 |
18 Jan 08 |
peter |
72 |
cached_=false; |
1003 |
18 Jan 08 |
peter |
73 |
} |
1003 |
18 Jan 08 |
peter |
74 |
|
1003 |
18 Jan 08 |
peter |
75 |
|
1593 |
21 Oct 08 |
peter |
76 |
double KolmogorovSmirnov::p_value(void) const |
1593 |
21 Oct 08 |
peter |
77 |
{ |
1593 |
21 Oct 08 |
peter |
78 |
double res=0; |
1593 |
21 Oct 08 |
peter |
79 |
double res2=0; |
1600 |
27 Oct 08 |
peter |
80 |
double s2 = score() * score() * sum_w1_*sum_w2_/(sum_w1_+sum_w2_); |
1593 |
21 Oct 08 |
peter |
81 |
int sign = 1; |
1593 |
21 Oct 08 |
peter |
82 |
for (size_t k = 1; k<100; ++k) { |
1600 |
27 Oct 08 |
peter |
83 |
res += sign * 2 * exp(-2.0 * k * k * s2); |
1593 |
21 Oct 08 |
peter |
84 |
sign *= -1; |
1600 |
27 Oct 08 |
peter |
// ignore remaining terms as they are small |
1593 |
21 Oct 08 |
peter |
86 |
if (res==res2) |
1593 |
21 Oct 08 |
peter |
87 |
return res; |
1593 |
21 Oct 08 |
peter |
88 |
res2 = res; |
1593 |
21 Oct 08 |
peter |
89 |
} |
1593 |
21 Oct 08 |
peter |
90 |
|
1593 |
21 Oct 08 |
peter |
91 |
return res; |
1593 |
21 Oct 08 |
peter |
92 |
} |
1593 |
21 Oct 08 |
peter |
93 |
|
1593 |
21 Oct 08 |
peter |
94 |
|
1003 |
18 Jan 08 |
peter |
95 |
double KolmogorovSmirnov::p_value(size_t perm) const |
1003 |
18 Jan 08 |
peter |
96 |
{ |
1003 |
18 Jan 08 |
peter |
97 |
size_t count=0; |
1626 |
15 Nov 08 |
peter |
98 |
double score_threshold = score()-10*std::numeric_limits<double>().epsilon(); |
1701 |
07 Jan 09 |
peter |
99 |
KolmogorovSmirnov ks(*this); |
1626 |
15 Nov 08 |
peter |
100 |
|
1003 |
18 Jan 08 |
peter |
101 |
for (size_t i=0; i<perm; ++i){ |
1701 |
07 Jan 09 |
peter |
102 |
ks.shuffle(); |
1626 |
15 Nov 08 |
peter |
103 |
if (ks.score()>=score_threshold) |
1003 |
18 Jan 08 |
peter |
104 |
++count; |
1003 |
18 Jan 08 |
peter |
105 |
} |
1003 |
18 Jan 08 |
peter |
106 |
return static_cast<double>(count)/perm; |
1003 |
18 Jan 08 |
peter |
107 |
} |
1003 |
18 Jan 08 |
peter |
108 |
|
1003 |
18 Jan 08 |
peter |
109 |
|
2721 |
12 Apr 12 |
peter |
110 |
void KolmogorovSmirnov::remove(double value, bool class_label, double weight) |
2721 |
12 Apr 12 |
peter |
111 |
{ |
2721 |
12 Apr 12 |
peter |
112 |
if (!weight) |
2721 |
12 Apr 12 |
peter |
113 |
return; |
2721 |
12 Apr 12 |
peter |
114 |
Element e(value, class_label, weight); |
3018 |
04 Apr 13 |
peter |
115 |
typedef std::multiset<Element>::iterator iterator; |
2721 |
12 Apr 12 |
peter |
116 |
std::pair<iterator, iterator> iter = data_.equal_range(e); |
2721 |
12 Apr 12 |
peter |
117 |
while (iter.first!=iter.second) { |
2721 |
12 Apr 12 |
peter |
118 |
if (iter.first->label==class_label && iter.first->weight==weight) { |
2721 |
12 Apr 12 |
peter |
119 |
if (class_label) |
2721 |
12 Apr 12 |
peter |
120 |
sum_w1_-=weight; |
2721 |
12 Apr 12 |
peter |
121 |
else |
2721 |
12 Apr 12 |
peter |
122 |
sum_w2_-=weight; |
3018 |
04 Apr 13 |
peter |
123 |
data_.erase(iter.first); |
2721 |
12 Apr 12 |
peter |
124 |
cached_=false; |
2721 |
12 Apr 12 |
peter |
125 |
return; |
2721 |
12 Apr 12 |
peter |
126 |
} |
2721 |
12 Apr 12 |
peter |
127 |
++iter.first; |
2721 |
12 Apr 12 |
peter |
128 |
} |
3018 |
04 Apr 13 |
peter |
129 |
std::ostringstream ss; |
3018 |
04 Apr 13 |
peter |
130 |
ss << "KolmogorovSmirnov::remove: " << value << " " << class_label |
3018 |
04 Apr 13 |
peter |
131 |
<< " " << weight; |
3018 |
04 Apr 13 |
peter |
132 |
throw utility::runtime_error(ss.str()); |
2721 |
12 Apr 12 |
peter |
133 |
} |
2721 |
12 Apr 12 |
peter |
134 |
|
2721 |
12 Apr 12 |
peter |
135 |
|
1612 |
04 Nov 08 |
peter |
136 |
void KolmogorovSmirnov::reset(void) |
1612 |
04 Nov 08 |
peter |
137 |
{ |
1612 |
04 Nov 08 |
peter |
138 |
cached_=false; |
1612 |
04 Nov 08 |
peter |
139 |
data_.clear(); |
1612 |
04 Nov 08 |
peter |
140 |
sum_w1_=0; |
1612 |
04 Nov 08 |
peter |
141 |
sum_w2_=0; |
1612 |
04 Nov 08 |
peter |
142 |
} |
1612 |
04 Nov 08 |
peter |
143 |
|
1612 |
04 Nov 08 |
peter |
144 |
|
1003 |
18 Jan 08 |
peter |
145 |
double KolmogorovSmirnov::score(void) const |
1003 |
18 Jan 08 |
peter |
146 |
{ |
1595 |
22 Oct 08 |
peter |
147 |
return std::abs(signed_score()); |
1595 |
22 Oct 08 |
peter |
148 |
} |
1595 |
22 Oct 08 |
peter |
149 |
|
1595 |
22 Oct 08 |
peter |
150 |
|
1003 |
18 Jan 08 |
peter |
151 |
void KolmogorovSmirnov::scores(std::vector<double>& res) const |
1003 |
18 Jan 08 |
peter |
152 |
{ |
1003 |
18 Jan 08 |
peter |
153 |
res.clear(); |
1003 |
18 Jan 08 |
peter |
154 |
res.reserve(data_.size()); |
1003 |
18 Jan 08 |
peter |
155 |
data_w::const_iterator iter(data_.begin()); |
1003 |
18 Jan 08 |
peter |
156 |
double f1=0; |
1003 |
18 Jan 08 |
peter |
157 |
double f2=0; |
1003 |
18 Jan 08 |
peter |
158 |
while(iter!=data_.end()){ |
1687 |
30 Dec 08 |
peter |
159 |
size_t count=0; |
1701 |
07 Jan 09 |
peter |
160 |
double value = iter->value; |
1701 |
07 Jan 09 |
peter |
161 |
while (iter!=data_.end() && iter->value==value) { |
1701 |
07 Jan 09 |
peter |
162 |
if (iter->label) { |
1701 |
07 Jan 09 |
peter |
163 |
f1 += iter->weight; |
1687 |
30 Dec 08 |
peter |
164 |
} |
1687 |
30 Dec 08 |
peter |
165 |
else { |
1701 |
07 Jan 09 |
peter |
166 |
f2 += iter->weight; |
1687 |
30 Dec 08 |
peter |
167 |
} |
1687 |
30 Dec 08 |
peter |
168 |
++count; |
1687 |
30 Dec 08 |
peter |
169 |
++iter; |
1687 |
30 Dec 08 |
peter |
170 |
} |
1687 |
30 Dec 08 |
peter |
171 |
res.resize(res.size()+count, f1/sum_w1_-f2/sum_w2_); |
1003 |
18 Jan 08 |
peter |
172 |
} |
1003 |
18 Jan 08 |
peter |
173 |
} |
1003 |
18 Jan 08 |
peter |
174 |
|
1003 |
18 Jan 08 |
peter |
175 |
|
1701 |
07 Jan 09 |
peter |
176 |
void KolmogorovSmirnov::shuffle(void) |
1701 |
07 Jan 09 |
peter |
177 |
{ |
4256 |
20 Dec 22 |
peter |
// avoid using vector<bool> |
4256 |
20 Dec 22 |
peter |
179 |
std::vector<char> labels; |
4256 |
20 Dec 22 |
peter |
180 |
labels.reserve(data_.size()); |
2721 |
12 Apr 12 |
peter |
181 |
for (data_w::const_iterator iter=data_.begin(); |
1701 |
07 Jan 09 |
peter |
182 |
iter!=data_.end(); ++iter) { |
1701 |
07 Jan 09 |
peter |
183 |
labels.push_back(iter->label); |
1701 |
07 Jan 09 |
peter |
184 |
} |
1701 |
07 Jan 09 |
peter |
185 |
random::random_shuffle(labels.begin(), labels.end()); |
4256 |
20 Dec 22 |
peter |
186 |
auto label = labels.cbegin(); |
2721 |
12 Apr 12 |
peter |
187 |
for (data_w::iterator iter=data_.begin(); |
1701 |
07 Jan 09 |
peter |
188 |
iter!=data_.end(); ++iter, ++label) { |
1701 |
07 Jan 09 |
peter |
// label does not affect sorting of set, so modifying label is safe |
1701 |
07 Jan 09 |
peter |
190 |
const_cast<Element&>(*iter).label = *label; |
1701 |
07 Jan 09 |
peter |
191 |
} |
1701 |
07 Jan 09 |
peter |
192 |
sum_w1_ = 0; |
1701 |
07 Jan 09 |
peter |
193 |
sum_w2_ = 0; |
1701 |
07 Jan 09 |
peter |
194 |
add_sum_w(data_.begin(), data_.end()); |
1701 |
07 Jan 09 |
peter |
195 |
cached_ = false; |
1701 |
07 Jan 09 |
peter |
196 |
} |
1701 |
07 Jan 09 |
peter |
197 |
|
1701 |
07 Jan 09 |
peter |
198 |
|
1701 |
07 Jan 09 |
peter |
199 |
double KolmogorovSmirnov::signed_score(void) const |
1701 |
07 Jan 09 |
peter |
200 |
{ |
1701 |
07 Jan 09 |
peter |
201 |
if (cached_) |
1701 |
07 Jan 09 |
peter |
202 |
return score_; |
1701 |
07 Jan 09 |
peter |
203 |
if (!sum_w1_ || !sum_w2_) |
1701 |
07 Jan 09 |
peter |
204 |
return 0.0; |
1701 |
07 Jan 09 |
peter |
205 |
score_=0; |
1701 |
07 Jan 09 |
peter |
206 |
std::vector<double> v; |
1701 |
07 Jan 09 |
peter |
207 |
scores(v); |
1701 |
07 Jan 09 |
peter |
208 |
std::less<double> l; |
1701 |
07 Jan 09 |
peter |
209 |
utility::abs<double> a; |
1701 |
07 Jan 09 |
peter |
210 |
using utility::make_compose_f_gx_hy; |
2721 |
12 Apr 12 |
peter |
211 |
score_ = *std::max_element(v.begin(), v.end(), |
1701 |
07 Jan 09 |
peter |
212 |
make_compose_f_gx_hy(l, a, a)); |
1701 |
07 Jan 09 |
peter |
213 |
cached_=true; |
1701 |
07 Jan 09 |
peter |
214 |
return score_; |
1701 |
07 Jan 09 |
peter |
215 |
} |
1701 |
07 Jan 09 |
peter |
216 |
|
1701 |
07 Jan 09 |
peter |
217 |
|
1003 |
18 Jan 08 |
peter |
218 |
std::ostream& operator<<(std::ostream& os, const KolmogorovSmirnov& ks) |
1003 |
18 Jan 08 |
peter |
219 |
{ |
1003 |
18 Jan 08 |
peter |
220 |
std::vector<double> s; |
1003 |
18 Jan 08 |
peter |
221 |
ks.scores(s); |
1003 |
18 Jan 08 |
peter |
222 |
for (size_t i=0; i<s.size(); ++i) |
1003 |
18 Jan 08 |
peter |
223 |
os << i << "\t" << s[i] << "\n"; |
1003 |
18 Jan 08 |
peter |
224 |
return os; |
1003 |
18 Jan 08 |
peter |
225 |
} |
1003 |
18 Jan 08 |
peter |
226 |
|
1003 |
18 Jan 08 |
peter |
227 |
|
1701 |
07 Jan 09 |
peter |
228 |
bool KolmogorovSmirnov::Element::operator< |
2064 |
16 Sep 09 |
peter |
229 |
(const KolmogorovSmirnov::Element& rhs) const |
1701 |
07 Jan 09 |
peter |
230 |
{ |
1701 |
07 Jan 09 |
peter |
231 |
return value<rhs.value; |
1701 |
07 Jan 09 |
peter |
232 |
} |
1701 |
07 Jan 09 |
peter |
233 |
|
1003 |
18 Jan 08 |
peter |
234 |
}}} // of namespace theplu yat statistics |