69 |
29 Apr 04 |
peter |
// $Id$ |
69 |
29 Apr 04 |
peter |
2 |
|
675 |
10 Oct 06 |
jari |
3 |
/* |
831 |
27 Mar 07 |
peter |
Copyright (C) 2004, 2005 Peter Johansson |
2119 |
12 Dec 09 |
peter |
Copyright (C) 2006, 2007, 2008 Jari Häkkinen, Peter Johansson |
3439 |
20 Nov 15 |
peter |
Copyright (C) 2011, 2012, 2013, 2015 Peter Johansson |
295 |
29 Apr 05 |
peter |
7 |
|
1437 |
25 Aug 08 |
peter |
This file is part of the yat library, http://dev.thep.lu.se/yat |
623 |
05 Sep 06 |
peter |
9 |
|
675 |
10 Oct 06 |
jari |
The yat library is free software; you can redistribute it and/or |
675 |
10 Oct 06 |
jari |
modify it under the terms of the GNU General Public License as |
1486 |
09 Sep 08 |
jari |
published by the Free Software Foundation; either version 3 of the |
675 |
10 Oct 06 |
jari |
License, or (at your option) any later version. |
675 |
10 Oct 06 |
jari |
14 |
|
675 |
10 Oct 06 |
jari |
The yat library is distributed in the hope that it will be useful, |
675 |
10 Oct 06 |
jari |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
675 |
10 Oct 06 |
jari |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
675 |
10 Oct 06 |
jari |
General Public License for more details. |
675 |
10 Oct 06 |
jari |
19 |
|
675 |
10 Oct 06 |
jari |
You should have received a copy of the GNU General Public License |
1487 |
10 Sep 08 |
jari |
along with yat. If not, see <http://www.gnu.org/licenses/>. |
675 |
10 Oct 06 |
jari |
22 |
*/ |
675 |
10 Oct 06 |
jari |
23 |
|
2881 |
18 Nov 12 |
peter |
24 |
#include <config.h> |
2881 |
18 Nov 12 |
peter |
25 |
|
680 |
11 Oct 06 |
jari |
26 |
#include "ROC.h" |
821 |
18 Mar 07 |
peter |
27 |
#include "AUC.h" |
675 |
10 Oct 06 |
jari |
28 |
|
2720 |
12 Apr 12 |
peter |
29 |
#include "yat/utility/Exception.h" |
2720 |
12 Apr 12 |
peter |
30 |
|
295 |
29 Apr 05 |
peter |
31 |
#include <gsl/gsl_cdf.h> |
295 |
29 Apr 05 |
peter |
32 |
|
2595 |
30 Oct 11 |
peter |
33 |
#include <algorithm> |
2556 |
20 Aug 11 |
peter |
34 |
#include <cassert> |
140 |
20 Aug 04 |
peter |
35 |
#include <cmath> |
2601 |
30 Oct 11 |
peter |
36 |
#include <limits> |
2720 |
12 Apr 12 |
peter |
37 |
#include <sstream> |
112 |
07 Jul 04 |
peter |
38 |
#include <utility> |
69 |
29 Apr 04 |
peter |
39 |
|
69 |
29 Apr 04 |
peter |
40 |
namespace theplu { |
680 |
11 Oct 06 |
jari |
41 |
namespace yat { |
2696 |
28 Feb 12 |
peter |
42 |
namespace statistics { |
69 |
29 Apr 04 |
peter |
43 |
|
2696 |
28 Feb 12 |
peter |
44 |
ROC::ROC(void) |
821 |
18 Mar 07 |
peter |
45 |
:minimum_size_(10) |
69 |
29 Apr 04 |
peter |
46 |
{ |
821 |
18 Mar 07 |
peter |
47 |
reset(); |
102 |
15 Jun 04 |
peter |
48 |
} |
102 |
15 Jun 04 |
peter |
49 |
|
703 |
18 Dec 06 |
jari |
50 |
|
821 |
18 Mar 07 |
peter |
51 |
void ROC::add(double x, bool target, double w) |
69 |
29 Apr 04 |
peter |
52 |
{ |
821 |
18 Mar 07 |
peter |
53 |
if (!w) |
821 |
18 Mar 07 |
peter |
54 |
return; |
2585 |
25 Oct 11 |
peter |
55 |
ROC::Map::value_type element(x, std::make_pair(target, w)); |
2585 |
25 Oct 11 |
peter |
56 |
ROC::Map::iterator lower = multimap_.lower_bound(x); |
2556 |
20 Aug 11 |
peter |
57 |
if (lower!=multimap_.end() && lower->first == x) |
2556 |
20 Aug 11 |
peter |
58 |
has_ties_ = true; |
2556 |
20 Aug 11 |
peter |
59 |
multimap_.insert(lower, element); |
821 |
18 Mar 07 |
peter |
60 |
if (target) |
2709 |
15 Mar 12 |
peter |
61 |
pos_weights_.add(w); |
821 |
18 Mar 07 |
peter |
62 |
else |
2709 |
15 Mar 12 |
peter |
63 |
neg_weights_.add(w); |
2697 |
28 Feb 12 |
peter |
64 |
area_ = std::numeric_limits<double>::quiet_NaN(); |
821 |
18 Mar 07 |
peter |
65 |
} |
821 |
18 Mar 07 |
peter |
66 |
|
821 |
18 Mar 07 |
peter |
67 |
|
3023 |
06 Apr 13 |
peter |
68 |
double ROC::area(void) const |
821 |
18 Mar 07 |
peter |
69 |
{ |
2697 |
28 Feb 12 |
peter |
70 |
if (std::isnan(area_)){ |
821 |
18 Mar 07 |
peter |
71 |
AUC auc(false); |
821 |
18 Mar 07 |
peter |
72 |
area_=auc.score(multimap_); |
821 |
18 Mar 07 |
peter |
73 |
} |
821 |
18 Mar 07 |
peter |
74 |
return area_; |
821 |
18 Mar 07 |
peter |
75 |
} |
821 |
18 Mar 07 |
peter |
76 |
|
821 |
18 Mar 07 |
peter |
77 |
|
821 |
18 Mar 07 |
peter |
78 |
double ROC::get_p_approx(double x) const |
821 |
18 Mar 07 |
peter |
79 |
{ |
2709 |
15 Mar 12 |
peter |
80 |
size_t n_pos = nof_points(pos_weights_); |
2709 |
15 Mar 12 |
peter |
81 |
size_t n_neg = nof_points(neg_weights_); |
2709 |
15 Mar 12 |
peter |
82 |
size_t nof_samples = n_pos + n_neg; |
821 |
18 Mar 07 |
peter |
// make x standard normal |
821 |
18 Mar 07 |
peter |
84 |
x -= 0.5; |
78 |
04 May 04 |
peter |
// Not integrating from the middle of the bin, but from the inner edge. |
78 |
04 May 04 |
peter |
86 |
if (x>0) |
2709 |
15 Mar 12 |
peter |
87 |
x -= 0.5/(n_pos*n_neg); |
78 |
04 May 04 |
peter |
88 |
else if(x<0) |
2709 |
15 Mar 12 |
peter |
89 |
x += 0.5/(n_pos*n_neg); |
2554 |
17 Aug 11 |
peter |
90 |
else |
821 |
18 Mar 07 |
peter |
91 |
return 0.5; |
2709 |
15 Mar 12 |
peter |
92 |
double var = 1.0+nof_samples; |
2556 |
20 Aug 11 |
peter |
93 |
if (has_ties_) { |
2556 |
20 Aug 11 |
peter |
94 |
double correction = 0; |
2556 |
20 Aug 11 |
peter |
95 |
Map::const_iterator first = multimap_.begin(); |
2556 |
20 Aug 11 |
peter |
96 |
Map::const_iterator last = multimap_.begin(); |
2556 |
20 Aug 11 |
peter |
97 |
while (first!=multimap_.end()) { |
2556 |
20 Aug 11 |
peter |
98 |
size_t n = 0; |
2556 |
20 Aug 11 |
peter |
99 |
while (first->first == last->first) { |
2556 |
20 Aug 11 |
peter |
100 |
++n; |
2556 |
20 Aug 11 |
peter |
101 |
++last; |
2556 |
20 Aug 11 |
peter |
102 |
} |
2556 |
20 Aug 11 |
peter |
103 |
correction += n * (n-1) * (n+1); |
2556 |
20 Aug 11 |
peter |
104 |
first = last; |
2556 |
20 Aug 11 |
peter |
105 |
} |
2556 |
20 Aug 11 |
peter |
106 |
/* |
2556 |
20 Aug 11 |
peter |
mn(N+1)/12-[mn/(12N(N-1)) * sum(t(t-1)(t+1))] = |
2556 |
20 Aug 11 |
peter |
mn/12 [ N+1 - 1/(N(N-1)) * sum(t(t-1)(t+1)) ] |
2556 |
20 Aug 11 |
peter |
109 |
*/ |
2709 |
15 Mar 12 |
peter |
110 |
var -= correction/(nof_samples * (nof_samples-1)); |
2556 |
20 Aug 11 |
peter |
111 |
} |
2709 |
15 Mar 12 |
peter |
112 |
var = var / (12*n_pos*n_neg); |
2556 |
20 Aug 11 |
peter |
113 |
return gsl_cdf_gaussian_Q(x, std::sqrt(var)); |
69 |
29 Apr 04 |
peter |
114 |
} |
69 |
29 Apr 04 |
peter |
115 |
|
69 |
29 Apr 04 |
peter |
116 |
|
2718 |
12 Apr 12 |
peter |
117 |
bool ROC::is_weighted(void) const |
2718 |
12 Apr 12 |
peter |
118 |
{ |
2718 |
12 Apr 12 |
peter |
119 |
return pos_weights_.variance() || neg_weights_.variance() |
2718 |
12 Apr 12 |
peter |
120 |
|| pos_weights_.mean() != neg_weights_.mean(); |
2718 |
12 Apr 12 |
peter |
121 |
} |
2718 |
12 Apr 12 |
peter |
122 |
|
3006 |
01 Apr 13 |
peter |
123 |
|
1271 |
09 Apr 08 |
peter |
124 |
unsigned int& ROC::minimum_size(void) |
718 |
26 Dec 06 |
jari |
125 |
{ |
718 |
26 Dec 06 |
jari |
126 |
return minimum_size_; |
718 |
26 Dec 06 |
jari |
127 |
} |
718 |
26 Dec 06 |
jari |
128 |
|
718 |
26 Dec 06 |
jari |
129 |
|
1271 |
09 Apr 08 |
peter |
130 |
const unsigned int& ROC::minimum_size(void) const |
718 |
26 Dec 06 |
jari |
131 |
{ |
821 |
18 Mar 07 |
peter |
132 |
return minimum_size_; |
718 |
26 Dec 06 |
jari |
133 |
} |
718 |
26 Dec 06 |
jari |
134 |
|
821 |
18 Mar 07 |
peter |
135 |
|
848 |
28 Apr 07 |
jari |
136 |
double ROC::n(void) const |
103 |
15 Jun 04 |
peter |
137 |
{ |
821 |
18 Mar 07 |
peter |
138 |
return n_pos()+n_neg(); |
103 |
15 Jun 04 |
peter |
139 |
} |
103 |
15 Jun 04 |
peter |
140 |
|
821 |
18 Mar 07 |
peter |
141 |
|
848 |
28 Apr 07 |
jari |
142 |
double ROC::n_neg(void) const |
103 |
15 Jun 04 |
peter |
143 |
{ |
2709 |
15 Mar 12 |
peter |
144 |
return neg_weights_.sum_x(); |
103 |
15 Jun 04 |
peter |
145 |
} |
103 |
15 Jun 04 |
peter |
146 |
|
623 |
05 Sep 06 |
peter |
147 |
|
848 |
28 Apr 07 |
jari |
148 |
double ROC::n_pos(void) const |
623 |
05 Sep 06 |
peter |
149 |
{ |
2709 |
15 Mar 12 |
peter |
150 |
return pos_weights_.sum_x(); |
623 |
05 Sep 06 |
peter |
151 |
} |
623 |
05 Sep 06 |
peter |
152 |
|
623 |
05 Sep 06 |
peter |
153 |
|
2709 |
15 Mar 12 |
peter |
154 |
size_t ROC::nof_points(const Averager& a) const |
2709 |
15 Mar 12 |
peter |
155 |
{ |
2838 |
16 Sep 12 |
peter |
156 |
return static_cast<size_t>(a.sum_x()*a.sum_x()/a.sum_xx()); |
2709 |
15 Mar 12 |
peter |
157 |
} |
2709 |
15 Mar 12 |
peter |
158 |
|
2709 |
15 Mar 12 |
peter |
159 |
|
2718 |
12 Apr 12 |
peter |
160 |
double ROC::p_exact_left(double area) const |
2585 |
25 Oct 11 |
peter |
161 |
{ |
2718 |
12 Apr 12 |
peter |
162 |
if (is_weighted()) |
2718 |
12 Apr 12 |
peter |
163 |
return p_left_weighted(area); |
2718 |
12 Apr 12 |
peter |
164 |
return p_exact_with_ties(multimap_.rbegin(), multimap_.rend(), |
2718 |
12 Apr 12 |
peter |
165 |
(1-area)*pos_weights_.n()*neg_weights_.n(), |
2718 |
12 Apr 12 |
peter |
166 |
pos_weights_.n(), neg_weights_.n()); |
2718 |
12 Apr 12 |
peter |
167 |
} |
2718 |
12 Apr 12 |
peter |
168 |
|
2718 |
12 Apr 12 |
peter |
169 |
|
2718 |
12 Apr 12 |
peter |
170 |
double ROC::p_exact_right(double area) const |
2718 |
12 Apr 12 |
peter |
171 |
{ |
2718 |
12 Apr 12 |
peter |
172 |
if (is_weighted()) |
2718 |
12 Apr 12 |
peter |
173 |
return p_right_weighted(area); |
2595 |
30 Oct 11 |
peter |
174 |
return p_exact_with_ties(multimap_.begin(), multimap_.end(), |
2710 |
16 Mar 12 |
peter |
175 |
area*pos_weights_.n()*neg_weights_.n(), |
2710 |
16 Mar 12 |
peter |
176 |
pos_weights_.n(), neg_weights_.n()); |
2585 |
25 Oct 11 |
peter |
177 |
} |
2585 |
25 Oct 11 |
peter |
178 |
|
2585 |
25 Oct 11 |
peter |
179 |
|
2718 |
12 Apr 12 |
peter |
180 |
double ROC::p_left_weighted(double area) const |
2718 |
12 Apr 12 |
peter |
181 |
{ |
2718 |
12 Apr 12 |
peter |
182 |
return count(utility::pair_first_iterator(multimap_.begin()), |
2718 |
12 Apr 12 |
peter |
183 |
utility::pair_first_iterator(multimap_.end()), 1-area); |
2718 |
12 Apr 12 |
peter |
184 |
} |
2718 |
12 Apr 12 |
peter |
185 |
|
2718 |
12 Apr 12 |
peter |
186 |
|
2718 |
12 Apr 12 |
peter |
187 |
double ROC::p_right_weighted(double area) const |
2718 |
12 Apr 12 |
peter |
188 |
{ |
2718 |
12 Apr 12 |
peter |
189 |
return count(utility::pair_first_iterator(multimap_.rbegin()), |
2718 |
12 Apr 12 |
peter |
190 |
utility::pair_first_iterator(multimap_.rend()), area); |
2718 |
12 Apr 12 |
peter |
191 |
} |
2718 |
12 Apr 12 |
peter |
192 |
|
2718 |
12 Apr 12 |
peter |
193 |
|
3006 |
01 Apr 13 |
peter |
194 |
double ROC::p_left() const |
3006 |
01 Apr 13 |
peter |
195 |
{ |
3023 |
06 Apr 13 |
peter |
196 |
if (std::isnan(area())) |
3006 |
01 Apr 13 |
peter |
197 |
return std::numeric_limits<double>::quiet_NaN(); |
3006 |
01 Apr 13 |
peter |
198 |
if (use_exact_method()) |
3023 |
06 Apr 13 |
peter |
199 |
return p_exact_left(area()); |
3023 |
06 Apr 13 |
peter |
200 |
return get_p_approx(1-area()); |
3006 |
01 Apr 13 |
peter |
201 |
} |
3006 |
01 Apr 13 |
peter |
202 |
|
3006 |
01 Apr 13 |
peter |
203 |
|
3006 |
01 Apr 13 |
peter |
204 |
double ROC::p_right() const |
3006 |
01 Apr 13 |
peter |
205 |
{ |
3023 |
06 Apr 13 |
peter |
206 |
if (std::isnan(area())) |
3006 |
01 Apr 13 |
peter |
207 |
return std::numeric_limits<double>::quiet_NaN(); |
3006 |
01 Apr 13 |
peter |
208 |
if (use_exact_method()) |
3023 |
06 Apr 13 |
peter |
209 |
return p_exact_right(area()); |
3023 |
06 Apr 13 |
peter |
210 |
return get_p_approx(area()); |
3006 |
01 Apr 13 |
peter |
211 |
} |
3006 |
01 Apr 13 |
peter |
212 |
|
3006 |
01 Apr 13 |
peter |
213 |
|
821 |
18 Mar 07 |
peter |
214 |
double ROC::p_value() const |
103 |
15 Jun 04 |
peter |
215 |
{ |
3023 |
06 Apr 13 |
peter |
216 |
if (std::isnan(area())) |
2601 |
30 Oct 11 |
peter |
217 |
return std::numeric_limits<double>::quiet_NaN(); |
2585 |
25 Oct 11 |
peter |
218 |
if (use_exact_method()) { |
2585 |
25 Oct 11 |
peter |
219 |
double p = 0; |
3023 |
06 Apr 13 |
peter |
220 |
double abs_area = std::max(area(), 1-area()); |
2718 |
12 Apr 12 |
peter |
221 |
p = p_exact_right(abs_area); |
2585 |
25 Oct 11 |
peter |
222 |
if (has_ties_) { |
2718 |
12 Apr 12 |
peter |
223 |
p += p_exact_left(1.0 - abs_area); |
2585 |
25 Oct 11 |
peter |
224 |
} |
2585 |
25 Oct 11 |
peter |
225 |
else |
2585 |
25 Oct 11 |
peter |
226 |
p *= 2.0; |
2585 |
25 Oct 11 |
peter |
// avoid double counting when area is 0.5 |
2585 |
25 Oct 11 |
peter |
228 |
return std::min(p, 1.0); |
2585 |
25 Oct 11 |
peter |
229 |
} |
3439 |
20 Nov 15 |
peter |
230 |
return 2*get_p_approx(std::max(area(), 1-area())); |
821 |
18 Mar 07 |
peter |
231 |
} |
475 |
22 Dec 05 |
peter |
232 |
|
488 |
04 Jan 06 |
peter |
233 |
|
821 |
18 Mar 07 |
peter |
234 |
double ROC::p_value_one_sided() const |
103 |
15 Jun 04 |
peter |
235 |
{ |
3006 |
01 Apr 13 |
peter |
236 |
return p_right(); |
103 |
15 Jun 04 |
peter |
237 |
} |
103 |
15 Jun 04 |
peter |
238 |
|
821 |
18 Mar 07 |
peter |
239 |
|
2720 |
12 Apr 12 |
peter |
240 |
void ROC::remove(double value, bool target, double weight) |
2720 |
12 Apr 12 |
peter |
241 |
{ |
2722 |
12 Apr 12 |
peter |
242 |
if (!weight) |
2722 |
12 Apr 12 |
peter |
243 |
return; |
2720 |
12 Apr 12 |
peter |
244 |
std::pair<Map::iterator, Map::iterator> iter = multimap_.equal_range(value); |
2720 |
12 Apr 12 |
peter |
245 |
while (iter.first!=iter.second) { |
2720 |
12 Apr 12 |
peter |
246 |
if (iter.first->second.first==target && iter.first->second.first==target){ |
2720 |
12 Apr 12 |
peter |
247 |
multimap_.erase(iter.first); |
2722 |
12 Apr 12 |
peter |
248 |
if (target) |
2722 |
12 Apr 12 |
peter |
249 |
pos_weights_.add(weight, -1); |
2722 |
12 Apr 12 |
peter |
250 |
else |
2722 |
12 Apr 12 |
peter |
251 |
neg_weights_.add(weight, -1); |
2722 |
12 Apr 12 |
peter |
252 |
area_ = std::numeric_limits<double>::quiet_NaN(); |
2720 |
12 Apr 12 |
peter |
253 |
return; |
2720 |
12 Apr 12 |
peter |
254 |
} |
2720 |
12 Apr 12 |
peter |
255 |
++iter.first; |
2720 |
12 Apr 12 |
peter |
256 |
} |
2720 |
12 Apr 12 |
peter |
257 |
std::stringstream ss; |
2720 |
12 Apr 12 |
peter |
258 |
ss << "ROC::remove(" << value << "," << target << "," << weight << "): " |
2720 |
12 Apr 12 |
peter |
259 |
<< "no such element"; |
2720 |
12 Apr 12 |
peter |
260 |
throw utility::runtime_error(ss.str()); |
2720 |
12 Apr 12 |
peter |
261 |
} |
2720 |
12 Apr 12 |
peter |
262 |
|
2720 |
12 Apr 12 |
peter |
263 |
|
821 |
18 Mar 07 |
peter |
264 |
void ROC::reset(void) |
103 |
15 Jun 04 |
peter |
265 |
{ |
2697 |
28 Feb 12 |
peter |
266 |
area_ = std::numeric_limits<double>::quiet_NaN(); |
2556 |
20 Aug 11 |
peter |
267 |
has_ties_ = false; |
2709 |
15 Mar 12 |
peter |
268 |
neg_weights_.reset(); |
2709 |
15 Mar 12 |
peter |
269 |
pos_weights_.reset(); |
821 |
18 Mar 07 |
peter |
270 |
multimap_.clear(); |
103 |
15 Jun 04 |
peter |
271 |
} |
103 |
15 Jun 04 |
peter |
272 |
|
2585 |
25 Oct 11 |
peter |
273 |
|
2585 |
25 Oct 11 |
peter |
274 |
bool ROC::use_exact_method(void) const |
2585 |
25 Oct 11 |
peter |
275 |
{ |
2612 |
04 Nov 11 |
peter |
276 |
return (n_pos() < minimum_size_) || (n_neg() < minimum_size_); |
2585 |
25 Oct 11 |
peter |
277 |
} |
2585 |
25 Oct 11 |
peter |
278 |
|
2718 |
12 Apr 12 |
peter |
279 |
|
2718 |
12 Apr 12 |
peter |
280 |
ROC::Weights::Weights(void) |
2718 |
12 Apr 12 |
peter |
281 |
: small_pos(0), small_neg(0), tied_pos(0), tied_neg(0) |
2718 |
12 Apr 12 |
peter |
282 |
{} |
2718 |
12 Apr 12 |
peter |
283 |
|
683 |
11 Oct 06 |
jari |
284 |
}}} // of namespace statistics, yat, and theplu |