4200 |
19 Aug 22 |
peter |
1 |
#ifndef _theplu_yat_statistics_percentiler_ |
4200 |
19 Aug 22 |
peter |
2 |
#define _theplu_yat_statistics_percentiler_ |
1317 |
21 May 08 |
peter |
3 |
|
1317 |
21 May 08 |
peter |
// $Id$ |
1317 |
21 May 08 |
peter |
5 |
|
1317 |
21 May 08 |
peter |
6 |
/* |
2121 |
13 Dec 09 |
peter |
Copyright (C) 2008, 2009 Jari Häkkinen, Peter Johansson |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2010, 2011, 2014, 2016, 2020 Peter Johansson |
1317 |
21 May 08 |
peter |
9 |
|
1469 |
02 Sep 08 |
peter |
This file is part of the yat library, http://dev.thep.lu.se/yat |
1317 |
21 May 08 |
peter |
11 |
|
1317 |
21 May 08 |
peter |
The yat library is free software; you can redistribute it and/or |
1317 |
21 May 08 |
peter |
modify it under the terms of the GNU General Public License as |
1486 |
09 Sep 08 |
jari |
published by the Free Software Foundation; either version 3 of the |
1317 |
21 May 08 |
peter |
License, or (at your option) any later version. |
1317 |
21 May 08 |
peter |
16 |
|
1317 |
21 May 08 |
peter |
The yat library is distributed in the hope that it will be useful, |
1317 |
21 May 08 |
peter |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
1317 |
21 May 08 |
peter |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
1317 |
21 May 08 |
peter |
General Public License for more details. |
1317 |
21 May 08 |
peter |
21 |
|
1317 |
21 May 08 |
peter |
You should have received a copy of the GNU General Public License |
1487 |
10 Sep 08 |
jari |
along with yat. If not, see <http://www.gnu.org/licenses/>. |
1317 |
21 May 08 |
peter |
24 |
*/ |
1317 |
21 May 08 |
peter |
25 |
|
2263 |
26 May 10 |
peter |
26 |
#include "yat/utility/concept_check.h" |
1404 |
07 Aug 08 |
peter |
27 |
#include "yat/utility/DataWeight.h" |
1317 |
21 May 08 |
peter |
28 |
#include "yat/utility/iterator_traits.h" |
1404 |
07 Aug 08 |
peter |
29 |
#include "yat/utility/yat_assert.h" |
1404 |
07 Aug 08 |
peter |
30 |
#include "yat/utility/WeightIterator.h" |
1317 |
21 May 08 |
peter |
31 |
|
2202 |
21 Feb 10 |
peter |
32 |
#include <boost/concept_check.hpp> |
3511 |
22 Jul 16 |
peter |
33 |
#include <boost/iterator/iterator_concepts.hpp> |
2202 |
21 Feb 10 |
peter |
34 |
|
1317 |
21 May 08 |
peter |
35 |
#include <algorithm> |
1317 |
21 May 08 |
peter |
36 |
#include <cmath> |
2470 |
12 Apr 11 |
peter |
37 |
#include <limits> |
1404 |
07 Aug 08 |
peter |
38 |
#include <numeric> |
1404 |
07 Aug 08 |
peter |
39 |
#include <stdexcept> |
1317 |
21 May 08 |
peter |
40 |
#include <vector> |
1317 |
21 May 08 |
peter |
41 |
|
1317 |
21 May 08 |
peter |
42 |
namespace theplu { |
1317 |
21 May 08 |
peter |
43 |
namespace yat { |
3240 |
24 May 14 |
peter |
44 |
namespace statistics { |
1317 |
21 May 08 |
peter |
45 |
|
1317 |
21 May 08 |
peter |
46 |
/** |
1317 |
21 May 08 |
peter |
\brief Functor to calculate percentile of a range |
1339 |
06 Jun 08 |
peter |
48 |
|
1339 |
06 Jun 08 |
peter |
\since New in yat 0.5 |
1317 |
21 May 08 |
peter |
50 |
*/ |
1317 |
21 May 08 |
peter |
51 |
class Percentiler |
1317 |
21 May 08 |
peter |
52 |
{ |
1317 |
21 May 08 |
peter |
53 |
public: |
1317 |
21 May 08 |
peter |
54 |
/** |
1422 |
20 Aug 08 |
peter |
\param perc percentile to calculate [0,100]. Default value is |
1317 |
21 May 08 |
peter |
50, which implies class will calculate median. |
1404 |
07 Aug 08 |
peter |
\param sorted if true class assumes that ranges are already |
1317 |
21 May 08 |
peter |
sorted, if false the range will copied to a new range which is |
1317 |
21 May 08 |
peter |
sorted. |
1317 |
21 May 08 |
peter |
60 |
*/ |
1317 |
21 May 08 |
peter |
61 |
Percentiler(double perc=50, bool sorted=false); |
1317 |
21 May 08 |
peter |
62 |
|
1317 |
21 May 08 |
peter |
63 |
/** |
1317 |
21 May 08 |
peter |
Function is a non-mutable function, i.e., \a first and \a last |
1317 |
21 May 08 |
peter |
can be const_iterators. |
1317 |
21 May 08 |
peter |
66 |
|
1422 |
20 Aug 08 |
peter |
The \a Nth percentile is defined such that, for example, when |
1422 |
20 Aug 08 |
peter |
having four numbers \f$ 0.69 < 1.41 < 3.14 < 28 \f$, the \a Nth |
1422 |
20 Aug 08 |
peter |
percentile is: |
1422 |
20 Aug 08 |
peter |
70 |
|
1422 |
20 Aug 08 |
peter |
- \f$ 0.69 \f$ if \f$ N < 25 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ (0.69+1.41)/2 \f$ if \f$ N=25 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ 1.41 \f$ if \f$ 25 < N < 50 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ (1.41+3.14)/2 \f$ if \f$ N=50 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ 3.14 \f$ if \f$ 50 < N < 75 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ (3.14+28)/2 \f$ if \f$ N=75 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ 28 \f$ if \f$ 75 < N \f$ |
1422 |
20 Aug 08 |
peter |
78 |
|
1422 |
20 Aug 08 |
peter |
Similarily, if we have a weighted range \f$ x_0=0.69, w_0=1 ; |
1422 |
20 Aug 08 |
peter |
x_1=1.41, w_1=0 ; x_2=3.14, w_2=0.5 ; x_3=28, w_3=0.5 \f$, we |
1422 |
20 Aug 08 |
peter |
calculate the accumulated normalized weight \f$ S_k = \frac |
1422 |
20 Aug 08 |
peter |
{100}{\sum w_i}\sum_{i=0}^k w_i \f$ and the percentile is |
1422 |
20 Aug 08 |
peter |
83 |
|
1422 |
20 Aug 08 |
peter |
- \f$ 0.69 \f$ if \f$ N < S_0 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ (0.69+3.14)/2 \f$ if \f$ N=S_0 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ 3.14 \f$ if \f$ S_0 < N < S_2 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ (3.14+28)/2 \f$ if \f$ N=S_2 \f$ |
1422 |
20 Aug 08 |
peter |
- \f$ 28 \f$ if \f$ S_2 < N \f$ |
1422 |
20 Aug 08 |
peter |
89 |
|
3240 |
24 May 14 |
peter |
Note, that data point with weight zero is completely ignored. |
3240 |
24 May 14 |
peter |
91 |
|
3511 |
22 Jul 16 |
peter |
Type Requirements: |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \ref |
3511 |
22 Jul 16 |
peter |
concept_data_iterator |
3511 |
22 Jul 16 |
peter |
- \c RandomAccessIterator must be a \random_access_traversal_iterator |
3870 |
24 Feb 20 |
peter |
- \c For unweighted iterator, RandomAccessIterator must be an |
3870 |
24 Feb 20 |
peter |
\lvalue_iterator as implied by \forward_iterator. |
2202 |
21 Feb 10 |
peter |
98 |
|
1317 |
21 May 08 |
peter |
\return percentile of range |
1317 |
21 May 08 |
peter |
100 |
*/ |
1323 |
24 May 08 |
peter |
101 |
template<typename RandomAccessIterator> |
3240 |
24 May 14 |
peter |
102 |
double operator()(RandomAccessIterator first, |
1404 |
07 Aug 08 |
peter |
103 |
RandomAccessIterator last) const |
1317 |
21 May 08 |
peter |
104 |
{ |
2263 |
26 May 10 |
peter |
105 |
BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>)); |
3511 |
22 Jul 16 |
peter |
106 |
BOOST_CONCEPT_ASSERT((boost_concepts::RandomAccessTraversal<RandomAccessIterator>)); |
3870 |
24 Feb 20 |
peter |
107 |
// |
2470 |
12 Apr 11 |
peter |
108 |
if (first==last) |
2470 |
12 Apr 11 |
peter |
109 |
return std::numeric_limits<double>::quiet_NaN(); |
3243 |
24 May 14 |
peter |
// just to avoid long line |
3243 |
24 May 14 |
peter |
111 |
using utility::weighted_iterator_traits; |
3243 |
24 May 14 |
peter |
112 |
typedef typename weighted_iterator_traits<RandomAccessIterator>::type tag; |
3243 |
24 May 14 |
peter |
113 |
return calculate(first, last, sorted_, tag()); |
1317 |
21 May 08 |
peter |
114 |
} |
1317 |
21 May 08 |
peter |
115 |
private: |
1317 |
21 May 08 |
peter |
116 |
double perc_; |
1317 |
21 May 08 |
peter |
117 |
bool sorted_; |
1317 |
21 May 08 |
peter |
118 |
|
1404 |
07 Aug 08 |
peter |
// unweighted version |
1323 |
24 May 08 |
peter |
120 |
template<typename RandomAccessIterator> |
1323 |
24 May 08 |
peter |
121 |
double calculate(RandomAccessIterator first, RandomAccessIterator last, |
1404 |
07 Aug 08 |
peter |
122 |
bool sorted, utility::unweighted_iterator_tag tag) const; |
1317 |
21 May 08 |
peter |
123 |
|
1404 |
07 Aug 08 |
peter |
// weighted version |
1323 |
24 May 08 |
peter |
125 |
template<typename RandomAccessIterator> |
1323 |
24 May 08 |
peter |
126 |
double calculate(RandomAccessIterator first, RandomAccessIterator last, |
1404 |
07 Aug 08 |
peter |
127 |
bool sorted, utility::weighted_iterator_tag tag) const; |
1317 |
21 May 08 |
peter |
128 |
|
1317 |
21 May 08 |
peter |
// using compiler generated copy |
1317 |
21 May 08 |
peter |
//Percentiler(const Percentiler&); |
1317 |
21 May 08 |
peter |
//Percentiler& operator=(const Percentiler&); |
1317 |
21 May 08 |
peter |
132 |
|
1317 |
21 May 08 |
peter |
133 |
}; |
1317 |
21 May 08 |
peter |
134 |
|
1317 |
21 May 08 |
peter |
// template implementations |
1317 |
21 May 08 |
peter |
136 |
// ///////////////////////// |
1317 |
21 May 08 |
peter |
137 |
|
1317 |
21 May 08 |
peter |
// unweighted version |
1323 |
24 May 08 |
peter |
139 |
template<typename RandomAccessIterator> |
3240 |
24 May 14 |
peter |
140 |
double |
3240 |
24 May 14 |
peter |
141 |
Percentiler::calculate(RandomAccessIterator first, |
1404 |
07 Aug 08 |
peter |
142 |
RandomAccessIterator last, |
1404 |
07 Aug 08 |
peter |
143 |
bool sorted, |
1404 |
07 Aug 08 |
peter |
144 |
utility::unweighted_iterator_tag tag) const |
1317 |
21 May 08 |
peter |
145 |
{ |
3870 |
24 Feb 20 |
peter |
146 |
BOOST_CONCEPT_ASSERT((boost_concepts::LvalueIterator<RandomAccessIterator>)); |
3243 |
24 May 14 |
peter |
147 |
size_t n = last - first; |
1323 |
24 May 08 |
peter |
// range is one value only is a special case |
3243 |
24 May 14 |
peter |
149 |
if (n == 1) |
1418 |
19 Aug 08 |
peter |
150 |
*first; |
3243 |
24 May 14 |
peter |
151 |
|
3243 |
24 May 14 |
peter |
152 |
double j = n * perc_ / 100.0; |
3243 |
24 May 14 |
peter |
153 |
|
3243 |
24 May 14 |
peter |
// Some special cases |
3243 |
24 May 14 |
peter |
155 |
if (j > n-1) { |
3243 |
24 May 14 |
peter |
156 |
if (sorted) |
1317 |
21 May 08 |
peter |
157 |
return *(--last); |
3243 |
24 May 14 |
peter |
158 |
return *std::max_element(first, last); |
3243 |
24 May 14 |
peter |
159 |
} |
3243 |
24 May 14 |
peter |
160 |
if (j < 1.0) { |
3243 |
24 May 14 |
peter |
161 |
if (sorted) |
1404 |
07 Aug 08 |
peter |
162 |
return *first; |
3243 |
24 May 14 |
peter |
163 |
return *std::min_element(first, last); |
1317 |
21 May 08 |
peter |
164 |
} |
1317 |
21 May 08 |
peter |
165 |
|
3243 |
24 May 14 |
peter |
166 |
size_t i = static_cast<size_t>(j); |
3243 |
24 May 14 |
peter |
// for border case (j integer) we take the average of adjacent bins |
3243 |
24 May 14 |
peter |
168 |
size_t k = (i==j) ? i-1 : i; |
3243 |
24 May 14 |
peter |
169 |
|
3243 |
24 May 14 |
peter |
170 |
if (sorted) { |
3243 |
24 May 14 |
peter |
171 |
if (i==k) |
3243 |
24 May 14 |
peter |
172 |
return first[i]; |
3243 |
24 May 14 |
peter |
173 |
return (first[i]+first[k])/2; |
3243 |
24 May 14 |
peter |
174 |
} |
3243 |
24 May 14 |
peter |
175 |
|
3243 |
24 May 14 |
peter |
176 |
std::vector<double> vec(first, last); |
3243 |
24 May 14 |
peter |
// find the ith element |
3243 |
24 May 14 |
peter |
178 |
std::nth_element(vec.begin(), vec.begin()+i, vec.end()); |
3243 |
24 May 14 |
peter |
// if i==k simply return vec[i] |
3243 |
24 May 14 |
peter |
180 |
if (i==k) |
3243 |
24 May 14 |
peter |
181 |
return vec[i]; |
3243 |
24 May 14 |
peter |
182 |
YAT_ASSERT(k==i-1); |
3243 |
24 May 14 |
peter |
// otherwise return average of vec[i] and vec[k]. |
3243 |
24 May 14 |
peter |
184 |
|
3243 |
24 May 14 |
peter |
// We need to find the kth element. Since we have called |
3243 |
24 May 14 |
peter |
// nth_element above, we know that all elements in (begin+i, end) |
3243 |
24 May 14 |
peter |
// are >= vec[i] and all elements in [begin, begin+i) are <= |
3243 |
24 May 14 |
peter |
// vec[i]. Consequently, kth (k=i-1) element is the max element in |
3243 |
24 May 14 |
peter |
// range [begin, begin+i). |
3243 |
24 May 14 |
peter |
190 |
return (vec[i] + *std::max_element(vec.begin(), vec.begin()+i))/2; |
1317 |
21 May 08 |
peter |
191 |
} |
1323 |
24 May 08 |
peter |
192 |
|
1323 |
24 May 08 |
peter |
193 |
|
1317 |
21 May 08 |
peter |
// weighted version |
1323 |
24 May 08 |
peter |
195 |
template<typename RandomAccessIterator> |
3240 |
24 May 14 |
peter |
196 |
double Percentiler::calculate(RandomAccessIterator first, |
1323 |
24 May 08 |
peter |
197 |
RandomAccessIterator last, |
1404 |
07 Aug 08 |
peter |
198 |
bool sorted, |
1317 |
21 May 08 |
peter |
199 |
utility::weighted_iterator_tag tag) const |
1317 |
21 May 08 |
peter |
200 |
{ |
1404 |
07 Aug 08 |
peter |
201 |
if (sorted) { |
1404 |
07 Aug 08 |
peter |
202 |
utility::iterator_traits<RandomAccessIterator> trait; |
1404 |
07 Aug 08 |
peter |
203 |
std::vector<double> accum_w; |
1404 |
07 Aug 08 |
peter |
204 |
accum_w.reserve(last-first); |
1404 |
07 Aug 08 |
peter |
205 |
std::partial_sum(weight_iterator(first), |
1404 |
07 Aug 08 |
peter |
206 |
weight_iterator(last), |
1404 |
07 Aug 08 |
peter |
207 |
std::back_inserter(accum_w)); |
1404 |
07 Aug 08 |
peter |
208 |
|
1404 |
07 Aug 08 |
peter |
209 |
double w_bound=perc_/100.0*accum_w.back(); |
1418 |
19 Aug 08 |
peter |
210 |
std::vector<double>::const_iterator upper(accum_w.begin()); |
1420 |
20 Aug 08 |
peter |
211 |
double margin=1e-10; |
2470 |
12 Apr 11 |
peter |
212 |
while (upper!=accum_w.end() && *upper <= w_bound+margin) |
1404 |
07 Aug 08 |
peter |
213 |
++upper; |
2470 |
12 Apr 11 |
peter |
214 |
while (upper!=accum_w.begin() && |
3240 |
24 May 14 |
peter |
215 |
(upper==accum_w.end() || |
2470 |
12 Apr 11 |
peter |
216 |
trait.weight(first+(upper-accum_w.begin()))==0.0)) |
1420 |
20 Aug 08 |
peter |
217 |
--upper; |
1418 |
19 Aug 08 |
peter |
218 |
std::vector<double>::const_iterator lower(upper); |
1420 |
20 Aug 08 |
peter |
219 |
while ( *(lower-1)>=w_bound-margin && lower>accum_w.begin()) |
1404 |
07 Aug 08 |
peter |
220 |
--lower; |
3240 |
24 May 14 |
peter |
221 |
|
1418 |
19 Aug 08 |
peter |
222 |
return (trait.data(first+(upper-accum_w.begin()))+ |
1420 |
20 Aug 08 |
peter |
223 |
trait.data(first+(lower-accum_w.begin())))/2; |
1404 |
07 Aug 08 |
peter |
224 |
} |
1404 |
07 Aug 08 |
peter |
225 |
|
3244 |
24 May 14 |
peter |
226 |
std::vector<utility::DataWeight> v_copy(first, last); |
1404 |
07 Aug 08 |
peter |
227 |
std::sort(v_copy.begin(), v_copy.end()); |
1404 |
07 Aug 08 |
peter |
228 |
return calculate(v_copy.begin(), v_copy.end(), true, tag); |
1317 |
21 May 08 |
peter |
229 |
} |
1317 |
21 May 08 |
peter |
230 |
|
1317 |
21 May 08 |
peter |
231 |
|
1317 |
21 May 08 |
peter |
232 |
}}} // of namespace statistics, yat, and theplu |
1317 |
21 May 08 |
peter |
233 |
|
1317 |
21 May 08 |
peter |
234 |
#endif |