yat  0.12.3pre
ROC.h
1 #ifndef _theplu_yat_statistics_roc_
2 #define _theplu_yat_statistics_roc_
3 
4 // $Id: ROC.h 3023 2013-04-06 02:35:36Z peter $
5 
6 /*
7  Copyright (C) 2004 Peter Johansson
8  Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2011, 2012, 2013 Peter Johansson
10 
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12 
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17 
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22 
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25 */
26 
27 #include "Averager.h"
28 #include "yat/utility/deprecate.h"
30 #include "yat/utility/yat_assert.h"
31 
32 #include <gsl/gsl_randist.h>
33 
34 #include <algorithm>
35 #include <map>
36 #include <utility>
37 #include <vector>
38 
39 namespace theplu {
40 namespace yat {
41 namespace statistics {
42 
52  class ROC
53  {
54 
55  public:
59  ROC(void);
60 
74  void add(double value, bool target, double weight=1.0);
75 
83  double area(void) const;
84 
92  unsigned int& minimum_size(void);
93 
107  const unsigned int& minimum_size(void) const;
108 
114  double n(void) const;
115 
121  double n_neg(void) const;
122 
128  double n_pos(void) const;
129 
130 
136  double p_left(void) const;
137 
183  double p_right(void) const;
184 
189  double p_value_one_sided(void) const YAT_DEPRECATE;
190 
208  double p_value(void) const;
209 
219  void remove(double value, bool target, double weight=1.0);
220 
224  void reset(void);
225 
226  private:
227  typedef std::multimap<double, std::pair<bool, double> > Map;
228  typedef std::vector<std::pair<bool, Map::mapped_type> > Vector;
229 
230  // struct used in count functions
231  struct Weights
232  {
233  Weights(void);
234  double small_pos;
235  double small_neg;
236  double tied_pos;
237  double tied_neg;
238  };
239 
241  double get_p_approx(double) const;
242 
246  bool is_weighted(void) const;
247 
251  size_t nof_points(const Averager& a) const;
252 
253  /*
254  Calculate probability to get an area equal (smaller) than \a
255  area given the distribution of weights and ties in multimap_
256  */
257  double p_left_weighted(double area) const;
258 
259  /*
260  Calculate probability to get an area equal (greater) than \a
261  area given the distribution of weights and ties in multimap_
262  */
263  double p_right_weighted(double area) const;
264 
265  /*
266  Count number of combinations (of N!) that gives weight sum equal
267  or larger than \a threshold.
268 
269  Range [first, last) is used to check for ties. If, e.g., *first
270  and *(first+1) are equal implies that the two largest values are
271  equal.
272  */
273  template <typename Iterator>
274  double count(Iterator first, Iterator last, double threshold) const;
275 
276  /*
277  Loop over all elements in \a weights and call count(7)
278  */
279  template <typename Iterator>
280  double count(Vector& weights, Iterator iter, Iterator last,
281  double threshold, double sum, const Weights& weight) const;
282 
283  /*
284  Count number of combinations in which sum>=threshold given
285  classes and weights in \a weight. Range [iter, last) is used to
286  handle ties.
287  */
288  template <typename Iterator>
289  double count(Vector& weights, Iterator iter, Iterator last,
290  double threshold, double sum, Weights weight,
291  const std::pair<bool, double>& entry) const;
292 
293  /*
294  Calculates probability to get \a block number of pairs correctly
295  sorted when having \a pos positive samples and \a neg negative
296  samples given the distribution of ties as in [first, last).
297  */
298  template<typename ForwardIterator>
299  double p_exact_with_ties(ForwardIterator first, ForwardIterator last,
300  double block, unsigned int pos,
301  unsigned int neg) const;
302 
306  double p_exact_right(double area) const;
307 
311  double p_exact_left(double area) const;
312 
313  bool use_exact_method(void) const;
314 
315  mutable double area_;
316  bool has_ties_;
317  unsigned int minimum_size_;
318  Averager neg_weights_;
319  Averager pos_weights_;
320  Map multimap_;
321  };
322 
323  template<typename ForwardIterator>
324  double
325  ROC::p_exact_with_ties(ForwardIterator begin, ForwardIterator end,
326  double block, unsigned int pos,unsigned int neg) const
327  {
328  if (block <= 0)
329  return 1.0;
330  if (block > pos*neg)
331  return 0.0;
332 
333  ForwardIterator iter(begin);
334  unsigned int n=0;
335  while (iter!=end && iter->first == begin->first) {
336  ++iter;
337  ++n;
338  }
339  double result = 0;
340  /*
341  pos1 neg1 | n
342  pos2 neg2 |
343  ---- ---- ----
344  pos neg
345  */
346 
347  // ensure pos1 and neg2 are non-negative
348  unsigned int pos1 = n - std::min(n, neg);
349  // ensure pos2 and neg1 are non-negative
350  unsigned int max = std::min(n, pos);
351  YAT_ASSERT(pos1<=max);
352  for ( ; pos1<=max; ++pos1) {
353  unsigned int neg1 = n-pos1;
354  YAT_ASSERT(neg1<=n);
355  unsigned int pos2 = pos-pos1;
356  YAT_ASSERT(pos2<=pos);
357  unsigned int neg2 = neg-neg1;
358  YAT_ASSERT(neg2<=neg);
359  result += gsl_ran_hypergeometric_pdf(pos1, static_cast<unsigned int>(pos),
360  static_cast<unsigned int>(neg), n)
361  * p_exact_with_ties(iter, end,
362  block - pos2*neg1 - 0.5*pos1*neg1,
363  pos2, neg2);
364  }
365  return result;
366  }
367 
368 
369  template <typename Iterator>
370  double ROC::count(Iterator first, Iterator last, double threshold) const
371  {
372  Vector vec;
373  vec.reserve(multimap_.size());
374  // copy values from multimap_ to v
375  for (Map::const_iterator i = multimap_.begin(); i!=multimap_.end(); ++i)
376  vec.push_back(std::make_pair(false, i->second));
377 
378  ROC::Weights w;
379  w.small_pos = pos_weights_.sum_x();
380  w.small_neg = neg_weights_.sum_x();
381  return count(vec, first, last, threshold*w.small_pos*w.small_neg, 0, w);
382  }
383 
384 
385 
386  template <typename Iterator>
387  double ROC::count(ROC::Vector& v, Iterator iter, Iterator last,
388  double threshold, double sum, const Weights& w) const
389  {
390  double result = 0.0;
391  // loop over all elements
392  int nof_elements = 0;
393  for (ROC::Vector::iterator i=v.begin(); i!=v.end(); ++i) {
394  if (i->first)
395  continue;
396  i->first = true;
397  result += count(v, iter, last, threshold, sum, w, i->second);
398  i->first = false;
399  ++nof_elements;
400  }
401  YAT_ASSERT(nof_elements);
402  return result/nof_elements;
403  }
404 
405 
406  template <typename Iterator>
407  double ROC::count(Vector& weights, Iterator iter, Iterator last,
408  double threshold, double sum, Weights w,
409  const std::pair<bool, double>& entry) const
410  {
411  double tiny = 10e-10;
412 
413  Iterator next(iter);
414  YAT_ASSERT(next!=last);
415  ++next;
416 
417  // update weights
418  if (entry.first) {
419  w.tied_pos += entry.second;
420  w.small_pos -= entry.second;
421  }
422  else {
423  w.tied_neg += entry.second;
424  w.small_neg -= entry.second;
425  }
426 
427  // last entry in equal range
428  if (next==last || *next!=*iter) {
429  sum += 0.5*w.tied_pos*w.tied_neg + w.tied_pos * w.small_neg;
430  w.tied_pos=0;
431  w.tied_neg=0;
432  }
433 
434  // max sum happens if all pos values belong to current equal range
435  // and none of the remaining neg values
436  double max_sum = sum + 0.5*(w.tied_pos+w.small_pos)*w.tied_neg +
437  (w.tied_pos+w.small_pos)*w.small_neg;
438 
439  if (max_sum<threshold-tiny)
440  return 0.0;
441  if (sum + 0.5*w.tied_pos*(w.tied_neg+w.small_neg) >= threshold-tiny)
442  return 1.0;
443 
444  if (next!=last)
445  return count(weights, next, last, threshold, sum, w);
446  return 0.0;
447  }
448 
449 }}} // of namespace statistics, yat, and theplu
450 #endif
double area(void) const
Area Under Curve, AUC.
ROC(void)
Default constructor.
unsigned int & minimum_size(void)
threshold for p_value calculation
Class to calculate simple (first and second moments) averages.
Definition: Averager.h:45
double n_neg(void) const
number of negative samples
double p_value(void) const
Two-sided p-value.
void add(double value, bool target, double weight=1.0)
Add a data value.
double p_right(void) const
One-sided P-value.
T max(const T &a, const T &b, const T &c)
Definition: stl_utility.h:638
double p_value_one_sided(void) const
double n_pos(void) const
number of positive samples
double p_left(void) const
double n(void) const
number of samples
Reciever Operating Characteristic.
Definition: ROC.h:52
double sum_x(void) const
Definition: averager_base.h:128
void reset(void)
Set everything to zero.

Generated on Mon Jun 1 2015 12:29:52 for yat by  doxygen 1.8.5