yat  0.10.4pre
ROC.h
1 #ifndef _theplu_yat_statistics_roc_
2 #define _theplu_yat_statistics_roc_
3 
4 // $Id: ROC.h 2736 2012-05-29 10:06:01Z peter $
5 
6 /*
7  Copyright (C) 2004 Peter Johansson
8  Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
9  Copyright (C) 2011, 2012 Peter Johansson
10 
11  This file is part of the yat library, http://dev.thep.lu.se/yat
12 
13  The yat library is free software; you can redistribute it and/or
14  modify it under the terms of the GNU General Public License as
15  published by the Free Software Foundation; either version 3 of the
16  License, or (at your option) any later version.
17 
18  The yat library is distributed in the hope that it will be useful,
19  but WITHOUT ANY WARRANTY; without even the implied warranty of
20  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21  General Public License for more details.
22 
23  You should have received a copy of the GNU General Public License
24  along with yat. If not, see <http://www.gnu.org/licenses/>.
25 */
26 
27 #include "Averager.h"
29 #include "yat/utility/yat_assert.h"
30 
31 #include <gsl/gsl_randist.h>
32 
33 #include <algorithm>
34 #include <map>
35 #include <utility>
36 #include <vector>
37 
38 namespace theplu {
39 namespace yat {
40 namespace statistics {
41 
51  class ROC
52  {
53 
54  public:
58  ROC(void);
59 
73  void add(double value, bool target, double weight=1.0);
74 
82  double area(void);
83 
91  unsigned int& minimum_size(void);
92 
106  const unsigned int& minimum_size(void) const;
107 
113  double n(void) const;
114 
120  double n_neg(void) const;
121 
127  double n_pos(void) const;
128 
174  double p_value_one_sided(void) const;
175 
193  double p_value(void) const;
194 
204  void remove(double value, bool target, double weight=1.0);
205 
209  void reset(void);
210 
211  private:
212  typedef std::multimap<double, std::pair<bool, double> > Map;
213  typedef std::vector<std::pair<bool, Map::mapped_type> > Vector;
214 
215  // struct used in count functions
216  struct Weights
217  {
218  Weights(void);
219  double small_pos;
220  double small_neg;
221  double tied_pos;
222  double tied_neg;
223  };
224 
226  double get_p_approx(double) const;
227 
231  bool is_weighted(void) const;
232 
236  size_t nof_points(const Averager& a) const;
237 
238  /*
239  Calculate probability to get an area equal (smaller) than \a
240  area given the distribution of weights and ties in multimap_
241  */
242  double p_left_weighted(double area) const;
243 
244  /*
245  Calculate probability to get an area equal (greater) than \a
246  area given the distribution of weights and ties in multimap_
247  */
248  double p_right_weighted(double area) const;
249 
250  /*
251  Count number of combinations (of N!) that gives weight sum equal
252  or larger than \a threshold.
253 
254  Range [first, last) is used to check for ties. If, e.g., *first
255  and *(first+1) are equal implies that the two largest values are
256  equal.
257  */
258  template <typename Iterator>
259  double count(Iterator first, Iterator last, double threshold) const;
260 
261  /*
262  Loop over all elements in \a weights and call count(7)
263  */
264  template <typename Iterator>
265  double count(Vector& weights, Iterator iter, Iterator last,
266  double threshold, double sum, const Weights& weight) const;
267 
268  /*
269  Count number of combinations in which sum>=threshold given
270  classes and weights in \a weight. Range [iter, last) is used to
271  handle ties.
272  */
273  template <typename Iterator>
274  double count(Vector& weights, Iterator iter, Iterator last,
275  double threshold, double sum, Weights weight,
276  const std::pair<bool, double>& entry) const;
277 
278  /*
279  Calculates probability to get \a block number of pairs correctly
280  sorted when having \a pos positive samples and \a neg negative
281  samples given the distribution of ties as in [first, last).
282  */
283  template<typename ForwardIterator>
284  double p_exact_with_ties(ForwardIterator first, ForwardIterator last,
285  double block, unsigned int pos,
286  unsigned int neg) const;
287 
291  double p_exact_right(double area) const;
292 
296  double p_exact_left(double area) const;
297 
298  bool use_exact_method(void) const;
299 
300  double area_;
301  bool has_ties_;
302  unsigned int minimum_size_;
303  Averager neg_weights_;
304  Averager pos_weights_;
305  Map multimap_;
306  };
307 
308  template<typename ForwardIterator>
309  double
310  ROC::p_exact_with_ties(ForwardIterator begin, ForwardIterator end,
311  double block, unsigned int pos,unsigned int neg) const
312  {
313  if (block <= 0)
314  return 1.0;
315  if (block > pos*neg)
316  return 0.0;
317 
318  ForwardIterator iter(begin);
319  unsigned int n=0;
320  while (iter!=end && iter->first == begin->first) {
321  ++iter;
322  ++n;
323  }
324  double result = 0;
325  /*
326  pos1 neg1 | n
327  pos2 neg2 |
328  ---- ---- ----
329  pos neg
330  */
331 
332  // ensure pos1 and neg2 are non-negative
333  unsigned int pos1 = n - std::min(n, neg);
334  // ensure pos2 and neg1 are non-negative
335  unsigned int max = std::min(n, pos);
336  YAT_ASSERT(pos1<=max);
337  for ( ; pos1<=max; ++pos1) {
338  unsigned int neg1 = n-pos1;
339  YAT_ASSERT(neg1<=n);
340  unsigned int pos2 = pos-pos1;
341  YAT_ASSERT(pos2<=pos);
342  unsigned int neg2 = neg-neg1;
343  YAT_ASSERT(neg2<=neg);
344  result += gsl_ran_hypergeometric_pdf(pos1, static_cast<unsigned int>(pos),
345  static_cast<unsigned int>(neg), n)
346  * p_exact_with_ties(iter, end,
347  block - pos2*neg1 - 0.5*pos1*neg1,
348  pos2, neg2);
349  }
350  return result;
351  }
352 
353 
354  template <typename Iterator>
355  double ROC::count(Iterator first, Iterator last, double threshold) const
356  {
357  Vector vec;
358  vec.reserve(multimap_.size());
359  // copy values from multimap_ to v
360  for (Map::const_iterator i = multimap_.begin(); i!=multimap_.end(); ++i)
361  vec.push_back(std::make_pair(false, i->second));
362 
363  ROC::Weights w;
364  w.small_pos = pos_weights_.sum_x();
365  w.small_neg = neg_weights_.sum_x();
366  return count(vec, first, last, threshold*w.small_pos*w.small_neg, 0, w);
367  }
368 
369 
370 
371  template <typename Iterator>
372  double ROC::count(ROC::Vector& v, Iterator iter, Iterator last,
373  double threshold, double sum, const Weights& w) const
374  {
375  double result = 0.0;
376  // loop over all elements
377  int nof_elements = 0;
378  for (ROC::Vector::iterator i=v.begin(); i!=v.end(); ++i) {
379  if (i->first)
380  continue;
381  i->first = true;
382  result += count(v, iter, last, threshold, sum, w, i->second);
383  i->first = false;
384  ++nof_elements;
385  }
386  YAT_ASSERT(nof_elements);
387  return result/nof_elements;
388  }
389 
390 
391  template <typename Iterator>
392  double ROC::count(Vector& weights, Iterator iter, Iterator last,
393  double threshold, double sum, Weights w,
394  const std::pair<bool, double>& entry) const
395  {
396  double tiny = 10e-10;
397 
398  Iterator next(iter);
399  YAT_ASSERT(next!=last);
400  ++next;
401 
402  // update weights
403  if (entry.first) {
404  w.tied_pos += entry.second;
405  w.small_pos -= entry.second;
406  }
407  else {
408  w.tied_neg += entry.second;
409  w.small_neg -= entry.second;
410  }
411 
412  // last entry in equal range
413  if (next==last || *next!=*iter) {
414  sum += 0.5*w.tied_pos*w.tied_neg + w.tied_pos * w.small_neg;
415  w.tied_pos=0;
416  w.tied_neg=0;
417  }
418 
419  // max sum happens if all pos values belong to current equal range
420  // and none of the remaining neg values
421  double max_sum = sum + 0.5*(w.tied_pos+w.small_pos)*w.tied_neg +
422  (w.tied_pos+w.small_pos)*w.small_neg;
423 
424  if (max_sum<threshold-tiny)
425  return 0.0;
426  if (sum + 0.5*w.tied_pos*(w.tied_neg+w.small_neg) >= threshold-tiny)
427  return 1.0;
428 
429  if (next!=last)
430  return count(weights, next, last, threshold, sum, w);
431  return 0.0;
432  }
433 
434 }}} // of namespace statistics, yat, and theplu
435 #endif

Generated on Mon Nov 11 2013 09:41:44 for yat by  doxygen 1.8.1