yat: /scratch/bob/jari/tmp/pristine/yat-0.10.x/yat/statistics/ROC.h Source File

 #ifndef _theplu_yat_statistics_roc_
 #define _theplu_yat_statistics_roc_
 
 // $Id: ROC.h 2736 2012-05-29 10:06:01Z peter $
 
 /*
   Copyright (C) 2004 Peter Johansson
   Copyright (C) 2005, 2006, 2007, 2008 Jari Häkkinen, Peter Johansson
   Copyright (C) 2011, 2012 Peter Johansson
 
   This file is part of the yat library, http://dev.thep.lu.se/yat
 
   The yat library is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 3 of the
   License, or (at your option) any later version.
 
   The yat library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
   General Public License for more details.
 
   You should have received a copy of the GNU General Public License
   along with yat. If not, see <http://www.gnu.org/licenses/>.
 */
 
 #include "Averager.h"
 #include "yat/utility/stl_utility.h"
 #include "yat/utility/yat_assert.h"
 
 #include <gsl/gsl_randist.h>
 
 #include <algorithm>
 #include <map>
 #include <utility>
 #include <vector>
 
 namespace theplu {
 namespace yat {
 namespace statistics {
 
   class ROC
   {
 
   public:
     ROC(void);
 
     void add(double value, bool target, double weight=1.0);
 
     double area(void);
 
     unsigned int& minimum_size(void);
 
     const unsigned int& minimum_size(void) const;
 
     double n(void) const;
 
     double n_neg(void) const;
 
     double n_pos(void) const;
 
     double p_value_one_sided(void) const;
 
     double p_value(void) const;
 
     void remove(double value, bool target, double weight=1.0);
 
     void reset(void);
 
   private:
     typedef std::multimap<double, std::pair<bool, double> > Map;
     typedef std::vector<std::pair<bool, Map::mapped_type> > Vector;
 
     // struct used in count functions
     struct Weights
     {
       Weights(void);
       double small_pos;
       double small_neg;
       double tied_pos;
       double tied_neg;
     };
 
     double get_p_approx(double) const;
 
     bool is_weighted(void) const;
 
     size_t nof_points(const Averager& a) const;
 
     /*
       Calculate probability to get an area equal (smaller) than \a
       area given the distribution of weights and ties in multimap_
      */
     double p_left_weighted(double area) const;
 
     /*
       Calculate probability to get an area equal (greater) than \a
       area given the distribution of weights and ties in multimap_
      */
     double p_right_weighted(double area) const;
 
     /*
       Count number of combinations (of N!) that gives weight sum equal
       or larger than \a threshold.
 
       Range [first, last) is used to check for ties. If, e.g., *first
       and *(first+1) are equal implies that the two largest values are
       equal.
      */
     template <typename Iterator>
     double count(Iterator first, Iterator last, double threshold) const;
 
     /*
       Loop over all elements in \a weights and call count(7)
      */
     template <typename Iterator>
     double count(Vector& weights, Iterator iter, Iterator last,
                  double threshold, double sum, const Weights& weight) const;
 
     /*
       Count number of combinations in which sum>=threshold given
       classes and weights in \a weight. Range [iter, last) is used to
       handle ties.
      */
     template <typename Iterator>
     double count(Vector& weights, Iterator iter, Iterator last,
                  double threshold, double sum, Weights weight,
                  const std::pair<bool, double>& entry) const;
 
     /*
       Calculates probability to get \a block number of pairs correctly
       sorted when having \a pos positive samples and \a neg negative
       samples given the distribution of ties as in [first, last).
      */
     template<typename ForwardIterator>
     double p_exact_with_ties(ForwardIterator first, ForwardIterator last,
                              double block, unsigned int pos,
                              unsigned int neg) const;
 
     double p_exact_right(double area) const;
 
     double p_exact_left(double area) const;
 
     bool use_exact_method(void) const;
 
     double area_;
     bool has_ties_;
     unsigned int minimum_size_;
     Averager neg_weights_;
     Averager pos_weights_;
     Map multimap_;
   };
 
   template<typename ForwardIterator>
   double
   ROC::p_exact_with_ties(ForwardIterator begin, ForwardIterator end,
                          double block, unsigned int pos,unsigned int neg) const
   {
     if (block <= 0)
       return 1.0;
     if (block > pos*neg)
       return 0.0;
 
     ForwardIterator iter(begin);
     unsigned int n=0;
     while (iter!=end && iter->first == begin->first) {
       ++iter;
       ++n;
     }
     double result = 0;
     /*
       pos1  neg1  |  n
       pos2  neg2  |
       ----  ----   ----
       pos   neg
      */
 
     // ensure pos1 and neg2 are non-negative
     unsigned int pos1 = n - std::min(n, neg);
     // ensure pos2 and neg1 are non-negative
     unsigned int max = std::min(n, pos);
     YAT_ASSERT(pos1<=max);
     for ( ; pos1<=max; ++pos1) {
       unsigned int neg1 = n-pos1;
       YAT_ASSERT(neg1<=n);
       unsigned int pos2 = pos-pos1;
       YAT_ASSERT(pos2<=pos);
       unsigned int neg2 = neg-neg1;
       YAT_ASSERT(neg2<=neg);
       result += gsl_ran_hypergeometric_pdf(pos1, static_cast<unsigned int>(pos),
                                            static_cast<unsigned int>(neg), n)
         * p_exact_with_ties(iter, end,
                             block - pos2*neg1 - 0.5*pos1*neg1,
                             pos2, neg2);
     }
     return result;
   }
 
 
   template <typename Iterator>
   double ROC::count(Iterator first, Iterator last, double threshold) const
   {
     Vector vec;
     vec.reserve(multimap_.size());
     // copy values from multimap_ to v
     for (Map::const_iterator i = multimap_.begin(); i!=multimap_.end(); ++i)
       vec.push_back(std::make_pair(false, i->second));
 
     ROC::Weights w;
     w.small_pos = pos_weights_.sum_x();
     w.small_neg = neg_weights_.sum_x();
     return count(vec, first, last, threshold*w.small_pos*w.small_neg, 0, w);
   }
 
 
 
   template <typename Iterator>
   double ROC::count(ROC::Vector& v, Iterator iter, Iterator last,
                     double threshold, double sum, const Weights& w) const
   {
     double result = 0.0;
     // loop over all elements
     int nof_elements = 0;
     for (ROC::Vector::iterator i=v.begin(); i!=v.end(); ++i) {
       if (i->first)
         continue;
       i->first = true;
       result += count(v, iter, last, threshold, sum, w, i->second);
       i->first = false;
       ++nof_elements;
     }
     YAT_ASSERT(nof_elements);
     return result/nof_elements;
   }
 
 
   template <typename Iterator>
   double ROC::count(Vector& weights, Iterator iter, Iterator last,
                     double threshold, double sum, Weights w,
                     const std::pair<bool, double>& entry) const
   {
     double tiny = 10e-10;
 
     Iterator next(iter);
     YAT_ASSERT(next!=last);
     ++next;
 
     // update weights
     if (entry.first) {
       w.tied_pos += entry.second;
       w.small_pos -= entry.second;
     }
     else {
       w.tied_neg += entry.second;
       w.small_neg -= entry.second;
     }
 
     // last entry in equal range
     if (next==last || *next!=*iter) {
       sum += 0.5*w.tied_pos*w.tied_neg + w.tied_pos * w.small_neg;
       w.tied_pos=0;
       w.tied_neg=0;
     }
 
     // max sum happens if all pos values belong to current equal range
     // and none of the remaining neg values
     double max_sum = sum + 0.5*(w.tied_pos+w.small_pos)*w.tied_neg +
       (w.tied_pos+w.small_pos)*w.small_neg;
 
     if (max_sum<threshold-tiny)
       return 0.0;
     if (sum + 0.5*w.tied_pos*(w.tied_neg+w.small_neg) >= threshold-tiny)
       return 1.0;
 
     if (next!=last)
       return count(weights, next, last, threshold, sum, w);
     return 0.0;
   }
 
 }}} // of namespace statistics, yat, and theplu
 #endif