yat  0.8.3pre
utility.h
00001 #ifndef _theplu_yat_statistics_utility_ 
00002 #define _theplu_yat_statistics_utility_ 
00003 
00004 // $Id: utility.h 2625 2011-11-08 00:07:24Z peter $
00005 
00006 /*
00007   Copyright (C) 2004 Jari Häkkinen, Peter Johansson
00008   Copyright (C) 2005 Peter Johansson
00009   Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér
00010   Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson
00011   Copyright (C) 2009, 2010, 2011 Peter Johansson
00012 
00013   This file is part of the yat library, http://dev.thep.lu.se/yat
00014 
00015   The yat library is free software; you can redistribute it and/or
00016   modify it under the terms of the GNU General Public License as
00017   published by the Free Software Foundation; either version 3 of the
00018   License, or (at your option) any later version.
00019 
00020   The yat library is distributed in the hope that it will be useful,
00021   but WITHOUT ANY WARRANTY; without even the implied warranty of
00022   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
00023   General Public License for more details.
00024 
00025   You should have received a copy of the GNU General Public License
00026   along with yat. If not, see <http://www.gnu.org/licenses/>.
00027 */
00028 
00029 #include "Percentiler.h"
00030 
00031 #include "yat/classifier/DataLookupWeighted1D.h"
00032 #include "yat/classifier/Target.h"
00033 #include "yat/normalizer/utility.h"
00034 #include "yat/utility/concept_check.h"
00035 #include "yat/utility/DataIterator.h"
00036 #include "yat/utility/deprecate.h"
00037 #include "yat/utility/iterator_traits.h"
00038 #include "yat/utility/VectorBase.h"
00039 #include "yat/utility/yat_assert.h"
00040 
00041 #include <boost/concept_check.hpp>
00042 
00043 #include <algorithm>
00044 #include <cmath>
00045 #include <iterator>
00046 #include <stdexcept>
00047 #include <vector>
00048 
00049 #include <gsl/gsl_statistics_double.h>
00050 
00051 namespace theplu {
00052 namespace yat {
00053 namespace statistics {  
00054 
00060   template <typename T, typename ForwardIterator>
00061   void add(T& o, ForwardIterator first, ForwardIterator last,
00062            const classifier::Target& target);
00063 
00079   template<typename BidirectionalIterator1, typename BidirectionalIterator2>
00080   void benjamini_hochberg(BidirectionalIterator1 first,
00081                           BidirectionalIterator1 last,
00082                           BidirectionalIterator2 result);
00083                           
00084 
00096   double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 
00097                               unsigned int n2, unsigned int t);
00098 
00099 
00115   double pearson_p_value(double r, unsigned int n);
00116 
00124   double kurtosis(const utility::VectorBase&);
00125 
00126 
00138   template <class RandomAccessIterator>
00139   double mad(RandomAccessIterator first, RandomAccessIterator last, 
00140              bool sorted=false);
00141 
00142 
00158   template <class RandomAccessIterator> 
00159   double median(RandomAccessIterator first, RandomAccessIterator last, 
00160                 bool sorted=false); 
00161 
00162 
00188   template <class RandomAccessIterator>
00189   double percentile(RandomAccessIterator first, RandomAccessIterator last, 
00190                     double p, bool sorted=false) YAT_DEPRECATE;
00191 
00197   template <class RandomAccessIterator>
00198   double percentile2(RandomAccessIterator first, RandomAccessIterator last, 
00199                      double p, bool sorted=false)
00200   {
00201     Percentiler percentiler(p, sorted);
00202     return percentiler(first, last);
00203   }
00204 
00211   double skewness(const utility::VectorBase&);
00212   
00213 
00215 
00216   template <typename T, typename ForwardIterator>
00217   void add(T& o, ForwardIterator first, ForwardIterator last,
00218            const classifier::Target& target)
00219   {
00220     utility::iterator_traits<ForwardIterator> traits;
00221     for (size_t i=0; first!=last; ++i, ++first)
00222       o.add(traits.data(first), target.binary(i), traits.weight(first));
00223   } 
00224 
00225 
00226   template<typename BidirectionalIterator1, typename BidirectionalIterator2>
00227   void benjamini_hochberg(BidirectionalIterator1 first,
00228                           BidirectionalIterator1 last,
00229                           BidirectionalIterator2 result)
00230   {
00231     using boost::Mutable_BidirectionalIterator;
00232     BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>));
00233     BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>));
00234     size_t n = std::distance(first, last);
00235     if (!n)
00236       return;
00237     std::advance(result, n-1);
00238     first = last;
00239     --first;
00240     size_t rank = n;
00241     
00242     double prev = 1.0;
00243     while (rank) {
00244       *result = std::min(*first * n/static_cast<double>(rank), prev);
00245       prev = *result;
00246       --rank;
00247       --first;
00248       --result;
00249     }
00250   }
00251 
00252 
00253   template <class RandomAccessIterator>
00254   double mad(RandomAccessIterator first, RandomAccessIterator last, 
00255              bool sorted)
00256   {
00257     BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
00258     BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
00259     double m = median(first, last, sorted);
00260     typedef typename std::iterator_traits<RandomAccessIterator>::value_type T;
00261     std::vector<T> ad(std::distance(first, last));
00262     // copy weights if needed
00263     normalizer::detail::copy_weight_if_weighted(first, last, ad.begin());
00264     // assign data (absolute deviation from median)
00265     utility::iterator_traits<RandomAccessIterator> traits;
00266     utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin());
00267     while (first!=last) {
00268       *first2 = std::abs(traits.data(first)-m);
00269       ++first;
00270       ++first2;
00271     }
00272     std::sort(ad.begin(), ad.end());
00273     return median(ad.begin(), ad.end(), true);
00274   }
00275   
00276 
00277   template <class RandomAccessIterator> 
00278   double median(RandomAccessIterator first, RandomAccessIterator last, 
00279                 bool sorted) 
00280   { 
00281     return percentile2(first, last, 50.0, sorted); 
00282   }
00283 
00284 
00285   template <class RandomAccessIterator>
00286   double percentile(RandomAccessIterator first, RandomAccessIterator last, 
00287                     double p, bool sorted)
00288   {
00289     BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>));
00290     BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>));
00291     // range is one value only is a special case
00292     if (first+1 == last)
00293       return utility::iterator_traits<RandomAccessIterator>().data(first);
00294     if (sorted) {
00295       // have to take care of this special case
00296       if (p>=100)
00297         return utility::iterator_traits<RandomAccessIterator>().data(--last);
00298       double j = p/100 * (std::distance(first,last)-1);
00299       int i = static_cast<int>(j);
00300       return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1];
00301     }
00302     using std::iterator_traits;
00303     typedef typename iterator_traits<RandomAccessIterator>::value_type value_t;
00304     std::vector<value_t> v_copy;
00305     v_copy.reserve(std::distance(first,last));
00306     std::copy(first, last, std::back_inserter(v_copy));
00307     size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1));
00308     if (i+2 < v_copy.size()) {
00309       std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end());
00310     }
00311     else
00312       std::sort(v_copy.begin(), v_copy.end());
00313     return percentile(v_copy.begin(), v_copy.end(), p, true);
00314   }
00315 
00316 
00317 }}} // of namespace statistics, yat, and theplu
00318 
00319 #endif

Generated on Thu Dec 20 2012 03:12:57 for yat by  doxygen 1.8.0-20120409