yat
0.8.3pre
|
00001 #ifndef _theplu_yat_statistics_utility_ 00002 #define _theplu_yat_statistics_utility_ 00003 00004 // $Id: utility.h 2625 2011-11-08 00:07:24Z peter $ 00005 00006 /* 00007 Copyright (C) 2004 Jari Häkkinen, Peter Johansson 00008 Copyright (C) 2005 Peter Johansson 00009 Copyright (C) 2006 Jari Häkkinen, Peter Johansson, Markus Ringnér 00010 Copyright (C) 2007, 2008 Jari Häkkinen, Peter Johansson 00011 Copyright (C) 2009, 2010, 2011 Peter Johansson 00012 00013 This file is part of the yat library, http://dev.thep.lu.se/yat 00014 00015 The yat library is free software; you can redistribute it and/or 00016 modify it under the terms of the GNU General Public License as 00017 published by the Free Software Foundation; either version 3 of the 00018 License, or (at your option) any later version. 00019 00020 The yat library is distributed in the hope that it will be useful, 00021 but WITHOUT ANY WARRANTY; without even the implied warranty of 00022 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00023 General Public License for more details. 00024 00025 You should have received a copy of the GNU General Public License 00026 along with yat. If not, see <http://www.gnu.org/licenses/>. 00027 */ 00028 00029 #include "Percentiler.h" 00030 00031 #include "yat/classifier/DataLookupWeighted1D.h" 00032 #include "yat/classifier/Target.h" 00033 #include "yat/normalizer/utility.h" 00034 #include "yat/utility/concept_check.h" 00035 #include "yat/utility/DataIterator.h" 00036 #include "yat/utility/deprecate.h" 00037 #include "yat/utility/iterator_traits.h" 00038 #include "yat/utility/VectorBase.h" 00039 #include "yat/utility/yat_assert.h" 00040 00041 #include <boost/concept_check.hpp> 00042 00043 #include <algorithm> 00044 #include <cmath> 00045 #include <iterator> 00046 #include <stdexcept> 00047 #include <vector> 00048 00049 #include <gsl/gsl_statistics_double.h> 00050 00051 namespace theplu { 00052 namespace yat { 00053 namespace statistics { 00054 00060 template <typename T, typename ForwardIterator> 00061 void add(T& o, ForwardIterator first, ForwardIterator last, 00062 const classifier::Target& target); 00063 00079 template<typename BidirectionalIterator1, typename BidirectionalIterator2> 00080 void benjamini_hochberg(BidirectionalIterator1 first, 00081 BidirectionalIterator1 last, 00082 BidirectionalIterator2 result); 00083 00084 00096 double cdf_hypergeometric_P(unsigned int k, unsigned int n1, 00097 unsigned int n2, unsigned int t); 00098 00099 00115 double pearson_p_value(double r, unsigned int n); 00116 00124 double kurtosis(const utility::VectorBase&); 00125 00126 00138 template <class RandomAccessIterator> 00139 double mad(RandomAccessIterator first, RandomAccessIterator last, 00140 bool sorted=false); 00141 00142 00158 template <class RandomAccessIterator> 00159 double median(RandomAccessIterator first, RandomAccessIterator last, 00160 bool sorted=false); 00161 00162 00188 template <class RandomAccessIterator> 00189 double percentile(RandomAccessIterator first, RandomAccessIterator last, 00190 double p, bool sorted=false) YAT_DEPRECATE; 00191 00197 template <class RandomAccessIterator> 00198 double percentile2(RandomAccessIterator first, RandomAccessIterator last, 00199 double p, bool sorted=false) 00200 { 00201 Percentiler percentiler(p, sorted); 00202 return percentiler(first, last); 00203 } 00204 00211 double skewness(const utility::VectorBase&); 00212 00213 00215 00216 template <typename T, typename ForwardIterator> 00217 void add(T& o, ForwardIterator first, ForwardIterator last, 00218 const classifier::Target& target) 00219 { 00220 utility::iterator_traits<ForwardIterator> traits; 00221 for (size_t i=0; first!=last; ++i, ++first) 00222 o.add(traits.data(first), target.binary(i), traits.weight(first)); 00223 } 00224 00225 00226 template<typename BidirectionalIterator1, typename BidirectionalIterator2> 00227 void benjamini_hochberg(BidirectionalIterator1 first, 00228 BidirectionalIterator1 last, 00229 BidirectionalIterator2 result) 00230 { 00231 using boost::Mutable_BidirectionalIterator; 00232 BOOST_CONCEPT_ASSERT((boost::BidirectionalIterator<BidirectionalIterator1>)); 00233 BOOST_CONCEPT_ASSERT((Mutable_BidirectionalIterator<BidirectionalIterator2>)); 00234 size_t n = std::distance(first, last); 00235 if (!n) 00236 return; 00237 std::advance(result, n-1); 00238 first = last; 00239 --first; 00240 size_t rank = n; 00241 00242 double prev = 1.0; 00243 while (rank) { 00244 *result = std::min(*first * n/static_cast<double>(rank), prev); 00245 prev = *result; 00246 --rank; 00247 --first; 00248 --result; 00249 } 00250 } 00251 00252 00253 template <class RandomAccessIterator> 00254 double mad(RandomAccessIterator first, RandomAccessIterator last, 00255 bool sorted) 00256 { 00257 BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>)); 00258 BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>)); 00259 double m = median(first, last, sorted); 00260 typedef typename std::iterator_traits<RandomAccessIterator>::value_type T; 00261 std::vector<T> ad(std::distance(first, last)); 00262 // copy weights if needed 00263 normalizer::detail::copy_weight_if_weighted(first, last, ad.begin()); 00264 // assign data (absolute deviation from median) 00265 utility::iterator_traits<RandomAccessIterator> traits; 00266 utility::DataIterator<typename std::vector<T>::iterator> first2(ad.begin()); 00267 while (first!=last) { 00268 *first2 = std::abs(traits.data(first)-m); 00269 ++first; 00270 ++first2; 00271 } 00272 std::sort(ad.begin(), ad.end()); 00273 return median(ad.begin(), ad.end(), true); 00274 } 00275 00276 00277 template <class RandomAccessIterator> 00278 double median(RandomAccessIterator first, RandomAccessIterator last, 00279 bool sorted) 00280 { 00281 return percentile2(first, last, 50.0, sorted); 00282 } 00283 00284 00285 template <class RandomAccessIterator> 00286 double percentile(RandomAccessIterator first, RandomAccessIterator last, 00287 double p, bool sorted) 00288 { 00289 BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator>)); 00290 BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator>)); 00291 // range is one value only is a special case 00292 if (first+1 == last) 00293 return utility::iterator_traits<RandomAccessIterator>().data(first); 00294 if (sorted) { 00295 // have to take care of this special case 00296 if (p>=100) 00297 return utility::iterator_traits<RandomAccessIterator>().data(--last); 00298 double j = p/100 * (std::distance(first,last)-1); 00299 int i = static_cast<int>(j); 00300 return (1-j+floor(j))*first[i] + (j-floor(j))*first[i+1]; 00301 } 00302 using std::iterator_traits; 00303 typedef typename iterator_traits<RandomAccessIterator>::value_type value_t; 00304 std::vector<value_t> v_copy; 00305 v_copy.reserve(std::distance(first,last)); 00306 std::copy(first, last, std::back_inserter(v_copy)); 00307 size_t i = static_cast<size_t>(p/100 * (v_copy.size()-1)); 00308 if (i+2 < v_copy.size()) { 00309 std::partial_sort(v_copy.begin(), v_copy.begin()+i+2, v_copy.end()); 00310 } 00311 else 00312 std::sort(v_copy.begin(), v_copy.end()); 00313 return percentile(v_copy.begin(), v_copy.end(), p, true); 00314 } 00315 00316 00317 }}} // of namespace statistics, yat, and theplu 00318 00319 #endif