yat
0.8.3pre
|
00001 #ifndef _theplu_yat_normalizer_qquantile_normalizer_ 00002 #define _theplu_yat_normalizer_qquantile_normalizer_ 00003 00004 /* 00005 Copyright (C) 2009 Jari Häkkinen, Peter Johansson 00006 Copyright (C) 2010 Peter Johansson 00007 00008 This file is part of the yat library, http://dev.thep.lu.se/yat 00009 00010 The yat library is free software; you can redistribute it and/or 00011 modify it under the terms of the GNU General Public License as 00012 published by the Free Software Foundation; either version 3 of the 00013 License, or (at your option) any later version. 00014 00015 The yat library is distributed in the hope that it will be useful, 00016 but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 General Public License for more details. 00019 00020 You should have received a copy of the GNU General Public License 00021 along with yat. If not, see <http://www.gnu.org/licenses/>. 00022 */ 00023 00024 #include "utility.h" 00025 00026 #include "yat/regression/CSplineInterpolation.h" 00027 #include "yat/utility/concept_check.h" 00028 #include "yat/utility/DataIterator.h" 00029 #include "yat/utility/DataWeight.h" 00030 #include "yat/utility/Exception.h" 00031 #include "yat/utility/iterator_traits.h" 00032 #include "yat/utility/sort_index.h" 00033 #include "yat/utility/Vector.h" 00034 #include "yat/utility/WeightIterator.h" 00035 #include "yat/utility/yat_assert.h" 00036 00037 #include <boost/concept_check.hpp> 00038 00039 #include <algorithm> 00040 #include <cmath> 00041 #include <iterator> 00042 #include <limits> 00043 #include <numeric> 00044 #include <stdexcept> 00045 #include <vector> 00046 00047 namespace theplu { 00048 namespace yat { 00049 namespace utility { 00050 class VectorBase; 00051 } 00052 namespace normalizer { 00053 00088 class qQuantileNormalizer 00089 { 00090 public: 00117 template<typename ForwardIterator> 00118 qQuantileNormalizer(ForwardIterator first, ForwardIterator last, 00119 unsigned int Q); 00120 00144 template<typename RandomAccessIterator1, typename RandomAccessIterator2> 00145 void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last, 00146 RandomAccessIterator2 result) const; 00147 00148 private: 00149 00159 class Partitioner 00160 { 00161 public: 00165 template<typename ForwardIterator> 00166 Partitioner(ForwardIterator first, ForwardIterator last, 00167 unsigned int N); 00168 00174 const utility::Vector& averages(void) const; 00175 00189 const utility::Vector& quantiles(void) const; 00190 00194 size_t size(void) const; 00195 00196 private: 00197 // unweighted "constructor" 00198 template<typename ForwardIterator> 00199 void build(ForwardIterator first, ForwardIterator last, unsigned int N, 00200 utility::unweighted_iterator_tag); 00201 // weighted "constructor" 00202 template<typename ForwardIterator> 00203 void build(ForwardIterator first, ForwardIterator last, unsigned int N, 00204 utility::weighted_iterator_tag); 00205 void init(const utility::VectorBase&, unsigned int N); 00206 void init(const std::vector<utility::DataWeight>&, unsigned int N); 00207 00208 utility::Vector average_; 00209 utility::Vector quantiles_; 00210 }; 00211 00212 Partitioner target_; 00213 00214 // unweighted version 00215 template<typename RandomAccessIterator1, typename RandomAccessIterator2> 00216 void normalize(const Partitioner& source,RandomAccessIterator1 first, 00217 RandomAccessIterator1 last, RandomAccessIterator2 result, 00218 utility::unweighted_iterator_tag tag) const; 00219 00220 // weighted version 00221 template<typename RandomAccessIterator1, typename RandomAccessIterator2> 00222 void normalize(const Partitioner& source,RandomAccessIterator1 first, 00223 RandomAccessIterator1 last, RandomAccessIterator2 result, 00224 utility::weighted_iterator_tag tag) const; 00225 }; 00226 00227 00228 // template implementations 00229 00230 template<typename ForwardIterator> 00231 qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first, 00232 ForwardIterator last, 00233 unsigned int Q) 00234 : target_(Partitioner(first, last, Q)) 00235 { 00236 YAT_ASSERT(Q>2); 00237 } 00238 00239 00240 template<typename RandomAccessIterator1, typename RandomAccessIterator2> 00241 void qQuantileNormalizer::operator()(RandomAccessIterator1 first, 00242 RandomAccessIterator1 last, 00243 RandomAccessIterator2 result) const 00244 { 00245 Partitioner source(first, last, target_.size()); 00246 typename utility::weighted_if_any2<RandomAccessIterator1, RandomAccessIterator2>::type tag; 00247 normalize(source, first, last, result, tag); 00248 } 00249 00250 00251 template<typename RandomAccessIterator1, typename RandomAccessIterator2> 00252 void 00253 qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source, 00254 RandomAccessIterator1 first, 00255 RandomAccessIterator1 last, 00256 RandomAccessIterator2 result, 00257 utility::unweighted_iterator_tag tag) const 00258 { 00259 BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>)); 00260 BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>)); 00261 BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator1>)); 00262 BOOST_CONCEPT_ASSERT((utility::DataIteratorConcept<RandomAccessIterator2>)); 00263 utility::check_iterator_is_unweighted(first); 00264 utility::check_iterator_is_unweighted(result); 00265 size_t N = last-first; 00266 YAT_ASSERT(N >= target_.size()); 00267 00268 std::vector<size_t> sorted_index(last-first); 00269 utility::sort_index(first, last, sorted_index); 00270 00271 utility::Vector diff(source.averages()); 00272 diff-=target_.averages(); 00273 const utility::Vector& idx=target_.quantiles(); 00274 regression::CSplineInterpolation cspline(idx,diff); 00275 00276 // linear extrapolation for first part, i.e., use first diff for 00277 // all points in the first part. 00278 size_t start=0; 00279 size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5)); 00280 // take care of limiting case number of parts approximately equal 00281 // to the number of elements in range. 00282 if (end==0) 00283 ++end; 00284 for (size_t i=start; i<end; ++i) { 00285 size_t si = sorted_index[i]; 00286 result[si] = first[si] - diff(0); 00287 } 00288 00289 using utility::yat_assert; 00290 00291 // cspline interpolation for all data between the mid points of 00292 // the first and last part 00293 start=end; 00294 end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5)); 00295 if (end>=N) 00296 end = N-1; 00297 for ( size_t i=start; i<end; ++i) { 00298 size_t si = sorted_index[i]; 00299 00300 YAT_ASSERT((i+0.5)/N>idx(0)); 00301 result[si] = first[si] - cspline.evaluate((i+0.5)/N); 00302 } 00303 00304 // linear extrapolation for last part, i.e., use last diff for 00305 // all points in the last part. 00306 for (size_t i=end ; i<N; ++i) { 00307 size_t si = sorted_index[i]; 00308 result[si] = first[si] - diff(diff.size()-1); 00309 } 00310 } 00311 00312 00313 template<typename RandomAccessIterator1, typename RandomAccessIterator2> 00314 void qQuantileNormalizer::normalize(const Partitioner& source, 00315 RandomAccessIterator1 first, 00316 RandomAccessIterator1 last, 00317 RandomAccessIterator2 result, 00318 utility::weighted_iterator_tag tag) const 00319 { 00320 BOOST_CONCEPT_ASSERT((boost::RandomAccessIterator<RandomAccessIterator1>)); 00321 BOOST_CONCEPT_ASSERT((boost::Mutable_RandomAccessIterator<RandomAccessIterator2>)); 00322 BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator1>)); 00323 BOOST_CONCEPT_ASSERT((utility::DataIterator<RandomAccessIterator2>)); 00324 // copy the weights 00325 detail::copy_weight_if_weighted(first, last, result); 00326 00327 double total_w = std::accumulate(utility::weight_iterator(first), 00328 utility::weight_iterator(last), 0.0); 00329 00330 std::vector<size_t> sorted_index(last-first); 00331 // Code to avoid problems with NaN (ticket:535) 00332 // utility::sort_index(utility::data_iterator(first), 00333 // utility::data_iterator(last), sorted_index); 00334 // ... above statement replaced below code block 00335 { 00336 using namespace utility; 00337 std::vector<double> vec; 00338 vec.reserve(std::distance(first, last)); 00339 std::copy(utility::data_iterator(first), utility::data_iterator(last), 00340 std::back_inserter(vec)); 00341 for (std::vector<double>::iterator i(vec.begin()); i!=vec.end(); ++i) 00342 if (std::isnan(*i)) 00343 *i = std::numeric_limits<double>::infinity(); 00344 utility::sort_index(vec.begin(), vec.end(), sorted_index); 00345 } 00346 // end Code to avoid problems with NaN (ticket:535) 00347 00348 utility::Vector diff(source.averages()); 00349 diff-=target_.averages(); 00350 const utility::Vector& idx=target_.quantiles(); 00351 regression::CSplineInterpolation cspline(idx,diff); 00352 00353 double sum_w=0; 00354 utility::iterator_traits<RandomAccessIterator1> traits1; 00355 utility::iterator_traits<RandomAccessIterator2> traits2; 00356 for (size_t i=0; i<sorted_index.size(); ++i) { 00357 size_t si = sorted_index[i]; 00358 double w = (sum_w + 0.5*traits1.weight(first+si))/total_w; 00359 double correction = 0; 00360 if (w <= idx(0)) { 00361 // linear extrapolation for first part, i.e., use first diff for 00362 // all points in the first part. 00363 correction = diff(0); 00364 } 00365 else if (w < idx(idx.size()-1) ) { 00366 // cspline interpolation for all data between the mid points of 00367 // the first and last part 00368 correction = cspline.evaluate(w); 00369 } 00370 else { 00371 // linear extrapolation for last part, i.e., use last diff for 00372 // all points in the last part. 00373 correction = diff(diff.size()-1); 00374 } 00375 traits2.data(result+si) = traits1.data(first+si) - correction; 00376 sum_w += traits1.weight(first+si); 00377 } 00378 } 00379 00380 00381 template<typename ForwardIterator> 00382 qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first, 00383 ForwardIterator last, 00384 unsigned int N) 00385 : average_(utility::Vector(N)), quantiles_(utility::Vector(N)) 00386 { 00387 BOOST_CONCEPT_ASSERT((boost::ForwardIterator<ForwardIterator>)); 00388 BOOST_CONCEPT_ASSERT((utility::DataIterator<ForwardIterator>)); 00389 build(first, last, N, 00390 typename utility::weighted_iterator_traits<ForwardIterator>::type()); 00391 } 00392 00393 00394 template<typename ForwardIterator> 00395 void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 00396 ForwardIterator last, 00397 unsigned int N, 00398 utility::unweighted_iterator_tag) 00399 { 00400 utility::Vector vec(std::distance(first, last)); 00401 std::copy(first, last, vec.begin()); 00402 std::sort(vec.begin(), vec.end()); 00403 init(vec, N); 00404 } 00405 00406 00407 template<typename ForwardIterator> 00408 void qQuantileNormalizer::Partitioner::build(ForwardIterator first, 00409 ForwardIterator last, 00410 unsigned int N, 00411 utility::weighted_iterator_tag) 00412 { 00413 using namespace utility; 00414 std::vector<DataWeight> vec; 00415 vec.reserve(std::distance(first, last)); 00416 std::back_insert_iterator<std::vector<DataWeight> > inserter(vec); 00417 std::copy(first, last, inserter); 00418 std::sort(vec.begin(), vec.end()); 00419 init(vec, N); 00420 } 00421 00422 00423 }}} // end of namespace normalizer, yat and thep 00424 00425 #endif