00001 #ifndef _theplu_yat_normalizer_qquantile_normalizer_
00002 #define _theplu_yat_normalizer_qquantile_normalizer_
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "yat/regression/CSplineInterpolation.h"
00024 #include "yat/utility/DataIterator.h"
00025 #include "yat/utility/DataWeight.h"
00026 #include "yat/utility/iterator_traits.h"
00027 #include "yat/utility/sort_index.h"
00028 #include "yat/utility/Vector.h"
00029 #include "yat/utility/WeightIterator.h"
00030 #include "yat/utility/yat_assert.h"
00031
00032 #include <algorithm>
00033 #include <cmath>
00034 #include <iterator>
00035 #include <numeric>
00036 #include <stdexcept>
00037 #include <vector>
00038
00039 namespace theplu {
00040 namespace yat {
00041 namespace utility {
00042 class VectorBase;
00043 }
00044 namespace normalizer {
00045
00080 class qQuantileNormalizer
00081 {
00082 public:
00111 template<typename ForwardIterator>
00112 qQuantileNormalizer(ForwardIterator first, ForwardIterator last,
00113 unsigned int Q);
00114
00140 template<typename RandomAccessIterator1, typename RandomAccessIterator2>
00141 void operator()(RandomAccessIterator1 first, RandomAccessIterator1 last,
00142 RandomAccessIterator2 result) const;
00143
00144 private:
00145
00155 class Partitioner
00156 {
00157 public:
00161 template<typename ForwardIterator>
00162 Partitioner(ForwardIterator first, ForwardIterator last,
00163 unsigned int N);
00164
00170 const utility::Vector& averages(void) const;
00171
00185 const utility::Vector& quantiles(void) const;
00186
00190 size_t size(void) const;
00191
00192 private:
00193
00194 template<typename ForwardIterator>
00195 void build(ForwardIterator first, ForwardIterator last, unsigned int N,
00196 utility::unweighted_iterator_tag);
00197
00198 template<typename ForwardIterator>
00199 void build(ForwardIterator first, ForwardIterator last, unsigned int N,
00200 utility::weighted_iterator_tag);
00201 void init(const utility::VectorBase&, unsigned int N);
00202 void init(const std::vector<utility::DataWeight>&, unsigned int N);
00203
00204 utility::Vector average_;
00205 utility::Vector quantiles_;
00206 };
00207
00208 Partitioner target_;
00209
00210
00211 template<typename RandomAccessIterator1, typename RandomAccessIterator2>
00212 void normalize(const Partitioner& source,RandomAccessIterator1 first,
00213 RandomAccessIterator1 last, RandomAccessIterator2 result,
00214 utility::unweighted_iterator_tag tag) const;
00215
00216
00217 template<typename RandomAccessIterator1, typename RandomAccessIterator2>
00218 void normalize(const Partitioner& source,RandomAccessIterator1 first,
00219 RandomAccessIterator1 last, RandomAccessIterator2 result,
00220 utility::weighted_iterator_tag tag) const;
00221 };
00222
00223
00224
00225
00226 template<typename ForwardIterator>
00227 qQuantileNormalizer::qQuantileNormalizer(ForwardIterator first,
00228 ForwardIterator last,
00229 unsigned int Q)
00230 : target_(Partitioner(first, last, Q))
00231 {
00232 utility::yat_assert<std::runtime_error>(Q>2,
00233 "qQuantileNormalizer: Q too small");
00234 }
00235
00236
00237 template<typename RandomAccessIterator1, typename RandomAccessIterator2>
00238 void qQuantileNormalizer::operator()(RandomAccessIterator1 first,
00239 RandomAccessIterator1 last,
00240 RandomAccessIterator2 result) const
00241 {
00242 Partitioner source(first, last, target_.size());
00243 typename utility::weighted_iterator_traits<RandomAccessIterator2>::type tag;
00244 normalize(source, first, last, result, tag);
00245 }
00246
00247
00248 template<typename RandomAccessIterator1, typename RandomAccessIterator2>
00249 void
00250 qQuantileNormalizer::normalize(const qQuantileNormalizer::Partitioner& source,
00251 RandomAccessIterator1 first,
00252 RandomAccessIterator1 last,
00253 RandomAccessIterator2 result,
00254 utility::unweighted_iterator_tag tag) const
00255 {
00256 utility::check_iterator_is_unweighted(first);
00257 utility::check_iterator_is_unweighted(result);
00258 size_t N = last-first;
00259 utility::yat_assert<std::runtime_error>
00260 (N >= target_.size(), "qQuantileNormalizer: Input range too small");
00261
00262 std::vector<size_t> sorted_index(last-first);
00263 utility::sort_index(first, last, sorted_index);
00264
00265 utility::Vector diff(source.averages());
00266 diff-=target_.averages();
00267 const utility::Vector& idx=target_.quantiles();
00268 regression::CSplineInterpolation cspline(idx,diff);
00269
00270
00271
00272 size_t start=0;
00273 size_t end = static_cast<size_t>(std::ceil(N*idx(0) - 0.5));
00274
00275
00276 if (end==0)
00277 ++end;
00278 for (size_t i=start; i<end; ++i) {
00279 size_t si = sorted_index[i];
00280 result[si] = first[si] - diff(0);
00281 }
00282
00283 using utility::yat_assert;
00284
00285
00286
00287 start=end;
00288 end = static_cast<size_t>(std::ceil(N*idx(idx.size()-1) - 0.5));
00289 if (end>=N)
00290 end = N-1;
00291 for ( size_t i=start; i<end; ++i) {
00292 size_t si = sorted_index[i];
00293
00294 yat_assert<std::runtime_error>((i+0.5)/N>idx(0),
00295 "qQuantileNormalizer: invalid input to cspline");
00296 result[si] = first[si] - cspline.evaluate((i+0.5)/N);
00297 }
00298
00299
00300
00301 for (size_t i=end ; i<N; ++i) {
00302 size_t si = sorted_index[i];
00303 result[si] = first[si] - diff(diff.size()-1);
00304 }
00305 }
00306
00307
00308 template<typename RandomAccessIterator1, typename RandomAccessIterator2>
00309 void qQuantileNormalizer::normalize(const Partitioner& source,
00310 RandomAccessIterator1 first,
00311 RandomAccessIterator1 last,
00312 RandomAccessIterator2 result,
00313 utility::weighted_iterator_tag tag) const
00314 {
00315
00316 std::copy(utility::weight_iterator(first),
00317 utility::weight_iterator(last),
00318 utility::weight_iterator(result));
00319
00320 double total_w = std::accumulate(utility::weight_iterator(first),
00321 utility::weight_iterator(last), 0.0);
00322
00323 std::vector<size_t> sorted_index(last-first);
00324
00325
00326
00327
00328 {
00329 using namespace utility;
00330 std::vector<DataWeight> vec;
00331 vec.reserve(std::distance(first, last));
00332 std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
00333 std::copy(first, last, inserter);
00334 for (std::vector<DataWeight>::iterator i(vec.begin()); i!=vec.end(); ++i)
00335 if (std::isnan((i->data())))
00336 i->data() = std::numeric_limits<double>::infinity();
00337 utility::sort_index(utility::data_iterator(vec.begin()),
00338 utility::data_iterator(vec.end()), sorted_index);
00339 }
00340
00341
00342 utility::Vector diff(source.averages());
00343 diff-=target_.averages();
00344 const utility::Vector& idx=target_.quantiles();
00345 regression::CSplineInterpolation cspline(idx,diff);
00346
00347 double sum_w=0;
00348 utility::iterator_traits<RandomAccessIterator1> traits1;
00349 utility::iterator_traits<RandomAccessIterator2> traits2;
00350 for (size_t i=0; i<sorted_index.size(); ++i) {
00351 size_t si = sorted_index[i];
00352 double w = (sum_w + 0.5*traits1.weight(first+si))/total_w;
00353 double correction = 0;
00354 if (w <= idx(0)) {
00355
00356
00357 correction = diff(0);
00358 }
00359 else if (w < idx(idx.size()-1) ) {
00360
00361
00362 correction = cspline.evaluate(w);
00363 }
00364 else {
00365
00366
00367 correction = diff(diff.size()-1);
00368 }
00369 traits2.data(result+si) = traits1.data(first+si) - correction;
00370 sum_w += traits1.weight(first+si);
00371 }
00372 }
00373
00374
00375 template<typename ForwardIterator>
00376 qQuantileNormalizer::Partitioner::Partitioner(ForwardIterator first,
00377 ForwardIterator last,
00378 unsigned int N)
00379 : average_(utility::Vector(N)), quantiles_(utility::Vector(N))
00380 {
00381 build(first, last, N,
00382 typename utility::weighted_iterator_traits<ForwardIterator>::type());
00383 }
00384
00385
00386 template<typename ForwardIterator>
00387 void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
00388 ForwardIterator last,
00389 unsigned int N,
00390 utility::unweighted_iterator_tag)
00391 {
00392 utility::Vector vec(std::distance(first, last));
00393 std::copy(first, last, vec.begin());
00394 std::sort(vec.begin(), vec.end());
00395 init(vec, N);
00396 }
00397
00398
00399 template<typename ForwardIterator>
00400 void qQuantileNormalizer::Partitioner::build(ForwardIterator first,
00401 ForwardIterator last,
00402 unsigned int N,
00403 utility::weighted_iterator_tag)
00404 {
00405 using namespace utility;
00406 std::vector<DataWeight> vec;
00407 vec.reserve(std::distance(first, last));
00408 std::back_insert_iterator<std::vector<DataWeight> > inserter(vec);
00409 std::copy(first, last, inserter);
00410
00411 for (std::vector<DataWeight>::iterator i(vec.begin()); i!=vec.end(); ++i)
00412 if (std::isnan((i->data())))
00413 i->data() = std::numeric_limits<double>::infinity();
00414
00415 std::sort(vec.begin(), vec.end());
00416 init(vec, N);
00417 }
00418
00419
00420 }}}
00421
00422 #endif