yat  0.18.2pre
VCF.h
1 #ifndef theplu_yat_omic_vcf
2 #define theplu_yat_omic_vcf
3 
4 // $Id: VCF.h 3999 2020-10-08 23:22:32Z peter $
5 
6 /*
7  Copyright (C) 2018, 2019, 2020 Peter Johansson
8 
9  This file is part of the yat library, http://dev.thep.lu.se/yat
10 
11  The yat library is free software; you can redistribute it and/or
12  modify it under the terms of the GNU General Public License as
13  published by the Free Software Foundation; either version 3 of the
14  License, or (at your option) any later version.
15 
16  The yat library is distributed in the hope that it will be useful,
17  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  General Public License for more details.
20 
21  You should have received a copy of the GNU General Public License
22  along with yat. If not, see <http://www.gnu.org/licenses/>.
23 */
24 
26 
27 #include "yat/utility/Exception.h"
28 #include "yat/utility/split.h"
29 #include "yat/utility/utility.h"
30 #include "yat/utility/yat_assert.h"
31 
32 #include <boost/cstdint.hpp>
33 
34 #include <algorithm>
35 #include <vector>
36 #include <string>
37 
38 namespace theplu {
39 namespace yat {
40 namespace omic {
41 
48  class VCF
49  {
50  public:
54  VCF(void);
55 
59  explicit VCF(const std::string&);
60 
69  explicit VCF(std::istream& is);
70 
74  void chromosome(const std::string& chr);
75 
79  const std::string& chromosome(void) const;
80 
84  void pos(int32_t p);
85 
89  const int32_t& pos(void) const;
90 
96  void id(const std::string& id);
97 
101  const std::string& id(void) const;
102 
106  void ref(const std::string& r);
107 
111  void ref(char r);
112 
116  const std::string& ref(void) const;
117 
123  void alt(const std::string& a);
124 
130  void alt(char a);
131 
135  const std::string& alt(void) const;
136 
140  const std::vector<std::string>& alts(void) const;
141 
145  void alts(const std::vector<std::string>& a);
146 
150  void alts(std::vector<std::string>&& a);
151 
155  unsigned int n_alleles(void) const;
156 
160  const std::string& qual(void) const;
161 
165  void qual(const std::string& q);
166 
170  void qual(unsigned int q);
171 
175  const std::string& filter(void) const;
176 
180  const std::vector<std::string>& filters(void) const;
181 
185  void add_filter(const std::string&);
186 
190  void filter(const std::string&);
191 
195  class Info
196  {
197  public:
201  void add(const std::string& key);
202 
207  template<typename T>
208  void add(const std::string& key, const T& value);
209 
213  void clear(void);
214 
218  size_t count(const std::string& key) const;
219 
225  template<typename T>
226  void get(T& result, const std::string& key) const;
227 
231  void remove(const std::string& key);
232 
236  void set(const std::string& s);
237 
241  void set(std::string&& s);
242 
246  const std::string& str(void) const;
247 
248  private:
249  friend std::ostream& operator<<(std::ostream&, const VCF::Info&);
250  friend class VCF;
251 
252  // find range [begin, end) such that
253  // 1) [begin, end) == key
254  // 2) begin is equal to str_.begin() or begin[-1] is ';'
255  // 3) end is equal to str_.end() or end[0] == '=' or end[0] == ';'
256  std::string::iterator find(const std::string& key);
257  // same as find (above) but returns a const_iterator
258  std::string::const_iterator cfind(const std::string& key) const;
259 
260  void init(std::string& str);
261  void init(std::string&& str);
262 
263  // For now we store data as raw string (as it's stored in
264  // file). This has no overhead and is thus the fastest in case
265  // when the info field is not accessed nor modified. For the
266  // case when heavily accessed and modified, it might be
267  // preferable to also store data in a map<string, string> with a
268  // lazy implementation switching between storages depending on
269  // usage (i.e. avoid buoilding the map until needed).
270  std::string str_;
271 
272  // only call if initialised
273  void validate(void) const;
274  }; // end class Info
275 
276 
280  VCF::Info& info(void);
281 
285  const VCF::Info& info(void) const;
286 
290  class Data
291  {
292  public:
294  Data(void);
295 
297  template<typename T>
298  void add(const std::string& key, const std::vector<T>& data);
299 
304  void clear(void);
305 
309  size_t count(const std::string& key) const;
310 
316  const std::vector<std::string>& format(void) const;
317 
319  template<typename T>
320  void get(const std::string& key, std::vector<T>& data) const;
321 
322  private:
323  friend std::ostream& operator<<(std::ostream&, const VCF::Data&);
324  friend class VCF;
325 
326  // return true if class has been initialised
327  bool initialised(void) const;
328  void init(std::vector<std::string>::const_iterator,
329  std::vector<std::string>::const_iterator);
330  size_t index(const std::string& key) const;
331  bool initialised_;
332  std::vector<std::string> format_;
333  std::vector<std::vector<std::string> > data_;
334 
335  // only call if initialised
336  void validate(void) const;
337  }; // end class Data
338 
342  const Data& data(void) const;
343 
347  Data& data(void);
348 
352  unsigned int n_samples(void) const;
353 
357  void n_samples(int n);
358 
359  private:
360  friend std::ostream& operator<<(std::ostream&, const VCF&);
361  std::vector<std::string> vec_;
362  int32_t position_;
363 
364  Info info_;
365  Data data_;
366 
367  void init(std::vector<std::string>& vec);
368  // update vec_[4] from alts_
369  void update_alt(void);
370  void print(std::ostream& os) const;
371 
372  // if NDEBUG no-op
373  // otherwise throw if not valid
374  void validate(void) const;
375 
376  // private lazy variables
377  mutable std::vector<std::string> alts_;
378  mutable std::vector<std::string> filters_;
379 
380  // using compiler generated copy
381  // VCF(const VCF&)
382  // VCF& operator=(const VCF&)
383  // using compiler generated move
384  // VCF(VCF&&)
385  // VCF& operator=(VCF&&)
386  };
387 
394  std::ostream& operator<<(std::ostream& os, const VCF& vcf);
395 
406  std::istream& operator>>(std::istream& is, VCF& vcf);
407 
414  std::ostream& operator<<(std::ostream& os, const VCF::Info& info);
415 
422  std::ostream& operator<<(std::ostream& os, const VCF::Data& data);
423 
430  bool is_indel(const VCF&);
431 
439  bool is_snv(const VCF&);
440 
448  bool is_dnv(const VCF&);
449 
457  bool is_mnv(const VCF&);
458 
459 
460  // We would prefer to hide this privately with VCF, but explicit
461  // template specialization must be in namespace scope.
462  //
464  namespace detail {
465 
466  // class that append a string from T; T might be string, numerical
467  // or a vector of above.
468  template <typename T>
469  struct Appender
470  {
471  Appender(char delim) : delim_(delim) {}
472  void operator()(std::string& val, const T& x) const
473  {
474  val += utility::convert(x);
475  }
476  private:
477  char delim_;
478  };
479 
480 
481  template <>
482  struct Appender<std::string>
483  {
484  Appender(char delim) : delim_(delim) {}
485  void operator()(std::string& val, const std::string& x) const
486  {
487  val += x;
488  }
489  private:
490  char delim_;
491  };
492 
493 
494  template <>
495  struct Appender<std::vector<std::string> >
496  {
497  Appender(char delim) : delim_(delim) {}
498  void operator()(std::string& val,const std::vector<std::string>& x) const
499  {
500  for (size_t i=0; i<x.size(); ++i) {
501  if (i)
502  val += delim_;
503  val += x[i];
504  }
505  }
506  private:
507  char delim_;
508  };
509 
510 
511  template <typename T>
512  struct Appender<std::vector<T> >
513  {
514  Appender(char delim) : delim_(delim) {}
515  void operator()(std::string& val, const std::vector<T>& x) const
516  {
517  for (size_t i=0; i<x.size(); ++i) {
518  if (i)
519  val += delim_;
520  val += utility::convert(x[i]);
521  }
522  }
523  private:
524  char delim_;
525  };
526 
527 
528  // Class that converts a string to T
529  template<typename T>
530  struct Converter
531  {
532  void operator()(const std::string& x, T& result) const
533  {
534  result = utility::convert<T>(x);
535  }
536  };
537 
538 
539  template<>
540  struct Converter<std::string>
541  {
542  void operator()(const std::string& x, std::string& result) const
543  {
544  result = x;
545  }
546  };
547 
548 
549  template<typename T>
550  struct Converter<std::vector<T> >
551  {
552  void operator()(const std::string& x, std::vector<T>& result) const
553  {
554  std::vector<std::string> vec;
555  utility::split(vec, x, ',');
556  result.clear();
557  result.reserve(vec.size());
558  for (size_t i=0; i<vec.size(); ++i)
559  result.push_back(utility::convert<T>(vec[i]));
560  }
561  };
562 
563 
564  template<>
565  struct Converter<std::vector<std::string> >
566  {
567  void
568  operator()(const std::string& x, std::vector<std::string>& result) const
569  {
570  result.clear();
571  utility::split(result, x, ',');
572  }
573  };
574 
575  }
576 
578 
579 
580  // template implementations
581 
582  // VCF::Info
583 
584  template<typename T>
585  void VCF::Info::add(const std::string& key, const T& value)
586  {
587  add(key);
588  str_ += '=';
589  detail::Appender<T> append(',');
590  append(str_, value);
591  }
592 
593 
594  template<typename T>
595  void VCF::Info::get(T& result, const std::string& key) const
596  {
597  detail::Converter<T> converter;
598  std::string value;
599  std::string k = key + "=";
600  size_t begin = str_.find(k);
601 
602  if (begin == std::string::npos) {
603  converter(value, result);
604  return;
605  }
606 
607  if (begin && str_[begin-1]!=';') {
608  k = ";" + key + "=";
609  begin = str_.find(k, begin+key.size());
610  if (begin == std::string::npos) {
611  converter(value, result);
612  return;
613  }
614  }
615 
616  begin += k.size();
617  int end = str_.find(";", begin);
618  value = str_.substr(begin, end-begin);
619  converter(value, result);
620  }
621 
622 
623  // VCF::Data
624 
625  template<typename T>
626  void VCF::Data::add(const std::string& key, const std::vector<T>& input)
627  {
628  YAT_ASSERT(key != "");
629  format_.push_back(key);
630  detail::Appender<T> append(',');
631  for (size_t i=0; i<input.size(); ++i) {
632  data_[i].push_back("");
633  append(data_[i].back(), input[i]);
634  }
635  }
636 
637 
638  template<typename T>
639  void VCF::Data::get(const std::string& key, std::vector<T>& data) const
640  {
641  size_t idx = index(key);
642  detail::Converter<T> converter;
643  data.resize(data_.size());
644  for (size_t i=0; i<data.size(); ++i)
645  converter(data_[i][idx], data[i]);
646  }
647 
648 
649  /*
650  template<typename T>
651  void VCF::add_data(const std::string& key, const std::vector<T>& val)
652  {
653  if (format() != "")
654  format() += ":";
655  format() += key;
656 
657 
658  YAT_ASSERT(val.size()+9 == vec_.size());
659  for (size_t i=0; i<val.size(); ++i) {
660  std::string& data = vec_[i+9];
661  if (data != "")
662  data += ":";
663  append(data, val[i]);
664  }
665  }
666  */
667 }}}
668 #endif
const std::string & chromosome(void) const
const std::string & id(void) const
bool is(const std::string &s)
check if string is convertible to (numerical) type T
Definition: utility.h:589
Definition: VCF.h:48
const std::vector< std::string > & alts(void) const
The Department of Theoretical Physics namespace as we define it.
const int32_t & pos(void) const
Some useful functions are placed here.
const std::vector< std::string > & format(void) const
void add(const std::string &key)
bool is_snv(const VCF &)
Definition: stl_utility.h:64
VCF(void)
default constructor
const std::string & alt(void) const
void get(const std::string &key, std::vector< T > &data) const
T is string, numeric or vector string or numeric.
Definition: VCF.h:639
const std::string & qual(void) const
phred-scaled probability that all samples are wildtype
unsigned int n_samples(void) const
VCF::Info & info(void)
set info
unsigned int n_alleles(void) const
const std::string & filter(void) const
const std::vector< std::string > & filters(void) const
size_t count(const std::string &key) const
class holiding data stored in FORMAT and data fields
Definition: VCF.h:290
const Data & data(void) const
access data
void split(std::vector< std::string > &result, const std::string &str, char delim)
split a string into several substrings
const std::string & str(void) const
const std::string & ref(void) const
Data(void)
Default constructor.
class handling the INFO field in an VCF entry
Definition: VCF.h:195
bool is_indel(const VCF &)
bool is_dnv(const VCF &)
std::string convert(T input)
convert T to a string
Definition: utility.h:569
void get(T &result, const std::string &key) const
Definition: VCF.h:595
void add_filter(const std::string &)
append list of filters
bool is_mnv(const VCF &)
void add(const std::string &key, const std::vector< T > &data)
T is string, numeric or vector string or numeric.
Definition: VCF.h:626
size_t count(const std::string &key) const

Generated on Tue Sep 7 2021 17:32:32 for yat by  doxygen 1.8.14