3475 |
08 Mar 16 |
peter |
// $Id$ |
3475 |
08 Mar 16 |
peter |
2 |
|
4346 |
24 Apr 23 |
peter |
3 |
/* |
4359 |
23 Aug 23 |
peter |
Copyright (C) 2016, 2018, 2020, 2021, 2023 Peter Johansson |
4346 |
24 Apr 23 |
peter |
5 |
|
4346 |
24 Apr 23 |
peter |
This file is part of the yat library, https://dev.thep.lu.se/yat |
4346 |
24 Apr 23 |
peter |
7 |
|
4346 |
24 Apr 23 |
peter |
The yat library is free software; you can redistribute it and/or |
4346 |
24 Apr 23 |
peter |
modify it under the terms of the GNU General Public License as |
4346 |
24 Apr 23 |
peter |
published by the Free Software Foundation; either version 3 of the |
4346 |
24 Apr 23 |
peter |
License, or (at your option) any later version. |
4346 |
24 Apr 23 |
peter |
12 |
|
4346 |
24 Apr 23 |
peter |
The yat library is distributed in the hope that it will be useful, |
4346 |
24 Apr 23 |
peter |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
4346 |
24 Apr 23 |
peter |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4346 |
24 Apr 23 |
peter |
General Public License for more details. |
4346 |
24 Apr 23 |
peter |
17 |
|
4346 |
24 Apr 23 |
peter |
You should have received a copy of the GNU General Public License |
4346 |
24 Apr 23 |
peter |
along with yat. If not, see <https://www.gnu.org/licenses/>. |
4346 |
24 Apr 23 |
peter |
20 |
*/ |
4346 |
24 Apr 23 |
peter |
21 |
|
3475 |
08 Mar 16 |
peter |
22 |
#include <config.h> |
3475 |
08 Mar 16 |
peter |
23 |
|
3475 |
08 Mar 16 |
peter |
24 |
#include "Fasta.h" |
3475 |
08 Mar 16 |
peter |
25 |
#include "algorithm.h" |
3475 |
08 Mar 16 |
peter |
26 |
|
3475 |
08 Mar 16 |
peter |
27 |
#include <yat/utility/Exception.h> |
3475 |
08 Mar 16 |
peter |
28 |
#include <yat/utility/utility.h> |
3475 |
08 Mar 16 |
peter |
29 |
#include <yat/utility/split.h> |
3475 |
08 Mar 16 |
peter |
30 |
|
3727 |
09 Feb 18 |
peter |
31 |
#include <boost/iterator/iterator_traits.hpp> |
3475 |
08 Mar 16 |
peter |
32 |
#include <boost/scoped_array.hpp> |
3475 |
08 Mar 16 |
peter |
33 |
|
3475 |
08 Mar 16 |
peter |
34 |
#include <algorithm> |
3475 |
08 Mar 16 |
peter |
35 |
#include <cstring> |
3475 |
08 Mar 16 |
peter |
36 |
#include <fstream> |
3727 |
09 Feb 18 |
peter |
37 |
#include <iterator> |
3475 |
08 Mar 16 |
peter |
38 |
#include <limits> |
3727 |
09 Feb 18 |
peter |
39 |
#include <ostream> |
3475 |
08 Mar 16 |
peter |
40 |
#include <sstream> |
3475 |
08 Mar 16 |
peter |
41 |
|
3475 |
08 Mar 16 |
peter |
42 |
namespace theplu { |
3475 |
08 Mar 16 |
peter |
43 |
namespace yat { |
3475 |
08 Mar 16 |
peter |
44 |
namespace omic { |
3475 |
08 Mar 16 |
peter |
45 |
|
3475 |
08 Mar 16 |
peter |
46 |
Fasta::Fasta(const std::string& fn) |
3475 |
08 Mar 16 |
peter |
47 |
: faidx_(fai_load(fn.c_str()), fai_destroy) |
3475 |
08 Mar 16 |
peter |
48 |
{ |
3475 |
08 Mar 16 |
peter |
49 |
if (faidx_.get()==NULL) { |
3475 |
08 Mar 16 |
peter |
50 |
std::ostringstream ss; |
3475 |
08 Mar 16 |
peter |
51 |
ss << "cannot open index file for '" << fn << "'"; |
3475 |
08 Mar 16 |
peter |
52 |
throw utility::runtime_error(ss.str()); |
3475 |
08 Mar 16 |
peter |
53 |
} |
3475 |
08 Mar 16 |
peter |
54 |
} |
3475 |
08 Mar 16 |
peter |
55 |
|
3475 |
08 Mar 16 |
peter |
56 |
|
3475 |
08 Mar 16 |
peter |
57 |
void Fasta::fetch(Fasta::Sequence& res,const std::string& chr, |
3475 |
08 Mar 16 |
peter |
58 |
int begin, int end) const |
3475 |
08 Mar 16 |
peter |
59 |
{ |
3475 |
08 Mar 16 |
peter |
// faidx_fetch_seq returns char* allocated the malloc; tell smart |
3475 |
08 Mar 16 |
peter |
// pointer to deallocate with free() rather than delete (default). |
3883 |
24 Mar 20 |
peter |
62 |
res.seq_.reset(faidx_fetch_seq(faidx_.get(), chr.c_str(), begin, end, |
3883 |
24 Mar 20 |
peter |
63 |
&res.size_), free); |
3887 |
25 Mar 20 |
peter |
// fetch failed |
4200 |
19 Aug 22 |
peter |
65 |
if (!res.seq_) { |
3887 |
25 Mar 20 |
peter |
// htslib documents that they communicate type of error with |
3887 |
25 Mar 20 |
peter |
// setting size_ to either -1 or -2 (unknown chromosome). |
3887 |
25 Mar 20 |
peter |
68 |
if (res.size_ == -2) |
3887 |
25 Mar 20 |
peter |
69 |
throw_unknown_chr(chr); // throw |
3887 |
25 Mar 20 |
peter |
70 |
std::ostringstream msg; |
3887 |
25 Mar 20 |
peter |
71 |
msg << "failed fetching sequence for '" << chr << ":" << begin << "-" |
3887 |
25 Mar 20 |
peter |
72 |
<< end << "'"; |
3887 |
25 Mar 20 |
peter |
73 |
throw utility::runtime_error(msg.str()); |
3887 |
25 Mar 20 |
peter |
74 |
} |
3475 |
08 Mar 16 |
peter |
75 |
} |
3475 |
08 Mar 16 |
peter |
76 |
|
4200 |
19 Aug 22 |
peter |
77 |
|
3475 |
08 Mar 16 |
peter |
78 |
std::string Fasta::name(size_t i) const |
3475 |
08 Mar 16 |
peter |
79 |
{ |
3475 |
08 Mar 16 |
peter |
80 |
assert(static_cast<int>(i) < nseq()); |
3475 |
08 Mar 16 |
peter |
81 |
return faidx_iseq(faidx_.get(), i); |
3475 |
08 Mar 16 |
peter |
82 |
} |
3475 |
08 Mar 16 |
peter |
83 |
|
3475 |
08 Mar 16 |
peter |
84 |
|
3475 |
08 Mar 16 |
peter |
85 |
int Fasta::nseq(void) const |
3475 |
08 Mar 16 |
peter |
86 |
{ |
3475 |
08 Mar 16 |
peter |
87 |
return faidx_nseq(faidx_.get()); |
3475 |
08 Mar 16 |
peter |
88 |
} |
3475 |
08 Mar 16 |
peter |
89 |
|
3475 |
08 Mar 16 |
peter |
90 |
|
3475 |
08 Mar 16 |
peter |
91 |
bool Fasta::present(const std::string& name) const |
3475 |
08 Mar 16 |
peter |
92 |
{ |
3475 |
08 Mar 16 |
peter |
93 |
return faidx_has_seq(faidx_.get(), name.c_str()); |
3475 |
08 Mar 16 |
peter |
94 |
} |
3475 |
08 Mar 16 |
peter |
95 |
|
3475 |
08 Mar 16 |
peter |
96 |
|
3475 |
08 Mar 16 |
peter |
97 |
Fasta::Sequence Fasta::sequence(const std::string& chr) const |
3475 |
08 Mar 16 |
peter |
98 |
{ |
3475 |
08 Mar 16 |
peter |
99 |
Fasta::Sequence res; |
3887 |
25 Mar 20 |
peter |
100 |
try { |
3887 |
25 Mar 20 |
peter |
101 |
fetch(res, chr, 0, std::numeric_limits<int>::max()); |
3475 |
08 Mar 16 |
peter |
102 |
} |
3887 |
25 Mar 20 |
peter |
103 |
catch (utility::runtime_error& e) { |
3887 |
25 Mar 20 |
peter |
104 |
if (!res.seq_.get() && res.size_==-2) { |
3887 |
25 Mar 20 |
peter |
105 |
std::ostringstream msg; |
3887 |
25 Mar 20 |
peter |
106 |
msg << "failed fetching sequence for '" << chr << "'"; |
4060 |
10 May 21 |
peter |
107 |
std::throw_with_nested(utility::runtime_error(msg.str())); |
3887 |
25 Mar 20 |
peter |
108 |
} |
3887 |
25 Mar 20 |
peter |
109 |
throw; // retrow e |
3887 |
25 Mar 20 |
peter |
110 |
} |
3475 |
08 Mar 16 |
peter |
111 |
return res; |
3475 |
08 Mar 16 |
peter |
112 |
} |
3475 |
08 Mar 16 |
peter |
113 |
|
3475 |
08 Mar 16 |
peter |
114 |
|
3475 |
08 Mar 16 |
peter |
115 |
Fasta::Sequence |
3475 |
08 Mar 16 |
peter |
116 |
Fasta::sequence(const std::string& chr, int begin, int end) const |
3475 |
08 Mar 16 |
peter |
117 |
{ |
3475 |
08 Mar 16 |
peter |
118 |
Fasta::Sequence res; |
3475 |
08 Mar 16 |
peter |
119 |
fetch(res, chr, begin, end); |
3887 |
25 Mar 20 |
peter |
120 |
assert(res.seq_); |
3475 |
08 Mar 16 |
peter |
121 |
return res; |
3475 |
08 Mar 16 |
peter |
122 |
} |
3475 |
08 Mar 16 |
peter |
123 |
|
3475 |
08 Mar 16 |
peter |
124 |
|
3475 |
08 Mar 16 |
peter |
125 |
int Fasta::sequence_length(const std::string& name) const |
3475 |
08 Mar 16 |
peter |
126 |
{ |
3475 |
08 Mar 16 |
peter |
127 |
int res = faidx_seq_len(faidx_.get(), name.c_str()); |
3475 |
08 Mar 16 |
peter |
128 |
if (res == -1) |
3475 |
08 Mar 16 |
peter |
129 |
throw_unknown_chr(name); |
3475 |
08 Mar 16 |
peter |
130 |
return res; |
3475 |
08 Mar 16 |
peter |
131 |
} |
3475 |
08 Mar 16 |
peter |
132 |
|
3475 |
08 Mar 16 |
peter |
133 |
|
3475 |
08 Mar 16 |
peter |
134 |
void Fasta::throw_unknown_chr(const std::string& name) const |
3475 |
08 Mar 16 |
peter |
135 |
{ |
3475 |
08 Mar 16 |
peter |
136 |
std::ostringstream msg; |
3475 |
08 Mar 16 |
peter |
137 |
msg << "unknown sequence name: " << name << " in fasta"; |
3475 |
08 Mar 16 |
peter |
138 |
throw utility::runtime_error(msg.str()); |
3475 |
08 Mar 16 |
peter |
139 |
} |
3475 |
08 Mar 16 |
peter |
140 |
|
3475 |
08 Mar 16 |
peter |
/// implementation of Fasta::Sequence |
3475 |
08 Mar 16 |
peter |
142 |
|
3475 |
08 Mar 16 |
peter |
143 |
Fasta::Sequence::const_iterator Fasta::Sequence::begin(void) const |
3475 |
08 Mar 16 |
peter |
144 |
{ |
3475 |
08 Mar 16 |
peter |
145 |
assert(seq_.get()); |
3475 |
08 Mar 16 |
peter |
146 |
return seq_.get(); |
3475 |
08 Mar 16 |
peter |
147 |
} |
3475 |
08 Mar 16 |
peter |
148 |
|
3475 |
08 Mar 16 |
peter |
149 |
|
3475 |
08 Mar 16 |
peter |
150 |
Fasta::Sequence::const_iterator Fasta::Sequence::end(void) const |
3475 |
08 Mar 16 |
peter |
151 |
{ |
3475 |
08 Mar 16 |
peter |
152 |
return begin() + size_; |
3475 |
08 Mar 16 |
peter |
153 |
} |
3475 |
08 Mar 16 |
peter |
154 |
|
3475 |
08 Mar 16 |
peter |
155 |
|
3475 |
08 Mar 16 |
peter |
156 |
int Fasta::Sequence::size(void) const |
3475 |
08 Mar 16 |
peter |
157 |
{ |
3475 |
08 Mar 16 |
peter |
158 |
return size_; |
3475 |
08 Mar 16 |
peter |
159 |
} |
3475 |
08 Mar 16 |
peter |
160 |
|
3475 |
08 Mar 16 |
peter |
161 |
|
3475 |
08 Mar 16 |
peter |
162 |
char Fasta::Sequence::operator[](size_t i) const |
3475 |
08 Mar 16 |
peter |
163 |
{ |
3475 |
08 Mar 16 |
peter |
164 |
assert(static_cast<int>(i) < size()); |
3475 |
08 Mar 16 |
peter |
165 |
return begin()[i]; |
3475 |
08 Mar 16 |
peter |
166 |
} |
3475 |
08 Mar 16 |
peter |
167 |
|
3475 |
08 Mar 16 |
peter |
168 |
|
3475 |
08 Mar 16 |
peter |
169 |
Fasta::Sequence reverse_complement(const Fasta::Sequence& seq) |
3475 |
08 Mar 16 |
peter |
170 |
{ |
3475 |
08 Mar 16 |
peter |
// allocate a new char* |
4019 |
06 Nov 20 |
peter |
172 |
std::shared_ptr<char> ptr(new char[seq.size()+1], |
4019 |
06 Nov 20 |
peter |
173 |
std::default_delete<char[]>()); |
3475 |
08 Mar 16 |
peter |
174 |
std::copy(seq.seq_.get(), seq.seq_.get()+seq.size(), ptr.get()); |
3475 |
08 Mar 16 |
peter |
// calculate reverse complement |
3475 |
08 Mar 16 |
peter |
176 |
dna_reverse_complement(ptr.get(), ptr.get()+seq.size()); |
3475 |
08 Mar 16 |
peter |
// assign into result |
3475 |
08 Mar 16 |
peter |
178 |
Fasta::Sequence res; |
3475 |
08 Mar 16 |
peter |
179 |
res.seq_ = ptr; |
3475 |
08 Mar 16 |
peter |
180 |
res.size_ = seq.size(); |
3475 |
08 Mar 16 |
peter |
181 |
return res; |
3475 |
08 Mar 16 |
peter |
182 |
} |
3475 |
08 Mar 16 |
peter |
183 |
|
3475 |
08 Mar 16 |
peter |
184 |
|
3727 |
09 Feb 18 |
peter |
185 |
std::ostream& operator<<(std::ostream& os, const Fasta::Sequence& seq) |
3727 |
09 Feb 18 |
peter |
186 |
{ |
3727 |
09 Feb 18 |
peter |
187 |
typedef |
3727 |
09 Feb 18 |
peter |
188 |
boost::iterator_value<Fasta::Sequence::const_iterator>::type value_type; |
3727 |
09 Feb 18 |
peter |
189 |
std::copy(seq.begin(), seq.end(), std::ostream_iterator<value_type>(os)); |
3727 |
09 Feb 18 |
peter |
190 |
return os; |
3727 |
09 Feb 18 |
peter |
191 |
} |
3727 |
09 Feb 18 |
peter |
192 |
|
3475 |
08 Mar 16 |
peter |
193 |
}}} // of namespace theplu, yat, and omic |