4340 |
16 Apr 23 |
peter |
1 |
#ifndef _theplu_yat_utility_maligner_ |
4340 |
16 Apr 23 |
peter |
2 |
#define _theplu_yat_utility_maligner_ |
4340 |
16 Apr 23 |
peter |
3 |
|
4356 |
23 Aug 23 |
peter |
// $Id$ |
4356 |
23 Aug 23 |
peter |
5 |
|
4356 |
23 Aug 23 |
peter |
6 |
/* |
4356 |
23 Aug 23 |
peter |
Copyright (C) 2023 Peter Johansson |
4356 |
23 Aug 23 |
peter |
8 |
|
4356 |
23 Aug 23 |
peter |
This file is part of the yat library, https://dev.thep.lu.se/yat |
4356 |
23 Aug 23 |
peter |
10 |
|
4356 |
23 Aug 23 |
peter |
The yat library is free software; you can redistribute it and/or |
4356 |
23 Aug 23 |
peter |
modify it under the terms of the GNU General Public License as |
4356 |
23 Aug 23 |
peter |
published by the Free Software Foundation; either version 3 of the |
4356 |
23 Aug 23 |
peter |
License, or (at your option) any later version. |
4356 |
23 Aug 23 |
peter |
15 |
|
4356 |
23 Aug 23 |
peter |
The yat library is distributed in the hope that it will be useful, |
4356 |
23 Aug 23 |
peter |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
4356 |
23 Aug 23 |
peter |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
4356 |
23 Aug 23 |
peter |
General Public License for more details. |
4356 |
23 Aug 23 |
peter |
20 |
|
4356 |
23 Aug 23 |
peter |
You should have received a copy of the GNU General Public License |
4356 |
23 Aug 23 |
peter |
along with yat. If not, see <https://www.gnu.org/licenses/>. |
4356 |
23 Aug 23 |
peter |
23 |
*/ |
4356 |
23 Aug 23 |
peter |
24 |
|
4353 |
23 Aug 23 |
peter |
25 |
#include "Aligner.h" |
4353 |
23 Aug 23 |
peter |
26 |
#include "mAlignerBase.h" |
4353 |
23 Aug 23 |
peter |
27 |
#include "MatrixBase.h" |
4353 |
23 Aug 23 |
peter |
28 |
#include "yat_assert.h" |
4340 |
16 Apr 23 |
peter |
29 |
|
4340 |
16 Apr 23 |
peter |
30 |
#include <algorithm> |
4353 |
23 Aug 23 |
peter |
31 |
#include <iterator> |
4340 |
16 Apr 23 |
peter |
32 |
#include <vector> |
4340 |
16 Apr 23 |
peter |
33 |
|
4340 |
16 Apr 23 |
peter |
34 |
namespace theplu { |
4340 |
16 Apr 23 |
peter |
35 |
namespace yat { |
4340 |
16 Apr 23 |
peter |
36 |
namespace utility { |
4340 |
16 Apr 23 |
peter |
37 |
|
4344 |
17 Apr 23 |
peter |
38 |
/** |
4353 |
23 Aug 23 |
peter |
Base class for memory-efficient aligners, implemented as a |
4353 |
23 Aug 23 |
peter |
CRTP. An inheriting class passes itself as template parameter. |
4353 |
23 Aug 23 |
peter |
\code |
4353 |
23 Aug 23 |
peter |
class Derived : public mAligner<Derived> |
4353 |
23 Aug 23 |
peter |
\endcode |
4340 |
16 Apr 23 |
peter |
44 |
|
4353 |
23 Aug 23 |
peter |
The derived class must implement the following functions: |
4353 |
23 Aug 23 |
peter |
- double init_row(size_t q) const; |
4353 |
23 Aug 23 |
peter |
- double init_col(size_t r) const; |
4353 |
23 Aug 23 |
peter |
- double init(size_t r, size_t q) const; |
4353 |
23 Aug 23 |
peter |
- void update(const std::vector<Alignment>& a, size_t r, size_t rsize); |
4344 |
17 Apr 23 |
peter |
50 |
|
4353 |
23 Aug 23 |
peter |
Typically, these functions are declared private and mAligner is |
4353 |
23 Aug 23 |
peter |
declared as friend. |
4344 |
17 Apr 23 |
peter |
53 |
|
4353 |
23 Aug 23 |
peter |
The class aligns the query and reference using dynamic |
4353 |
23 Aug 23 |
peter |
programming with a dot-matrix and allows inherited classes to |
4353 |
23 Aug 23 |
peter |
tailor the behaviour through the functions above. The dot-matrix |
4353 |
23 Aug 23 |
peter |
has the dimensions rsize+1 x qsize+1 and dot(r,q) corresponds to |
4353 |
23 Aug 23 |
peter |
the best alignment score when aligning the r first elements of |
4353 |
23 Aug 23 |
peter |
the reference and the q first elements of the query. The element |
4353 |
23 Aug 23 |
peter |
in the top-left corner is set to zero, other elements in the |
4353 |
23 Aug 23 |
peter |
first column is set via init_col(row) function (row>0), other |
4353 |
23 Aug 23 |
peter |
elements in the first row is set to init_col(column) function |
4353 |
23 Aug 23 |
peter |
(column>0), and other elements are initialised via the init(row, |
4353 |
23 Aug 23 |
peter |
column) function. The algorithm calculates the optimal path |
4353 |
23 Aug 23 |
peter |
through the dot-matrix by for each element compare the four |
4353 |
23 Aug 23 |
peter |
alternatives and calculating scores as appropriate given passsed |
4353 |
23 Aug 23 |
peter |
values for insertions, deletion and the passed kernel |
4353 |
23 Aug 23 |
peter |
function. The algorithm is traversing through the dot-matrix |
4353 |
23 Aug 23 |
peter |
row-by-row storing the best alignments for the row in a qsize+1 |
4353 |
23 Aug 23 |
peter |
sized vector<Alignment> and when the rth row of the dot-matrix |
4353 |
23 Aug 23 |
peter |
has been completed, this vector is passed to the function |
4353 |
23 Aug 23 |
peter |
update(alignments, r, row_size), where row_size is the number of |
4353 |
23 Aug 23 |
peter |
rows in the dot-matrix or number of elements in the reference |
4353 |
23 Aug 23 |
peter |
plus one. |
4344 |
17 Apr 23 |
peter |
75 |
|
4353 |
23 Aug 23 |
peter |
To optimize the memory usage, the whole dot-matrix is not kept in |
4353 |
23 Aug 23 |
peter |
memory, but only what it is needed, which means the memory usage |
4353 |
23 Aug 23 |
peter |
scales linearly with the query size but is constant with respect |
4353 |
23 Aug 23 |
peter |
to the reference size. |
4344 |
17 Apr 23 |
peter |
80 |
|
4353 |
23 Aug 23 |
peter |
The computational cost scales rsize * qsize. |
4344 |
17 Apr 23 |
peter |
82 |
|
4353 |
23 Aug 23 |
peter |
\since New in yat 0.21 |
4353 |
23 Aug 23 |
peter |
84 |
*/ |
4353 |
23 Aug 23 |
peter |
85 |
template<class Derived> |
4353 |
23 Aug 23 |
peter |
86 |
class mAligner : public mAlignerBase |
4353 |
23 Aug 23 |
peter |
87 |
{ |
4353 |
23 Aug 23 |
peter |
88 |
public: |
4344 |
17 Apr 23 |
peter |
89 |
/** |
4344 |
17 Apr 23 |
peter |
Align reference range [reference_begin, reference_end) and |
4344 |
17 Apr 23 |
peter |
query [query_begin, query_end). |
4344 |
17 Apr 23 |
peter |
92 |
|
4344 |
17 Apr 23 |
peter |
KernelFunction has |
4344 |
17 Apr 23 |
peter |
operator()(RandomAccessIterator1 r, RandomAccessIterator2 q) |
4344 |
17 Apr 23 |
peter |
that describes the alignment between r and q. |
4344 |
17 Apr 23 |
peter |
96 |
|
4344 |
17 Apr 23 |
peter |
\return alignment score |
4344 |
17 Apr 23 |
peter |
98 |
*/ |
4353 |
23 Aug 23 |
peter |
99 |
template<typename Iterator1, typename Iterator2, class KernelFunction> |
4340 |
16 Apr 23 |
peter |
100 |
double operator()(Iterator1 reference_begin, |
4340 |
16 Apr 23 |
peter |
101 |
Iterator1 reference_end, |
4340 |
16 Apr 23 |
peter |
102 |
Iterator2 query_begin, |
4340 |
16 Apr 23 |
peter |
103 |
Iterator2 query_end, |
4353 |
23 Aug 23 |
peter |
104 |
const KernelFunction& kernel) |
4340 |
16 Apr 23 |
peter |
105 |
{ |
4353 |
23 Aug 23 |
peter |
// We don't allow alignment of empty ranges |
4353 |
23 Aug 23 |
peter |
107 |
YAT_ASSERT(reference_begin != reference_end); |
4353 |
23 Aug 23 |
peter |
108 |
YAT_ASSERT(query_begin != query_end); |
4340 |
16 Apr 23 |
peter |
109 |
|
4353 |
23 Aug 23 |
peter |
110 |
best_ = Alignment(); |
4353 |
23 Aug 23 |
peter |
111 |
best_.score() = -std::numeric_limits<double>::infinity(); |
4353 |
23 Aug 23 |
peter |
112 |
size_t qsize = std::distance(query_begin, query_end); |
4353 |
23 Aug 23 |
peter |
113 |
size_t rsize = std::distance(reference_begin, reference_end); |
4340 |
16 Apr 23 |
peter |
114 |
|
4353 |
23 Aug 23 |
peter |
// init first row |
4353 |
23 Aug 23 |
peter |
116 |
std::vector<Alignment> alignments(qsize+1); |
4353 |
23 Aug 23 |
peter |
117 |
for (size_t q=0; q<=qsize; ++q) { |
4353 |
23 Aug 23 |
peter |
118 |
alignments[q].score() = static_cast<Derived*>(this)->init_row(q); |
4353 |
23 Aug 23 |
peter |
119 |
alignments[q].cigar().push_back(BAM_CSOFT_CLIP, q); |
4340 |
16 Apr 23 |
peter |
120 |
} |
4353 |
23 Aug 23 |
peter |
121 |
static_cast<Derived*>(this)->update(alignments, 0, 1+rsize); |
4353 |
23 Aug 23 |
peter |
122 |
std::vector<Alignment> prev(alignments.size()); |
4340 |
16 Apr 23 |
peter |
123 |
|
4353 |
23 Aug 23 |
peter |
// loop over reference |
4353 |
23 Aug 23 |
peter |
125 |
for (size_t r=0; r<rsize; ++r) { |
4353 |
23 Aug 23 |
peter |
126 |
std::swap(alignments, prev); |
4353 |
23 Aug 23 |
peter |
// special case for 1st column |
4353 |
23 Aug 23 |
peter |
128 |
alignments[0].score() = static_cast<Derived*>(this)->init_col(1+r); |
4353 |
23 Aug 23 |
peter |
129 |
alignments[0].position() = 1+r; |
4340 |
16 Apr 23 |
peter |
130 |
|
4353 |
23 Aug 23 |
peter |
// loop over query |
4353 |
23 Aug 23 |
peter |
132 |
for (size_t q=0; q<qsize; ++q) { |
4353 |
23 Aug 23 |
peter |
133 |
uint8_t cig = 255; |
4353 |
23 Aug 23 |
peter |
134 |
double local_score = static_cast<Derived*>(this)->init(r, q); |
4340 |
16 Apr 23 |
peter |
135 |
|
4353 |
23 Aug 23 |
peter |
// match |
4353 |
23 Aug 23 |
peter |
137 |
double s = prev[q].score() + kernel(reference_begin+r, query_begin+q); |
4353 |
23 Aug 23 |
peter |
138 |
if (s > local_score) { |
4353 |
23 Aug 23 |
peter |
139 |
local_score = s; |
4353 |
23 Aug 23 |
peter |
140 |
cig = BAM_CMATCH; |
4353 |
23 Aug 23 |
peter |
141 |
} |
4340 |
16 Apr 23 |
peter |
142 |
|
4353 |
23 Aug 23 |
peter |
// insertion, corresponds to a right-step in dot matrix |
4353 |
23 Aug 23 |
peter |
144 |
if (alignments[q].cigar().size() && |
4353 |
23 Aug 23 |
peter |
145 |
alignments[q].cigar().op(alignments[q].cigar().size()-1)==BAM_CINS) |
4353 |
23 Aug 23 |
peter |
// extending insertion |
4353 |
23 Aug 23 |
peter |
147 |
s = alignments[q].score() - insertion_.penalty(); |
4353 |
23 Aug 23 |
peter |
148 |
else |
4353 |
23 Aug 23 |
peter |
149 |
s = alignments[q].score() - insertion_(1); |
4353 |
23 Aug 23 |
peter |
150 |
if (s > local_score) { |
4353 |
23 Aug 23 |
peter |
151 |
local_score = s; |
4353 |
23 Aug 23 |
peter |
152 |
cig = BAM_CINS; |
4353 |
23 Aug 23 |
peter |
153 |
} |
4340 |
16 Apr 23 |
peter |
154 |
|
4353 |
23 Aug 23 |
peter |
// deletion - vertical step in dot matrix |
4353 |
23 Aug 23 |
peter |
156 |
if (prev[q+1].cigar().size() && |
4353 |
23 Aug 23 |
peter |
157 |
prev[q+1].cigar().op(prev[q+1].cigar().size()-1) == BAM_CDEL) |
4353 |
23 Aug 23 |
peter |
158 |
s = prev[q+1].score() - deletion_.penalty(); |
4353 |
23 Aug 23 |
peter |
159 |
else |
4353 |
23 Aug 23 |
peter |
160 |
s = prev[q+1].score() - deletion_(1); |
4353 |
23 Aug 23 |
peter |
161 |
if (s > local_score) { |
4353 |
23 Aug 23 |
peter |
162 |
local_score = s; |
4353 |
23 Aug 23 |
peter |
163 |
cig = BAM_CDEL; |
4340 |
16 Apr 23 |
peter |
164 |
} |
4340 |
16 Apr 23 |
peter |
165 |
|
4353 |
23 Aug 23 |
peter |
166 |
YAT_ASSERT(q+1 < alignments.size()); |
4353 |
23 Aug 23 |
peter |
167 |
switch (cig) { |
4353 |
23 Aug 23 |
peter |
168 |
case (BAM_CMATCH): |
4353 |
23 Aug 23 |
peter |
169 |
alignments[q+1] = prev[q]; |
4353 |
23 Aug 23 |
peter |
170 |
break; |
4353 |
23 Aug 23 |
peter |
171 |
case (BAM_CINS): |
4353 |
23 Aug 23 |
peter |
172 |
alignments[q+1] = alignments[q]; |
4353 |
23 Aug 23 |
peter |
173 |
break; |
4353 |
23 Aug 23 |
peter |
174 |
case (BAM_CDEL): |
4353 |
23 Aug 23 |
peter |
175 |
alignments[q+1] = prev[q+1]; |
4353 |
23 Aug 23 |
peter |
176 |
break; |
4340 |
16 Apr 23 |
peter |
177 |
} |
4353 |
23 Aug 23 |
peter |
178 |
alignments[q+1].score() = local_score; |
4353 |
23 Aug 23 |
peter |
179 |
if (cig == 255) { |
4353 |
23 Aug 23 |
peter |
180 |
alignments[q+1].cigar().clear(); |
4353 |
23 Aug 23 |
peter |
181 |
alignments[q+1].cigar().push_back(BAM_CSOFT_CLIP, q+1); |
4353 |
23 Aug 23 |
peter |
182 |
alignments[q+1].position() = r+1; |
4353 |
23 Aug 23 |
peter |
183 |
} |
4353 |
23 Aug 23 |
peter |
184 |
else |
4353 |
23 Aug 23 |
peter |
185 |
alignments[q+1].cigar().push_back(cig); |
4340 |
16 Apr 23 |
peter |
186 |
|
4353 |
23 Aug 23 |
peter |
187 |
} // end of loop over query |
4353 |
23 Aug 23 |
peter |
188 |
static_cast<Derived*>(this)->update(alignments, 1+r, 1+rsize); |
4353 |
23 Aug 23 |
peter |
189 |
} // end of loop over reference |
4340 |
16 Apr 23 |
peter |
190 |
|
4353 |
23 Aug 23 |
peter |
191 |
cigar2_ = |
4353 |
23 Aug 23 |
peter |
192 |
utility::cigar2(reference_begin+best_.position(), reference_end, |
4353 |
23 Aug 23 |
peter |
193 |
query_begin, query_end, kernel, best_.cigar()); |
4353 |
23 Aug 23 |
peter |
194 |
return score(); |
4340 |
16 Apr 23 |
peter |
195 |
} |
4340 |
16 Apr 23 |
peter |
196 |
|
4353 |
23 Aug 23 |
peter |
197 |
protected: |
4353 |
23 Aug 23 |
peter |
198 |
/** |
4353 |
23 Aug 23 |
peter |
\param insertion penalty for an insertion |
4340 |
16 Apr 23 |
peter |
200 |
|
4353 |
23 Aug 23 |
peter |
\param deletion penalty for a deletion |
4353 |
23 Aug 23 |
peter |
202 |
*/ |
4353 |
23 Aug 23 |
peter |
203 |
mAligner(const Affine& insertion, const Affine& deletion) |
4353 |
23 Aug 23 |
peter |
204 |
: mAlignerBase(insertion, deletion) |
4353 |
23 Aug 23 |
peter |
205 |
{} |
4340 |
16 Apr 23 |
peter |
206 |
|
4353 |
23 Aug 23 |
peter |
207 |
}; // end of mAligner |
4340 |
16 Apr 23 |
peter |
208 |
|
4340 |
16 Apr 23 |
peter |
209 |
}}} |
4340 |
16 Apr 23 |
peter |
210 |
#endif |