5834 |
24 Feb 20 |
nicklas |
# $Id$ |
5834 |
24 Feb 20 |
nicklas |
# Helper functions for MIPS alignment. This script can't be used by itself. |
5834 |
24 Feb 20 |
nicklas |
# It has been designed to be included in a main script that also need to |
5834 |
24 Feb 20 |
nicklas |
# set several global variables. For details see each function definition. |
5834 |
24 Feb 20 |
nicklas |
5 |
# |
5834 |
24 Feb 20 |
nicklas |
# Copy FASTQ and RG files from/to given folders |
5834 |
24 Feb 20 |
nicklas |
# Parameters: |
5834 |
24 Feb 20 |
nicklas |
# $1: Folder with FASTQ and RG files |
5834 |
24 Feb 20 |
nicklas |
# $2: Folder to copy the files to |
5834 |
24 Feb 20 |
nicklas |
10 |
function copy_fastq_and_rg { |
5834 |
24 Feb 20 |
nicklas |
11 |
local srcFolder=$1 |
5834 |
24 Feb 20 |
nicklas |
12 |
local destFolder=$2 |
5834 |
24 Feb 20 |
nicklas |
13 |
if [ ! -d "${srcFolder}" ]; then |
5834 |
24 Feb 20 |
nicklas |
14 |
echo "Can't find FASTQ data folder ${srcFolder}" 1>&2 |
5834 |
24 Feb 20 |
nicklas |
15 |
exit 1 |
5834 |
24 Feb 20 |
nicklas |
16 |
fi |
5834 |
24 Feb 20 |
nicklas |
17 |
cp ${srcFolder}/*.fastq.gz $2 |
5834 |
24 Feb 20 |
nicklas |
18 |
cp ${srcFolder}/*.rg $2 |
5834 |
24 Feb 20 |
nicklas |
19 |
} |
5834 |
24 Feb 20 |
nicklas |
20 |
|
5834 |
24 Feb 20 |
nicklas |
# Adapter sequences |
5834 |
24 Feb 20 |
nicklas |
# Trimmomatic adapter sequences: |
5834 |
24 Feb 20 |
nicklas |
# "For 'Palindrome' clipping, a matched pair of adapter sequences must be provided. |
5834 |
24 Feb 20 |
nicklas |
# The sequence names should both start with 'Prefix', and end in '/1' for the |
5834 |
24 Feb 20 |
nicklas |
# forward adapter and '/2' for the reverse adapter. The part of the name between |
5834 |
24 Feb 20 |
nicklas |
# 'Prefix' and '/1' or '/2' must match exactly within each pair." |
5834 |
24 Feb 20 |
nicklas |
# Checking the adapter fasta files provided by Trimmomatic, it seems as the |
5834 |
24 Feb 20 |
nicklas |
# Prefix/1 adapter is the i5 adapter read from the flowcell surface (i.e. NOT |
5834 |
24 Feb 20 |
nicklas |
# as it will appear in read2 when reading through a short fragment) |
5834 |
24 Feb 20 |
nicklas |
# Prefix/2 adapter is the i7 adapter, also as read from the flowcell surface |
5834 |
24 Feb 20 |
nicklas |
# MIPS BDC i5 adapter sequence after the index for trimmomatic: CATACGAGATCCGTAATCGGGAAGCTGAAG |
5834 |
24 Feb 20 |
nicklas |
# MIPS BDC i7 adapter sequence after the index for trimmomatic: ACACGCACGATCCGACGGTAGTGT |
5834 |
24 Feb 20 |
nicklas |
33 |
adapter1=CATACGAGATCCGTAATCGGGAAGCTGAAG |
5834 |
24 Feb 20 |
nicklas |
34 |
adapter2=ACACGCACGATCCGACGGTAGTGT |
5834 |
24 Feb 20 |
nicklas |
# Picard CollectAlignmentSummaryMetrics adapter sequences: |
5834 |
24 Feb 20 |
nicklas |
# I haven't figured out in what direction it wants the adapters, |
5834 |
24 Feb 20 |
nicklas |
# for simplicity, I use both directions |
5834 |
24 Feb 20 |
nicklas |
# Reverse complement of MIPS BDC i5 adapter: CTTCAGCTTCCCGATTACGGATCTCGTATG |
5834 |
24 Feb 20 |
nicklas |
# Reverse complement of MIPS BDC i7 adapter: ACACTACCGTCGGATCGTGCGTGT |
5834 |
24 Feb 20 |
nicklas |
40 |
adapter1rc=CTTCAGCTTCCCGATTACGGATCTCGTATG |
5834 |
24 Feb 20 |
nicklas |
41 |
adapter2rc=ACACTACCGTCGGATCGTGCGTGT |
5834 |
24 Feb 20 |
nicklas |
# Novoalign adapter sequences: |
5834 |
24 Feb 20 |
nicklas |
# Adapter1 is the i7 adapter sequence as it appears in read1 when reading |
5834 |
24 Feb 20 |
nicklas |
# through a short fragment |
5834 |
24 Feb 20 |
nicklas |
# Adapter2 is the i5 adapter sequence as it appears in read2 when reading |
5834 |
24 Feb 20 |
nicklas |
# through as short fragment |
5834 |
24 Feb 20 |
nicklas |
# MIPS BDC read1 (i7) adapter (before i7 index): ACACTACCGTCGGATCGTGCGTGT |
5834 |
24 Feb 20 |
nicklas |
# MIPS BDC read2 (i5) adapter (before i5 index): CTTCAGCTTCCCGATTACGGATCTCGTATG |
5834 |
24 Feb 20 |
nicklas |
# Use 12 first bases of each for novoalign (if using -a) |
5834 |
24 Feb 20 |
nicklas |
50 |
novo_adapter1=ACACTACCGTCG |
5834 |
24 Feb 20 |
nicklas |
51 |
novo_adapter2=CTTCAGCTTCCC |
5834 |
24 Feb 20 |
nicklas |
52 |
|
5834 |
24 Feb 20 |
nicklas |
# Create an adapter file that can be used with Trimmomatic for adapter trimming |
5834 |
24 Feb 20 |
nicklas |
# Parameters: |
5834 |
24 Feb 20 |
nicklas |
# $1: Name of the file to create |
5834 |
24 Feb 20 |
nicklas |
56 |
function create_adapter_fa { |
5834 |
24 Feb 20 |
nicklas |
57 |
local faFile=$1 |
5834 |
24 Feb 20 |
nicklas |
58 |
echo -e ">PrefixPE/1\n${adapter1}\n>PrefixPE/2\n${adapter2}" > ${faFile} |
5834 |
24 Feb 20 |
nicklas |
59 |
} |
5834 |
24 Feb 20 |
nicklas |
60 |
|
5834 |
24 Feb 20 |
nicklas |
# Run two Trimmomatic steps. First trim adapters and then quality |
5834 |
24 Feb 20 |
nicklas |
# Parameters: |
5836 |
24 Feb 20 |
nicklas |
# $1: Prefix to FASTQ files "_R?.fastq.gz" is added automatically |
5836 |
24 Feb 20 |
nicklas |
# $2: Prefix for generated files (counter) |
5834 |
24 Feb 20 |
nicklas |
# Global parameters: |
5834 |
24 Feb 20 |
nicklas |
# $JAVA: Path to Java exectable |
5834 |
24 Feb 20 |
nicklas |
# $TrimmomaticJAR: Path to Trimmomatic JAR file |
5834 |
24 Feb 20 |
nicklas |
# $NumThreads: Number of threads to use |
5834 |
24 Feb 20 |
nicklas |
# $TrimOptionsAdapter: Options for adapter trimming |
5834 |
24 Feb 20 |
nicklas |
# $TrimOptionsQual: Options for quality trimming |
5834 |
24 Feb 20 |
nicklas |
# $nAfterTrim: Counter for number of FASTQ files with data remaining after trimming |
5834 |
24 Feb 20 |
nicklas |
72 |
function run_trimmomatic { |
5834 |
24 Feb 20 |
nicklas |
73 |
local prefix=$1 |
5834 |
24 Feb 20 |
nicklas |
74 |
local n=$2 |
5834 |
24 Feb 20 |
nicklas |
75 |
|
5834 |
24 Feb 20 |
nicklas |
## Adapter trimming |
5834 |
24 Feb 20 |
nicklas |
77 |
./stdwrap.sh ${JAVA} -jar ${TrimmomaticJAR} PE \ |
5834 |
24 Feb 20 |
nicklas |
78 |
-threads ${NumThreads} \ |
5834 |
24 Feb 20 |
nicklas |
79 |
-phred33 \ |
5834 |
24 Feb 20 |
nicklas |
80 |
-trimlog tmp/${n}.adaptrim.log \ |
5834 |
24 Feb 20 |
nicklas |
81 |
fastq/${prefix}_R1.fastq.gz fastq/${prefix}_R2.fastq.gz \ |
5834 |
24 Feb 20 |
nicklas |
82 |
tmp/${n}.adaptrim.1.fastq tmp/${n}.adaptrim.u.1.fastq tmp/${n}.adaptrim.2.fastq tmp/${n}.adaptrim.u.2.fastq \ |
5836 |
24 Feb 20 |
nicklas |
83 |
${TrimOptionsAdapter} \ |
5834 |
24 Feb 20 |
nicklas |
84 |
>> trimmomatic.out |
5834 |
24 Feb 20 |
nicklas |
85 |
|
5834 |
24 Feb 20 |
nicklas |
86 |
if [ ! -s "tmp/${n}.adaptrim.1.fastq" ]; then |
5834 |
24 Feb 20 |
nicklas |
87 |
rm -f tmp/${n}.adaptrim.*.fastq |
5834 |
24 Feb 20 |
nicklas |
88 |
echo "${prefix}: No paired end reads surviving adapter trimming" |
5834 |
24 Feb 20 |
nicklas |
89 |
return |
5834 |
24 Feb 20 |
nicklas |
90 |
fi |
5834 |
24 Feb 20 |
nicklas |
91 |
|
5834 |
24 Feb 20 |
nicklas |
## Quality trimming |
5834 |
24 Feb 20 |
nicklas |
93 |
./stdwrap.sh ${JAVA} -jar ${TrimmomaticJAR} PE \ |
5834 |
24 Feb 20 |
nicklas |
94 |
-threads ${NumThreads} \ |
5834 |
24 Feb 20 |
nicklas |
95 |
-phred33 \ |
5834 |
24 Feb 20 |
nicklas |
96 |
-trimlog tmp/${n}.qualtrim.log \ |
5834 |
24 Feb 20 |
nicklas |
97 |
tmp/${n}.adaptrim.1.fastq tmp/${n}.adaptrim.2.fastq \ |
5834 |
24 Feb 20 |
nicklas |
98 |
tmp/${n}.qualtrim.1.fastq tmp/${n}.qualtrim.u.1.fastq tmp/${n}.qualtrim.2.fastq tmp/${n}.qualtrim.u.2.fastq \ |
5836 |
24 Feb 20 |
nicklas |
99 |
${TrimOptionsQual} \ |
5834 |
24 Feb 20 |
nicklas |
100 |
>> trimmomatic.out |
5834 |
24 Feb 20 |
nicklas |
101 |
|
5834 |
24 Feb 20 |
nicklas |
102 |
if [ ! -s "tmp/${n}.qualtrim.1.fastq" ]; then |
5834 |
24 Feb 20 |
nicklas |
103 |
rm -f tmp/${n}.qualtrim.*.fastq |
5834 |
24 Feb 20 |
nicklas |
104 |
echo "${prefix}: No paired end reads surviving quality trimming" |
5834 |
24 Feb 20 |
nicklas |
105 |
return |
5834 |
24 Feb 20 |
nicklas |
106 |
fi |
5834 |
24 Feb 20 |
nicklas |
107 |
|
5834 |
24 Feb 20 |
nicklas |
108 |
nAfterTrim=$((nAfterTrim + 1)) |
5834 |
24 Feb 20 |
nicklas |
109 |
} |
5834 |
24 Feb 20 |
nicklas |
110 |
|
5834 |
24 Feb 20 |
nicklas |
# Generate statistics from Trimmomatic log files |
5834 |
24 Feb 20 |
nicklas |
# Parameters: |
5834 |
24 Feb 20 |
nicklas |
# $1: Input file name pattern (*.log files from Trimmomatic) |
5834 |
24 Feb 20 |
nicklas |
# $2: Output file name |
5834 |
24 Feb 20 |
nicklas |
115 |
function trimlog_stats { |
5834 |
24 Feb 20 |
nicklas |
116 |
local input=$1 |
5834 |
24 Feb 20 |
nicklas |
117 |
local output=$2 |
5834 |
24 Feb 20 |
nicklas |
118 |
|
5834 |
24 Feb 20 |
nicklas |
119 |
echo -e "Trim\tCount" > ${output} |
5834 |
24 Feb 20 |
nicklas |
120 |
if ls $input > /dev/null 2>&1; then |
5834 |
24 Feb 20 |
nicklas |
121 |
awk '{print $ (NF)}' ${input} | sort -n | uniq -c | awk -v OFS="\t" '{print $2,$1}' >> ${output} |
5834 |
24 Feb 20 |
nicklas |
122 |
fi |
5834 |
24 Feb 20 |
nicklas |
123 |
} |
5834 |
24 Feb 20 |
nicklas |
124 |
|
5836 |
24 Feb 20 |
nicklas |
# Read RG file and convert to Picard2 syntax. |
5836 |
24 Feb 20 |
nicklas |
# Parameters: |
5836 |
24 Feb 20 |
nicklas |
# $1: Path to RG file |
5836 |
24 Feb 20 |
nicklas |
# Returns: |
5836 |
24 Feb 20 |
nicklas |
# Converted RG value |
5836 |
24 Feb 20 |
nicklas |
130 |
function read_and_fix_rg { |
5836 |
24 Feb 20 |
nicklas |
131 |
local rg=$1 |
5836 |
24 Feb 20 |
nicklas |
132 |
|
5836 |
24 Feb 20 |
nicklas |
133 |
read -r DATA < ${rg} |
5836 |
24 Feb 20 |
nicklas |
134 |
|
5836 |
24 Feb 20 |
nicklas |
135 |
DATA=${DATA/RG=/-READ_GROUP_NAME } |
5836 |
24 Feb 20 |
nicklas |
136 |
DATA=${DATA/SM=/-SAMPLE_NAME } |
5836 |
24 Feb 20 |
nicklas |
137 |
DATA=${DATA/LB=/-LIBRARY_NAME } |
5836 |
24 Feb 20 |
nicklas |
138 |
DATA=${DATA/PU=/-PLATFORM_UNIT } |
5836 |
24 Feb 20 |
nicklas |
139 |
DATA=${DATA/DT=/-RUN_DATE } |
5836 |
24 Feb 20 |
nicklas |
140 |
DATA=${DATA/PL=/-PLATFORM } |
5836 |
24 Feb 20 |
nicklas |
141 |
DATA=${DATA/CN=/-SEQUENCING_CENTER } |
5836 |
24 Feb 20 |
nicklas |
142 |
|
5836 |
24 Feb 20 |
nicklas |
143 |
echo ${DATA} |
5836 |
24 Feb 20 |
nicklas |
144 |
} |
5834 |
24 Feb 20 |
nicklas |
145 |
|
5836 |
24 Feb 20 |
nicklas |
# Run Picard to convert FASTQ files to BAM file |
5836 |
24 Feb 20 |
nicklas |
# Parameters: |
5836 |
24 Feb 20 |
nicklas |
# $1: Prefix to RG file ".rg" is added automatically |
5836 |
24 Feb 20 |
nicklas |
# $2: Prefix for generated files (counter) |
5836 |
24 Feb 20 |
nicklas |
150 |
function fastq_to_bam { |
5836 |
24 Feb 20 |
nicklas |
151 |
local prefix=$1 |
5836 |
24 Feb 20 |
nicklas |
152 |
local n=$2 |
5836 |
24 Feb 20 |
nicklas |
153 |
|
5836 |
24 Feb 20 |
nicklas |
154 |
if [ ! -f tmp/${n}.qualtrim.1.fastq ]; then |
5836 |
24 Feb 20 |
nicklas |
155 |
return |
5836 |
24 Feb 20 |
nicklas |
156 |
fi |
5836 |
24 Feb 20 |
nicklas |
157 |
|
5836 |
24 Feb 20 |
nicklas |
158 |
local rg=$(read_and_fix_rg "fastq/${prefix}.rg") |
5836 |
24 Feb 20 |
nicklas |
159 |
|
5836 |
24 Feb 20 |
nicklas |
160 |
./stdwrap.sh ./picard2 FastqToSam \ |
5836 |
24 Feb 20 |
nicklas |
161 |
-SORT_ORDER queryname \ |
5836 |
24 Feb 20 |
nicklas |
162 |
-QUALITY_FORMAT Standard \ |
5836 |
24 Feb 20 |
nicklas |
163 |
${rg} \ |
5836 |
24 Feb 20 |
nicklas |
164 |
-FASTQ tmp/${n}.qualtrim.1.fastq \ |
5836 |
24 Feb 20 |
nicklas |
165 |
-FASTQ2 tmp/${n}.qualtrim.2.fastq \ |
5836 |
24 Feb 20 |
nicklas |
166 |
-OUTPUT tmp/${n}.u.bam \ |
5836 |
24 Feb 20 |
nicklas |
167 |
>> fastqtosam.out |
5836 |
24 Feb 20 |
nicklas |
168 |
} |
5836 |
24 Feb 20 |
nicklas |
169 |
|
5836 |
24 Feb 20 |
nicklas |
# Run FgBio to annotate the BAM file with UMI information |
5836 |
24 Feb 20 |
nicklas |
# Parameters: |
5836 |
24 Feb 20 |
nicklas |
# $1: Prefix to UMI file "_UMI.fastq.gz" is added automatically |
5836 |
24 Feb 20 |
nicklas |
# $2: Prefix for generated files (counter) |
5836 |
24 Feb 20 |
nicklas |
# Global parameters: |
5836 |
24 Feb 20 |
nicklas |
# $JAVA: Path to Java exectable |
5836 |
24 Feb 20 |
nicklas |
# $FgBioJAR: Path to FgBio JAR file |
5836 |
24 Feb 20 |
nicklas |
177 |
function annotate_umi { |
5836 |
24 Feb 20 |
nicklas |
178 |
local prefix=$1 |
5836 |
24 Feb 20 |
nicklas |
179 |
local n=$2 |
5836 |
24 Feb 20 |
nicklas |
180 |
|
5836 |
24 Feb 20 |
nicklas |
181 |
if [ ! -f tmp/${n}.u.bam ]; then |
5836 |
24 Feb 20 |
nicklas |
182 |
return |
5836 |
24 Feb 20 |
nicklas |
183 |
fi |
5836 |
24 Feb 20 |
nicklas |
184 |
|
5836 |
24 Feb 20 |
nicklas |
185 |
./stdwrap.sh ${JAVA} -jar ${FgBioJAR} AnnotateBamWithUmis \ |
5836 |
24 Feb 20 |
nicklas |
186 |
--fail-fast true -t RX \ |
5836 |
24 Feb 20 |
nicklas |
187 |
-i tmp/${n}.u.bam \ |
5836 |
24 Feb 20 |
nicklas |
188 |
-f fastq/${prefix}_UMI.fastq.gz \ |
5836 |
24 Feb 20 |
nicklas |
189 |
-o tmp/${n}.umi.bam \ |
5836 |
24 Feb 20 |
nicklas |
190 |
>> annotatebam.out |
5836 |
24 Feb 20 |
nicklas |
191 |
|
5836 |
24 Feb 20 |
nicklas |
192 |
} |
5836 |
24 Feb 20 |
nicklas |
193 |
|
5842 |
25 Feb 20 |
nicklas |
# Get min_insert and max_insert from the Amplicons BED file |
5842 |
25 Feb 20 |
nicklas |
# The values are needed by novoalign and collecting alignment metrics |
5842 |
25 Feb 20 |
nicklas |
# Global parameters: |
5842 |
25 Feb 20 |
nicklas |
# $AmpliconsBed: Path to amplicons BED file |
5842 |
25 Feb 20 |
nicklas |
# $min_insert: Updated with value from the BED file |
5842 |
25 Feb 20 |
nicklas |
# $max_insert: Updated with value from the BED file |
5842 |
25 Feb 20 |
nicklas |
200 |
function get_min_max_insert { |
5842 |
25 Feb 20 |
nicklas |
201 |
min_insert=$(awk 'NR == 1 || $3 - $2 < min {min = $3 - $2}END{print min - 1}' "${AmpliconsBed}") |
5842 |
25 Feb 20 |
nicklas |
202 |
max_insert=$(awk 'NR == 1 || $3 - $2 > max {max = $3 - $2}END{print max + 1}' "${AmpliconsBed}") |
5842 |
25 Feb 20 |
nicklas |
203 |
} |
5842 |
25 Feb 20 |
nicklas |
204 |
|
5842 |
25 Feb 20 |
nicklas |
205 |
|
5838 |
24 Feb 20 |
nicklas |
# Align with novoalign |
5838 |
24 Feb 20 |
nicklas |
# Parameters: |
5838 |
24 Feb 20 |
nicklas |
# $1: Prefix of sample (used in output files) |
5838 |
24 Feb 20 |
nicklas |
# $2: Prefix for generated files (counter) |
5838 |
24 Feb 20 |
nicklas |
# Global parameters: |
5838 |
24 Feb 20 |
nicklas |
# $NovoAlign: Path to novoalign exectable |
5838 |
24 Feb 20 |
nicklas |
# $NovoAlignOptions: Options for novoalign |
5838 |
24 Feb 20 |
nicklas |
# $NovoIndex: Path to index file |
5838 |
24 Feb 20 |
nicklas |
# $AmpliconsBed: Path to amplicons BED file |
5838 |
24 Feb 20 |
nicklas |
# $NovoSort: Path to novosort executable |
5838 |
24 Feb 20 |
nicklas |
# $NumThreads: Number of threads to use |
5842 |
25 Feb 20 |
nicklas |
# $min_insert: Calculated by get_min_max_insert |
5842 |
25 Feb 20 |
nicklas |
# $max_insert: Calculated by get_min_max_insert |
5838 |
24 Feb 20 |
nicklas |
# $AlignedBams: For returning paths to aligned BAM files (used for input in the merge step) |
5838 |
24 Feb 20 |
nicklas |
220 |
function novoalign { |
5838 |
24 Feb 20 |
nicklas |
221 |
local prefix=$1 |
5838 |
24 Feb 20 |
nicklas |
222 |
local n=$2 |
5836 |
24 Feb 20 |
nicklas |
223 |
|
5838 |
24 Feb 20 |
nicklas |
224 |
if [ ! -f tmp/${n}.umi.bam ]; then |
5838 |
24 Feb 20 |
nicklas |
225 |
return |
5838 |
24 Feb 20 |
nicklas |
226 |
fi |
5838 |
24 Feb 20 |
nicklas |
227 |
|
5838 |
24 Feb 20 |
nicklas |
228 |
./stderrwrap.sh ${NovoAlign} \ |
5838 |
24 Feb 20 |
nicklas |
229 |
-c ${NumThreads} \ |
5838 |
24 Feb 20 |
nicklas |
230 |
${NovoAlignOptions} \ |
5838 |
24 Feb 20 |
nicklas |
231 |
-d ${NovoIndex} \ |
5838 |
24 Feb 20 |
nicklas |
232 |
--tags MC,ZP \ |
5838 |
24 Feb 20 |
nicklas |
233 |
-i PE ${min_insert}-${max_insert} \ |
5838 |
24 Feb 20 |
nicklas |
234 |
--amplicons ${AmpliconsBed} \ |
5842 |
25 Feb 20 |
nicklas |
235 |
1 out/${n}.novo_coverage.bed \ |
5838 |
24 Feb 20 |
nicklas |
236 |
-f tmp/${n}.umi.bam \ |
5838 |
24 Feb 20 |
nicklas |
237 |
> tmp/${n}.nosort.bam \ |
5842 |
25 Feb 20 |
nicklas |
238 |
3> out/${n}.novo.log |
5838 |
24 Feb 20 |
nicklas |
239 |
|
5838 |
24 Feb 20 |
nicklas |
240 |
./stderrwrap.sh ${NovoSort} \ |
5838 |
24 Feb 20 |
nicklas |
241 |
-c ${NumThreads} -t . -i \ |
5838 |
24 Feb 20 |
nicklas |
242 |
-o tmp/${n}.novo.bam \ |
5838 |
24 Feb 20 |
nicklas |
243 |
tmp/${n}.nosort.bam \ |
5838 |
24 Feb 20 |
nicklas |
244 |
3> tmp/${n}.sort.log |
5838 |
24 Feb 20 |
nicklas |
245 |
|
5838 |
24 Feb 20 |
nicklas |
246 |
AlignedBams="${AlignedBams} -INPUT tmp/${n}.novo.bam" |
5838 |
24 Feb 20 |
nicklas |
247 |
} |
5838 |
24 Feb 20 |
nicklas |
248 |
|
5838 |
24 Feb 20 |
nicklas |
# Merge BAM files |
5838 |
24 Feb 20 |
nicklas |
# Global parameters: |
5838 |
24 Feb 20 |
nicklas |
# $AlignedBams: BAM files to merge (created by 'novoalign') |
5838 |
24 Feb 20 |
nicklas |
252 |
function merge_bam { |
5838 |
24 Feb 20 |
nicklas |
253 |
|
5838 |
24 Feb 20 |
nicklas |
254 |
./stdwrap.sh ./picard2 MergeSamFiles \ |
5838 |
24 Feb 20 |
nicklas |
255 |
-SORT_ORDER coordinate -ASSUME_SORTED true \ |
5838 |
24 Feb 20 |
nicklas |
256 |
${AlignedBams} \ |
5838 |
24 Feb 20 |
nicklas |
257 |
-OUTPUT tmp/novo.bam \ |
5838 |
24 Feb 20 |
nicklas |
258 |
> mergebam.out |
5838 |
24 Feb 20 |
nicklas |
259 |
} |
5838 |
24 Feb 20 |
nicklas |
260 |
|
5840 |
25 Feb 20 |
nicklas |
# Re-header the BAM file |
5840 |
25 Feb 20 |
nicklas |
# Replace SQ entries in the novo.bam header with reference genome dict |
5840 |
25 Feb 20 |
nicklas |
# - The novoalign genome index was built from a genome with ambiguity codes |
5840 |
25 Feb 20 |
nicklas |
# and therefore the md5 checksums don't match the standard reference |
5840 |
25 Feb 20 |
nicklas |
# genome. |
5840 |
25 Feb 20 |
nicklas |
# - Mismatching md5 values in the SQ entries will cause both picard and |
5840 |
25 Feb 20 |
nicklas |
# GATK tools to crash downstream |
5840 |
25 Feb 20 |
nicklas |
# Global parameters: |
5840 |
25 Feb 20 |
nicklas |
# $samtools: Path to samtools executable |
5840 |
25 Feb 20 |
nicklas |
# $GenomeDict: Path to genome .dict file |
5840 |
25 Feb 20 |
nicklas |
271 |
function reheader_bam { |
5840 |
25 Feb 20 |
nicklas |
272 |
${samtools} view -H tmp/novo.bam | grep -v "^@SQ" > tmp/header.sam |
5840 |
25 Feb 20 |
nicklas |
273 |
grep "^@SQ" ${GenomeDict} >> tmp/header.sam |
5840 |
25 Feb 20 |
nicklas |
274 |
${samtools} reheader -P tmp/header.sam tmp/novo.bam > tmp/novo_reheaded.bam |
5840 |
25 Feb 20 |
nicklas |
275 |
} |
5840 |
25 Feb 20 |
nicklas |
276 |
|
5840 |
25 Feb 20 |
nicklas |
# Split novo_reheader.bam into seperate files for concordant, discordant and unmapped read pairs |
5840 |
25 Feb 20 |
nicklas |
# Global parameters: |
5840 |
25 Feb 20 |
nicklas |
# $samtools: Path to samtools executable |
5840 |
25 Feb 20 |
nicklas |
280 |
function split_bam { |
5840 |
25 Feb 20 |
nicklas |
281 |
${samtools} view -b -h -@ 2 -f 3 tmp/novo_reheaded.bam > tmp/concordant.bam |
5840 |
25 Feb 20 |
nicklas |
282 |
${samtools} view -b -h -@ 2 -G 12 -F 2 tmp/novo_reheaded.bam > tmp/discordant.bam |
5840 |
25 Feb 20 |
nicklas |
283 |
${samtools} view -b -h -@ 2 -f 12 tmp/novo_reheaded.bam > out/unmapped.bam |
5840 |
25 Feb 20 |
nicklas |
284 |
} |
5840 |
25 Feb 20 |
nicklas |
285 |
|
5840 |
25 Feb 20 |
nicklas |
# Mark duplicates with Picard that is aware of UMIs |
5840 |
25 Feb 20 |
nicklas |
# Parameters: |
5840 |
25 Feb 20 |
nicklas |
# $1: Prefix to .bam file to process |
5840 |
25 Feb 20 |
nicklas |
# Global parameters: |
5840 |
25 Feb 20 |
nicklas |
# $MarkDuplicatesOptions: Options to Picard |
5840 |
25 Feb 20 |
nicklas |
291 |
function mark_duplicates { |
5840 |
25 Feb 20 |
nicklas |
292 |
local prefix=$1 |
5840 |
25 Feb 20 |
nicklas |
293 |
|
5840 |
25 Feb 20 |
nicklas |
294 |
./stdwrap.sh ./picard2 UmiAwareMarkDuplicatesWithMateCigar \ |
5840 |
25 Feb 20 |
nicklas |
295 |
${MarkDuplicatesOptions} \ |
5840 |
25 Feb 20 |
nicklas |
296 |
-UMI_METRICS out/${prefix}.umi_metrics.txt \ |
5840 |
25 Feb 20 |
nicklas |
297 |
-METRICS_FILE out/${prefix}.dedup_metrics.txt \ |
5840 |
25 Feb 20 |
nicklas |
298 |
-INPUT tmp/${prefix}.bam \ |
5840 |
25 Feb 20 |
nicklas |
299 |
-OUTPUT out/${prefix}.bam \ |
5840 |
25 Feb 20 |
nicklas |
300 |
>> markduplicates.out |
5840 |
25 Feb 20 |
nicklas |
301 |
} |
5840 |
25 Feb 20 |
nicklas |
302 |
|
5842 |
25 Feb 20 |
nicklas |
# Get PCR metrics for the concordant.bam file |
5842 |
25 Feb 20 |
nicklas |
# Global parameters: |
5842 |
25 Feb 20 |
nicklas |
# $AmpliconsBed: Path to amplicons BED file |
5842 |
25 Feb 20 |
nicklas |
# $GenomeDict: Path to genome .dict file |
5842 |
25 Feb 20 |
nicklas |
# $GenomeFasta: Path to the genome fasta file |
5842 |
25 Feb 20 |
nicklas |
# $PcrMetricsOptions: Options for Picard |
5842 |
25 Feb 20 |
nicklas |
309 |
function collect_pcr_metrics { |
5842 |
25 Feb 20 |
nicklas |
310 |
|
5842 |
25 Feb 20 |
nicklas |
311 |
local amplicon_set_name=$(basename ${AmpliconsBed} ".bed") |
5842 |
25 Feb 20 |
nicklas |
312 |
awk -F "\t" -v OFS="\t" '{print $1,$7,$8,$4,$5,$6}' ${AmpliconsBed} > amplicon_inserts.bed |
5842 |
25 Feb 20 |
nicklas |
313 |
|
5842 |
25 Feb 20 |
nicklas |
314 |
./stdwrap.sh ./picard2 BedToIntervalList \ |
5842 |
25 Feb 20 |
nicklas |
315 |
-SEQUENCE_DICTIONARY ${GenomeDict} \ |
5842 |
25 Feb 20 |
nicklas |
316 |
-INPUT amplicon_inserts.bed \ |
5842 |
25 Feb 20 |
nicklas |
317 |
-OUTPUT amplicon_inserts.intervals \ |
5842 |
25 Feb 20 |
nicklas |
318 |
>> pcr_metrics.out |
5842 |
25 Feb 20 |
nicklas |
319 |
|
5842 |
25 Feb 20 |
nicklas |
320 |
./stdwrap.sh ./picard2 CollectTargetedPcrMetrics \ |
5842 |
25 Feb 20 |
nicklas |
321 |
-INPUT out/concordant.bam \ |
5842 |
25 Feb 20 |
nicklas |
322 |
-CUSTOM_AMPLICON_SET_NAME ${amplicon_set_name} \ |
5842 |
25 Feb 20 |
nicklas |
323 |
-AMPLICON_INTERVALS amplicon_inserts.intervals \ |
5842 |
25 Feb 20 |
nicklas |
324 |
-TARGET_INTERVALS amplicon_inserts.intervals \ |
5842 |
25 Feb 20 |
nicklas |
325 |
-R ${GenomeFasta} \ |
5842 |
25 Feb 20 |
nicklas |
326 |
${PcrMetricsOptions} \ |
5842 |
25 Feb 20 |
nicklas |
327 |
-PER_TARGET_COVERAGE out/concordant.per_amplicon_cov.txt \ |
5842 |
25 Feb 20 |
nicklas |
328 |
-PER_BASE_COVERAGE out/concordant.per_base_cov.txt.gz \ |
5842 |
25 Feb 20 |
nicklas |
329 |
-OUTPUT out/concordant.pcr_metrics.txt \ |
5842 |
25 Feb 20 |
nicklas |
330 |
>> pcr_metrics.out |
5842 |
25 Feb 20 |
nicklas |
331 |
} |
5842 |
25 Feb 20 |
nicklas |
332 |
|
5842 |
25 Feb 20 |
nicklas |
# Get alignment metrics for the concordant.bam file |
5842 |
25 Feb 20 |
nicklas |
# Global parameters: |
5842 |
25 Feb 20 |
nicklas |
# $GenomeFasta: Path to the genome fasta file |
5842 |
25 Feb 20 |
nicklas |
# $max_insert: Calculated by get_min_max_insert |
5842 |
25 Feb 20 |
nicklas |
# $adapterN: Multiple adapter sequences defined earlier in this file |
5842 |
25 Feb 20 |
nicklas |
338 |
function collect_alignment_metrics { |
5842 |
25 Feb 20 |
nicklas |
339 |
|
5842 |
25 Feb 20 |
nicklas |
340 |
./stdwrap.sh ./picard2 CollectAlignmentSummaryMetrics \ |
5842 |
25 Feb 20 |
nicklas |
341 |
-INPUT out/concordant.bam \ |
5842 |
25 Feb 20 |
nicklas |
342 |
-R ${GenomeFasta} \ |
5842 |
25 Feb 20 |
nicklas |
343 |
-METRIC_ACCUMULATION_LEVEL null -METRIC_ACCUMULATION_LEVEL LIBRARY -METRIC_ACCUMULATION_LEVEL READ_GROUP \ |
5842 |
25 Feb 20 |
nicklas |
344 |
-MAX_INSERT_SIZE ${max_insert} \ |
5842 |
25 Feb 20 |
nicklas |
345 |
-ADAPTER_SEQUENCE null -ADAPTER_SEQUENCE ${adapter1} -ADAPTER_SEQUENCE ${adapter2} \ |
5842 |
25 Feb 20 |
nicklas |
346 |
-ADAPTER_SEQUENCE ${adapter1rc} -ADAPTER_SEQUENCE ${adapter2rc} \ |
5842 |
25 Feb 20 |
nicklas |
347 |
-OUTPUT out/concordant.alignment_summary_metrics.txt \ |
5842 |
25 Feb 20 |
nicklas |
348 |
>> alignment_metrics.out |
5842 |
25 Feb 20 |
nicklas |
349 |
|
5842 |
25 Feb 20 |
nicklas |
350 |
} |