6657 |
30 Mar 22 |
nicklas |
#!/bin/bash |
6657 |
30 Mar 22 |
nicklas |
2 |
## |
6657 |
30 Mar 22 |
nicklas |
## Pipeline script for the Targeted Genotyping analysis. |
6657 |
30 Mar 22 |
nicklas |
## Environment variables that should be defined (in job.sh) before calling this script |
6657 |
30 Mar 22 |
nicklas |
## -BamFile: Path to alignment BAM file |
6657 |
30 Mar 22 |
nicklas |
## -BaseDir: Path to directory with variant call databases |
6657 |
30 Mar 22 |
nicklas |
## -JAVA: Path to java |
6657 |
30 Mar 22 |
nicklas |
## -GATK: Path to the GATK JAR file |
6657 |
30 Mar 22 |
nicklas |
## -VCFANNO: Path to the vcfanno program |
6657 |
30 Mar 22 |
nicklas |
## -SNPEFF: Path to the SnpEff JAR file |
6657 |
30 Mar 22 |
nicklas |
## -Genome: Path to the reference genome |
6657 |
30 Mar 22 |
nicklas |
## -HaplotypeCallerOptions: Options to HaplotypeCaller |
6657 |
30 Mar 22 |
nicklas |
## -VcfannoOptions: Options to the vcfanno program |
6657 |
30 Mar 22 |
nicklas |
## -SnpEffOptions: Options to the SnpSift program |
6657 |
30 Mar 22 |
nicklas |
## -VcfFolder: Path to folder where result VCF files should be saved |
6657 |
30 Mar 22 |
nicklas |
## -TMPDIR: Path to a local temporary working directory |
6657 |
30 Mar 22 |
nicklas |
17 |
## |
6657 |
30 Mar 22 |
nicklas |
18 |
|
6657 |
30 Mar 22 |
nicklas |
19 |
set -eo pipefail |
6657 |
30 Mar 22 |
nicklas |
20 |
|
6657 |
30 Mar 22 |
nicklas |
## Import utility functions |
6657 |
30 Mar 22 |
nicklas |
22 |
source ./reggie-utils.sh |
6657 |
30 Mar 22 |
nicklas |
23 |
source ./variantcall-utils.sh |
6657 |
30 Mar 22 |
nicklas |
24 |
|
6657 |
30 Mar 22 |
nicklas |
## Verify settings in options |
6657 |
30 Mar 22 |
nicklas |
26 |
rg_var_isdir "TMPDIR" "WD" "VcfFolder" "BaseDir" |
6657 |
30 Mar 22 |
nicklas |
27 |
rg_var_isfile "JAVA" "GATK" "VCFANNO" "SNPEFF" "Genome" "BamFile" |
6657 |
30 Mar 22 |
nicklas |
28 |
rg_var_isset "HaplotypeCallerOptions" "VcfannoOptions" "SnpEffOptions" |
6657 |
30 Mar 22 |
nicklas |
29 |
rg_file_exists "${WD}/target_info.txt" |
6657 |
30 Mar 22 |
nicklas |
30 |
|
6657 |
30 Mar 22 |
nicklas |
## Move to the temporary working directory |
6657 |
30 Mar 22 |
nicklas |
32 |
cd ${TMPDIR} |
6669 |
06 Apr 22 |
nicklas |
33 |
mkdir -p tmp |
6669 |
06 Apr 22 |
nicklas |
34 |
mkdir -p gatk_tmp |
6669 |
06 Apr 22 |
nicklas |
35 |
mkdir -p vcf |
6669 |
06 Apr 22 |
nicklas |
36 |
mkdir -p done |
6657 |
30 Mar 22 |
nicklas |
37 |
|
6657 |
30 Mar 22 |
nicklas |
38 |
NUM_LINES=`wc -l < ${WD}/target_info.txt` |
6657 |
30 Mar 22 |
nicklas |
39 |
CURRENT_LINE=0 |
6657 |
30 Mar 22 |
nicklas |
40 |
while IFS=$',' read TARGET_NAME TARGET_VCF; do |
6657 |
30 Mar 22 |
nicklas |
41 |
CURRENT_LINE=$(( ${CURRENT_LINE}+1 )) |
6657 |
30 Mar 22 |
nicklas |
42 |
PROGRESS=$(( 10+${CURRENT_LINE}*80/${NUM_LINES} )) |
6657 |
30 Mar 22 |
nicklas |
43 |
rg_progress ${PROGRESS} "Genotyping target: ${TARGET_NAME}" |
6657 |
30 Mar 22 |
nicklas |
44 |
TargetVcfPath="${BaseDir}/targeted/${TARGET_VCF}" |
6657 |
30 Mar 22 |
nicklas |
45 |
rg_file_exists "${TargetVcfPath}" |
6657 |
30 Mar 22 |
nicklas |
46 |
|
6657 |
30 Mar 22 |
nicklas |
# Run HaplotypeCaller to genotype the targeted locations |
6669 |
06 Apr 22 |
nicklas |
48 |
if [ ! -f "done/${TARGET_NAME}.haplotypecaller.done" ]; then |
6669 |
06 Apr 22 |
nicklas |
49 |
${WD}/stdwrap.sh ${JAVA} ${JavaOptions} \ |
6669 |
06 Apr 22 |
nicklas |
50 |
-jar ${GATK} HaplotypeCaller \ |
6669 |
06 Apr 22 |
nicklas |
51 |
-R ${Genome} \ |
6669 |
06 Apr 22 |
nicklas |
52 |
-L ${TargetVcfPath} \ |
6669 |
06 Apr 22 |
nicklas |
53 |
--alleles ${TargetVcfPath} \ |
6669 |
06 Apr 22 |
nicklas |
54 |
--tmp-dir ${TMPDIR}/gatk_tmp \ |
6669 |
06 Apr 22 |
nicklas |
55 |
${HaplotypeCallerOptions} \ |
6669 |
06 Apr 22 |
nicklas |
56 |
-I ${BamFile} \ |
6669 |
06 Apr 22 |
nicklas |
57 |
-O tmp/${TARGET_VCF}.1.hc \ |
6669 |
06 Apr 22 |
nicklas |
58 |
>> haplotypecaller.out |
6669 |
06 Apr 22 |
nicklas |
59 |
touch "done/${TARGET_NAME}.haplotypecaller.done" |
6669 |
06 Apr 22 |
nicklas |
60 |
fi |
6657 |
30 Mar 22 |
nicklas |
61 |
|
6657 |
30 Mar 22 |
nicklas |
# Use 'LeftAlignAndTrim' to split variants on the same genomic location |
6657 |
30 Mar 22 |
nicklas |
# otherwise we can't annotate properly since different alternate alleles |
6657 |
30 Mar 22 |
nicklas |
# result in different amino acid changes |
6669 |
06 Apr 22 |
nicklas |
65 |
if [ ! -f "done/${TARGET_NAME}.split.done" ]; then |
6669 |
06 Apr 22 |
nicklas |
66 |
${WD}/stdwrap.sh ${JAVA} ${JavaOptions} \ |
6669 |
06 Apr 22 |
nicklas |
67 |
-jar ${GATK} LeftAlignAndTrimVariants \ |
6669 |
06 Apr 22 |
nicklas |
68 |
-R ${Genome} \ |
6669 |
06 Apr 22 |
nicklas |
69 |
--split-multi-allelics true \ |
6669 |
06 Apr 22 |
nicklas |
70 |
--add-output-vcf-command-line false \ |
6669 |
06 Apr 22 |
nicklas |
71 |
-V tmp/${TARGET_VCF}.1.hc \ |
6669 |
06 Apr 22 |
nicklas |
72 |
-O tmp/${TARGET_VCF}.2.split \ |
6669 |
06 Apr 22 |
nicklas |
73 |
>> split.out |
6669 |
06 Apr 22 |
nicklas |
74 |
touch "done/${TARGET_NAME}.split.done" |
6669 |
06 Apr 22 |
nicklas |
75 |
fi |
6657 |
30 Mar 22 |
nicklas |
76 |
|
6657 |
30 Mar 22 |
nicklas |
# Copy TYPE annotation from that target VCF file |
6657 |
30 Mar 22 |
nicklas |
# We need a compressed and indexed version of the target VCF |
6657 |
30 Mar 22 |
nicklas |
# and a .toml file that define the fields to copy |
6669 |
06 Apr 22 |
nicklas |
80 |
if [ ! -f "done/${TARGET_NAME}.type.done" ]; then |
6669 |
06 Apr 22 |
nicklas |
81 |
bgzip -c ${TargetVcfPath} > tmp/${TARGET_VCF}.gz |
6669 |
06 Apr 22 |
nicklas |
82 |
tabix tmp/${TARGET_VCF}.gz |
6669 |
06 Apr 22 |
nicklas |
83 |
echo '[[annotation]]' >> tmp/${TARGET_VCF}.toml |
6669 |
06 Apr 22 |
nicklas |
84 |
echo "file=\"tmp/${TARGET_VCF}.gz\"" >> tmp/${TARGET_VCF}.toml |
6669 |
06 Apr 22 |
nicklas |
85 |
echo 'fields=["TYPE"]' >> tmp/${TARGET_VCF}.toml |
6669 |
06 Apr 22 |
nicklas |
86 |
echo 'names=["TYPE"]' >> tmp/${TARGET_VCF}.toml |
6669 |
06 Apr 22 |
nicklas |
87 |
echo 'ops=["self"]' >> tmp/${TARGET_VCF}.toml |
6669 |
06 Apr 22 |
nicklas |
88 |
|
6669 |
06 Apr 22 |
nicklas |
89 |
vcf_annotate tmp/${TARGET_VCF}.2.split tmp/${TARGET_VCF}.toml tmp/${TARGET_VCF}.3.type |
6669 |
06 Apr 22 |
nicklas |
90 |
|
6669 |
06 Apr 22 |
nicklas |
# Remove lines without a TYPE annotation |
6669 |
06 Apr 22 |
nicklas |
# They are extra results that HaplotypeCaller creates |
6669 |
06 Apr 22 |
nicklas |
# by combining nearby SNP variants. |
6669 |
06 Apr 22 |
nicklas |
94 |
grep "\(^#\|TYPE\=\)" tmp/${TARGET_VCF}.3.type > tmp/${TARGET_VCF}.4.grep |
6669 |
06 Apr 22 |
nicklas |
95 |
|
6669 |
06 Apr 22 |
nicklas |
96 |
touch "done/${TARGET_NAME}.type.done" |
6669 |
06 Apr 22 |
nicklas |
97 |
fi |
6657 |
30 Mar 22 |
nicklas |
98 |
|
6657 |
30 Mar 22 |
nicklas |
# Annotate with vcfanno |
6669 |
06 Apr 22 |
nicklas |
100 |
if [ ! -f "done/${TARGET_NAME}.vcfanno.done" ]; then |
6669 |
06 Apr 22 |
nicklas |
101 |
vcf_annotate tmp/${TARGET_VCF}.4.grep "${VcfannoOptions}" tmp/${TARGET_VCF}.5.ann |
6669 |
06 Apr 22 |
nicklas |
102 |
touch "done/${TARGET_NAME}.vcfanno.done" |
6669 |
06 Apr 22 |
nicklas |
103 |
fi |
6657 |
30 Mar 22 |
nicklas |
104 |
|
6657 |
30 Mar 22 |
nicklas |
# Annotate with snpEff |
6669 |
06 Apr 22 |
nicklas |
106 |
if [ ! -f "done/${TARGET_NAME}.snpeff.done" ]; then |
6669 |
06 Apr 22 |
nicklas |
107 |
snpeff_annotate tmp/${TARGET_VCF}.5.ann "${SnpEffOptions}" tmp/${TARGET_VCF}.6.eff |
6669 |
06 Apr 22 |
nicklas |
108 |
touch "done/${TARGET_NAME}.snpeff.done" |
6669 |
06 Apr 22 |
nicklas |
109 |
fi |
6657 |
30 Mar 22 |
nicklas |
110 |
|
6657 |
30 Mar 22 |
nicklas |
# Remove '##contig' entries from the final VCF since they are not really needed |
6669 |
06 Apr 22 |
nicklas |
112 |
if [ ! -f "done/${TARGET_NAME}.done" ]; then |
6669 |
06 Apr 22 |
nicklas |
113 |
sed '/##contig/d' tmp/${TARGET_VCF}.6.eff > vcf/genotype_${TARGET_VCF} |
6669 |
06 Apr 22 |
nicklas |
114 |
echo "[${TARGET_NAME}]" >> ${WD}/files.out |
6669 |
06 Apr 22 |
nicklas |
115 |
echo "genotype_${TARGET_VCF}" >> ${WD}/files.out |
6669 |
06 Apr 22 |
nicklas |
116 |
touch "done/${TARGET_NAME}.done" |
6669 |
06 Apr 22 |
nicklas |
117 |
fi |
6657 |
30 Mar 22 |
nicklas |
118 |
|
6657 |
30 Mar 22 |
nicklas |
119 |
done < ${WD}/target_info.txt |
6657 |
30 Mar 22 |
nicklas |
120 |
|
6657 |
30 Mar 22 |
nicklas |
121 |
rg_progress 90 "Copying result files to project archive" |
6657 |
30 Mar 22 |
nicklas |
122 |
\cp vcf/* ${VcfFolder} |
6657 |
30 Mar 22 |
nicklas |
123 |
|
6657 |
30 Mar 22 |
nicklas |
124 |
rg_progress 99 "Analysis completed, cleaning up..." |