7395 |
06 Nov 23 |
nicklas |
#!/bin/bash |
7395 |
06 Nov 23 |
nicklas |
2 |
## |
7395 |
06 Nov 23 |
nicklas |
## Pipeline script for building Panel-of-normals for WGS variant calling. |
7395 |
06 Nov 23 |
nicklas |
4 |
## |
7395 |
06 Nov 23 |
nicklas |
5 |
## |
7395 |
06 Nov 23 |
nicklas |
## Environment variables that should be defined (in job.sh) before calling this script |
7395 |
06 Nov 23 |
nicklas |
## -PonArchive: Base folder for VCF files with results from variant calling of normal samples |
7395 |
06 Nov 23 |
nicklas |
## -GATK: Path to the gatk program |
7395 |
06 Nov 23 |
nicklas |
## -REF_FASTA: Path to the reference FASTA file |
7395 |
06 Nov 23 |
nicklas |
## -TMPDIR: Path to a local temporary working directory |
7395 |
06 Nov 23 |
nicklas |
11 |
## |
7395 |
06 Nov 23 |
nicklas |
12 |
|
7395 |
06 Nov 23 |
nicklas |
13 |
set -eo pipefail |
7395 |
06 Nov 23 |
nicklas |
14 |
|
7395 |
06 Nov 23 |
nicklas |
## Import utility functions |
7395 |
06 Nov 23 |
nicklas |
16 |
source ./reggie-utils.sh |
7395 |
06 Nov 23 |
nicklas |
17 |
|
7395 |
06 Nov 23 |
nicklas |
## Verify settings in options |
7395 |
06 Nov 23 |
nicklas |
19 |
rg_var_isdir "PonArchive" "TMPDIR" "WD" |
7395 |
06 Nov 23 |
nicklas |
20 |
rg_var_isfile "GATK" "REF_FASTA" |
7395 |
06 Nov 23 |
nicklas |
21 |
rg_file_exists "${WD}/ponlist.txt" |
7395 |
06 Nov 23 |
nicklas |
22 |
|
7395 |
06 Nov 23 |
nicklas |
23 |
|
7395 |
06 Nov 23 |
nicklas |
24 |
build_pon() |
7395 |
06 Nov 23 |
nicklas |
25 |
{ |
7395 |
06 Nov 23 |
nicklas |
26 |
local CHR=$1 |
7395 |
06 Nov 23 |
nicklas |
27 |
local VCF_OUT=vcf/${CHR}.vcf.gz |
7395 |
06 Nov 23 |
nicklas |
28 |
|
7395 |
06 Nov 23 |
nicklas |
29 |
if [ ! -f "done/import_${CHR}.done" ]; then |
7395 |
06 Nov 23 |
nicklas |
30 |
${WD}/stdwrap.sh ${GATK} GenomicsDBImport \ |
7395 |
06 Nov 23 |
nicklas |
31 |
-R ${REF_FASTA} \ |
7395 |
06 Nov 23 |
nicklas |
32 |
-L ${CHR} \ |
7395 |
06 Nov 23 |
nicklas |
33 |
--genomicsdb-workspace-path gendb/${CHR} \ |
7395 |
06 Nov 23 |
nicklas |
34 |
${GenomicsDBImportOptions} \ |
7409 |
08 Nov 23 |
nicklas |
35 |
${CommonGATKOptions} \ |
7395 |
06 Nov 23 |
nicklas |
36 |
${ALL_FILES} \ |
7395 |
06 Nov 23 |
nicklas |
37 |
> db_import_${CHR}.out |
7395 |
06 Nov 23 |
nicklas |
38 |
touch "done/import_${CHR}.done" |
7395 |
06 Nov 23 |
nicklas |
39 |
fi |
7395 |
06 Nov 23 |
nicklas |
40 |
|
7395 |
06 Nov 23 |
nicklas |
41 |
if [ ! -f "done/build_${CHR}.done" ]; then |
7395 |
06 Nov 23 |
nicklas |
42 |
${WD}/stdwrap.sh ${GATK} CreateSomaticPanelOfNormals \ |
7395 |
06 Nov 23 |
nicklas |
43 |
-R ${REF_FASTA} \ |
7395 |
06 Nov 23 |
nicklas |
44 |
-L ${CHR} \ |
7395 |
06 Nov 23 |
nicklas |
45 |
-V gendb://gendb/${CHR} \ |
7395 |
06 Nov 23 |
nicklas |
46 |
-O ${VCF_OUT} \ |
7395 |
06 Nov 23 |
nicklas |
47 |
${CreateSomaticPanelOfNormalsOptions} \ |
7409 |
08 Nov 23 |
nicklas |
48 |
${CommonGATKOptions} \ |
7395 |
06 Nov 23 |
nicklas |
49 |
> build_${CHR}.out |
7395 |
06 Nov 23 |
nicklas |
50 |
echo "${CHR}.variants: `zcat ${VCF_OUT} | grep -c -v '^#'`" >> ${WD}/stats.out |
7395 |
06 Nov 23 |
nicklas |
51 |
touch "done/build_${CHR}.done" |
7395 |
06 Nov 23 |
nicklas |
52 |
fi |
7395 |
06 Nov 23 |
nicklas |
53 |
} |
7395 |
06 Nov 23 |
nicklas |
54 |
|
7395 |
06 Nov 23 |
nicklas |
## Move to the temporary working directory |
7395 |
06 Nov 23 |
nicklas |
56 |
cd ${TMPDIR} |
7395 |
06 Nov 23 |
nicklas |
57 |
mkdir -p done |
7395 |
06 Nov 23 |
nicklas |
58 |
mkdir -p gendb |
7395 |
06 Nov 23 |
nicklas |
59 |
mkdir -p vcf |
7395 |
06 Nov 23 |
nicklas |
60 |
|
7395 |
06 Nov 23 |
nicklas |
61 |
ALL_FILES="" |
7395 |
06 Nov 23 |
nicklas |
62 |
while read PON_NAME PON_FOLDER; do |
7395 |
06 Nov 23 |
nicklas |
63 |
ALL_FILES="${ALL_FILES} -V ${PonArchive}${PON_FOLDER}/all.vcf.gz" |
7395 |
06 Nov 23 |
nicklas |
64 |
done < ${WD}/ponlist.txt |
7395 |
06 Nov 23 |
nicklas |
65 |
|
7395 |
06 Nov 23 |
nicklas |
66 |
if [ ! -f "done/import.done" ]; then |
7395 |
06 Nov 23 |
nicklas |
67 |
CHRS=(X 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 Y) |
7395 |
06 Nov 23 |
nicklas |
68 |
CURRENT_STEP=0 |
7395 |
06 Nov 23 |
nicklas |
69 |
COMPLETED_STEPS=0 |
7395 |
06 Nov 23 |
nicklas |
70 |
NUM_STEPS=${#CHRS[@]} |
7395 |
06 Nov 23 |
nicklas |
71 |
JOBS_LIMIT=$(( (${NumThreads}+2)/3 )) |
7395 |
06 Nov 23 |
nicklas |
72 |
JOBS_PIDS=() |
7395 |
06 Nov 23 |
nicklas |
73 |
for chr in "${CHRS[@]}" |
7395 |
06 Nov 23 |
nicklas |
74 |
do |
7395 |
06 Nov 23 |
nicklas |
75 |
if (( ${#JOBS_PIDS[@]} >= ${JOBS_LIMIT} )) |
7395 |
06 Nov 23 |
nicklas |
76 |
then |
7395 |
06 Nov 23 |
nicklas |
## Wait for at least one job to complete; errors will be trapped |
7395 |
06 Nov 23 |
nicklas |
78 |
wait -n -p PID ${!JOBS_PIDS[@]} |
7395 |
06 Nov 23 |
nicklas |
79 |
unset JOBS_PIDS[$PID] |
7395 |
06 Nov 23 |
nicklas |
80 |
COMPLETED_STEPS=$(( ${COMPLETED_STEPS}+1 )) |
7395 |
06 Nov 23 |
nicklas |
81 |
fi |
7395 |
06 Nov 23 |
nicklas |
82 |
|
7395 |
06 Nov 23 |
nicklas |
83 |
build_pon "chr${chr}" & |
7395 |
06 Nov 23 |
nicklas |
84 |
JOBS_PIDS[$!]="chr${chr}" |
7395 |
06 Nov 23 |
nicklas |
85 |
|
7395 |
06 Nov 23 |
nicklas |
86 |
CURRENT_STEP=$(( ${CURRENT_STEP}+1 )) |
7395 |
06 Nov 23 |
nicklas |
87 |
PROGRESS=$(( 20+(${CURRENT_STEP}+${COMPLETED_STEPS})*35/${NUM_STEPS} )) |
7395 |
06 Nov 23 |
nicklas |
## We want to report the active jobs as comma-separated list |
7395 |
06 Nov 23 |
nicklas |
89 |
ACTIVE_JOBS="${JOBS_PIDS[*]}" |
7395 |
06 Nov 23 |
nicklas |
90 |
rg_progress ${PROGRESS} "Building panel: ${COMPLETED_STEPS} of ${NUM_STEPS} completed, working on ${ACTIVE_JOBS// /, }" |
7395 |
06 Nov 23 |
nicklas |
91 |
|
7395 |
06 Nov 23 |
nicklas |
92 |
done |
7395 |
06 Nov 23 |
nicklas |
93 |
|
7395 |
06 Nov 23 |
nicklas |
## Wait for the remaining jobs to complete |
7395 |
06 Nov 23 |
nicklas |
95 |
while (( ${#JOBS_PIDS[@]} > 0 )) |
7395 |
06 Nov 23 |
nicklas |
96 |
do |
7395 |
06 Nov 23 |
nicklas |
97 |
wait -n -p PID ${!JOBS_PIDS[@]} |
7395 |
06 Nov 23 |
nicklas |
98 |
unset JOBS_PIDS[$PID] |
7395 |
06 Nov 23 |
nicklas |
99 |
|
7395 |
06 Nov 23 |
nicklas |
100 |
COMPLETED_STEPS=$(( ${COMPLETED_STEPS}+1 )) |
7395 |
06 Nov 23 |
nicklas |
101 |
PROGRESS=$(( 15+(${CURRENT_STEP}+${COMPLETED_STEPS})*35/${NUM_STEPS} )) |
7395 |
06 Nov 23 |
nicklas |
102 |
ACTIVE_JOBS="${JOBS_PIDS[*]}" |
7395 |
06 Nov 23 |
nicklas |
103 |
rg_progress ${PROGRESS} "Building panel: ${COMPLETED_STEPS} of ${NUM_STEPS} completed, working on ${ACTIVE_JOBS// /, }" |
7395 |
06 Nov 23 |
nicklas |
104 |
done |
7395 |
06 Nov 23 |
nicklas |
105 |
|
7395 |
06 Nov 23 |
nicklas |
106 |
touch "done/import.done" |
7395 |
06 Nov 23 |
nicklas |
107 |
fi |
7395 |
06 Nov 23 |
nicklas |
108 |
|
7395 |
06 Nov 23 |
nicklas |
109 |
|
7395 |
06 Nov 23 |
nicklas |
110 |
rg_progress 95 "Copying result files to project archive" |
7395 |
06 Nov 23 |
nicklas |
111 |
|
7395 |
06 Nov 23 |
nicklas |
## Merge the VCF files and save to the final location |
7395 |
06 Nov 23 |
nicklas |
113 |
ls -1 vcf/*.vcf.gz > all.list |
7415 |
13 Nov 23 |
nicklas |
114 |
${WD}/stdwrap.sh ${GATK} MergeVcfs -I all.list -O ${WD}/panel.vcf.gz --QUIET --VERBOSITY WARNING |
7395 |
06 Nov 23 |
nicklas |
115 |
|
7395 |
06 Nov 23 |
nicklas |
116 |
rg_progress 99 "Analysis completed, cleaning up..." |
7395 |
06 Nov 23 |
nicklas |
117 |
|