7084 |
28 Mar 23 |
nicklas |
1 |
package net.sf.basedb.reggie.grid; |
7084 |
28 Mar 23 |
nicklas |
2 |
|
7091 |
04 Apr 23 |
nicklas |
3 |
import java.io.InputStream; |
7091 |
04 Apr 23 |
nicklas |
4 |
import java.io.OutputStream; |
7087 |
03 Apr 23 |
nicklas |
5 |
import java.io.StringWriter; |
7086 |
31 Mar 23 |
nicklas |
6 |
import java.util.ArrayList; |
7084 |
28 Mar 23 |
nicklas |
7 |
import java.util.List; |
7100 |
06 Apr 23 |
nicklas |
8 |
import java.util.Map; |
7100 |
06 Apr 23 |
nicklas |
9 |
import java.util.TreeMap; |
7089 |
04 Apr 23 |
nicklas |
10 |
import java.util.regex.Matcher; |
7089 |
04 Apr 23 |
nicklas |
11 |
import java.util.regex.Pattern; |
7084 |
28 Mar 23 |
nicklas |
12 |
|
7084 |
28 Mar 23 |
nicklas |
13 |
import org.slf4j.LoggerFactory; |
7084 |
28 Mar 23 |
nicklas |
14 |
|
7089 |
04 Apr 23 |
nicklas |
15 |
import net.sf.basedb.core.AnyToAny; |
7091 |
04 Apr 23 |
nicklas |
16 |
import net.sf.basedb.core.BaseException; |
7089 |
04 Apr 23 |
nicklas |
17 |
import net.sf.basedb.core.DataFileType; |
7084 |
28 Mar 23 |
nicklas |
18 |
import net.sf.basedb.core.DbControl; |
7084 |
28 Mar 23 |
nicklas |
19 |
import net.sf.basedb.core.DerivedBioAssay; |
7089 |
04 Apr 23 |
nicklas |
20 |
import net.sf.basedb.core.Directory; |
7087 |
03 Apr 23 |
nicklas |
21 |
import net.sf.basedb.core.File; |
7089 |
04 Apr 23 |
nicklas |
22 |
import net.sf.basedb.core.FileServer; |
7089 |
04 Apr 23 |
nicklas |
23 |
import net.sf.basedb.core.FileSetMember; |
7087 |
03 Apr 23 |
nicklas |
24 |
import net.sf.basedb.core.InvalidDataException; |
7086 |
31 Mar 23 |
nicklas |
25 |
import net.sf.basedb.core.ItemList; |
7084 |
28 Mar 23 |
nicklas |
26 |
import net.sf.basedb.core.ItemNotFoundException; |
7086 |
31 Mar 23 |
nicklas |
27 |
import net.sf.basedb.core.ItemSubtype; |
7084 |
28 Mar 23 |
nicklas |
28 |
import net.sf.basedb.core.Job; |
7089 |
04 Apr 23 |
nicklas |
29 |
import net.sf.basedb.core.Path; |
7084 |
28 Mar 23 |
nicklas |
30 |
import net.sf.basedb.core.Protocol; |
7086 |
31 Mar 23 |
nicklas |
31 |
import net.sf.basedb.core.Sample; |
7084 |
28 Mar 23 |
nicklas |
32 |
import net.sf.basedb.core.SessionControl; |
7084 |
28 Mar 23 |
nicklas |
33 |
import net.sf.basedb.core.Software; |
7086 |
31 Mar 23 |
nicklas |
34 |
import net.sf.basedb.core.StringParameterType; |
7084 |
28 Mar 23 |
nicklas |
35 |
import net.sf.basedb.opengrid.JobDefinition; |
7084 |
28 Mar 23 |
nicklas |
36 |
import net.sf.basedb.opengrid.JobStatus; |
7084 |
28 Mar 23 |
nicklas |
37 |
import net.sf.basedb.opengrid.OpenGridCluster; |
7084 |
28 Mar 23 |
nicklas |
38 |
import net.sf.basedb.opengrid.OpenGridSession; |
7086 |
31 Mar 23 |
nicklas |
39 |
import net.sf.basedb.opengrid.ScriptBuilder; |
7084 |
28 Mar 23 |
nicklas |
40 |
import net.sf.basedb.opengrid.config.ClusterConfig; |
7086 |
31 Mar 23 |
nicklas |
41 |
import net.sf.basedb.opengrid.config.JobConfig; |
7087 |
03 Apr 23 |
nicklas |
42 |
import net.sf.basedb.opengrid.filetransfer.StringUploadSource; |
7084 |
28 Mar 23 |
nicklas |
43 |
import net.sf.basedb.opengrid.service.JobCompletionHandler; |
7084 |
28 Mar 23 |
nicklas |
44 |
import net.sf.basedb.reggie.Reggie; |
7084 |
28 Mar 23 |
nicklas |
45 |
import net.sf.basedb.reggie.XmlConfig; |
7089 |
04 Apr 23 |
nicklas |
46 |
import net.sf.basedb.reggie.dao.AlignedSequences; |
7084 |
28 Mar 23 |
nicklas |
47 |
import net.sf.basedb.reggie.dao.Annotationtype; |
7086 |
31 Mar 23 |
nicklas |
48 |
import net.sf.basedb.reggie.dao.BiomaterialList; |
7087 |
03 Apr 23 |
nicklas |
49 |
import net.sf.basedb.reggie.dao.Datafiletype; |
7086 |
31 Mar 23 |
nicklas |
50 |
import net.sf.basedb.reggie.dao.DoNotUse; |
7089 |
04 Apr 23 |
nicklas |
51 |
import net.sf.basedb.reggie.dao.Fileserver; |
7086 |
31 Mar 23 |
nicklas |
52 |
import net.sf.basedb.reggie.dao.Library; |
7084 |
28 Mar 23 |
nicklas |
53 |
import net.sf.basedb.reggie.dao.MergedSequences; |
7086 |
31 Mar 23 |
nicklas |
54 |
import net.sf.basedb.reggie.dao.Pipeline; |
7086 |
31 Mar 23 |
nicklas |
55 |
import net.sf.basedb.reggie.dao.Subtype; |
7089 |
04 Apr 23 |
nicklas |
56 |
import net.sf.basedb.reggie.vcf.VcfData; |
7091 |
04 Apr 23 |
nicklas |
57 |
import net.sf.basedb.reggie.vcf.VcfParser; |
7091 |
04 Apr 23 |
nicklas |
58 |
import net.sf.basedb.util.FileUtil; |
7091 |
04 Apr 23 |
nicklas |
59 |
import net.sf.basedb.util.InputStreamSplitter; |
7086 |
31 Mar 23 |
nicklas |
60 |
import net.sf.basedb.util.Values; |
7087 |
03 Apr 23 |
nicklas |
61 |
import net.sf.basedb.util.export.TableWriter; |
7084 |
28 Mar 23 |
nicklas |
62 |
import net.sf.basedb.util.extensions.logging.ExtensionsLog; |
7084 |
28 Mar 23 |
nicklas |
63 |
import net.sf.basedb.util.extensions.logging.ExtensionsLogger; |
7084 |
28 Mar 23 |
nicklas |
64 |
|
7084 |
28 Mar 23 |
nicklas |
65 |
/** |
7084 |
28 Mar 23 |
nicklas |
Helper class for creating items needed for executing Bwa-mem2 as |
7084 |
28 Mar 23 |
nicklas |
well as generating the script and send it to the cluster for |
7084 |
28 Mar 23 |
nicklas |
execution. |
7084 |
28 Mar 23 |
nicklas |
69 |
|
7084 |
28 Mar 23 |
nicklas |
@author nicklas |
7084 |
28 Mar 23 |
nicklas |
@since 4.46 |
7084 |
28 Mar 23 |
nicklas |
72 |
*/ |
7084 |
28 Mar 23 |
nicklas |
73 |
public class BwaMem2AlignJobCreator |
7084 |
28 Mar 23 |
nicklas |
74 |
extends AbstractJobCreator |
7084 |
28 Mar 23 |
nicklas |
75 |
{ |
7084 |
28 Mar 23 |
nicklas |
76 |
|
7084 |
28 Mar 23 |
nicklas |
77 |
private Software alignSoftware; |
7084 |
28 Mar 23 |
nicklas |
78 |
private Protocol alignProtocol; |
7084 |
28 Mar 23 |
nicklas |
79 |
|
7084 |
28 Mar 23 |
nicklas |
80 |
public BwaMem2AlignJobCreator() |
7084 |
28 Mar 23 |
nicklas |
81 |
{} |
7084 |
28 Mar 23 |
nicklas |
82 |
|
7084 |
28 Mar 23 |
nicklas |
83 |
/** |
7084 |
28 Mar 23 |
nicklas |
Set the software item to set on created AlignedSequences. |
7084 |
28 Mar 23 |
nicklas |
@see DerivedBioAssay#setSoftware(Software) |
7084 |
28 Mar 23 |
nicklas |
86 |
*/ |
7084 |
28 Mar 23 |
nicklas |
87 |
public void setAlignSoftware(Software software) |
7084 |
28 Mar 23 |
nicklas |
88 |
{ |
7084 |
28 Mar 23 |
nicklas |
89 |
this.alignSoftware = software; |
7084 |
28 Mar 23 |
nicklas |
90 |
} |
7084 |
28 Mar 23 |
nicklas |
91 |
|
7084 |
28 Mar 23 |
nicklas |
92 |
/** |
7084 |
28 Mar 23 |
nicklas |
Set the protocol item to set on created AlignedSequences. |
7084 |
28 Mar 23 |
nicklas |
@see DerivedBioAssay#setProtocol(Protocol) |
7084 |
28 Mar 23 |
nicklas |
95 |
*/ |
7084 |
28 Mar 23 |
nicklas |
96 |
public void setAlignProtocol(Protocol protocol) |
7084 |
28 Mar 23 |
nicklas |
97 |
{ |
7084 |
28 Mar 23 |
nicklas |
98 |
this.alignProtocol = protocol; |
7084 |
28 Mar 23 |
nicklas |
99 |
} |
7084 |
28 Mar 23 |
nicklas |
100 |
|
7084 |
28 Mar 23 |
nicklas |
101 |
/** |
7084 |
28 Mar 23 |
nicklas |
Create a child bioassays for all given merged sequences and schedule |
7084 |
28 Mar 23 |
nicklas |
jobs on the given cluster for running Hisat alignment. |
7084 |
28 Mar 23 |
nicklas |
@return A list with the corresponding jobs in BASE |
7084 |
28 Mar 23 |
nicklas |
105 |
*/ |
7084 |
28 Mar 23 |
nicklas |
106 |
public List<JobDefinition> createBwaMem2AlignJobs(DbControl dc, OpenGridCluster cluster, List<MergedSequences> mergedSequences) |
7084 |
28 Mar 23 |
nicklas |
107 |
{ |
7084 |
28 Mar 23 |
nicklas |
108 |
SessionControl sc = dc.getSessionControl(); |
7084 |
28 Mar 23 |
nicklas |
109 |
|
7084 |
28 Mar 23 |
nicklas |
110 |
ClusterConfig clusterCfg = cluster.getConfig(); |
7084 |
28 Mar 23 |
nicklas |
111 |
XmlConfig cfg = Reggie.getConfig(cluster.getId()); |
7084 |
28 Mar 23 |
nicklas |
112 |
if (cfg == null) |
7084 |
28 Mar 23 |
nicklas |
113 |
{ |
7084 |
28 Mar 23 |
nicklas |
114 |
throw new ItemNotFoundException("No configuration in reggie-config.xml for cluster: " + cluster.getId()); |
7084 |
28 Mar 23 |
nicklas |
115 |
} |
7084 |
28 Mar 23 |
nicklas |
116 |
|
7084 |
28 Mar 23 |
nicklas |
117 |
String alignParameterSet = (String)Annotationtype.PARAMETER_SET.getAnnotationValue(dc, alignSoftware); |
7084 |
28 Mar 23 |
nicklas |
118 |
|
7084 |
28 Mar 23 |
nicklas |
// Get global options |
7084 |
28 Mar 23 |
nicklas |
120 |
String global_env = ScriptUtil.multilineIndent(cfg.getConfig("global-env")); |
7086 |
31 Mar 23 |
nicklas |
121 |
String projectArchiveDNA = cfg.getRequiredConfig("project-archive-dna", null); |
7084 |
28 Mar 23 |
nicklas |
122 |
|
7086 |
31 Mar 23 |
nicklas |
// Options for the programs |
7372 |
06 Oct 23 |
nicklas |
124 |
String align_submit = cfg.getConfig("align-bwa-mem2/submit", alignParameterSet, null); |
7372 |
06 Oct 23 |
nicklas |
125 |
String align_submit_debug = cfg.getConfig("align-bwa-mem2/submit-debug", alignParameterSet, null); |
7086 |
31 Mar 23 |
nicklas |
126 |
String align_env = ScriptUtil.multilineIndent(cfg.getRequiredConfig("align-bwa-mem2/env", alignParameterSet)); |
7107 |
12 Apr 23 |
nicklas |
127 |
String align_env_novaseq = ScriptUtil.multilineIndent(cfg.getConfig("align-bwa-mem2/env-novaseq", alignParameterSet, null)); |
7107 |
12 Apr 23 |
nicklas |
128 |
String align_env_hiseq = ScriptUtil.multilineIndent(cfg.getConfig("align-bwa-mem2/env-hiseq", alignParameterSet, null)); |
7086 |
31 Mar 23 |
nicklas |
129 |
String align_envdebug = ScriptUtil.multilineIndent(cfg.getConfig("align-bwa-mem2/env-debug", alignParameterSet, null)); |
7086 |
31 Mar 23 |
nicklas |
130 |
String align_execute = ScriptUtil.multilineIndent(cfg.getConfig("align-bwa-mem2/execute", alignParameterSet, "./bwa-mem2.sh")); |
7086 |
31 Mar 23 |
nicklas |
131 |
|
7086 |
31 Mar 23 |
nicklas |
// Load common items |
7086 |
31 Mar 23 |
nicklas |
133 |
ItemSubtype alignType = Subtype.ALIGNED_SEQUENCES.get(dc); |
7086 |
31 Mar 23 |
nicklas |
134 |
ItemSubtype bwaMem2AlignJobType = Subtype.BWA_MEM2_ALIGN_JOB.get(dc); |
7086 |
31 Mar 23 |
nicklas |
135 |
|
7086 |
31 Mar 23 |
nicklas |
// Selected items must be removed from this list |
7086 |
31 Mar 23 |
nicklas |
137 |
ItemList bwaMem2Pipeline = BiomaterialList.BWA_MEM2_PIPELINE.load(dc); |
7086 |
31 Mar 23 |
nicklas |
138 |
|
7086 |
31 Mar 23 |
nicklas |
// Options common for all jobs |
7086 |
31 Mar 23 |
nicklas |
140 |
JobConfig jobConfig = new JobConfig(); |
7086 |
31 Mar 23 |
nicklas |
141 |
if (priority != null) jobConfig.setPriority(priority); |
7372 |
06 Oct 23 |
nicklas |
142 |
if (partition != null) jobConfig.setSbatchOption("partition", ScriptUtil.checkValidScriptParameter(partition)); |
7372 |
06 Oct 23 |
nicklas |
143 |
jobConfig.convertOptionsTo(clusterCfg.getType()); |
7372 |
06 Oct 23 |
nicklas |
144 |
if (submitOptionsOverride != null) |
7372 |
06 Oct 23 |
nicklas |
145 |
{ |
7372 |
06 Oct 23 |
nicklas |
146 |
ScriptUtil.addSubmitOptions(jobConfig, submitOptionsOverride, clusterCfg.getType()); |
7372 |
06 Oct 23 |
nicklas |
147 |
} |
7372 |
06 Oct 23 |
nicklas |
148 |
else |
7372 |
06 Oct 23 |
nicklas |
149 |
{ |
7372 |
06 Oct 23 |
nicklas |
150 |
ScriptUtil.addSubmitOptions(jobConfig, align_submit, clusterCfg.getType()); |
7372 |
06 Oct 23 |
nicklas |
151 |
if (debug) ScriptUtil.addSubmitOptions(jobConfig, align_submit_debug, clusterCfg.getType()); |
7372 |
06 Oct 23 |
nicklas |
152 |
} |
7086 |
31 Mar 23 |
nicklas |
153 |
|
7086 |
31 Mar 23 |
nicklas |
// We submit one job for each bioassay to the cluster |
7086 |
31 Mar 23 |
nicklas |
155 |
List<JobDefinition> jobDefs = new ArrayList<JobDefinition>(mergedSequences.size()); |
7086 |
31 Mar 23 |
nicklas |
156 |
|
7086 |
31 Mar 23 |
nicklas |
157 |
for (MergedSequences ms : mergedSequences) |
7086 |
31 Mar 23 |
nicklas |
158 |
{ |
7086 |
31 Mar 23 |
nicklas |
159 |
ms = MergedSequences.getById(dc, ms.getId()); // Ensure item is loaded in this transaction |
7086 |
31 Mar 23 |
nicklas |
// Get some information about the merged data that we need |
7086 |
31 Mar 23 |
nicklas |
161 |
DerivedBioAssay merged = ms.getDerivedBioAssay(); |
7086 |
31 Mar 23 |
nicklas |
162 |
bwaMem2Pipeline.removeItem(merged); |
7086 |
31 Mar 23 |
nicklas |
163 |
|
7086 |
31 Mar 23 |
nicklas |
164 |
Library lib = Library.get(merged.getExtract()); |
7086 |
31 Mar 23 |
nicklas |
165 |
Pipeline pipeline = Pipeline.getByName((String)Annotationtype.PIPELINE.getAnnotationValue(dc, merged)); |
7086 |
31 Mar 23 |
nicklas |
// The originating sample can be either a BLOOD or SPECIMEN item |
7086 |
31 Mar 23 |
nicklas |
167 |
Subtype expectedSampleType = pipeline == Pipeline.DNA_NORMAL_WGS ? Subtype.BLOOD : Subtype.SPECIMEN; |
7086 |
31 Mar 23 |
nicklas |
168 |
Sample sample = (Sample)lib.findSingleParent(dc, expectedSampleType); |
7086 |
31 Mar 23 |
nicklas |
169 |
if (sample == null) |
7086 |
31 Mar 23 |
nicklas |
170 |
{ |
7086 |
31 Mar 23 |
nicklas |
171 |
throw new ItemNotFoundException("Expected to find a parent "+ |
7086 |
31 Mar 23 |
nicklas |
172 |
expectedSampleType.getName()+" for "+merged.getName()); |
7086 |
31 Mar 23 |
nicklas |
173 |
} |
7086 |
31 Mar 23 |
nicklas |
174 |
String fastQFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, merged); |
7087 |
03 Apr 23 |
nicklas |
175 |
|
7100 |
06 Apr 23 |
nicklas |
// Get names to create a Read-Group tag |
7100 |
06 Apr 23 |
nicklas |
177 |
String mergedName = ScriptUtil.checkValidScriptParameter(merged.getName()); |
7100 |
06 Apr 23 |
nicklas |
178 |
String libName = ScriptUtil.checkValidScriptParameter(lib.getName()); |
7100 |
06 Apr 23 |
nicklas |
// Remove all after the first dot not followed by a number or 'b' |
7100 |
06 Apr 23 |
nicklas |
180 |
String sampleName = libName.replaceFirst("\\.(?!\\d|b).*", ""); |
7100 |
06 Apr 23 |
nicklas |
181 |
if (sample != null && sample.getExternalId() != null) |
7100 |
06 Apr 23 |
nicklas |
182 |
{ |
7100 |
06 Apr 23 |
nicklas |
// Replace SCANB-ID in libName/mergedName/sampleName with Sample.externalId |
7100 |
06 Apr 23 |
nicklas |
184 |
sampleName = ScriptUtil.checkValidScriptParameter(sample.getExternalId()); |
7100 |
06 Apr 23 |
nicklas |
185 |
libName = libName.replace(sample.getName(), sample.getExternalId()); |
7100 |
06 Apr 23 |
nicklas |
186 |
mergedName = mergedName.replace(sample.getName(), sample.getExternalId()); |
7100 |
06 Apr 23 |
nicklas |
187 |
} |
7100 |
06 Apr 23 |
nicklas |
// NOTE!!! We need \\t as output in the file so due to Java escape we need 4 |
7100 |
06 Apr 23 |
nicklas |
189 |
String rgSample = "SM:"+sampleName+"\\\\tLB:"+libName; |
7100 |
06 Apr 23 |
nicklas |
190 |
|
7100 |
06 Apr 23 |
nicklas |
// Get all FASTQ files and match into pairs... |
7087 |
03 Apr 23 |
nicklas |
192 |
List<File> fastqFiles = Datafiletype.FASTQ.getAllFiles(dc, merged); |
7100 |
06 Apr 23 |
nicklas |
193 |
Map<String, FastqPair> fastqPairs = new TreeMap<>(); |
7107 |
12 Apr 23 |
nicklas |
194 |
String flowCellType = null; |
7108 |
13 Apr 23 |
nicklas |
195 |
int readLengthSum = 0; |
7108 |
13 Apr 23 |
nicklas |
196 |
int readLengthCount = 0; |
7087 |
03 Apr 23 |
nicklas |
197 |
for (File f : fastqFiles) |
7087 |
03 Apr 23 |
nicklas |
198 |
{ |
7087 |
03 Apr 23 |
nicklas |
199 |
String name = f.getName(); |
7100 |
06 Apr 23 |
nicklas |
200 |
String flowCellId = (String)Annotationtype.FLOWCELL_ID.getAnnotationValue(dc, f); |
7100 |
06 Apr 23 |
nicklas |
201 |
Integer lane = (Integer)Annotationtype.LANE_NUMBER.getAnnotationValue(dc, f); |
7107 |
12 Apr 23 |
nicklas |
202 |
if (flowCellType == null) |
7107 |
12 Apr 23 |
nicklas |
203 |
{ |
7107 |
12 Apr 23 |
nicklas |
204 |
flowCellType = (String)Annotationtype.FLOWCELL_TYPE.getAnnotationValue(dc, f); |
7107 |
12 Apr 23 |
nicklas |
205 |
} |
7108 |
13 Apr 23 |
nicklas |
206 |
Integer readLength = (Integer)Annotationtype.READ_LENGTH.getAnnotationValue(dc, f); |
7108 |
13 Apr 23 |
nicklas |
207 |
if (readLength != null) |
7108 |
13 Apr 23 |
nicklas |
208 |
{ |
7108 |
13 Apr 23 |
nicklas |
209 |
readLengthSum += readLength; |
7108 |
13 Apr 23 |
nicklas |
210 |
readLengthCount++; |
7108 |
13 Apr 23 |
nicklas |
211 |
} |
7100 |
06 Apr 23 |
nicklas |
212 |
int readNum = 0; |
7100 |
06 Apr 23 |
nicklas |
213 |
String baseName; |
7087 |
03 Apr 23 |
nicklas |
214 |
if (name.endsWith("R1.fastq.gz")) |
7087 |
03 Apr 23 |
nicklas |
215 |
{ |
7100 |
06 Apr 23 |
nicklas |
216 |
readNum = 1; |
7100 |
06 Apr 23 |
nicklas |
217 |
baseName = name.replace("R1.fastq.gz", ""); |
7087 |
03 Apr 23 |
nicklas |
218 |
} |
7087 |
03 Apr 23 |
nicklas |
219 |
else if (name.endsWith("R2.fastq.gz")) |
7087 |
03 Apr 23 |
nicklas |
220 |
{ |
7100 |
06 Apr 23 |
nicklas |
221 |
readNum = 2; |
7100 |
06 Apr 23 |
nicklas |
222 |
baseName = name.replace("R2.fastq.gz", ""); |
7087 |
03 Apr 23 |
nicklas |
223 |
} |
7087 |
03 Apr 23 |
nicklas |
224 |
else |
7087 |
03 Apr 23 |
nicklas |
225 |
{ |
7087 |
03 Apr 23 |
nicklas |
226 |
throw new InvalidDataException("Expected that all FASTQ files end with R1.fastq.gz or R2.fastq.gz: " + name); |
7087 |
03 Apr 23 |
nicklas |
227 |
} |
7100 |
06 Apr 23 |
nicklas |
228 |
|
7100 |
06 Apr 23 |
nicklas |
229 |
FastqPair fq = fastqPairs.get(baseName); |
7100 |
06 Apr 23 |
nicklas |
230 |
if (fq == null) |
7100 |
06 Apr 23 |
nicklas |
231 |
{ |
7100 |
06 Apr 23 |
nicklas |
232 |
fq = new FastqPair(); |
7100 |
06 Apr 23 |
nicklas |
233 |
fastqPairs.put(baseName, fq); |
7100 |
06 Apr 23 |
nicklas |
234 |
} |
7100 |
06 Apr 23 |
nicklas |
235 |
if (readNum == 1) |
7100 |
06 Apr 23 |
nicklas |
236 |
{ |
7100 |
06 Apr 23 |
nicklas |
237 |
fq.R1 = name; |
7100 |
06 Apr 23 |
nicklas |
238 |
fq.flowCellId = flowCellId; |
7100 |
06 Apr 23 |
nicklas |
239 |
fq.lane = lane; |
7100 |
06 Apr 23 |
nicklas |
240 |
} |
7100 |
06 Apr 23 |
nicklas |
241 |
else if (readNum == 2) |
7100 |
06 Apr 23 |
nicklas |
242 |
{ |
7100 |
06 Apr 23 |
nicklas |
243 |
fq.R2 = name; |
7100 |
06 Apr 23 |
nicklas |
244 |
} |
7087 |
03 Apr 23 |
nicklas |
245 |
} |
7100 |
06 Apr 23 |
nicklas |
// Write each pair of R1 and R2 and their read-group into a tab-separated file |
7087 |
03 Apr 23 |
nicklas |
247 |
StringWriter fastq_info = new StringWriter(); |
7087 |
03 Apr 23 |
nicklas |
248 |
TableWriter fastq_info_writer = new TableWriter(fastq_info); |
7100 |
06 Apr 23 |
nicklas |
249 |
for (FastqPair fq : fastqPairs.values()) |
7087 |
03 Apr 23 |
nicklas |
250 |
{ |
7100 |
06 Apr 23 |
nicklas |
251 |
if (fq.R1 == null) throw new InvalidDataException("No matching R1 FASTQ:" + fq.R2); |
7100 |
06 Apr 23 |
nicklas |
252 |
if (fq.R2 == null) throw new InvalidDataException("No matching R2 FASTQ:" + fq.R1); |
7100 |
06 Apr 23 |
nicklas |
253 |
String readGroup = rgSample; |
7100 |
06 Apr 23 |
nicklas |
254 |
if (fq.flowCellId != null) |
7100 |
06 Apr 23 |
nicklas |
255 |
{ |
7100 |
06 Apr 23 |
nicklas |
// NOTE!!! We need \\t as output in the file so due to Java escape we need 4 |
7100 |
06 Apr 23 |
nicklas |
257 |
readGroup += "\\\\tPU:"+ScriptUtil.checkValidScriptParameter(fq.flowCellId); |
7100 |
06 Apr 23 |
nicklas |
258 |
if (fq.lane != null) readGroup += "."+fq.lane; |
7100 |
06 Apr 23 |
nicklas |
259 |
} |
7100 |
06 Apr 23 |
nicklas |
260 |
fastq_info_writer.tablePrintData(fq.R1, fq.R2, readGroup); |
7087 |
03 Apr 23 |
nicklas |
261 |
} |
7113 |
14 Apr 23 |
nicklas |
262 |
|
7086 |
31 Mar 23 |
nicklas |
// Create job |
7086 |
31 Mar 23 |
nicklas |
264 |
Job alignJob = Job.getNew(dc, null, null, null); |
7086 |
31 Mar 23 |
nicklas |
265 |
alignJob.setItemSubtype(bwaMem2AlignJobType); |
7086 |
31 Mar 23 |
nicklas |
266 |
alignJob.setPluginVersion("reggie-"+Reggie.VERSION); |
7086 |
31 Mar 23 |
nicklas |
267 |
alignJob.setSendMessage(Values.getBoolean(sc.getUserClientSetting("plugins.sendmessage"), false)); |
7086 |
31 Mar 23 |
nicklas |
268 |
alignJob.setName("BwaMem2Align " + merged.getName()); |
7086 |
31 Mar 23 |
nicklas |
269 |
alignJob.setParameterValue("pipeline", new StringParameterType(), pipeline.getName()); |
7183 |
17 May 23 |
nicklas |
270 |
if (alignParameterSet != null) |
7183 |
17 May 23 |
nicklas |
271 |
{ |
7183 |
17 May 23 |
nicklas |
272 |
alignJob.setParameterValue("parameter-set", new StringParameterType(), alignParameterSet); |
7183 |
17 May 23 |
nicklas |
273 |
} |
7086 |
31 Mar 23 |
nicklas |
274 |
if (debug) alignJob.setName(alignJob.getName() + " (debug)"); |
7086 |
31 Mar 23 |
nicklas |
275 |
if (partition != null) alignJob.setParameterValue("partition", new StringParameterType(), partition); |
7372 |
06 Oct 23 |
nicklas |
276 |
if (submitOptionsOverride != null) alignJob.setParameterValue("jobOptions", new StringParameterType(), submitOptionsOverride); |
7086 |
31 Mar 23 |
nicklas |
277 |
dc.saveItem(alignJob); |
7086 |
31 Mar 23 |
nicklas |
278 |
|
7086 |
31 Mar 23 |
nicklas |
// Create ALIGNED derived bioassay set |
7086 |
31 Mar 23 |
nicklas |
280 |
String alignedName = ms.getNextAlignedSequencesName(dc); |
7086 |
31 Mar 23 |
nicklas |
281 |
DerivedBioAssay aligned = DerivedBioAssay.getNew(dc, false, alignJob); |
7086 |
31 Mar 23 |
nicklas |
282 |
aligned.setItemSubtype(alignType); |
7086 |
31 Mar 23 |
nicklas |
283 |
pipeline.setAnnotation(dc, aligned); |
7086 |
31 Mar 23 |
nicklas |
284 |
aligned.setName(alignedName); |
7086 |
31 Mar 23 |
nicklas |
285 |
aligned.setExtract(merged.getExtract()); |
7086 |
31 Mar 23 |
nicklas |
286 |
aligned.setSoftware(alignSoftware); |
7086 |
31 Mar 23 |
nicklas |
287 |
aligned.setProtocol(alignProtocol); |
7086 |
31 Mar 23 |
nicklas |
288 |
aligned.addParent(merged); |
7086 |
31 Mar 23 |
nicklas |
289 |
DoNotUse.copyDoNotUseAnnotations(dc, merged, aligned, false); |
7086 |
31 Mar 23 |
nicklas |
290 |
dc.saveItem(aligned); |
7086 |
31 Mar 23 |
nicklas |
291 |
|
7086 |
31 Mar 23 |
nicklas |
292 |
String alignFolder = fastQFolder + "/"+aligned.getName().substring(merged.getName().length()+1); |
7086 |
31 Mar 23 |
nicklas |
293 |
if (debug && !alignFolder.startsWith("/debug")) |
7086 |
31 Mar 23 |
nicklas |
294 |
{ |
7086 |
31 Mar 23 |
nicklas |
295 |
alignFolder = "/debug" + alignFolder; |
7086 |
31 Mar 23 |
nicklas |
296 |
} |
7100 |
06 Apr 23 |
nicklas |
297 |
ScriptUtil.checkValidPath(alignFolder, true, true); |
7086 |
31 Mar 23 |
nicklas |
298 |
Annotationtype.DATA_FILES_FOLDER.setAnnotationValue(dc, aligned, alignFolder); |
7086 |
31 Mar 23 |
nicklas |
299 |
if (autoConfirm) |
7086 |
31 Mar 23 |
nicklas |
300 |
{ |
7086 |
31 Mar 23 |
nicklas |
301 |
Annotationtype.AUTO_PROCESSING.setAnnotationValue(dc, aligned, "AutoConfirm"); |
7086 |
31 Mar 23 |
nicklas |
302 |
} |
7086 |
31 Mar 23 |
nicklas |
303 |
|
7086 |
31 Mar 23 |
nicklas |
304 |
ScriptBuilder script = new ScriptBuilder(); |
7086 |
31 Mar 23 |
nicklas |
305 |
script.cmd(debug ? "set -ex" : "set -e"); |
7086 |
31 Mar 23 |
nicklas |
// Set file permissions based on consent or external group! |
7086 |
31 Mar 23 |
nicklas |
307 |
ScriptUtil.setUmaskForItem(dc, lib, null, script); |
7086 |
31 Mar 23 |
nicklas |
308 |
script.newLine(); |
7086 |
31 Mar 23 |
nicklas |
309 |
script.cmd(global_env); |
7086 |
31 Mar 23 |
nicklas |
310 |
script.export("ArchiveFolder", projectArchiveDNA); |
7086 |
31 Mar 23 |
nicklas |
311 |
script.export("FastqFolder", "${ArchiveFolder}"+fastQFolder); |
7086 |
31 Mar 23 |
nicklas |
312 |
script.export("AlignFolder", "${ArchiveFolder}"+alignFolder); |
7108 |
13 Apr 23 |
nicklas |
313 |
script.export("ReadLength", Integer.toString(readLengthCount == 0 ? 150 : readLengthSum/readLengthCount)); |
7086 |
31 Mar 23 |
nicklas |
314 |
script.newLine(); |
7086 |
31 Mar 23 |
nicklas |
315 |
script.cmd(align_env); |
7107 |
12 Apr 23 |
nicklas |
// Currently, this is used for the -d parameter to samtools markdup -d 2500 (NovaSeq), -d 100 (HiSeq) |
7107 |
12 Apr 23 |
nicklas |
317 |
script.cmd("HiSeq".equals(flowCellType) ? align_env_hiseq : align_env_novaseq); |
7086 |
31 Mar 23 |
nicklas |
318 |
if (debug) script.cmd(align_envdebug); |
7086 |
31 Mar 23 |
nicklas |
319 |
script.newLine(); |
7086 |
31 Mar 23 |
nicklas |
320 |
script.cmd(align_execute); |
7086 |
31 Mar 23 |
nicklas |
321 |
script.newLine(); |
7086 |
31 Mar 23 |
nicklas |
322 |
|
7086 |
31 Mar 23 |
nicklas |
323 |
JobDefinition jobDef = new JobDefinition("BwaMem2", jobConfig, batchConfig, alignJob); |
7087 |
03 Apr 23 |
nicklas |
324 |
jobDef.addFile(new StringUploadSource("fastq_info.txt", fastq_info.toString())); |
7086 |
31 Mar 23 |
nicklas |
325 |
jobDef.addFile(ScriptUtil.upload("bwa-mem2.sh")); |
7086 |
31 Mar 23 |
nicklas |
326 |
jobDef.addFile(ScriptUtil.upload("reggie-utils.sh")); |
7091 |
04 Apr 23 |
nicklas |
327 |
jobDef.addFile(ScriptUtil.upload("stdwrap.sh")); |
7089 |
04 Apr 23 |
nicklas |
328 |
jobDef.addFile(ScriptUtil.upload("alignment_statistics.sh")); |
7086 |
31 Mar 23 |
nicklas |
329 |
jobDef.setDebug(debug); |
7086 |
31 Mar 23 |
nicklas |
330 |
jobDef.setCmd(script.toString()); |
7086 |
31 Mar 23 |
nicklas |
331 |
jobDefs.add(jobDef); |
7086 |
31 Mar 23 |
nicklas |
332 |
|
7086 |
31 Mar 23 |
nicklas |
333 |
} |
7086 |
31 Mar 23 |
nicklas |
334 |
|
7086 |
31 Mar 23 |
nicklas |
335 |
return jobDefs; |
7084 |
28 Mar 23 |
nicklas |
336 |
} |
7084 |
28 Mar 23 |
nicklas |
337 |
|
7084 |
28 Mar 23 |
nicklas |
338 |
/** |
7084 |
28 Mar 23 |
nicklas |
Job completion handler for Bwa-mem2 align jobs. |
7084 |
28 Mar 23 |
nicklas |
340 |
*/ |
7084 |
28 Mar 23 |
nicklas |
341 |
public static class AlignJobCompletionHandler |
7084 |
28 Mar 23 |
nicklas |
342 |
implements JobCompletionHandler |
7084 |
28 Mar 23 |
nicklas |
343 |
{ |
7084 |
28 Mar 23 |
nicklas |
344 |
private static final ExtensionsLogger logger = |
7084 |
28 Mar 23 |
nicklas |
345 |
ExtensionsLog.getLogger(JobCompletionHandlerFactory.ID, true).wrap(LoggerFactory.getLogger(AlignJobCompletionHandler.class)); |
7084 |
28 Mar 23 |
nicklas |
346 |
|
7084 |
28 Mar 23 |
nicklas |
347 |
public AlignJobCompletionHandler() |
7084 |
28 Mar 23 |
nicklas |
348 |
{} |
7084 |
28 Mar 23 |
nicklas |
349 |
|
7084 |
28 Mar 23 |
nicklas |
350 |
@Override |
7084 |
28 Mar 23 |
nicklas |
351 |
public String jobCompleted(SessionControl sc, OpenGridSession session, Job job, JobStatus status) |
7084 |
28 Mar 23 |
nicklas |
352 |
{ |
7089 |
04 Apr 23 |
nicklas |
353 |
String jobName = status.getName(); |
7105 |
11 Apr 23 |
nicklas |
354 |
String alignStatistics = session.getJobFileAsString(jobName, "alignment_statistics.txt", "UTF-8"); |
7105 |
11 Apr 23 |
nicklas |
355 |
String markdupStats = session.getJobFileAsString(jobName, "samtools_markdup.txt", "UTF-8"); |
7106 |
12 Apr 23 |
nicklas |
356 |
String insertSizeMetrics = session.getJobFileAsString(jobName, "picard_CollectInsertSizeMetrics.txt", "UTF-8"); |
7106 |
12 Apr 23 |
nicklas |
357 |
String wgsMetrics = session.getJobFileAsString(jobName, "picard_CollectWgsMetrics.txt", "UTF-8"); |
7106 |
12 Apr 23 |
nicklas |
358 |
String alignmentSummaryMetrics = session.getJobFileAsString(jobName, "picard_CollectAlignmentSummaryMetrics.txt", "UTF-8"); |
7106 |
12 Apr 23 |
nicklas |
359 |
String qualityYieldMetrics = session.getJobFileAsString(jobName, "picard_CollectQualityYieldMetrics.txt", "UTF-8"); |
7089 |
04 Apr 23 |
nicklas |
360 |
String files = session.getJobFileAsString(jobName, "files.out", "UTF-8"); |
7089 |
04 Apr 23 |
nicklas |
361 |
|
7106 |
12 Apr 23 |
nicklas |
362 |
Metrics metrics = parseAlignedOut(sc, job, alignStatistics, markdupStats, insertSizeMetrics, wgsMetrics, alignmentSummaryMetrics, qualityYieldMetrics, files); |
7089 |
04 Apr 23 |
nicklas |
363 |
String msg = Values.formatNumber(metrics.numReadsAfterAlign/1000000f, 1) + "M reads after alignment; "; |
7105 |
11 Apr 23 |
nicklas |
364 |
msg += Values.formatNumber(metrics.fractionDuplication * 100, 1) + "% duplicates"; |
7105 |
11 Apr 23 |
nicklas |
365 |
msg += " ("+Values.formatNumber(metrics.fractionOpticalDuplication * 100, 1)+"% optical); "; |
7106 |
12 Apr 23 |
nicklas |
366 |
msg += Reggie.formatCount(metrics.alignedBases) + " aligned bases; "; |
7106 |
12 Apr 23 |
nicklas |
367 |
msg += Values.formatNumber(metrics.meanCoverage, 1) + " mean coverage; "; |
7091 |
04 Apr 23 |
nicklas |
368 |
msg += Values.formatNumber(metrics.hetPercentage, 0) + "% HET"; |
7089 |
04 Apr 23 |
nicklas |
369 |
return msg; |
7084 |
28 Mar 23 |
nicklas |
370 |
} |
7084 |
28 Mar 23 |
nicklas |
371 |
|
7106 |
12 Apr 23 |
nicklas |
372 |
private Metrics parseAlignedOut(SessionControl sc, Job job, String alignOut, String markdupOut, |
7106 |
12 Apr 23 |
nicklas |
373 |
String insertSizeMetrics, String wgsMetrics, String alignmentSummaryMetrics, String qualityYieldMetrics, String filesOut) |
7089 |
04 Apr 23 |
nicklas |
374 |
{ |
7089 |
04 Apr 23 |
nicklas |
375 |
Metrics metrics = new Metrics(); |
7089 |
04 Apr 23 |
nicklas |
376 |
|
7105 |
11 Apr 23 |
nicklas |
// alignment_statistics.txt |
7089 |
04 Apr 23 |
nicklas |
378 |
Pattern p = Pattern.compile("(\\d+)"); |
7089 |
04 Apr 23 |
nicklas |
379 |
for (String line : alignOut.split("\n")) |
7089 |
04 Apr 23 |
nicklas |
380 |
{ |
7089 |
04 Apr 23 |
nicklas |
381 |
Matcher m = p.matcher(line); |
7089 |
04 Apr 23 |
nicklas |
382 |
if (m.matches()) |
7089 |
04 Apr 23 |
nicklas |
383 |
{ |
7089 |
04 Apr 23 |
nicklas |
384 |
metrics.numReadsAfterAlign = Values.getLong(m.group(1), null); |
7089 |
04 Apr 23 |
nicklas |
385 |
if (logger.isDebugEnabled()) |
7089 |
04 Apr 23 |
nicklas |
386 |
{ |
7089 |
04 Apr 23 |
nicklas |
387 |
logger.debug("Found match: " + line + "; numReadsAfterAlign="+metrics.numReadsAfterAlign); |
7089 |
04 Apr 23 |
nicklas |
388 |
} |
7089 |
04 Apr 23 |
nicklas |
389 |
break; |
7089 |
04 Apr 23 |
nicklas |
390 |
} |
7089 |
04 Apr 23 |
nicklas |
391 |
} |
7089 |
04 Apr 23 |
nicklas |
392 |
|
7105 |
11 Apr 23 |
nicklas |
// samtools_markdup.txt |
7089 |
04 Apr 23 |
nicklas |
394 |
for (String line : markdupOut.split("\n")) |
7089 |
04 Apr 23 |
nicklas |
395 |
{ |
7089 |
04 Apr 23 |
nicklas |
396 |
String[] keyVal = line.split("\\:\\s*", 2); |
7089 |
04 Apr 23 |
nicklas |
397 |
if (keyVal.length == 2) |
7089 |
04 Apr 23 |
nicklas |
398 |
{ |
7089 |
04 Apr 23 |
nicklas |
399 |
String key = keyVal[0]; |
7089 |
04 Apr 23 |
nicklas |
400 |
long val = Values.getLong(keyVal[1]); |
7089 |
04 Apr 23 |
nicklas |
401 |
|
7089 |
04 Apr 23 |
nicklas |
402 |
if ("EXAMINED".equals(key)) |
7089 |
04 Apr 23 |
nicklas |
403 |
{ |
7089 |
04 Apr 23 |
nicklas |
404 |
metrics.totalExamined = val; |
7089 |
04 Apr 23 |
nicklas |
405 |
} |
7089 |
04 Apr 23 |
nicklas |
406 |
else if ("DUPLICATE TOTAL".equals(key)) |
7089 |
04 Apr 23 |
nicklas |
407 |
{ |
7089 |
04 Apr 23 |
nicklas |
408 |
metrics.totalDuplicates = val; |
7089 |
04 Apr 23 |
nicklas |
409 |
} |
7089 |
04 Apr 23 |
nicklas |
410 |
else if ("PAIRED".equals(key)) |
7089 |
04 Apr 23 |
nicklas |
411 |
{ |
7089 |
04 Apr 23 |
nicklas |
412 |
metrics.readPairsExamined = val / 2; |
7089 |
04 Apr 23 |
nicklas |
413 |
} |
7089 |
04 Apr 23 |
nicklas |
414 |
else if ("DUPLICATE PAIR".equals(key)) |
7089 |
04 Apr 23 |
nicklas |
415 |
{ |
7089 |
04 Apr 23 |
nicklas |
416 |
metrics.readPairDuplicates = val / 2; |
7089 |
04 Apr 23 |
nicklas |
417 |
} |
7105 |
11 Apr 23 |
nicklas |
418 |
else if ("DUPLICATE PAIR OPTICAL".equals(key)) |
7105 |
11 Apr 23 |
nicklas |
419 |
{ |
7105 |
11 Apr 23 |
nicklas |
420 |
metrics.readPairOpticalDuplicates = val / 2; |
7105 |
11 Apr 23 |
nicklas |
421 |
} |
7089 |
04 Apr 23 |
nicklas |
422 |
} |
7089 |
04 Apr 23 |
nicklas |
423 |
if (metrics.totalDuplicates != null && metrics.totalExamined != null) |
7089 |
04 Apr 23 |
nicklas |
424 |
{ |
7089 |
04 Apr 23 |
nicklas |
425 |
metrics.fractionDuplication = metrics.totalDuplicates.floatValue() / metrics.totalExamined.floatValue(); |
7089 |
04 Apr 23 |
nicklas |
426 |
} |
7105 |
11 Apr 23 |
nicklas |
427 |
if (metrics.readPairOpticalDuplicates != null && metrics.readPairsExamined != null) |
7105 |
11 Apr 23 |
nicklas |
428 |
{ |
7105 |
11 Apr 23 |
nicklas |
429 |
metrics.fractionOpticalDuplication = metrics.readPairOpticalDuplicates.floatValue() / metrics.readPairsExamined.floatValue(); |
7105 |
11 Apr 23 |
nicklas |
430 |
} |
7089 |
04 Apr 23 |
nicklas |
431 |
} |
7089 |
04 Apr 23 |
nicklas |
432 |
|
7105 |
11 Apr 23 |
nicklas |
// picard_CollectInsertSizeMetrics.txt |
7106 |
12 Apr 23 |
nicklas |
434 |
PicardMetrics pm = new PicardMetrics(insertSizeMetrics, "MEAN_INSERT_SIZE", "STANDARD_DEVIATION"); |
7106 |
12 Apr 23 |
nicklas |
435 |
if (pm.next()) |
7105 |
11 Apr 23 |
nicklas |
436 |
{ |
7106 |
12 Apr 23 |
nicklas |
437 |
metrics.fragmentSizeAvg = pm.getInteger("MEAN_INSERT_SIZE", null); |
7106 |
12 Apr 23 |
nicklas |
438 |
metrics.fragmentSizeStd = pm.getInteger("STANDARD_DEVIATION", null); |
7106 |
12 Apr 23 |
nicklas |
439 |
} |
7106 |
12 Apr 23 |
nicklas |
440 |
|
7106 |
12 Apr 23 |
nicklas |
441 |
pm = new PicardMetrics(wgsMetrics, "MEAN_COVERAGE", "SD_COVERAGE"); |
7106 |
12 Apr 23 |
nicklas |
442 |
if (pm.next()) |
7106 |
12 Apr 23 |
nicklas |
443 |
{ |
7106 |
12 Apr 23 |
nicklas |
444 |
metrics.meanCoverage = pm.getFloat("MEAN_COVERAGE", null); |
7106 |
12 Apr 23 |
nicklas |
445 |
metrics.sdCoverage = pm.getFloat("SD_COVERAGE", null); |
7106 |
12 Apr 23 |
nicklas |
446 |
} |
7106 |
12 Apr 23 |
nicklas |
447 |
|
7106 |
12 Apr 23 |
nicklas |
448 |
pm = new PicardMetrics(alignmentSummaryMetrics, "CATEGORY", "PF_ALIGNED_BASES"); |
7106 |
12 Apr 23 |
nicklas |
449 |
while (pm.next()) |
7106 |
12 Apr 23 |
nicklas |
450 |
{ |
7106 |
12 Apr 23 |
nicklas |
451 |
if ("PAIR".equals(pm.getString("CATEGORY", null))) |
7105 |
11 Apr 23 |
nicklas |
452 |
{ |
7106 |
12 Apr 23 |
nicklas |
453 |
metrics.alignedBases = pm.getLong("PF_ALIGNED_BASES", null); |
7106 |
12 Apr 23 |
nicklas |
454 |
break; |
7105 |
11 Apr 23 |
nicklas |
455 |
} |
7105 |
11 Apr 23 |
nicklas |
456 |
} |
7106 |
12 Apr 23 |
nicklas |
457 |
|
7106 |
12 Apr 23 |
nicklas |
458 |
pm = new PicardMetrics(qualityYieldMetrics, "PF_BASES", "PF_Q30_BASES"); |
7106 |
12 Apr 23 |
nicklas |
459 |
if (pm.next()) |
7106 |
12 Apr 23 |
nicklas |
460 |
{ |
7106 |
12 Apr 23 |
nicklas |
461 |
metrics.pfBases = pm.getLong("PF_BASES", null); |
7106 |
12 Apr 23 |
nicklas |
462 |
metrics.pfQ30Bases = pm.getLong("PF_Q30_BASES", null); |
7106 |
12 Apr 23 |
nicklas |
463 |
} |
7106 |
12 Apr 23 |
nicklas |
464 |
|
7089 |
04 Apr 23 |
nicklas |
465 |
DbControl dc = null; |
7089 |
04 Apr 23 |
nicklas |
466 |
try |
7089 |
04 Apr 23 |
nicklas |
467 |
{ |
7089 |
04 Apr 23 |
nicklas |
468 |
dc = sc.newDbControl("Reggie: Bwa-mem2 alignment completed handler"); |
7089 |
04 Apr 23 |
nicklas |
469 |
|
7089 |
04 Apr 23 |
nicklas |
470 |
AlignedSequences alignedSequences = AlignedSequences.getByJob(dc, job); |
7089 |
04 Apr 23 |
nicklas |
471 |
DerivedBioAssay aligned = alignedSequences.getItem(); |
7089 |
04 Apr 23 |
nicklas |
472 |
|
7089 |
04 Apr 23 |
nicklas |
473 |
Annotationtype.ALIGNED_PAIRS.setAnnotationValue(dc, aligned, metrics.numReadsAfterAlign); |
7089 |
04 Apr 23 |
nicklas |
474 |
Annotationtype.READ_PAIRS_EXAMINED.setAnnotationValue(dc, aligned, metrics.readPairsExamined); |
7089 |
04 Apr 23 |
nicklas |
475 |
Annotationtype.READ_PAIR_DUPLICATES.setAnnotationValue(dc, aligned, metrics.readPairDuplicates); |
7105 |
11 Apr 23 |
nicklas |
476 |
Annotationtype.READ_PAIR_OPTICAL_DUPLICATES.setAnnotationValue(dc, aligned, metrics.readPairOpticalDuplicates); |
7089 |
04 Apr 23 |
nicklas |
477 |
Annotationtype.FRACTION_DUPLICATION.setAnnotationValue(dc, aligned, metrics.fractionDuplication); |
7105 |
11 Apr 23 |
nicklas |
478 |
Annotationtype.FRACTION_OPTICAL_DUPLICATION.setAnnotationValue(dc, aligned, metrics.fractionOpticalDuplication); |
7105 |
11 Apr 23 |
nicklas |
479 |
Annotationtype.FRAGMENT_SIZE_AVG.setAnnotationValue(dc, aligned, metrics.fragmentSizeAvg); |
7105 |
11 Apr 23 |
nicklas |
480 |
Annotationtype.FRAGMENT_SIZE_STDEV.setAnnotationValue(dc, aligned, metrics.fragmentSizeStd); |
7089 |
04 Apr 23 |
nicklas |
481 |
|
7113 |
14 Apr 23 |
nicklas |
482 |
Annotationtype.PF_BASES.setAnnotationValue(dc, aligned, metrics.pfBases); |
7113 |
14 Apr 23 |
nicklas |
483 |
Annotationtype.PF_Q30_BASES.setAnnotationValue(dc, aligned, metrics.pfQ30Bases); |
7106 |
12 Apr 23 |
nicklas |
484 |
Annotationtype.ALIGNED_BASES.setAnnotationValue(dc, aligned, metrics.alignedBases); |
7106 |
12 Apr 23 |
nicklas |
485 |
Annotationtype.MEAN_COVERAGE.setAnnotationValue(dc, aligned, metrics.meanCoverage); |
7106 |
12 Apr 23 |
nicklas |
486 |
Annotationtype.SD_COVERAGE.setAnnotationValue(dc, aligned, metrics.sdCoverage); |
7106 |
12 Apr 23 |
nicklas |
487 |
|
7089 |
04 Apr 23 |
nicklas |
// Create file links |
7089 |
04 Apr 23 |
nicklas |
489 |
FileServer fileArchive = Fileserver.PROJECT_ARCHIVE_DNA.load(dc); |
7089 |
04 Apr 23 |
nicklas |
490 |
String analysisDir = Reggie.SECONDARY_ANALYSIS_DIR; |
7089 |
04 Apr 23 |
nicklas |
491 |
|
7089 |
04 Apr 23 |
nicklas |
492 |
String dataFilesFolder = (String)Annotationtype.DATA_FILES_FOLDER.getAnnotationValue(dc, aligned); |
7089 |
04 Apr 23 |
nicklas |
493 |
String baseFolder = Reggie.convertDataFilesFolderToBaseFolder(dataFilesFolder); |
7089 |
04 Apr 23 |
nicklas |
494 |
Directory localDataDir = Directory.getNew(dc, new Path(analysisDir+baseFolder, Path.Type.DIRECTORY)); |
7089 |
04 Apr 23 |
nicklas |
495 |
DataFileType bamData = Datafiletype.BAM.load(dc); |
7089 |
04 Apr 23 |
nicklas |
496 |
ItemSubtype bamType = bamData.getGenericType(); |
7089 |
04 Apr 23 |
nicklas |
497 |
ItemSubtype vcfType = Subtype.VARIANT_CALL_FORMAT.load(dc); |
7089 |
04 Apr 23 |
nicklas |
498 |
|
7089 |
04 Apr 23 |
nicklas |
499 |
int lineNo = 0; |
7089 |
04 Apr 23 |
nicklas |
500 |
for (String line : filesOut.split("\n")) |
7089 |
04 Apr 23 |
nicklas |
501 |
{ |
7089 |
04 Apr 23 |
nicklas |
502 |
lineNo++; |
7089 |
04 Apr 23 |
nicklas |
503 |
|
7089 |
04 Apr 23 |
nicklas |
504 |
File f = File.getFile(dc, localDataDir, line.substring(line.lastIndexOf("/")+1), true); |
7089 |
04 Apr 23 |
nicklas |
505 |
f.setFileServer(fileArchive); |
7089 |
04 Apr 23 |
nicklas |
506 |
String fileUrl = "sftp://" + fileArchive.getHost() + dataFilesFolder + "/" + f.getName(); |
7089 |
04 Apr 23 |
nicklas |
507 |
try |
7089 |
04 Apr 23 |
nicklas |
508 |
{ |
7089 |
04 Apr 23 |
nicklas |
509 |
f.setUrl(fileUrl, true); |
7089 |
04 Apr 23 |
nicklas |
510 |
} |
7089 |
04 Apr 23 |
nicklas |
511 |
catch (RuntimeException ex) |
7089 |
04 Apr 23 |
nicklas |
512 |
{ |
7089 |
04 Apr 23 |
nicklas |
513 |
f.setUrl(fileUrl, false); |
7089 |
04 Apr 23 |
nicklas |
514 |
} |
7089 |
04 Apr 23 |
nicklas |
515 |
|
7089 |
04 Apr 23 |
nicklas |
516 |
if (!f.isInDatabase()) |
7089 |
04 Apr 23 |
nicklas |
517 |
{ |
7089 |
04 Apr 23 |
nicklas |
518 |
dc.saveItem(f); |
7089 |
04 Apr 23 |
nicklas |
519 |
} |
7089 |
04 Apr 23 |
nicklas |
520 |
if (f.getName().equals("alignment.bam")) |
7089 |
04 Apr 23 |
nicklas |
521 |
{ |
7089 |
04 Apr 23 |
nicklas |
522 |
f.setDescription(metrics.numReadsAfterAlign + " ALIGNED PAIRS"); |
7089 |
04 Apr 23 |
nicklas |
523 |
f.setItemSubtype(bamType); |
7089 |
04 Apr 23 |
nicklas |
524 |
FileSetMember member = aligned.getFileSet().addMember(f, bamData); |
7089 |
04 Apr 23 |
nicklas |
525 |
} |
7089 |
04 Apr 23 |
nicklas |
526 |
else |
7089 |
04 Apr 23 |
nicklas |
527 |
{ |
7089 |
04 Apr 23 |
nicklas |
528 |
AnyToAny link = AnyToAny.getNewOrExisting(dc, aligned, f.getName(), f, true); |
7089 |
04 Apr 23 |
nicklas |
529 |
if (!link.isInDatabase()) dc.saveItem(link); |
7091 |
04 Apr 23 |
nicklas |
530 |
|
7089 |
04 Apr 23 |
nicklas |
// We copy the VCF file to the BASE server |
7089 |
04 Apr 23 |
nicklas |
532 |
if ("qc_genotype.vcf".equals(f.getName())) |
7089 |
04 Apr 23 |
nicklas |
533 |
{ |
7089 |
04 Apr 23 |
nicklas |
534 |
f.setMimeTypeAuto("text/plain", vcfType); |
7089 |
04 Apr 23 |
nicklas |
535 |
VcfData vcfData = copyToBaseAndParse(f, aligned); |
7089 |
04 Apr 23 |
nicklas |
536 |
if (vcfData != null) |
7089 |
04 Apr 23 |
nicklas |
537 |
{ |
7089 |
04 Apr 23 |
nicklas |
538 |
metrics.genotypeCount = vcfData.getGtCount(); |
7089 |
04 Apr 23 |
nicklas |
539 |
metrics.hetCount = vcfData.getHetCount(); |
7089 |
04 Apr 23 |
nicklas |
540 |
metrics.hetPercentage = vcfData.getHetPercentage(); |
7089 |
04 Apr 23 |
nicklas |
541 |
Annotationtype.QC_GENOTYPE_COUNT.setAnnotationValue(dc, aligned, metrics.genotypeCount); |
7089 |
04 Apr 23 |
nicklas |
542 |
if (metrics.genotypeCount > 0) |
7089 |
04 Apr 23 |
nicklas |
543 |
{ |
7089 |
04 Apr 23 |
nicklas |
544 |
Annotationtype.QC_GENOTYPE_HET_PCT.setAnnotationValue(dc, aligned, metrics.hetPercentage); |
7089 |
04 Apr 23 |
nicklas |
545 |
} |
7091 |
04 Apr 23 |
nicklas |
// TODO -- for now we disable genotype qc for this alignment since |
7091 |
04 Apr 23 |
nicklas |
// there is not yet support in the wizard for this type of alignment |
7091 |
04 Apr 23 |
nicklas |
548 |
Annotationtype.QC_GENOTYPE_STATUS.setAnnotationValue(dc, aligned, "Disabled"); |
7089 |
04 Apr 23 |
nicklas |
549 |
} |
7089 |
04 Apr 23 |
nicklas |
550 |
} |
7089 |
04 Apr 23 |
nicklas |
551 |
} |
7089 |
04 Apr 23 |
nicklas |
552 |
} |
7089 |
04 Apr 23 |
nicklas |
553 |
|
7089 |
04 Apr 23 |
nicklas |
554 |
dc.commit(); |
7089 |
04 Apr 23 |
nicklas |
555 |
} |
7089 |
04 Apr 23 |
nicklas |
556 |
finally |
7089 |
04 Apr 23 |
nicklas |
557 |
{ |
7089 |
04 Apr 23 |
nicklas |
558 |
if (dc != null) dc.close(); |
7089 |
04 Apr 23 |
nicklas |
559 |
} |
7089 |
04 Apr 23 |
nicklas |
560 |
|
7089 |
04 Apr 23 |
nicklas |
561 |
return metrics; |
7089 |
04 Apr 23 |
nicklas |
562 |
} |
7089 |
04 Apr 23 |
nicklas |
563 |
|
7091 |
04 Apr 23 |
nicklas |
564 |
/** |
7091 |
04 Apr 23 |
nicklas |
Helper method for copying the VCF file from the file server |
7091 |
04 Apr 23 |
nicklas |
while at the same time parsing it and extracting genotype |
7091 |
04 Apr 23 |
nicklas |
information and statistics. |
7091 |
04 Apr 23 |
nicklas |
568 |
*/ |
7091 |
04 Apr 23 |
nicklas |
569 |
private VcfData copyToBaseAndParse(File vcfFile, DerivedBioAssay alignment) |
7091 |
04 Apr 23 |
nicklas |
570 |
{ |
7091 |
04 Apr 23 |
nicklas |
// Stream for copying the vcfFile |
7091 |
04 Apr 23 |
nicklas |
572 |
InputStream fromFileServer = null; |
7091 |
04 Apr 23 |
nicklas |
573 |
OutputStream toBase = null; |
7091 |
04 Apr 23 |
nicklas |
574 |
|
7091 |
04 Apr 23 |
nicklas |
// The splitter will help us parse the file and at the same time copy it to BASE. |
7091 |
04 Apr 23 |
nicklas |
576 |
InputStreamSplitter splitter = null; |
7091 |
04 Apr 23 |
nicklas |
577 |
|
7091 |
04 Apr 23 |
nicklas |
578 |
VcfData vcfData = null; |
7091 |
04 Apr 23 |
nicklas |
579 |
try |
7091 |
04 Apr 23 |
nicklas |
580 |
{ |
7091 |
04 Apr 23 |
nicklas |
581 |
fromFileServer = vcfFile.getDownloadStream(0); |
7091 |
04 Apr 23 |
nicklas |
582 |
toBase = vcfFile.getUploadStream(false); |
7091 |
04 Apr 23 |
nicklas |
583 |
splitter = new InputStreamSplitter(fromFileServer, true, true, toBase); |
7091 |
04 Apr 23 |
nicklas |
584 |
|
7091 |
04 Apr 23 |
nicklas |
585 |
VcfParser parser = new VcfParser(); |
7091 |
04 Apr 23 |
nicklas |
586 |
vcfData = parser.parse(splitter, vcfFile.getName()); |
7091 |
04 Apr 23 |
nicklas |
587 |
vcfFile.setFileServer(null); |
7091 |
04 Apr 23 |
nicklas |
588 |
} |
7091 |
04 Apr 23 |
nicklas |
589 |
catch (Exception ex) |
7091 |
04 Apr 23 |
nicklas |
590 |
{ |
7091 |
04 Apr 23 |
nicklas |
591 |
throw new BaseException("Could not parse '" + vcfFile.getName() + "' for alignment: " + alignment.getName(), ex); |
7091 |
04 Apr 23 |
nicklas |
592 |
} |
7091 |
04 Apr 23 |
nicklas |
593 |
finally |
7091 |
04 Apr 23 |
nicklas |
594 |
{ |
7091 |
04 Apr 23 |
nicklas |
595 |
FileUtil.close(splitter); |
7091 |
04 Apr 23 |
nicklas |
596 |
FileUtil.close(toBase); |
7091 |
04 Apr 23 |
nicklas |
597 |
FileUtil.close(fromFileServer); |
7091 |
04 Apr 23 |
nicklas |
598 |
} |
7091 |
04 Apr 23 |
nicklas |
599 |
return vcfData; |
7091 |
04 Apr 23 |
nicklas |
600 |
} |
7091 |
04 Apr 23 |
nicklas |
601 |
|
7084 |
28 Mar 23 |
nicklas |
602 |
} |
7089 |
04 Apr 23 |
nicklas |
603 |
|
7089 |
04 Apr 23 |
nicklas |
604 |
static class Metrics |
7089 |
04 Apr 23 |
nicklas |
605 |
{ |
7089 |
04 Apr 23 |
nicklas |
606 |
Long numReadsAfterAlign = null; |
7089 |
04 Apr 23 |
nicklas |
607 |
Long readPairsExamined = null; |
7089 |
04 Apr 23 |
nicklas |
608 |
Long readPairDuplicates = null; |
7105 |
11 Apr 23 |
nicklas |
609 |
Long readPairOpticalDuplicates = null; |
7089 |
04 Apr 23 |
nicklas |
610 |
Float fractionDuplication = null; |
7105 |
11 Apr 23 |
nicklas |
611 |
Float fractionOpticalDuplication = null; |
7089 |
04 Apr 23 |
nicklas |
612 |
Long totalExamined = null; |
7089 |
04 Apr 23 |
nicklas |
613 |
Long totalDuplicates = null; |
7106 |
12 Apr 23 |
nicklas |
614 |
Float meanCoverage = null; |
7106 |
12 Apr 23 |
nicklas |
615 |
Float sdCoverage = null; |
7106 |
12 Apr 23 |
nicklas |
616 |
Long alignedBases = null; |
7106 |
12 Apr 23 |
nicklas |
617 |
Long pfBases = null; |
7106 |
12 Apr 23 |
nicklas |
618 |
Long pfQ30Bases = null; |
7106 |
12 Apr 23 |
nicklas |
619 |
Integer fragmentSizeAvg = null; |
7106 |
12 Apr 23 |
nicklas |
620 |
Integer fragmentSizeStd = null; |
7089 |
04 Apr 23 |
nicklas |
// int fragmentSizeCount = -1; |
7091 |
04 Apr 23 |
nicklas |
622 |
int genotypeCount = -1; |
7091 |
04 Apr 23 |
nicklas |
623 |
int hetCount = -1; |
7091 |
04 Apr 23 |
nicklas |
624 |
float hetPercentage = Float.NaN; |
7089 |
04 Apr 23 |
nicklas |
625 |
} |
7084 |
28 Mar 23 |
nicklas |
626 |
|
7084 |
28 Mar 23 |
nicklas |
627 |
|
7100 |
06 Apr 23 |
nicklas |
628 |
static class FastqPair |
7100 |
06 Apr 23 |
nicklas |
629 |
{ |
7100 |
06 Apr 23 |
nicklas |
630 |
String R1; |
7100 |
06 Apr 23 |
nicklas |
631 |
String R2; |
7100 |
06 Apr 23 |
nicklas |
632 |
String flowCellId; |
7100 |
06 Apr 23 |
nicklas |
633 |
Integer lane; |
7100 |
06 Apr 23 |
nicklas |
634 |
} |
7100 |
06 Apr 23 |
nicklas |
635 |
|
7084 |
28 Mar 23 |
nicklas |
636 |
} |