6200 |
08 Apr 21 |
nicklas |
1 |
package net.sf.basedb.reggie.plugins.cmd; |
6200 |
08 Apr 21 |
nicklas |
2 |
|
6955 |
12 Dec 22 |
nicklas |
3 |
import java.io.BufferedReader; |
6200 |
08 Apr 21 |
nicklas |
4 |
import java.io.InputStream; |
6955 |
12 Dec 22 |
nicklas |
5 |
import java.io.InputStreamReader; |
6955 |
12 Dec 22 |
nicklas |
6 |
import java.nio.charset.StandardCharsets; |
6200 |
08 Apr 21 |
nicklas |
7 |
import java.util.Date; |
6474 |
04 Nov 21 |
nicklas |
8 |
import java.util.Locale; |
6955 |
12 Dec 22 |
nicklas |
9 |
import java.util.zip.GZIPInputStream; |
6200 |
08 Apr 21 |
nicklas |
10 |
|
6200 |
08 Apr 21 |
nicklas |
11 |
import org.json.simple.JSONObject; |
6200 |
08 Apr 21 |
nicklas |
12 |
|
6201 |
09 Apr 21 |
nicklas |
13 |
import net.sf.basedb.core.DbControl; |
6896 |
28 Nov 22 |
nicklas |
14 |
import net.sf.basedb.opengrid.RemoteSession; |
6200 |
08 Apr 21 |
nicklas |
15 |
import net.sf.basedb.opengrid.filetransfer.FileMetaData; |
6200 |
08 Apr 21 |
nicklas |
16 |
import net.sf.basedb.reggie.Reggie; |
6955 |
12 Dec 22 |
nicklas |
17 |
import net.sf.basedb.util.EqualsHelper; |
6200 |
08 Apr 21 |
nicklas |
18 |
import net.sf.basedb.util.FileUtil; |
7050 |
17 Feb 23 |
nicklas |
19 |
import net.sf.basedb.util.uri.UriMetadata; |
6200 |
08 Apr 21 |
nicklas |
20 |
|
6200 |
08 Apr 21 |
nicklas |
21 |
/** |
6200 |
08 Apr 21 |
nicklas |
Holds information about the FASTQ files that are related |
6200 |
08 Apr 21 |
nicklas |
to the main JSON file. |
6200 |
08 Apr 21 |
nicklas |
@since 4.32 |
6200 |
08 Apr 21 |
nicklas |
25 |
*/ |
6200 |
08 Apr 21 |
nicklas |
26 |
public class FastqInfo |
6200 |
08 Apr 21 |
nicklas |
27 |
{ |
6200 |
08 Apr 21 |
nicklas |
28 |
|
6200 |
08 Apr 21 |
nicklas |
29 |
private final JsonSection section; |
6200 |
08 Apr 21 |
nicklas |
30 |
|
6200 |
08 Apr 21 |
nicklas |
31 |
public FastqFile R1; |
6200 |
08 Apr 21 |
nicklas |
32 |
public FastqFile R2; |
6955 |
12 Dec 22 |
nicklas |
33 |
public String sequencerSerial; |
6955 |
12 Dec 22 |
nicklas |
34 |
public String flowCellId; |
6955 |
12 Dec 22 |
nicklas |
35 |
public String runNumber; |
6960 |
13 Dec 22 |
nicklas |
36 |
public String barcodeSequences; |
6200 |
08 Apr 21 |
nicklas |
37 |
|
6200 |
08 Apr 21 |
nicklas |
38 |
public boolean valid; |
6200 |
08 Apr 21 |
nicklas |
39 |
|
6900 |
29 Nov 22 |
nicklas |
40 |
public FastqInfo(JsonSection fastq, ImportContext ctx) |
6200 |
08 Apr 21 |
nicklas |
41 |
{ |
6200 |
08 Apr 21 |
nicklas |
42 |
this.section = fastq; |
6200 |
08 Apr 21 |
nicklas |
43 |
if (fastq != null) |
6200 |
08 Apr 21 |
nicklas |
44 |
{ |
6200 |
08 Apr 21 |
nicklas |
45 |
R1 = fastq.getRequiredEntry("R1", new FastqFile()); |
6200 |
08 Apr 21 |
nicklas |
46 |
R2 = fastq.getRequiredEntry("R2", new FastqFile()); |
6457 |
01 Nov 21 |
nicklas |
47 |
if (R1 != null) |
6457 |
01 Nov 21 |
nicklas |
48 |
{ |
6457 |
01 Nov 21 |
nicklas |
//R1.md5 = fastq.getOptionalEntry("R1 MD5", PatternValidator.MD5); |
6510 |
03 Dec 21 |
nicklas |
50 |
R1.expectedSize = fastq.getOptionalEntry("R1size", NullValidator.allowNull(LongValidator.POSITIVE)); |
6900 |
29 Nov 22 |
nicklas |
51 |
if (ctx != null) |
6900 |
29 Nov 22 |
nicklas |
52 |
{ |
6900 |
29 Nov 22 |
nicklas |
53 |
JsonSection duplicate = ctx.add("FASTQ:"+R1.name, section); |
6900 |
29 Nov 22 |
nicklas |
54 |
if (duplicate != null) |
6900 |
29 Nov 22 |
nicklas |
55 |
{ |
6900 |
29 Nov 22 |
nicklas |
56 |
String msg = "FASTQ R1 ["+R1.name+"] duplicated in file: "; |
6900 |
29 Nov 22 |
nicklas |
57 |
section.addErrorMessage(msg+duplicate.getFile().getName()); |
6900 |
29 Nov 22 |
nicklas |
58 |
duplicate.addErrorMessage(msg+section.getFile().getName()); |
6900 |
29 Nov 22 |
nicklas |
59 |
} |
6900 |
29 Nov 22 |
nicklas |
60 |
} |
6457 |
01 Nov 21 |
nicklas |
61 |
} |
6457 |
01 Nov 21 |
nicklas |
62 |
if (R2 != null) |
6457 |
01 Nov 21 |
nicklas |
63 |
{ |
6457 |
01 Nov 21 |
nicklas |
//R2.md5 = fastq.getOptionalEntry("R2 MD5", PatternValidator.MD5); |
6510 |
03 Dec 21 |
nicklas |
65 |
R2.expectedSize = fastq.getOptionalEntry("R2size", NullValidator.allowNull(LongValidator.POSITIVE)); |
6900 |
29 Nov 22 |
nicklas |
66 |
if (ctx != null) |
6900 |
29 Nov 22 |
nicklas |
67 |
{ |
6900 |
29 Nov 22 |
nicklas |
68 |
JsonSection duplicate = ctx.add("FASTQ:"+R2.name, section); |
6900 |
29 Nov 22 |
nicklas |
69 |
if (duplicate != null) |
6900 |
29 Nov 22 |
nicklas |
70 |
{ |
6900 |
29 Nov 22 |
nicklas |
71 |
String msg = "FASTQ R2 ["+R2.name+"] duplicated in file: "; |
6900 |
29 Nov 22 |
nicklas |
72 |
section.addErrorMessage(msg+duplicate.getFile().getName()); |
6900 |
29 Nov 22 |
nicklas |
73 |
duplicate.addErrorMessage(msg+section.getFile().getName()); |
6900 |
29 Nov 22 |
nicklas |
74 |
} |
6900 |
29 Nov 22 |
nicklas |
75 |
} |
6457 |
01 Nov 21 |
nicklas |
76 |
} |
6200 |
08 Apr 21 |
nicklas |
77 |
} |
6200 |
08 Apr 21 |
nicklas |
78 |
valid = fastq != null && !fastq.hasError(); |
6200 |
08 Apr 21 |
nicklas |
79 |
} |
6200 |
08 Apr 21 |
nicklas |
80 |
|
6894 |
25 Nov 22 |
nicklas |
81 |
|
6955 |
12 Dec 22 |
nicklas |
82 |
public void loadFileInfo(ImportContext ctx, RemoteSession session, String directory, boolean fullValidation) |
6200 |
08 Apr 21 |
nicklas |
83 |
{ |
6955 |
12 Dec 22 |
nicklas |
84 |
if (R1 != null) loadMetadata(ctx, session, directory, R1, fullValidation); |
6955 |
12 Dec 22 |
nicklas |
85 |
if (R2 != null) loadMetadata(ctx, session, directory, R2, fullValidation); |
6955 |
12 Dec 22 |
nicklas |
86 |
if (fullValidation && R1 != null && R2 != null) |
6955 |
12 Dec 22 |
nicklas |
87 |
{ |
6955 |
12 Dec 22 |
nicklas |
// If the headers match (they should!) in both FASTQ files we copy the values |
6955 |
12 Dec 22 |
nicklas |
89 |
if (!EqualsHelper.equals(R1.flowCellId, R2.flowCellId)) |
6955 |
12 Dec 22 |
nicklas |
90 |
{ |
6955 |
12 Dec 22 |
nicklas |
91 |
section.addErrorMessage("FlowCellID mismatch in FASTQ header: R1="+R1.flowCellId+"; R2="+R2.flowCellId); |
6955 |
12 Dec 22 |
nicklas |
92 |
} |
6955 |
12 Dec 22 |
nicklas |
93 |
else |
6955 |
12 Dec 22 |
nicklas |
94 |
{ |
6955 |
12 Dec 22 |
nicklas |
95 |
this.flowCellId = R1.flowCellId; |
6955 |
12 Dec 22 |
nicklas |
96 |
} |
6955 |
12 Dec 22 |
nicklas |
97 |
if (!EqualsHelper.equals(R1.sequencerSerial, R2.sequencerSerial)) |
6955 |
12 Dec 22 |
nicklas |
98 |
{ |
6955 |
12 Dec 22 |
nicklas |
99 |
section.addErrorMessage("SerialNumber mismatch in FASTQ header: R1="+R1.sequencerSerial+"; R2="+R2.sequencerSerial); |
6955 |
12 Dec 22 |
nicklas |
100 |
} |
6955 |
12 Dec 22 |
nicklas |
101 |
else |
6955 |
12 Dec 22 |
nicklas |
102 |
{ |
6955 |
12 Dec 22 |
nicklas |
103 |
this.sequencerSerial = R1.sequencerSerial; |
6955 |
12 Dec 22 |
nicklas |
104 |
} |
6955 |
12 Dec 22 |
nicklas |
105 |
if (!EqualsHelper.equals(R1.runNumber, R2.runNumber)) |
6955 |
12 Dec 22 |
nicklas |
106 |
{ |
6955 |
12 Dec 22 |
nicklas |
107 |
section.addErrorMessage("RunNumber mismatch in FASTQ header: R1="+R1.runNumber+"; R2="+R2.runNumber); |
6955 |
12 Dec 22 |
nicklas |
108 |
} |
6955 |
12 Dec 22 |
nicklas |
109 |
else |
6955 |
12 Dec 22 |
nicklas |
110 |
{ |
6955 |
12 Dec 22 |
nicklas |
111 |
this.runNumber = R1.runNumber; |
6955 |
12 Dec 22 |
nicklas |
112 |
} |
6960 |
13 Dec 22 |
nicklas |
113 |
if (R1.barcodeSequences != null && R1.barcodeSequences != null) |
6960 |
13 Dec 22 |
nicklas |
114 |
{ |
6960 |
13 Dec 22 |
nicklas |
115 |
if (!BarcodeUtil.matches(R1.barcodeSequences, R2.barcodeSequences, 4, 2)) |
6960 |
13 Dec 22 |
nicklas |
116 |
{ |
6960 |
13 Dec 22 |
nicklas |
117 |
section.addErrorMessage("BarcodeSequence mismatch in FASTQ header: R1="+R1.barcodeSequences+"; R2="+R2.barcodeSequences); |
6960 |
13 Dec 22 |
nicklas |
118 |
} |
6960 |
13 Dec 22 |
nicklas |
119 |
else |
6960 |
13 Dec 22 |
nicklas |
120 |
{ |
6960 |
13 Dec 22 |
nicklas |
121 |
this.barcodeSequences = BarcodeUtil.mergeN(R1.barcodeSequences, R2.barcodeSequences); |
6960 |
13 Dec 22 |
nicklas |
122 |
} |
6960 |
13 Dec 22 |
nicklas |
123 |
} |
6960 |
13 Dec 22 |
nicklas |
124 |
|
6955 |
12 Dec 22 |
nicklas |
125 |
if (section.hasError()) valid = false; |
6955 |
12 Dec 22 |
nicklas |
126 |
} |
6200 |
08 Apr 21 |
nicklas |
127 |
} |
6200 |
08 Apr 21 |
nicklas |
128 |
|
6955 |
12 Dec 22 |
nicklas |
129 |
private void loadMetadata(ImportContext ctx, RemoteSession session, String directory, FastqFile f, boolean fullValidation) |
6200 |
08 Apr 21 |
nicklas |
130 |
{ |
6200 |
08 Apr 21 |
nicklas |
131 |
InputStream tmp = null; |
6200 |
08 Apr 21 |
nicklas |
132 |
try |
6200 |
08 Apr 21 |
nicklas |
133 |
{ |
6896 |
28 Nov 22 |
nicklas |
134 |
FileMetaData info = null; |
6909 |
30 Nov 22 |
nicklas |
135 |
if (ctx != null && ctx.hasFileInfo()) |
6896 |
28 Nov 22 |
nicklas |
136 |
{ |
6896 |
28 Nov 22 |
nicklas |
137 |
info = ctx.getFileInfo(f.name); |
6896 |
28 Nov 22 |
nicklas |
138 |
} |
6955 |
12 Dec 22 |
nicklas |
139 |
if (info == null || fullValidation) |
6896 |
28 Nov 22 |
nicklas |
140 |
{ |
6896 |
28 Nov 22 |
nicklas |
141 |
info = new FileMetaData(); |
6896 |
28 Nov 22 |
nicklas |
142 |
tmp = session.readFile(directory+f.name, info); |
6896 |
28 Nov 22 |
nicklas |
143 |
} |
6894 |
25 Nov 22 |
nicklas |
144 |
if (info != null) |
6894 |
25 Nov 22 |
nicklas |
145 |
{ |
6894 |
25 Nov 22 |
nicklas |
146 |
f.actualSize = info.getSize(); |
6894 |
25 Nov 22 |
nicklas |
147 |
f.lastModified = info.getLastModifiedTime(); |
6955 |
12 Dec 22 |
nicklas |
148 |
if (fullValidation && f.actualSize > 0) |
6955 |
12 Dec 22 |
nicklas |
149 |
{ |
6955 |
12 Dec 22 |
nicklas |
// Get the first line of the FASTQ file |
6955 |
12 Dec 22 |
nicklas |
151 |
tmp = new GZIPInputStream(tmp); |
6955 |
12 Dec 22 |
nicklas |
152 |
BufferedReader r = new BufferedReader(new InputStreamReader(tmp, StandardCharsets.US_ASCII)); |
6955 |
12 Dec 22 |
nicklas |
153 |
String headerLine = r.readLine(); |
6955 |
12 Dec 22 |
nicklas |
154 |
r.close(); |
6955 |
12 Dec 22 |
nicklas |
155 |
if (headerLine != null) |
6955 |
12 Dec 22 |
nicklas |
156 |
{ |
6960 |
13 Dec 22 |
nicklas |
// The header is formatted as: @SerialNumber:RunNumber:FlowCellId:...other fields...:Barcode sequecenes |
6955 |
12 Dec 22 |
nicklas |
// Split on ':' from second character |
6955 |
12 Dec 22 |
nicklas |
159 |
String[] headers = headerLine.substring(1).split(":"); |
6955 |
12 Dec 22 |
nicklas |
160 |
if (headers.length >= 3) |
6955 |
12 Dec 22 |
nicklas |
161 |
{ |
6955 |
12 Dec 22 |
nicklas |
162 |
f.sequencerSerial = headers[0]; |
6955 |
12 Dec 22 |
nicklas |
163 |
f.runNumber = headers[1]; |
6955 |
12 Dec 22 |
nicklas |
164 |
f.flowCellId = headers[2]; |
6960 |
13 Dec 22 |
nicklas |
165 |
f.barcodeSequences = headers[headers.length-1]; |
6955 |
12 Dec 22 |
nicklas |
166 |
} |
6955 |
12 Dec 22 |
nicklas |
167 |
} |
6955 |
12 Dec 22 |
nicklas |
168 |
} |
6894 |
25 Nov 22 |
nicklas |
169 |
} |
6457 |
01 Nov 21 |
nicklas |
170 |
if (f.actualSize == 0) |
6200 |
08 Apr 21 |
nicklas |
171 |
{ |
6200 |
08 Apr 21 |
nicklas |
172 |
section.addErrorMessage("FASTQ file is missing: " + f.name); |
6894 |
25 Nov 22 |
nicklas |
// section.addWarningMessage("FASTQ file is missing: " + f.name); |
6200 |
08 Apr 21 |
nicklas |
174 |
} |
6457 |
01 Nov 21 |
nicklas |
175 |
else if (f.expectedSize != null && f.actualSize != f.expectedSize) |
6457 |
01 Nov 21 |
nicklas |
176 |
{ |
6474 |
04 Nov 21 |
nicklas |
177 |
section.addErrorMessage("FASTQ file size: "+f.name+"="+String.format(Locale.US, "%,d", f.actualSize)+" bytes (expected "+String.format(Locale.US, "%,d", f.expectedSize)+" bytes)"); |
6457 |
01 Nov 21 |
nicklas |
178 |
} |
6200 |
08 Apr 21 |
nicklas |
179 |
} |
6200 |
08 Apr 21 |
nicklas |
180 |
catch (Exception ex) |
6200 |
08 Apr 21 |
nicklas |
181 |
{ |
6200 |
08 Apr 21 |
nicklas |
182 |
section.addErrorMessage("Could not stat FASTQ file '"+f.name+"': " + ex.getMessage()); |
6200 |
08 Apr 21 |
nicklas |
183 |
} |
6200 |
08 Apr 21 |
nicklas |
184 |
finally |
6200 |
08 Apr 21 |
nicklas |
185 |
{ |
6200 |
08 Apr 21 |
nicklas |
186 |
FileUtil.close(tmp); |
6200 |
08 Apr 21 |
nicklas |
187 |
if (section.hasError()) valid = false; |
6200 |
08 Apr 21 |
nicklas |
188 |
} |
6200 |
08 Apr 21 |
nicklas |
189 |
} |
6200 |
08 Apr 21 |
nicklas |
190 |
|
6200 |
08 Apr 21 |
nicklas |
191 |
public static class FastqFile |
6200 |
08 Apr 21 |
nicklas |
192 |
implements ValueValidator<String, FastqFile> |
6200 |
08 Apr 21 |
nicklas |
193 |
{ |
6200 |
08 Apr 21 |
nicklas |
194 |
public String name; |
6221 |
23 Apr 21 |
nicklas |
195 |
public String md5; |
6457 |
01 Nov 21 |
nicklas |
196 |
public Long expectedSize; |
6457 |
01 Nov 21 |
nicklas |
197 |
public long actualSize; |
6200 |
08 Apr 21 |
nicklas |
198 |
public long lastModified; |
6200 |
08 Apr 21 |
nicklas |
199 |
|
6955 |
12 Dec 22 |
nicklas |
200 |
public String sequencerSerial; |
6955 |
12 Dec 22 |
nicklas |
201 |
public String flowCellId; |
6955 |
12 Dec 22 |
nicklas |
202 |
public String runNumber; |
6960 |
13 Dec 22 |
nicklas |
203 |
public String barcodeSequences; |
6955 |
12 Dec 22 |
nicklas |
204 |
|
6200 |
08 Apr 21 |
nicklas |
205 |
@Override |
6200 |
08 Apr 21 |
nicklas |
206 |
public Class<String> getExpectedClass() |
6200 |
08 Apr 21 |
nicklas |
207 |
{ |
6200 |
08 Apr 21 |
nicklas |
208 |
return String.class; |
6200 |
08 Apr 21 |
nicklas |
209 |
} |
6200 |
08 Apr 21 |
nicklas |
210 |
@Override |
6201 |
09 Apr 21 |
nicklas |
211 |
public FastqFile isValid(DbControl dc, String value, JsonSection section, String entryKey) |
6200 |
08 Apr 21 |
nicklas |
212 |
{ |
6221 |
23 Apr 21 |
nicklas |
213 |
this.name = PatternValidator.FILE_NAME.isValid(dc, value, section, entryKey); |
6221 |
23 Apr 21 |
nicklas |
214 |
if (name != null && !name.endsWith(".fastq.gz")) |
6221 |
23 Apr 21 |
nicklas |
215 |
{ |
6221 |
23 Apr 21 |
nicklas |
216 |
section.addWarningMessage("FASTQ filename doesn't end with '.fastq.gz': "+entryKey+"="+value); |
6221 |
23 Apr 21 |
nicklas |
217 |
} |
6221 |
23 Apr 21 |
nicklas |
218 |
return name == null ? null : this; |
6200 |
08 Apr 21 |
nicklas |
219 |
} |
6200 |
08 Apr 21 |
nicklas |
220 |
|
7050 |
17 Feb 23 |
nicklas |
221 |
public UriMetadata asUriMetadata() |
7050 |
17 Feb 23 |
nicklas |
222 |
{ |
7050 |
17 Feb 23 |
nicklas |
223 |
UriMetadata meta = new UriMetadata(null); |
7050 |
17 Feb 23 |
nicklas |
224 |
meta.setLength(actualSize); |
7050 |
17 Feb 23 |
nicklas |
225 |
meta.setLastModified(new Date(lastModified)); |
7050 |
17 Feb 23 |
nicklas |
226 |
meta.setMimeType("application/x-gzip"); |
7050 |
17 Feb 23 |
nicklas |
227 |
meta.setMd5(md5); |
7050 |
17 Feb 23 |
nicklas |
228 |
return meta; |
7050 |
17 Feb 23 |
nicklas |
229 |
} |
7050 |
17 Feb 23 |
nicklas |
230 |
|
6200 |
08 Apr 21 |
nicklas |
231 |
public JSONObject asJSONObject() |
6200 |
08 Apr 21 |
nicklas |
232 |
{ |
6200 |
08 Apr 21 |
nicklas |
233 |
JSONObject j = new JSONObject(); |
6200 |
08 Apr 21 |
nicklas |
234 |
j.put("name", name); |
6221 |
23 Apr 21 |
nicklas |
235 |
j.put("md5", md5); |
6457 |
01 Nov 21 |
nicklas |
236 |
j.put("size", actualSize); |
6200 |
08 Apr 21 |
nicklas |
237 |
j.put("lastModified", Reggie.CONVERTER_DATETIME_TO_STRING_WITH_SEPARATOR.convert(new Date(lastModified))); |
6200 |
08 Apr 21 |
nicklas |
238 |
return j; |
6200 |
08 Apr 21 |
nicklas |
239 |
} |
6200 |
08 Apr 21 |
nicklas |
240 |
|
6200 |
08 Apr 21 |
nicklas |
241 |
} |
6200 |
08 Apr 21 |
nicklas |
242 |
} |