5730 |
18 Nov 19 |
nicklas |
1 |
package net.sf.basedb.reggie.vcf; |
5730 |
18 Nov 19 |
nicklas |
2 |
|
5730 |
18 Nov 19 |
nicklas |
3 |
import java.util.Collection; |
5730 |
18 Nov 19 |
nicklas |
4 |
import java.util.HashMap; |
5730 |
18 Nov 19 |
nicklas |
5 |
import java.util.Map; |
5730 |
18 Nov 19 |
nicklas |
6 |
import java.util.regex.Matcher; |
5730 |
18 Nov 19 |
nicklas |
7 |
import java.util.regex.Pattern; |
5730 |
18 Nov 19 |
nicklas |
8 |
|
5730 |
18 Nov 19 |
nicklas |
9 |
import net.sf.basedb.core.Type; |
5730 |
18 Nov 19 |
nicklas |
10 |
|
5730 |
18 Nov 19 |
nicklas |
11 |
/** |
5730 |
18 Nov 19 |
nicklas |
InfoFactory implementation that extract a pre-defined set of fields |
5730 |
18 Nov 19 |
nicklas |
and ignore all others. It recognises Integer, Float and Flag (=Boolean) |
5730 |
18 Nov 19 |
nicklas |
types and will try to parse values to those types. All other values are |
5730 |
18 Nov 19 |
nicklas |
kept as strings. |
5730 |
18 Nov 19 |
nicklas |
16 |
|
5730 |
18 Nov 19 |
nicklas |
The ANN field is treated specially and is extracted into sub-fields |
5730 |
18 Nov 19 |
nicklas |
from the Description fields. Values can be retreived by keys: ANN.<subkey>[index] |
5730 |
18 Nov 19 |
nicklas |
for example, ANN.Gene_Name[0] |
5730 |
18 Nov 19 |
nicklas |
20 |
|
5730 |
18 Nov 19 |
nicklas |
ANN# gives number of entries for the ANN field. |
5730 |
18 Nov 19 |
nicklas |
22 |
*/ |
5730 |
18 Nov 19 |
nicklas |
23 |
public class VariantCallingInfoFactory |
5730 |
18 Nov 19 |
nicklas |
24 |
implements InfoFactory |
5730 |
18 Nov 19 |
nicklas |
25 |
{ |
5730 |
18 Nov 19 |
nicklas |
26 |
|
5730 |
18 Nov 19 |
nicklas |
27 |
private final Map<String, Type> keys; |
5730 |
18 Nov 19 |
nicklas |
28 |
private final Collection<String> annKeys; |
5730 |
18 Nov 19 |
nicklas |
29 |
|
5730 |
18 Nov 19 |
nicklas |
30 |
private final Pattern getType; |
5730 |
18 Nov 19 |
nicklas |
31 |
private final Pattern getAnnKeys; |
5730 |
18 Nov 19 |
nicklas |
32 |
private String[] annColumns; |
5730 |
18 Nov 19 |
nicklas |
33 |
|
5730 |
18 Nov 19 |
nicklas |
34 |
/** |
5730 |
18 Nov 19 |
nicklas |
Create a new factory that is extracting the given fields |
5730 |
18 Nov 19 |
nicklas |
@param keys All regular fields |
5730 |
18 Nov 19 |
nicklas |
@param annKeys Sub-fields under the ANN key |
5730 |
18 Nov 19 |
nicklas |
38 |
*/ |
5730 |
18 Nov 19 |
nicklas |
39 |
public VariantCallingInfoFactory(Collection<String> keys, Collection<String> annKeys) |
5730 |
18 Nov 19 |
nicklas |
40 |
{ |
5730 |
18 Nov 19 |
nicklas |
41 |
this.keys = new HashMap<String, Type>(); |
5730 |
18 Nov 19 |
nicklas |
42 |
if (keys != null) |
5730 |
18 Nov 19 |
nicklas |
43 |
{ |
5730 |
18 Nov 19 |
nicklas |
// Start by assuming all fields are Strings |
5730 |
18 Nov 19 |
nicklas |
45 |
for (String k : keys) |
5730 |
18 Nov 19 |
nicklas |
46 |
{ |
5730 |
18 Nov 19 |
nicklas |
47 |
this.keys.put(k, Type.STRING); |
5730 |
18 Nov 19 |
nicklas |
48 |
} |
5730 |
18 Nov 19 |
nicklas |
49 |
} |
5730 |
18 Nov 19 |
nicklas |
50 |
this.annKeys = annKeys; |
5730 |
18 Nov 19 |
nicklas |
51 |
this.getType = Pattern.compile("Type=(\\w+)"); |
5730 |
18 Nov 19 |
nicklas |
52 |
this.getAnnKeys = Pattern.compile("Description=.*\\'(.*)\\'"); |
5730 |
18 Nov 19 |
nicklas |
53 |
} |
5730 |
18 Nov 19 |
nicklas |
54 |
|
5730 |
18 Nov 19 |
nicklas |
55 |
@Override |
5730 |
18 Nov 19 |
nicklas |
56 |
public void addInfoHeader(String id, String data) |
5730 |
18 Nov 19 |
nicklas |
57 |
{ |
5730 |
18 Nov 19 |
nicklas |
58 |
if (keys.containsKey(id)) |
5730 |
18 Nov 19 |
nicklas |
59 |
{ |
5730 |
18 Nov 19 |
nicklas |
// Parse Type of predefined keys |
5730 |
18 Nov 19 |
nicklas |
61 |
Matcher m = getType.matcher(data); |
5730 |
18 Nov 19 |
nicklas |
62 |
if (m.find()) |
5730 |
18 Nov 19 |
nicklas |
63 |
{ |
5730 |
18 Nov 19 |
nicklas |
64 |
String type = m.group(1); |
5730 |
18 Nov 19 |
nicklas |
65 |
if ("Float".equals(type)) |
5730 |
18 Nov 19 |
nicklas |
66 |
{ |
5730 |
18 Nov 19 |
nicklas |
67 |
keys.put(id, Type.FLOAT); |
5730 |
18 Nov 19 |
nicklas |
68 |
} |
5730 |
18 Nov 19 |
nicklas |
69 |
else if ("Integer".equals(type)) |
5730 |
18 Nov 19 |
nicklas |
70 |
{ |
5730 |
18 Nov 19 |
nicklas |
71 |
keys.put(id, Type.INT); |
5730 |
18 Nov 19 |
nicklas |
72 |
} |
5730 |
18 Nov 19 |
nicklas |
73 |
else if ("Flag".equals(type)) |
5730 |
18 Nov 19 |
nicklas |
74 |
{ |
5730 |
18 Nov 19 |
nicklas |
75 |
keys.put(id, Type.BOOLEAN); |
5730 |
18 Nov 19 |
nicklas |
76 |
} |
5730 |
18 Nov 19 |
nicklas |
77 |
} |
5730 |
18 Nov 19 |
nicklas |
78 |
} |
5730 |
18 Nov 19 |
nicklas |
79 |
|
5730 |
18 Nov 19 |
nicklas |
80 |
if ("ANN".equals(id) && annKeys != null) |
5730 |
18 Nov 19 |
nicklas |
81 |
{ |
5730 |
18 Nov 19 |
nicklas |
// The ANN key has subfields specified in Description that are separated by '|' |
5730 |
18 Nov 19 |
nicklas |
// We get all subfields and match against the specified annKeys and keep |
5730 |
18 Nov 19 |
nicklas |
// a string[] with only the specified subfields (including 'ANN.' prefix). |
5730 |
18 Nov 19 |
nicklas |
85 |
Matcher m = getAnnKeys.matcher(data); |
5730 |
18 Nov 19 |
nicklas |
86 |
if (m.find()) |
5730 |
18 Nov 19 |
nicklas |
87 |
{ |
5730 |
18 Nov 19 |
nicklas |
88 |
annColumns = m.group(1).split("\\s\\|\\s"); |
5730 |
18 Nov 19 |
nicklas |
89 |
for (int colNo = 0; colNo < annColumns.length; colNo++) |
5730 |
18 Nov 19 |
nicklas |
90 |
{ |
5730 |
18 Nov 19 |
nicklas |
91 |
annColumns[colNo] = annKeys.contains(annColumns[colNo]) ? |
5730 |
18 Nov 19 |
nicklas |
92 |
"ANN."+annColumns[colNo] : null; |
5730 |
18 Nov 19 |
nicklas |
93 |
} |
5730 |
18 Nov 19 |
nicklas |
94 |
} |
5730 |
18 Nov 19 |
nicklas |
95 |
} |
5730 |
18 Nov 19 |
nicklas |
96 |
} |
5730 |
18 Nov 19 |
nicklas |
97 |
|
5730 |
18 Nov 19 |
nicklas |
98 |
@Override |
5730 |
18 Nov 19 |
nicklas |
99 |
public InfoData getInfo(String info, SnpData snp) |
5730 |
18 Nov 19 |
nicklas |
100 |
{ |
5730 |
18 Nov 19 |
nicklas |
101 |
if (info == null) return null; |
5730 |
18 Nov 19 |
nicklas |
102 |
|
5730 |
18 Nov 19 |
nicklas |
103 |
InfoData data = new InfoData(); |
5730 |
18 Nov 19 |
nicklas |
104 |
String[] entries = info.split(";"); |
5730 |
18 Nov 19 |
nicklas |
105 |
for (String e : entries) |
5730 |
18 Nov 19 |
nicklas |
106 |
{ |
5730 |
18 Nov 19 |
nicklas |
107 |
String[] kv = e.split("=", 2); |
5730 |
18 Nov 19 |
nicklas |
108 |
String key = kv[0]; |
5730 |
18 Nov 19 |
nicklas |
109 |
String val = kv.length == 2 ? kv[1] : key; |
5730 |
18 Nov 19 |
nicklas |
110 |
Type vType = keys.get(key); |
5730 |
18 Nov 19 |
nicklas |
111 |
if (vType != null) |
5730 |
18 Nov 19 |
nicklas |
112 |
{ |
5730 |
18 Nov 19 |
nicklas |
113 |
data.setInfo(key, vType.parseString(val)); |
5730 |
18 Nov 19 |
nicklas |
114 |
} |
5730 |
18 Nov 19 |
nicklas |
115 |
|
5730 |
18 Nov 19 |
nicklas |
116 |
if ("ANN".equals(key) && annColumns != null) |
5730 |
18 Nov 19 |
nicklas |
117 |
{ |
5730 |
18 Nov 19 |
nicklas |
// There can be multiple entries that are separated by ',' |
5730 |
18 Nov 19 |
nicklas |
119 |
String[] annEntries = val.split(","); |
6387 |
15 Sep 21 |
nicklas |
120 |
int numAnn = -1; |
5730 |
18 Nov 19 |
nicklas |
121 |
for (int eNo = 0; eNo < annEntries.length; eNo++) |
5730 |
18 Nov 19 |
nicklas |
122 |
{ |
6387 |
15 Sep 21 |
nicklas |
123 |
String annRow = annEntries[eNo]; |
6387 |
15 Sep 21 |
nicklas |
124 |
if (annRow.contains("structural_interaction") || annRow.contains("protein_protein_contact")) |
6387 |
15 Sep 21 |
nicklas |
125 |
{ |
6387 |
15 Sep 21 |
nicklas |
126 |
continue; // with next row |
6387 |
15 Sep 21 |
nicklas |
127 |
} |
6387 |
15 Sep 21 |
nicklas |
128 |
numAnn++; |
6387 |
15 Sep 21 |
nicklas |
129 |
String[] annVals = annRow.split("\\|"); |
5730 |
18 Nov 19 |
nicklas |
130 |
for (int colNo = 0; colNo < Math.min(annVals.length, annColumns.length); colNo++) |
5730 |
18 Nov 19 |
nicklas |
131 |
{ |
5730 |
18 Nov 19 |
nicklas |
132 |
if (annColumns[colNo] != null) |
5730 |
18 Nov 19 |
nicklas |
133 |
{ |
6387 |
15 Sep 21 |
nicklas |
134 |
data.setInfo(annColumns[colNo]+"["+numAnn+"]", annVals[colNo]); |
5730 |
18 Nov 19 |
nicklas |
135 |
} |
5730 |
18 Nov 19 |
nicklas |
136 |
} |
5730 |
18 Nov 19 |
nicklas |
137 |
} |
6387 |
15 Sep 21 |
nicklas |
138 |
data.setInfo("ANN#", numAnn+1); // Number of entries is stored in ANN# |
5730 |
18 Nov 19 |
nicklas |
139 |
} |
5730 |
18 Nov 19 |
nicklas |
140 |
} |
5730 |
18 Nov 19 |
nicklas |
141 |
return data; |
5730 |
18 Nov 19 |
nicklas |
142 |
} |
5730 |
18 Nov 19 |
nicklas |
143 |
|
5730 |
18 Nov 19 |
nicklas |
144 |
} |