4646 |
14 Dec 17 |
nicklas |
1 |
package net.sf.basedb.reggie.vcf; |
4646 |
14 Dec 17 |
nicklas |
2 |
|
4671 |
07 Feb 18 |
nicklas |
3 |
import org.json.simple.JSONObject; |
4671 |
07 Feb 18 |
nicklas |
4 |
|
4646 |
14 Dec 17 |
nicklas |
5 |
/** |
4646 |
14 Dec 17 |
nicklas |
A pair of two VCF files. The constructor will |
4646 |
14 Dec 17 |
nicklas |
trigger a comparison of genotypes from the |
4646 |
14 Dec 17 |
nicklas |
the two files. |
4646 |
14 Dec 17 |
nicklas |
9 |
|
4646 |
14 Dec 17 |
nicklas |
@author nicklas |
4646 |
14 Dec 17 |
nicklas |
@since 4.14 |
4646 |
14 Dec 17 |
nicklas |
12 |
*/ |
4646 |
14 Dec 17 |
nicklas |
13 |
public class VcfPair |
4646 |
14 Dec 17 |
nicklas |
14 |
{ |
4646 |
14 Dec 17 |
nicklas |
15 |
|
4646 |
14 Dec 17 |
nicklas |
16 |
private final VcfData vcf1; |
4646 |
14 Dec 17 |
nicklas |
17 |
private final VcfData vcf2; |
4646 |
14 Dec 17 |
nicklas |
18 |
|
4646 |
14 Dec 17 |
nicklas |
19 |
private int commonGt; |
4646 |
14 Dec 17 |
nicklas |
20 |
private int mismatches; |
4646 |
14 Dec 17 |
nicklas |
21 |
|
4671 |
07 Feb 18 |
nicklas |
22 |
private int homHomMismatches; |
4671 |
07 Feb 18 |
nicklas |
23 |
private int homHetMismatches; |
4671 |
07 Feb 18 |
nicklas |
24 |
private int hetHomMismatches; |
4671 |
07 Feb 18 |
nicklas |
25 |
|
4648 |
15 Dec 17 |
nicklas |
26 |
private int mismatchesLowGQ; |
4671 |
07 Feb 18 |
nicklas |
27 |
private int mismatchesHighGQ; |
4648 |
15 Dec 17 |
nicklas |
28 |
|
4646 |
14 Dec 17 |
nicklas |
29 |
/** |
4646 |
14 Dec 17 |
nicklas |
Creates a new pair. |
4646 |
14 Dec 17 |
nicklas |
31 |
*/ |
4646 |
14 Dec 17 |
nicklas |
32 |
public VcfPair(VcfData vcf1, VcfData vcf2) |
4646 |
14 Dec 17 |
nicklas |
33 |
{ |
4646 |
14 Dec 17 |
nicklas |
34 |
this.vcf1 = vcf1; |
4646 |
14 Dec 17 |
nicklas |
35 |
this.vcf2 = vcf2; |
4646 |
14 Dec 17 |
nicklas |
36 |
|
4646 |
14 Dec 17 |
nicklas |
37 |
for (GtData gt1 : vcf1.getGtData()) |
4646 |
14 Dec 17 |
nicklas |
38 |
{ |
4646 |
14 Dec 17 |
nicklas |
39 |
GtData gt2 = vcf2.getGtData(gt1.getId()); |
4646 |
14 Dec 17 |
nicklas |
40 |
if (gt2 == null) continue; |
4646 |
14 Dec 17 |
nicklas |
41 |
commonGt++; |
4648 |
15 Dec 17 |
nicklas |
42 |
if (gt1.getGenoType() != gt2.getGenoType()) |
4648 |
15 Dec 17 |
nicklas |
43 |
{ |
4648 |
15 Dec 17 |
nicklas |
44 |
mismatches++; |
6513 |
07 Dec 21 |
nicklas |
45 |
if (vcf1.isLowQuality(gt1) || vcf2.isLowQuality(gt2)) |
4648 |
15 Dec 17 |
nicklas |
46 |
{ |
4648 |
15 Dec 17 |
nicklas |
47 |
mismatchesLowGQ++; |
4648 |
15 Dec 17 |
nicklas |
48 |
} |
6513 |
07 Dec 21 |
nicklas |
49 |
else if (vcf1.isHighQuality(gt1) && vcf2.isHighQuality(gt2)) |
4671 |
07 Feb 18 |
nicklas |
50 |
{ |
4671 |
07 Feb 18 |
nicklas |
51 |
mismatchesHighGQ++; |
4671 |
07 Feb 18 |
nicklas |
52 |
} |
4671 |
07 Feb 18 |
nicklas |
53 |
if (gt1.getGenoType() != GenoType.HET && gt2.getGenoType() != GenoType.HET) |
4671 |
07 Feb 18 |
nicklas |
54 |
{ |
4671 |
07 Feb 18 |
nicklas |
55 |
homHomMismatches++; |
4671 |
07 Feb 18 |
nicklas |
56 |
} |
4671 |
07 Feb 18 |
nicklas |
57 |
else if (gt1.getGenoType() != GenoType.HET) |
4671 |
07 Feb 18 |
nicklas |
58 |
{ |
4671 |
07 Feb 18 |
nicklas |
59 |
homHetMismatches++; |
4671 |
07 Feb 18 |
nicklas |
60 |
} |
4671 |
07 Feb 18 |
nicklas |
61 |
else |
4671 |
07 Feb 18 |
nicklas |
62 |
{ |
4671 |
07 Feb 18 |
nicklas |
63 |
hetHomMismatches++; |
4671 |
07 Feb 18 |
nicklas |
64 |
} |
4648 |
15 Dec 17 |
nicklas |
65 |
} |
4646 |
14 Dec 17 |
nicklas |
66 |
} |
4646 |
14 Dec 17 |
nicklas |
67 |
} |
4646 |
14 Dec 17 |
nicklas |
68 |
|
4646 |
14 Dec 17 |
nicklas |
69 |
/** |
6451 |
22 Oct 21 |
nicklas |
Get the first VCF in the pair. |
6451 |
22 Oct 21 |
nicklas |
@since 4.34 |
6451 |
22 Oct 21 |
nicklas |
72 |
*/ |
6451 |
22 Oct 21 |
nicklas |
73 |
public VcfData getVcf1() |
6451 |
22 Oct 21 |
nicklas |
74 |
{ |
6451 |
22 Oct 21 |
nicklas |
75 |
return vcf1; |
6451 |
22 Oct 21 |
nicklas |
76 |
} |
6451 |
22 Oct 21 |
nicklas |
77 |
|
6451 |
22 Oct 21 |
nicklas |
78 |
/** |
6451 |
22 Oct 21 |
nicklas |
Get the second VCF in the pair. |
6451 |
22 Oct 21 |
nicklas |
@since 4.34 |
6451 |
22 Oct 21 |
nicklas |
81 |
*/ |
6451 |
22 Oct 21 |
nicklas |
82 |
public VcfData getVcf2() |
6451 |
22 Oct 21 |
nicklas |
83 |
{ |
6451 |
22 Oct 21 |
nicklas |
84 |
return vcf2; |
6451 |
22 Oct 21 |
nicklas |
85 |
} |
6451 |
22 Oct 21 |
nicklas |
86 |
|
6451 |
22 Oct 21 |
nicklas |
87 |
/** |
4646 |
14 Dec 17 |
nicklas |
Get the number of common genotypes. |
4646 |
14 Dec 17 |
nicklas |
89 |
*/ |
4646 |
14 Dec 17 |
nicklas |
90 |
public int getCommonGt() |
4646 |
14 Dec 17 |
nicklas |
91 |
{ |
4646 |
14 Dec 17 |
nicklas |
92 |
return commonGt; |
4646 |
14 Dec 17 |
nicklas |
93 |
} |
4646 |
14 Dec 17 |
nicklas |
94 |
|
4646 |
14 Dec 17 |
nicklas |
95 |
/** |
4646 |
14 Dec 17 |
nicklas |
Get the number of mismatched genotypes. |
4646 |
14 Dec 17 |
nicklas |
97 |
*/ |
4646 |
14 Dec 17 |
nicklas |
98 |
public int getMismatches() |
4646 |
14 Dec 17 |
nicklas |
99 |
{ |
4646 |
14 Dec 17 |
nicklas |
100 |
return mismatches; |
4646 |
14 Dec 17 |
nicklas |
101 |
} |
4646 |
14 Dec 17 |
nicklas |
102 |
|
4648 |
15 Dec 17 |
nicklas |
103 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the percentage of mismatches. |
4671 |
07 Feb 18 |
nicklas |
105 |
*/ |
4671 |
07 Feb 18 |
nicklas |
106 |
public float getMismatchPercentage() |
4671 |
07 Feb 18 |
nicklas |
107 |
{ |
4671 |
07 Feb 18 |
nicklas |
108 |
return commonGt > 0 ? 100f * mismatches / commonGt : 0; |
4671 |
07 Feb 18 |
nicklas |
109 |
} |
4671 |
07 Feb 18 |
nicklas |
110 |
|
4671 |
07 Feb 18 |
nicklas |
111 |
/** |
4648 |
15 Dec 17 |
nicklas |
Get the number of mismatches where at least one |
4648 |
15 Dec 17 |
nicklas |
of the genotypes has a GQ value below 50. |
4648 |
15 Dec 17 |
nicklas |
114 |
*/ |
4648 |
15 Dec 17 |
nicklas |
115 |
public int getMismatchesWithLowGQ() |
4648 |
15 Dec 17 |
nicklas |
116 |
{ |
4648 |
15 Dec 17 |
nicklas |
117 |
return mismatchesLowGQ; |
4648 |
15 Dec 17 |
nicklas |
118 |
} |
4646 |
14 Dec 17 |
nicklas |
119 |
|
4671 |
07 Feb 18 |
nicklas |
120 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the percentage of low GQ mismatches. |
4671 |
07 Feb 18 |
nicklas |
122 |
*/ |
4671 |
07 Feb 18 |
nicklas |
123 |
public float getMismatchWithLowGQPercentage() |
4671 |
07 Feb 18 |
nicklas |
124 |
{ |
4671 |
07 Feb 18 |
nicklas |
125 |
return commonGt > 0 ? 100f * mismatchesLowGQ / commonGt : 0; |
4671 |
07 Feb 18 |
nicklas |
126 |
} |
4671 |
07 Feb 18 |
nicklas |
127 |
|
4671 |
07 Feb 18 |
nicklas |
128 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the number of mismatches where both |
4671 |
07 Feb 18 |
nicklas |
genotypes have a GQ value of 99. |
4671 |
07 Feb 18 |
nicklas |
131 |
*/ |
4671 |
07 Feb 18 |
nicklas |
132 |
public int getMismatchesWithHighGQ() |
4671 |
07 Feb 18 |
nicklas |
133 |
{ |
4671 |
07 Feb 18 |
nicklas |
134 |
return mismatchesHighGQ; |
4671 |
07 Feb 18 |
nicklas |
135 |
} |
4671 |
07 Feb 18 |
nicklas |
136 |
|
4671 |
07 Feb 18 |
nicklas |
137 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the percentage of high GQ mismatches. |
4671 |
07 Feb 18 |
nicklas |
139 |
*/ |
4671 |
07 Feb 18 |
nicklas |
140 |
public float getMismatchWithHighGQPercentage() |
4671 |
07 Feb 18 |
nicklas |
141 |
{ |
4671 |
07 Feb 18 |
nicklas |
142 |
return commonGt > 0 ? 100f * mismatchesHighGQ / commonGt : 0; |
4671 |
07 Feb 18 |
nicklas |
143 |
} |
4671 |
07 Feb 18 |
nicklas |
144 |
|
4671 |
07 Feb 18 |
nicklas |
145 |
|
4671 |
07 Feb 18 |
nicklas |
146 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the number of mismatches where both samples are homozygous. |
4671 |
07 Feb 18 |
nicklas |
148 |
*/ |
4671 |
07 Feb 18 |
nicklas |
149 |
public int getHomHomMismatches() |
4671 |
07 Feb 18 |
nicklas |
150 |
{ |
4671 |
07 Feb 18 |
nicklas |
151 |
return homHomMismatches; |
4671 |
07 Feb 18 |
nicklas |
152 |
} |
4671 |
07 Feb 18 |
nicklas |
153 |
|
4671 |
07 Feb 18 |
nicklas |
154 |
/** |
4681 |
21 Feb 18 |
nicklas |
Get the percentage of high HOMHOM mismatches (in relation to all mismatches). |
4681 |
21 Feb 18 |
nicklas |
156 |
*/ |
4681 |
21 Feb 18 |
nicklas |
157 |
public float getHomHomMismatchPercentage() |
4681 |
21 Feb 18 |
nicklas |
158 |
{ |
4681 |
21 Feb 18 |
nicklas |
159 |
return mismatches > 0 ? 100f * homHomMismatches / mismatches : 0; |
4681 |
21 Feb 18 |
nicklas |
160 |
} |
4681 |
21 Feb 18 |
nicklas |
161 |
|
4681 |
21 Feb 18 |
nicklas |
162 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the number of mismatches where the first sample |
4671 |
07 Feb 18 |
nicklas |
is homozygous and the other heterozygous. |
4671 |
07 Feb 18 |
nicklas |
165 |
*/ |
4671 |
07 Feb 18 |
nicklas |
166 |
public int getHomHetMismatches() |
4671 |
07 Feb 18 |
nicklas |
167 |
{ |
4671 |
07 Feb 18 |
nicklas |
168 |
return homHetMismatches; |
4671 |
07 Feb 18 |
nicklas |
169 |
} |
4671 |
07 Feb 18 |
nicklas |
170 |
|
4671 |
07 Feb 18 |
nicklas |
171 |
/** |
4671 |
07 Feb 18 |
nicklas |
Get the number of mismatches where the first sample |
4671 |
07 Feb 18 |
nicklas |
is heterozygous and the other homozygous. |
4671 |
07 Feb 18 |
nicklas |
174 |
*/ |
4671 |
07 Feb 18 |
nicklas |
175 |
public int getHetHomMismatches() |
4671 |
07 Feb 18 |
nicklas |
176 |
{ |
4671 |
07 Feb 18 |
nicklas |
177 |
return hetHomMismatches; |
4671 |
07 Feb 18 |
nicklas |
178 |
} |
4671 |
07 Feb 18 |
nicklas |
179 |
|
4671 |
07 Feb 18 |
nicklas |
180 |
public JSONObject asJSONObject() |
4671 |
07 Feb 18 |
nicklas |
181 |
{ |
4671 |
07 Feb 18 |
nicklas |
182 |
JSONObject json = new JSONObject(); |
4671 |
07 Feb 18 |
nicklas |
183 |
json.put("commonGt", commonGt); |
4671 |
07 Feb 18 |
nicklas |
184 |
json.put("mismatches", mismatches); |
4671 |
07 Feb 18 |
nicklas |
185 |
json.put("mismatchesLowGQ", mismatchesLowGQ); |
4671 |
07 Feb 18 |
nicklas |
186 |
json.put("mismatchesHighGQ", mismatchesHighGQ); |
4671 |
07 Feb 18 |
nicklas |
187 |
json.put("homHomMismatches", homHomMismatches); |
4671 |
07 Feb 18 |
nicklas |
188 |
json.put("homHetMismatches", homHetMismatches); |
4671 |
07 Feb 18 |
nicklas |
189 |
json.put("hetHomMismatches", hetHomMismatches); |
4671 |
07 Feb 18 |
nicklas |
190 |
return json; |
4671 |
07 Feb 18 |
nicklas |
191 |
} |
4646 |
14 Dec 17 |
nicklas |
192 |
} |