5040 |
19 Oct 18 |
nicklas |
1 |
package net.sf.basedb.reggie.baf; |
5040 |
19 Oct 18 |
nicklas |
2 |
|
5045 |
22 Oct 18 |
nicklas |
3 |
import java.util.List; |
5040 |
19 Oct 18 |
nicklas |
4 |
|
5054 |
26 Oct 18 |
nicklas |
5 |
import net.sf.basedb.core.DbControl; |
5054 |
26 Oct 18 |
nicklas |
6 |
import net.sf.basedb.core.DerivedBioAssay; |
5054 |
26 Oct 18 |
nicklas |
7 |
import net.sf.basedb.reggie.dao.Annotationtype; |
5045 |
22 Oct 18 |
nicklas |
8 |
import net.sf.basedb.reggie.vcf.GtData; |
5045 |
22 Oct 18 |
nicklas |
9 |
import net.sf.basedb.reggie.vcf.SnpData; |
5045 |
22 Oct 18 |
nicklas |
10 |
import net.sf.basedb.reggie.vcf.VcfData; |
5045 |
22 Oct 18 |
nicklas |
11 |
import net.sf.basedb.reggie.vcf.VcfDataFilter; |
5054 |
26 Oct 18 |
nicklas |
12 |
import net.sf.basedb.util.filter.Filter; |
5045 |
22 Oct 18 |
nicklas |
13 |
|
5040 |
19 Oct 18 |
nicklas |
14 |
/** |
5040 |
19 Oct 18 |
nicklas |
Class for collecting mBAF data and statistics. |
5040 |
19 Oct 18 |
nicklas |
16 |
|
5040 |
19 Oct 18 |
nicklas |
@author nicklas |
5040 |
19 Oct 18 |
nicklas |
@since 4.20 |
5040 |
19 Oct 18 |
nicklas |
@see MBafParser#parse(java.io.InputStream, String) |
5040 |
19 Oct 18 |
nicklas |
20 |
*/ |
5040 |
19 Oct 18 |
nicklas |
21 |
public class BafData |
5040 |
19 Oct 18 |
nicklas |
22 |
{ |
5045 |
22 Oct 18 |
nicklas |
23 |
|
5045 |
22 Oct 18 |
nicklas |
24 |
private final List<RegionStat> regions; |
5045 |
22 Oct 18 |
nicklas |
25 |
private final MBafOptions options; |
5045 |
22 Oct 18 |
nicklas |
26 |
|
5045 |
22 Oct 18 |
nicklas |
27 |
private int snpCount; |
5045 |
22 Oct 18 |
nicklas |
28 |
private int snpFiltered; |
5045 |
22 Oct 18 |
nicklas |
29 |
private VcfData vcfData; |
5045 |
22 Oct 18 |
nicklas |
30 |
|
5045 |
22 Oct 18 |
nicklas |
31 |
BafData(List<Region> regions, MBafOptions options) |
5045 |
22 Oct 18 |
nicklas |
32 |
{ |
5045 |
22 Oct 18 |
nicklas |
33 |
this.regions = RegionStat.create(regions); |
5045 |
22 Oct 18 |
nicklas |
34 |
this.options = options; |
5045 |
22 Oct 18 |
nicklas |
35 |
} |
5045 |
22 Oct 18 |
nicklas |
36 |
|
5045 |
22 Oct 18 |
nicklas |
37 |
/** |
5045 |
22 Oct 18 |
nicklas |
Get the total number of SNPs that were found in the VCF file |
5045 |
22 Oct 18 |
nicklas |
that has a valid genotype. |
5045 |
22 Oct 18 |
nicklas |
40 |
*/ |
5045 |
22 Oct 18 |
nicklas |
41 |
public int getSnpCount() |
5045 |
22 Oct 18 |
nicklas |
42 |
{ |
5045 |
22 Oct 18 |
nicklas |
43 |
return snpCount; |
5045 |
22 Oct 18 |
nicklas |
44 |
} |
5045 |
22 Oct 18 |
nicklas |
45 |
|
5045 |
22 Oct 18 |
nicklas |
46 |
/** |
5045 |
22 Oct 18 |
nicklas |
Get the total number of SNps in the VCF file that passed the |
5045 |
22 Oct 18 |
nicklas |
filter. See {@link MBafOptions#getFilter()}. |
5045 |
22 Oct 18 |
nicklas |
49 |
*/ |
5045 |
22 Oct 18 |
nicklas |
50 |
public int getFilteredCount() |
5045 |
22 Oct 18 |
nicklas |
51 |
{ |
5045 |
22 Oct 18 |
nicklas |
52 |
return snpFiltered; |
5045 |
22 Oct 18 |
nicklas |
53 |
} |
5045 |
22 Oct 18 |
nicklas |
54 |
|
5045 |
22 Oct 18 |
nicklas |
55 |
/** |
5045 |
22 Oct 18 |
nicklas |
Get statisics for each of the regions. |
5045 |
22 Oct 18 |
nicklas |
57 |
*/ |
5045 |
22 Oct 18 |
nicklas |
58 |
public List<RegionStat> getRegionStat() |
5045 |
22 Oct 18 |
nicklas |
59 |
{ |
5045 |
22 Oct 18 |
nicklas |
60 |
return regions; |
5045 |
22 Oct 18 |
nicklas |
61 |
} |
5045 |
22 Oct 18 |
nicklas |
62 |
|
5045 |
22 Oct 18 |
nicklas |
63 |
/** |
5045 |
22 Oct 18 |
nicklas |
Get the VcfData that was created by the VcfParser during the |
5045 |
22 Oct 18 |
nicklas |
parsing. |
5045 |
22 Oct 18 |
nicklas |
66 |
*/ |
5045 |
22 Oct 18 |
nicklas |
67 |
public VcfData getVcfData() |
5045 |
22 Oct 18 |
nicklas |
68 |
{ |
5045 |
22 Oct 18 |
nicklas |
69 |
return vcfData; |
5045 |
22 Oct 18 |
nicklas |
70 |
} |
5045 |
22 Oct 18 |
nicklas |
71 |
void setVcfData(VcfData vcfData) |
5045 |
22 Oct 18 |
nicklas |
72 |
{ |
5045 |
22 Oct 18 |
nicklas |
73 |
this.vcfData = vcfData; |
5045 |
22 Oct 18 |
nicklas |
74 |
} |
5045 |
22 Oct 18 |
nicklas |
75 |
|
5045 |
22 Oct 18 |
nicklas |
76 |
/** |
5045 |
22 Oct 18 |
nicklas |
Filter implementation that collects statistics from the VCF file. |
5045 |
22 Oct 18 |
nicklas |
It delegates to the {@link MBafOptions#getFilter()} filter to |
5045 |
22 Oct 18 |
nicklas |
determine which SNPs that should be included or not. |
5045 |
22 Oct 18 |
nicklas |
80 |
*/ |
5051 |
24 Oct 18 |
nicklas |
81 |
VcfDataFilter getVcfFilter() |
5045 |
22 Oct 18 |
nicklas |
82 |
{ |
5051 |
24 Oct 18 |
nicklas |
83 |
VcfDataFilter filter = options.getVcfFilter(); |
5045 |
22 Oct 18 |
nicklas |
84 |
|
5045 |
22 Oct 18 |
nicklas |
85 |
return new VcfDataFilter() |
5045 |
22 Oct 18 |
nicklas |
86 |
{ |
5045 |
22 Oct 18 |
nicklas |
87 |
@Override |
5045 |
22 Oct 18 |
nicklas |
88 |
public boolean accept(GtData gt, SnpData snp) |
5045 |
22 Oct 18 |
nicklas |
89 |
{ |
5045 |
22 Oct 18 |
nicklas |
// Ignore if the SNP has no genotype |
5045 |
22 Oct 18 |
nicklas |
91 |
if (gt.getGenoType() == null) return false; |
5045 |
22 Oct 18 |
nicklas |
92 |
snpCount++; |
5045 |
22 Oct 18 |
nicklas |
93 |
|
5045 |
22 Oct 18 |
nicklas |
// Delegate to outside filter |
5045 |
22 Oct 18 |
nicklas |
95 |
if (!filter.accept(gt, snp)) return false; |
5045 |
22 Oct 18 |
nicklas |
96 |
snpFiltered++; |
5045 |
22 Oct 18 |
nicklas |
97 |
|
5045 |
22 Oct 18 |
nicklas |
// Add to regions |
5045 |
22 Oct 18 |
nicklas |
99 |
for (RegionStat r : regions) |
5045 |
22 Oct 18 |
nicklas |
100 |
{ |
5045 |
22 Oct 18 |
nicklas |
101 |
if (r.getRegion().isInRegion(snp)) |
5045 |
22 Oct 18 |
nicklas |
102 |
{ |
5045 |
22 Oct 18 |
nicklas |
103 |
r.addToStat(gt); |
5045 |
22 Oct 18 |
nicklas |
104 |
} |
5045 |
22 Oct 18 |
nicklas |
105 |
} |
5045 |
22 Oct 18 |
nicklas |
106 |
return true; |
5045 |
22 Oct 18 |
nicklas |
107 |
} |
5045 |
22 Oct 18 |
nicklas |
108 |
}; |
5045 |
22 Oct 18 |
nicklas |
109 |
} |
5045 |
22 Oct 18 |
nicklas |
110 |
|
5054 |
26 Oct 18 |
nicklas |
111 |
/** |
5054 |
26 Oct 18 |
nicklas |
Collect some metrics from the result. |
5054 |
26 Oct 18 |
nicklas |
* Total number SNPs |
5054 |
26 Oct 18 |
nicklas |
* SNPs that passed the filter |
5054 |
26 Oct 18 |
nicklas |
* Number of regions that passed the filter |
5054 |
26 Oct 18 |
nicklas |
* Number of significant regions |
5054 |
26 Oct 18 |
nicklas |
* Region with lowest p-value |
5054 |
26 Oct 18 |
nicklas |
* The lowest p-value |
5054 |
26 Oct 18 |
nicklas |
119 |
*/ |
5054 |
26 Oct 18 |
nicklas |
120 |
public Metrics collectMetrics() |
5054 |
26 Oct 18 |
nicklas |
121 |
{ |
5054 |
26 Oct 18 |
nicklas |
122 |
Metrics m = new Metrics(); |
5054 |
26 Oct 18 |
nicklas |
123 |
m.snpCount = getSnpCount(); |
5054 |
26 Oct 18 |
nicklas |
124 |
m.filteredCount = getFilteredCount(); |
5054 |
26 Oct 18 |
nicklas |
125 |
|
5054 |
26 Oct 18 |
nicklas |
126 |
Filter<RegionStat> regionFilter = options.getRegionFilter(); |
5054 |
26 Oct 18 |
nicklas |
127 |
for (RegionStat rs : getRegionStat()) |
5054 |
26 Oct 18 |
nicklas |
128 |
{ |
5054 |
26 Oct 18 |
nicklas |
129 |
if (regionFilter.evaluate(rs)) |
5054 |
26 Oct 18 |
nicklas |
130 |
{ |
5054 |
26 Oct 18 |
nicklas |
131 |
m.numRegions++; |
5054 |
26 Oct 18 |
nicklas |
132 |
double p = rs.getPValue(); |
5054 |
26 Oct 18 |
nicklas |
133 |
if (p < options.getSignificantPVal()) |
5054 |
26 Oct 18 |
nicklas |
134 |
{ |
5054 |
26 Oct 18 |
nicklas |
135 |
m.numSignificantRegions++; |
5054 |
26 Oct 18 |
nicklas |
136 |
} |
5054 |
26 Oct 18 |
nicklas |
137 |
if (Double.isNaN(m.minP) || p < m.minP) |
5054 |
26 Oct 18 |
nicklas |
138 |
{ |
5054 |
26 Oct 18 |
nicklas |
139 |
m.minP = p; |
5054 |
26 Oct 18 |
nicklas |
140 |
m.minRegion = rs; |
5054 |
26 Oct 18 |
nicklas |
141 |
} |
5054 |
26 Oct 18 |
nicklas |
142 |
} |
5054 |
26 Oct 18 |
nicklas |
143 |
} |
5054 |
26 Oct 18 |
nicklas |
144 |
|
5054 |
26 Oct 18 |
nicklas |
145 |
return m; |
5054 |
26 Oct 18 |
nicklas |
146 |
} |
5054 |
26 Oct 18 |
nicklas |
147 |
|
5054 |
26 Oct 18 |
nicklas |
148 |
public static class Metrics |
5054 |
26 Oct 18 |
nicklas |
149 |
{ |
5054 |
26 Oct 18 |
nicklas |
150 |
public int snpCount; |
5054 |
26 Oct 18 |
nicklas |
151 |
public int filteredCount; |
5054 |
26 Oct 18 |
nicklas |
152 |
public int numRegions; |
5054 |
26 Oct 18 |
nicklas |
153 |
public int numSignificantRegions; |
5054 |
26 Oct 18 |
nicklas |
154 |
|
5054 |
26 Oct 18 |
nicklas |
155 |
public RegionStat minRegion; |
5054 |
26 Oct 18 |
nicklas |
156 |
public double minP = Double.NaN; |
5054 |
26 Oct 18 |
nicklas |
157 |
|
5054 |
26 Oct 18 |
nicklas |
158 |
/** |
5054 |
26 Oct 18 |
nicklas |
Update the mBAF-related annotations on the alignment with |
5054 |
26 Oct 18 |
nicklas |
the metrics. |
5054 |
26 Oct 18 |
nicklas |
161 |
*/ |
5054 |
26 Oct 18 |
nicklas |
162 |
public void updateAnnotations(DbControl dc, DerivedBioAssay alignment) |
5054 |
26 Oct 18 |
nicklas |
163 |
{ |
5054 |
26 Oct 18 |
nicklas |
164 |
Annotationtype.MBAF_SNP_COUNT.setAnnotationValue(dc, alignment, snpCount); |
5054 |
26 Oct 18 |
nicklas |
165 |
Annotationtype.MBAF_SNP_COUNT_FILTERED.setAnnotationValue(dc, alignment, filteredCount); |
5054 |
26 Oct 18 |
nicklas |
166 |
|
5054 |
26 Oct 18 |
nicklas |
167 |
Annotationtype.MBAF_NUM_REGIONS.setAnnotationValue(dc, alignment, numRegions); |
5054 |
26 Oct 18 |
nicklas |
168 |
Annotationtype.MBAF_SIGNIFICANT_REGIONS.setAnnotationValue(dc, alignment, numSignificantRegions); |
5054 |
26 Oct 18 |
nicklas |
169 |
Annotationtype.MBAF_MIN_PVALUE.setAnnotationValue(dc, alignment, minRegion == null ? null : (float)minP); |
5054 |
26 Oct 18 |
nicklas |
170 |
Annotationtype.MBAF_MIN_PREGION.setAnnotationValue(dc, alignment, minRegion == null ? null : minRegion.getRegion().getName()); |
5054 |
26 Oct 18 |
nicklas |
171 |
} |
5054 |
26 Oct 18 |
nicklas |
172 |
|
5054 |
26 Oct 18 |
nicklas |
173 |
} |
5054 |
26 Oct 18 |
nicklas |
174 |
|
5040 |
19 Oct 18 |
nicklas |
175 |
} |