5040 |
19 Oct 18 |
nicklas |
1 |
package net.sf.basedb.reggie.baf; |
5040 |
19 Oct 18 |
nicklas |
2 |
|
5040 |
19 Oct 18 |
nicklas |
3 |
import java.io.IOException; |
5040 |
19 Oct 18 |
nicklas |
4 |
import java.io.InputStream; |
5040 |
19 Oct 18 |
nicklas |
5 |
import java.util.ArrayList; |
5040 |
19 Oct 18 |
nicklas |
6 |
import java.util.List; |
5040 |
19 Oct 18 |
nicklas |
7 |
import java.util.regex.Pattern; |
5040 |
19 Oct 18 |
nicklas |
8 |
|
5045 |
22 Oct 18 |
nicklas |
9 |
import net.sf.basedb.reggie.vcf.SnpData; |
5040 |
19 Oct 18 |
nicklas |
10 |
import net.sf.basedb.util.FileUtil; |
5040 |
19 Oct 18 |
nicklas |
11 |
import net.sf.basedb.util.Values; |
5040 |
19 Oct 18 |
nicklas |
12 |
import net.sf.basedb.util.parser.FlatFileParser; |
5040 |
19 Oct 18 |
nicklas |
13 |
import net.sf.basedb.util.parser.Mapper; |
5040 |
19 Oct 18 |
nicklas |
14 |
import net.sf.basedb.util.parser.FlatFileParser.LineType; |
5040 |
19 Oct 18 |
nicklas |
15 |
|
5040 |
19 Oct 18 |
nicklas |
16 |
/** |
5040 |
19 Oct 18 |
nicklas |
Represents a range in the genome that is spanning a specified part |
5040 |
19 Oct 18 |
nicklas |
of a chromosome. A region must have a name that is unique. |
5040 |
19 Oct 18 |
nicklas |
19 |
|
5040 |
19 Oct 18 |
nicklas |
@author nicklas |
5040 |
19 Oct 18 |
nicklas |
@since 4.20 |
5040 |
19 Oct 18 |
nicklas |
22 |
*/ |
5040 |
19 Oct 18 |
nicklas |
23 |
public class Region |
5040 |
19 Oct 18 |
nicklas |
24 |
{ |
5040 |
19 Oct 18 |
nicklas |
25 |
|
5040 |
19 Oct 18 |
nicklas |
26 |
/** |
5040 |
19 Oct 18 |
nicklas |
Get the default regions that have been included as part of Reggie. |
5040 |
19 Oct 18 |
nicklas |
Data is found in the /net/sf/basedb/reggie/baf/default-regions.txt |
5040 |
19 Oct 18 |
nicklas |
file. |
5040 |
19 Oct 18 |
nicklas |
30 |
*/ |
5040 |
19 Oct 18 |
nicklas |
31 |
public static List<Region> defaultRegions() |
5040 |
19 Oct 18 |
nicklas |
32 |
{ |
5040 |
19 Oct 18 |
nicklas |
33 |
List<Region> regions = null; |
5040 |
19 Oct 18 |
nicklas |
34 |
InputStream in = null; |
5040 |
19 Oct 18 |
nicklas |
35 |
try |
5040 |
19 Oct 18 |
nicklas |
36 |
{ |
5040 |
19 Oct 18 |
nicklas |
37 |
String path = "/net/sf/basedb/reggie/baf/default-regions.txt"; |
5040 |
19 Oct 18 |
nicklas |
38 |
in = Region.class.getResourceAsStream(path); |
5040 |
19 Oct 18 |
nicklas |
39 |
regions = parse(in, path); |
5040 |
19 Oct 18 |
nicklas |
40 |
} |
5040 |
19 Oct 18 |
nicklas |
41 |
catch (IOException ex) |
5040 |
19 Oct 18 |
nicklas |
42 |
{ |
5040 |
19 Oct 18 |
nicklas |
43 |
throw new RuntimeException(ex); |
5040 |
19 Oct 18 |
nicklas |
44 |
} |
5040 |
19 Oct 18 |
nicklas |
45 |
finally |
5040 |
19 Oct 18 |
nicklas |
46 |
{ |
5040 |
19 Oct 18 |
nicklas |
47 |
FileUtil.close(in); |
5040 |
19 Oct 18 |
nicklas |
48 |
} |
5040 |
19 Oct 18 |
nicklas |
49 |
return regions; |
5040 |
19 Oct 18 |
nicklas |
50 |
} |
5040 |
19 Oct 18 |
nicklas |
51 |
|
5040 |
19 Oct 18 |
nicklas |
52 |
/** |
5040 |
19 Oct 18 |
nicklas |
Parse a file with region information. It should be a tab-separated |
5040 |
19 Oct 18 |
nicklas |
text file with one region per line. A header line is required and |
5040 |
19 Oct 18 |
nicklas |
with the following headers: |
5040 |
19 Oct 18 |
nicklas |
56 |
|
5040 |
19 Oct 18 |
nicklas |
* Region |
5040 |
19 Oct 18 |
nicklas |
* Chromosome |
5040 |
19 Oct 18 |
nicklas |
* Start |
5040 |
19 Oct 18 |
nicklas |
* Stop |
5040 |
19 Oct 18 |
nicklas |
* AvgBAF |
5040 |
19 Oct 18 |
nicklas |
* SdBAF |
5040 |
19 Oct 18 |
nicklas |
63 |
*/ |
5040 |
19 Oct 18 |
nicklas |
64 |
public static List<Region> parse(InputStream in, String fileName) |
5040 |
19 Oct 18 |
nicklas |
65 |
throws IOException |
5040 |
19 Oct 18 |
nicklas |
66 |
{ |
5040 |
19 Oct 18 |
nicklas |
67 |
FlatFileParser ffp = new FlatFileParser(); |
5040 |
19 Oct 18 |
nicklas |
68 |
ffp.setDataHeaderRegexp(Pattern.compile("Region\tChromosome\t.*")); |
5040 |
19 Oct 18 |
nicklas |
69 |
ffp.setDataSplitterRegexp(Pattern.compile("\\t")); |
5040 |
19 Oct 18 |
nicklas |
70 |
ffp.setIgnoreRegexp(Pattern.compile("#.*")); |
5040 |
19 Oct 18 |
nicklas |
71 |
|
5040 |
19 Oct 18 |
nicklas |
72 |
ffp.setInputStream(in, "UTF-8"); |
5040 |
19 Oct 18 |
nicklas |
73 |
LineType headerLine = ffp.parseHeaders(); |
5040 |
19 Oct 18 |
nicklas |
74 |
int lineNo = ffp.getParsedLines(); |
5040 |
19 Oct 18 |
nicklas |
75 |
if (headerLine != LineType.DATA_HEADER) |
5040 |
19 Oct 18 |
nicklas |
76 |
{ |
5040 |
19 Oct 18 |
nicklas |
77 |
throw new IOException("File '" + fileName + "' line " + lineNo + ": Could not find header line starting with 'Region{tab}Chromosome...'"); |
5040 |
19 Oct 18 |
nicklas |
78 |
} |
5040 |
19 Oct 18 |
nicklas |
79 |
List<String> headers = ffp.getColumnHeaders(); |
5040 |
19 Oct 18 |
nicklas |
80 |
|
5040 |
19 Oct 18 |
nicklas |
81 |
Mapper nameMapper = ffp.getMapper("\\Region\\"); |
5040 |
19 Oct 18 |
nicklas |
82 |
Mapper chromMapper = ffp.getMapper("\\Chromosome\\"); |
5040 |
19 Oct 18 |
nicklas |
83 |
Mapper startMapper = ffp.getMapper("\\Start\\"); |
5040 |
19 Oct 18 |
nicklas |
84 |
Mapper stopMapper = ffp.getMapper("\\Stop\\"); |
5040 |
19 Oct 18 |
nicklas |
85 |
Mapper bafMapper = ffp.getMapper("\\AvgBAF\\"); |
5040 |
19 Oct 18 |
nicklas |
86 |
Mapper sdMapper = ffp.getMapper("\\SdBAF\\"); |
5040 |
19 Oct 18 |
nicklas |
87 |
|
5040 |
19 Oct 18 |
nicklas |
88 |
List<Region> regions = new ArrayList<>(); |
5040 |
19 Oct 18 |
nicklas |
89 |
while (ffp.hasMoreData()) |
5040 |
19 Oct 18 |
nicklas |
90 |
{ |
5040 |
19 Oct 18 |
nicklas |
91 |
FlatFileParser.Data line = ffp.nextData(); |
5040 |
19 Oct 18 |
nicklas |
92 |
|
5364 |
16 Apr 19 |
nicklas |
93 |
String name = nameMapper.getString(line); |
5040 |
19 Oct 18 |
nicklas |
94 |
Region r = new Region(name); |
5040 |
19 Oct 18 |
nicklas |
95 |
|
5364 |
16 Apr 19 |
nicklas |
96 |
r.setChromosome(chromMapper.getString(line)); |
5364 |
16 Apr 19 |
nicklas |
97 |
r.setStart(Values.getLong(startMapper.getString(line))); |
5364 |
16 Apr 19 |
nicklas |
98 |
r.setStop(Values.getLong(stopMapper.getString(line))); |
5040 |
19 Oct 18 |
nicklas |
99 |
|
5040 |
19 Oct 18 |
nicklas |
100 |
r.setBafRef(bafMapper.getFloat(line), sdMapper.getFloat(line)); |
5040 |
19 Oct 18 |
nicklas |
101 |
regions.add(r); |
5040 |
19 Oct 18 |
nicklas |
102 |
} |
5040 |
19 Oct 18 |
nicklas |
103 |
return regions; |
5040 |
19 Oct 18 |
nicklas |
104 |
} |
5040 |
19 Oct 18 |
nicklas |
105 |
|
5040 |
19 Oct 18 |
nicklas |
106 |
private final String name; |
5040 |
19 Oct 18 |
nicklas |
107 |
private String chr; |
5040 |
19 Oct 18 |
nicklas |
108 |
private long start; |
5040 |
19 Oct 18 |
nicklas |
109 |
private long stop; |
5040 |
19 Oct 18 |
nicklas |
110 |
|
5040 |
19 Oct 18 |
nicklas |
111 |
private double avgBafRef; |
5040 |
19 Oct 18 |
nicklas |
112 |
private double sdBafRef; |
5040 |
19 Oct 18 |
nicklas |
113 |
|
5040 |
19 Oct 18 |
nicklas |
114 |
public Region(String name) |
5040 |
19 Oct 18 |
nicklas |
115 |
{ |
5040 |
19 Oct 18 |
nicklas |
116 |
this.name = name; |
5040 |
19 Oct 18 |
nicklas |
117 |
} |
5040 |
19 Oct 18 |
nicklas |
118 |
|
5040 |
19 Oct 18 |
nicklas |
119 |
/** |
5040 |
19 Oct 18 |
nicklas |
Get the name of the region. It should be unique |
5040 |
19 Oct 18 |
nicklas |
among all regions used together. |
5040 |
19 Oct 18 |
nicklas |
122 |
*/ |
5040 |
19 Oct 18 |
nicklas |
123 |
public String getName() |
5040 |
19 Oct 18 |
nicklas |
124 |
{ |
5040 |
19 Oct 18 |
nicklas |
125 |
return name; |
5040 |
19 Oct 18 |
nicklas |
126 |
} |
5040 |
19 Oct 18 |
nicklas |
127 |
|
5040 |
19 Oct 18 |
nicklas |
128 |
/** |
5040 |
19 Oct 18 |
nicklas |
Get the chromosome where this regions is located. |
5040 |
19 Oct 18 |
nicklas |
130 |
*/ |
5040 |
19 Oct 18 |
nicklas |
131 |
public String getChromosome() |
5040 |
19 Oct 18 |
nicklas |
132 |
{ |
5040 |
19 Oct 18 |
nicklas |
133 |
return chr; |
5040 |
19 Oct 18 |
nicklas |
134 |
} |
5040 |
19 Oct 18 |
nicklas |
135 |
public void setChromosome(String chr) |
5040 |
19 Oct 18 |
nicklas |
136 |
{ |
5040 |
19 Oct 18 |
nicklas |
137 |
this.chr = chr; |
5040 |
19 Oct 18 |
nicklas |
138 |
} |
5040 |
19 Oct 18 |
nicklas |
139 |
|
5040 |
19 Oct 18 |
nicklas |
140 |
/** |
5040 |
19 Oct 18 |
nicklas |
Get the start position of this region. |
5040 |
19 Oct 18 |
nicklas |
142 |
*/ |
5040 |
19 Oct 18 |
nicklas |
143 |
public long getStart() |
5040 |
19 Oct 18 |
nicklas |
144 |
{ |
5040 |
19 Oct 18 |
nicklas |
145 |
return start; |
5040 |
19 Oct 18 |
nicklas |
146 |
} |
5040 |
19 Oct 18 |
nicklas |
147 |
public void setStart(long start) |
5040 |
19 Oct 18 |
nicklas |
148 |
{ |
5040 |
19 Oct 18 |
nicklas |
149 |
this.start = start; |
5040 |
19 Oct 18 |
nicklas |
150 |
} |
5040 |
19 Oct 18 |
nicklas |
151 |
/** |
5040 |
19 Oct 18 |
nicklas |
Get the stop position of this region. |
5040 |
19 Oct 18 |
nicklas |
153 |
*/ |
5040 |
19 Oct 18 |
nicklas |
154 |
public long getStop() |
5040 |
19 Oct 18 |
nicklas |
155 |
{ |
5040 |
19 Oct 18 |
nicklas |
156 |
return stop; |
5040 |
19 Oct 18 |
nicklas |
157 |
} |
5040 |
19 Oct 18 |
nicklas |
158 |
public void setStop(long stop) |
5040 |
19 Oct 18 |
nicklas |
159 |
{ |
5040 |
19 Oct 18 |
nicklas |
160 |
this.stop = stop; |
5040 |
19 Oct 18 |
nicklas |
161 |
} |
5040 |
19 Oct 18 |
nicklas |
162 |
|
5040 |
19 Oct 18 |
nicklas |
163 |
/** |
5040 |
19 Oct 18 |
nicklas |
Set the average and standard deviation of the mBAF |
5040 |
19 Oct 18 |
nicklas |
values for the normal background reference. |
5040 |
19 Oct 18 |
nicklas |
166 |
*/ |
5040 |
19 Oct 18 |
nicklas |
167 |
public void setBafRef(double avgBaf, double sdBaf) |
5040 |
19 Oct 18 |
nicklas |
168 |
{ |
5040 |
19 Oct 18 |
nicklas |
169 |
this.avgBafRef = avgBaf; |
5040 |
19 Oct 18 |
nicklas |
170 |
this.sdBafRef = sdBaf; |
5040 |
19 Oct 18 |
nicklas |
171 |
} |
5040 |
19 Oct 18 |
nicklas |
172 |
public double getAvgBafRef() |
5040 |
19 Oct 18 |
nicklas |
173 |
{ |
5040 |
19 Oct 18 |
nicklas |
174 |
return avgBafRef; |
5040 |
19 Oct 18 |
nicklas |
175 |
} |
5040 |
19 Oct 18 |
nicklas |
176 |
public double getSdBafRef() |
5040 |
19 Oct 18 |
nicklas |
177 |
{ |
5040 |
19 Oct 18 |
nicklas |
178 |
return sdBafRef; |
5040 |
19 Oct 18 |
nicklas |
179 |
} |
5040 |
19 Oct 18 |
nicklas |
180 |
|
5045 |
22 Oct 18 |
nicklas |
181 |
/** |
5045 |
22 Oct 18 |
nicklas |
Check if the given SNP is located in this region or not. |
5045 |
22 Oct 18 |
nicklas |
183 |
*/ |
5045 |
22 Oct 18 |
nicklas |
184 |
public boolean isInRegion(SnpData snp) |
5045 |
22 Oct 18 |
nicklas |
185 |
{ |
5045 |
22 Oct 18 |
nicklas |
186 |
if (!chr.equals(snp.getChromosome())) return false; |
5045 |
22 Oct 18 |
nicklas |
187 |
return start <= snp.getPosition() && snp.getPosition() <= stop; |
5045 |
22 Oct 18 |
nicklas |
188 |
} |
5045 |
22 Oct 18 |
nicklas |
189 |
|
5040 |
19 Oct 18 |
nicklas |
190 |
@Override |
5040 |
19 Oct 18 |
nicklas |
191 |
public boolean equals(Object o) |
5040 |
19 Oct 18 |
nicklas |
192 |
{ |
5040 |
19 Oct 18 |
nicklas |
193 |
if (!(o instanceof Region)) return false; |
5040 |
19 Oct 18 |
nicklas |
194 |
Region r = (Region)o; |
5040 |
19 Oct 18 |
nicklas |
195 |
return name.equals(r.name); |
5040 |
19 Oct 18 |
nicklas |
196 |
} |
5040 |
19 Oct 18 |
nicklas |
197 |
|
5040 |
19 Oct 18 |
nicklas |
198 |
@Override |
5040 |
19 Oct 18 |
nicklas |
199 |
public int hashCode() |
5040 |
19 Oct 18 |
nicklas |
200 |
{ |
5040 |
19 Oct 18 |
nicklas |
201 |
return name.hashCode(); |
5040 |
19 Oct 18 |
nicklas |
202 |
} |
5040 |
19 Oct 18 |
nicklas |
203 |
|
5040 |
19 Oct 18 |
nicklas |
204 |
@Override |
5040 |
19 Oct 18 |
nicklas |
205 |
public String toString() |
5040 |
19 Oct 18 |
nicklas |
206 |
{ |
5040 |
19 Oct 18 |
nicklas |
207 |
return "Region[" + name + "]["+chr+":"+start+"-"+stop+"]"; |
5040 |
19 Oct 18 |
nicklas |
208 |
} |
5040 |
19 Oct 18 |
nicklas |
209 |
|
5040 |
19 Oct 18 |
nicklas |
210 |
} |