6540 |
17 Jan 22 |
nicklas |
1 |
package net.sf.basedb.varsearch.index; |
6540 |
17 Jan 22 |
nicklas |
2 |
|
6540 |
17 Jan 22 |
nicklas |
3 |
import java.io.IOException; |
6540 |
17 Jan 22 |
nicklas |
4 |
import java.util.ArrayList; |
6540 |
17 Jan 22 |
nicklas |
5 |
import java.util.Collections; |
6540 |
17 Jan 22 |
nicklas |
6 |
import java.util.HashMap; |
6555 |
27 Jan 22 |
nicklas |
7 |
import java.util.HashSet; |
6540 |
17 Jan 22 |
nicklas |
8 |
import java.util.List; |
6540 |
17 Jan 22 |
nicklas |
9 |
import java.util.Map; |
6551 |
26 Jan 22 |
nicklas |
10 |
import java.util.Set; |
6544 |
19 Jan 22 |
nicklas |
11 |
import java.util.concurrent.ExecutorService; |
6540 |
17 Jan 22 |
nicklas |
12 |
|
6540 |
17 Jan 22 |
nicklas |
13 |
import org.apache.lucene.analysis.Analyzer; |
6540 |
17 Jan 22 |
nicklas |
14 |
import org.apache.lucene.analysis.core.KeywordAnalyzer; |
6540 |
17 Jan 22 |
nicklas |
15 |
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; |
6540 |
17 Jan 22 |
nicklas |
16 |
import org.apache.lucene.document.IntPoint; |
6540 |
17 Jan 22 |
nicklas |
17 |
import org.apache.lucene.document.Field.Store; |
6545 |
21 Jan 22 |
nicklas |
18 |
import org.apache.lucene.index.ConcurrentMergeScheduler; |
6545 |
21 Jan 22 |
nicklas |
19 |
import org.apache.lucene.index.IndexReader; |
6540 |
17 Jan 22 |
nicklas |
20 |
import org.apache.lucene.index.IndexWriter; |
6545 |
21 Jan 22 |
nicklas |
21 |
import org.apache.lucene.index.IndexWriterConfig; |
6546 |
24 Jan 22 |
nicklas |
22 |
import org.apache.lucene.index.Term; |
6545 |
21 Jan 22 |
nicklas |
23 |
import org.apache.lucene.index.TieredMergePolicy; |
6545 |
21 Jan 22 |
nicklas |
24 |
import org.apache.lucene.index.IndexWriterConfig.OpenMode; |
6546 |
24 Jan 22 |
nicklas |
25 |
import org.apache.lucene.queryparser.classic.ParseException; |
6540 |
17 Jan 22 |
nicklas |
26 |
import org.apache.lucene.queryparser.classic.QueryParser; |
6546 |
24 Jan 22 |
nicklas |
27 |
import org.apache.lucene.search.BooleanQuery; |
6545 |
21 Jan 22 |
nicklas |
28 |
import org.apache.lucene.search.IndexSearcher; |
6546 |
24 Jan 22 |
nicklas |
29 |
import org.apache.lucene.search.MatchNoDocsQuery; |
6540 |
17 Jan 22 |
nicklas |
30 |
import org.apache.lucene.search.Query; |
6546 |
24 Jan 22 |
nicklas |
31 |
import org.apache.lucene.search.TermQuery; |
6546 |
24 Jan 22 |
nicklas |
32 |
import org.apache.lucene.search.BooleanClause.Occur; |
6545 |
21 Jan 22 |
nicklas |
33 |
import org.apache.lucene.store.Directory; |
6556 |
28 Jan 22 |
nicklas |
34 |
import org.apache.lucene.store.NIOFSDirectory; |
6540 |
17 Jan 22 |
nicklas |
35 |
import org.slf4j.LoggerFactory; |
6540 |
17 Jan 22 |
nicklas |
36 |
|
6545 |
21 Jan 22 |
nicklas |
37 |
import net.sf.basedb.core.ArrayDesign; |
6545 |
21 Jan 22 |
nicklas |
38 |
import net.sf.basedb.core.DbControl; |
6545 |
21 Jan 22 |
nicklas |
39 |
import net.sf.basedb.core.File; |
6545 |
21 Jan 22 |
nicklas |
40 |
import net.sf.basedb.core.Include; |
6546 |
24 Jan 22 |
nicklas |
41 |
import net.sf.basedb.core.ItemContext; |
6556 |
28 Jan 22 |
nicklas |
42 |
import net.sf.basedb.core.ItemNotFoundException; |
6545 |
21 Jan 22 |
nicklas |
43 |
import net.sf.basedb.core.ItemQuery; |
6540 |
17 Jan 22 |
nicklas |
44 |
import net.sf.basedb.core.RawBioAssay; |
6544 |
19 Jan 22 |
nicklas |
45 |
import net.sf.basedb.core.SimpleProgressReporter; |
6545 |
21 Jan 22 |
nicklas |
46 |
import net.sf.basedb.core.query.Expressions; |
6545 |
21 Jan 22 |
nicklas |
47 |
import net.sf.basedb.core.query.Hql; |
6546 |
24 Jan 22 |
nicklas |
48 |
import net.sf.basedb.core.query.QueryFilterAction; |
6545 |
21 Jan 22 |
nicklas |
49 |
import net.sf.basedb.core.query.Restrictions; |
6540 |
17 Jan 22 |
nicklas |
50 |
import net.sf.basedb.util.FileUtil; |
6546 |
24 Jan 22 |
nicklas |
51 |
import net.sf.basedb.util.Values; |
6546 |
24 Jan 22 |
nicklas |
52 |
import net.sf.basedb.util.extensions.InvokationContext; |
7074 |
24 Mar 23 |
nicklas |
53 |
import net.sf.basedb.util.extensions.logging.ExtensionsLog; |
7074 |
24 Mar 23 |
nicklas |
54 |
import net.sf.basedb.util.extensions.logging.ExtensionsLogger; |
6552 |
26 Jan 22 |
nicklas |
55 |
import net.sf.basedb.varsearch.VarSearch; |
6552 |
26 Jan 22 |
nicklas |
56 |
import net.sf.basedb.varsearch.LuceneColumnFactory.LuceneColumnAction; |
6540 |
17 Jan 22 |
nicklas |
57 |
import net.sf.basedb.varsearch.analyze.KeywordListAnalyzer; |
6545 |
21 Jan 22 |
nicklas |
58 |
import net.sf.basedb.varsearch.dao.Datafiletype; |
6540 |
17 Jan 22 |
nicklas |
59 |
import net.sf.basedb.varsearch.fields.ListField; |
6545 |
21 Jan 22 |
nicklas |
60 |
import net.sf.basedb.varsearch.index.VariantCallIndex.VariantCallIndexer; |
6550 |
25 Jan 22 |
nicklas |
61 |
import net.sf.basedb.varsearch.query.AllDocsCollector; |
6546 |
24 Jan 22 |
nicklas |
62 |
import net.sf.basedb.varsearch.query.AutoPrefixer; |
6540 |
17 Jan 22 |
nicklas |
63 |
import net.sf.basedb.varsearch.query.FieldAwareQueryParser; |
6546 |
24 Jan 22 |
nicklas |
64 |
import net.sf.basedb.varsearch.query.QueryField; |
6552 |
26 Jan 22 |
nicklas |
65 |
import net.sf.basedb.varsearch.query.QueryResult; |
6551 |
26 Jan 22 |
nicklas |
66 |
import net.sf.basedb.varsearch.query.RawBioAssayIdCollector; |
6553 |
27 Jan 22 |
nicklas |
67 |
import net.sf.basedb.varsearch.query.RawBioAssayIdCollectorManager; |
6546 |
24 Jan 22 |
nicklas |
68 |
import net.sf.basedb.varsearch.query.SnpCollector; |
6540 |
17 Jan 22 |
nicklas |
69 |
import net.sf.basedb.varsearch.query.SumFieldCollector; |
6546 |
24 Jan 22 |
nicklas |
70 |
import net.sf.basedb.varsearch.query.LuceneQueryFactory.LuceneFilterAction; |
6553 |
27 Jan 22 |
nicklas |
71 |
import net.sf.basedb.varsearch.query.QueryCache; |
6550 |
25 Jan 22 |
nicklas |
72 |
import net.sf.basedb.varsearch.query.SnpCollector.SnpDocument; |
6545 |
21 Jan 22 |
nicklas |
73 |
import net.sf.basedb.varsearch.service.VarSearchService; |
6540 |
17 Jan 22 |
nicklas |
74 |
import net.sf.basedb.varsearch.vcf.VcfParser; |
6540 |
17 Jan 22 |
nicklas |
75 |
import net.sf.basedb.varsearch.vcf.VcfParser.VcfHeader; |
6540 |
17 Jan 22 |
nicklas |
76 |
import net.sf.basedb.varsearch.vcf.VcfParser.VcfLine; |
6540 |
17 Jan 22 |
nicklas |
77 |
|
6540 |
17 Jan 22 |
nicklas |
78 |
/** |
6540 |
17 Jan 22 |
nicklas |
Index imeplementation for VCF files that are produced by the OncoArray 500K |
6540 |
17 Jan 22 |
nicklas |
genotyping pipeline. There are too many variants to be able to index each one |
6540 |
17 Jan 22 |
nicklas |
of them as a separate document. On the other hand there is "only" about 500K |
6540 |
17 Jan 22 |
nicklas |
genomic locations so all annotations that are related to gene and location |
6545 |
21 Jan 22 |
nicklas |
can be indexed in a static database and then we only need to have a |
6540 |
17 Jan 22 |
nicklas |
single document for each rawbioassay that specify a list of SNP-ID:s that |
6540 |
17 Jan 22 |
nicklas |
are 0/0, 0/1 and 1/1 genotypes respectively. This complicates the query results |
6540 |
17 Jan 22 |
nicklas |
processing a lot... |
6540 |
17 Jan 22 |
nicklas |
87 |
|
6540 |
17 Jan 22 |
nicklas |
@author nicklas |
6540 |
17 Jan 22 |
nicklas |
@since 1.5 |
6540 |
17 Jan 22 |
nicklas |
90 |
*/ |
6540 |
17 Jan 22 |
nicklas |
91 |
public class OncoArrayIndex |
6540 |
17 Jan 22 |
nicklas |
92 |
extends LuceneIndex |
6540 |
17 Jan 22 |
nicklas |
93 |
{ |
6546 |
24 Jan 22 |
nicklas |
94 |
/** |
6546 |
24 Jan 22 |
nicklas |
Max number of SNPs that can be matched in a single query. |
6546 |
24 Jan 22 |
nicklas |
96 |
*/ |
6553 |
27 Jan 22 |
nicklas |
97 |
public static final int MAX_SNP_HITS = 200; |
6546 |
24 Jan 22 |
nicklas |
98 |
|
7074 |
24 Mar 23 |
nicklas |
99 |
private static final ExtensionsLogger logger = |
7074 |
24 Mar 23 |
nicklas |
100 |
ExtensionsLog.getLogger(VarSearchService.ID, true).wrap(LoggerFactory.getLogger(OncoArrayIndex.class)); |
6540 |
17 Jan 22 |
nicklas |
101 |
|
6545 |
21 Jan 22 |
nicklas |
102 |
private java.io.File rootPathPrefix; |
6545 |
21 Jan 22 |
nicklas |
103 |
private java.io.File rbaPathPrefix; |
6545 |
21 Jan 22 |
nicklas |
104 |
private java.io.File refPathPrefix; |
6545 |
21 Jan 22 |
nicklas |
105 |
private java.io.File refPath; |
6545 |
21 Jan 22 |
nicklas |
106 |
private Directory refDirectory; |
6545 |
21 Jan 22 |
nicklas |
107 |
private IndexReader refReader; |
6545 |
21 Jan 22 |
nicklas |
108 |
private IndexSearcher refSearcher; |
6546 |
24 Jan 22 |
nicklas |
109 |
private Analyzer refAnalyzer; |
6546 |
24 Jan 22 |
nicklas |
110 |
private Map<String, QueryField> refQueryFields; |
6545 |
21 Jan 22 |
nicklas |
111 |
|
6553 |
27 Jan 22 |
nicklas |
112 |
private QueryCache snpGtCache; |
6553 |
27 Jan 22 |
nicklas |
113 |
|
6540 |
17 Jan 22 |
nicklas |
114 |
public OncoArrayIndex(String id) |
6540 |
17 Jan 22 |
nicklas |
115 |
{ |
6540 |
17 Jan 22 |
nicklas |
116 |
super(id); |
6540 |
17 Jan 22 |
nicklas |
117 |
} |
6544 |
19 Jan 22 |
nicklas |
118 |
|
6540 |
17 Jan 22 |
nicklas |
119 |
/** |
6544 |
19 Jan 22 |
nicklas |
Overrides the default open() method so that we can check if the |
6544 |
19 Jan 22 |
nicklas |
reference VCF from the array design has been indexed. |
6544 |
19 Jan 22 |
nicklas |
122 |
*/ |
6544 |
19 Jan 22 |
nicklas |
123 |
@Override |
6545 |
21 Jan 22 |
nicklas |
124 |
public void open(java.io.File pathPrefix, ExecutorService executor) |
6544 |
19 Jan 22 |
nicklas |
125 |
throws IOException |
6544 |
19 Jan 22 |
nicklas |
126 |
{ |
6545 |
21 Jan 22 |
nicklas |
127 |
this.rootPathPrefix = pathPrefix; |
6545 |
21 Jan 22 |
nicklas |
128 |
this.rbaPathPrefix = new java.io.File(pathPrefix, "rba"); |
6545 |
21 Jan 22 |
nicklas |
129 |
this.refPathPrefix = new java.io.File(pathPrefix, "ref"); |
6545 |
21 Jan 22 |
nicklas |
130 |
|
6545 |
21 Jan 22 |
nicklas |
131 |
this.refPath = getExistingOrNewPath(refPathPrefix); |
6545 |
21 Jan 22 |
nicklas |
132 |
this.refDirectory = createIndexIfNeeded(refPath); |
6545 |
21 Jan 22 |
nicklas |
133 |
this.refReader = createIndexReader(refDirectory); |
6545 |
21 Jan 22 |
nicklas |
134 |
this.refSearcher = createIndexSearcher(refReader); |
6546 |
24 Jan 22 |
nicklas |
135 |
this.refAnalyzer = VariantCallIndexer.createAnalyzer(true); |
6546 |
24 Jan 22 |
nicklas |
136 |
this.refQueryFields = VariantCallIndexer.createQueryFields(true); |
6553 |
27 Jan 22 |
nicklas |
137 |
this.snpGtCache = new QueryCache(60); |
6605 |
23 Feb 22 |
nicklas |
138 |
super.open(rbaPathPrefix, executor); |
6544 |
19 Jan 22 |
nicklas |
139 |
} |
6544 |
19 Jan 22 |
nicklas |
140 |
|
6545 |
21 Jan 22 |
nicklas |
141 |
@Override |
6553 |
27 Jan 22 |
nicklas |
142 |
protected void reOpen() |
6553 |
27 Jan 22 |
nicklas |
143 |
throws IOException |
6553 |
27 Jan 22 |
nicklas |
144 |
{ |
6553 |
27 Jan 22 |
nicklas |
145 |
super.reOpen(); |
6553 |
27 Jan 22 |
nicklas |
146 |
this.snpGtCache = new QueryCache(60); |
6553 |
27 Jan 22 |
nicklas |
147 |
} |
6553 |
27 Jan 22 |
nicklas |
148 |
|
6553 |
27 Jan 22 |
nicklas |
149 |
@Override |
6545 |
21 Jan 22 |
nicklas |
150 |
public void close() |
6545 |
21 Jan 22 |
nicklas |
151 |
{ |
6545 |
21 Jan 22 |
nicklas |
152 |
super.close(); |
6545 |
21 Jan 22 |
nicklas |
153 |
FileUtil.close(refReader); |
6545 |
21 Jan 22 |
nicklas |
154 |
FileUtil.close(refDirectory); |
6545 |
21 Jan 22 |
nicklas |
155 |
refReader = null; |
6545 |
21 Jan 22 |
nicklas |
156 |
refSearcher = null; |
6545 |
21 Jan 22 |
nicklas |
157 |
refDirectory = null; |
6546 |
24 Jan 22 |
nicklas |
158 |
refAnalyzer = null; |
6546 |
24 Jan 22 |
nicklas |
159 |
refQueryFields = null; |
6553 |
27 Jan 22 |
nicklas |
160 |
snpGtCache = null; |
6545 |
21 Jan 22 |
nicklas |
161 |
} |
6545 |
21 Jan 22 |
nicklas |
162 |
|
6544 |
19 Jan 22 |
nicklas |
163 |
/** |
6605 |
23 Feb 22 |
nicklas |
Overrides the default implementation since we also need to DISABLE |
6605 |
23 Feb 22 |
nicklas |
queries if the reference index is empty. |
6605 |
23 Feb 22 |
nicklas |
@since 1.6 |
6605 |
23 Feb 22 |
nicklas |
167 |
*/ |
6605 |
23 Feb 22 |
nicklas |
168 |
@Override |
6605 |
23 Feb 22 |
nicklas |
169 |
public Status autoSetQueryStatus() |
6605 |
23 Feb 22 |
nicklas |
170 |
{ |
6605 |
23 Feb 22 |
nicklas |
171 |
if (getNumIndexedSNPs() == 0) |
6605 |
23 Feb 22 |
nicklas |
172 |
{ |
6605 |
23 Feb 22 |
nicklas |
173 |
setQueryStatus(Status.DISABLED); |
6605 |
23 Feb 22 |
nicklas |
174 |
} |
6605 |
23 Feb 22 |
nicklas |
175 |
else |
6605 |
23 Feb 22 |
nicklas |
176 |
{ |
6605 |
23 Feb 22 |
nicklas |
177 |
super.autoSetQueryStatus(); |
6605 |
23 Feb 22 |
nicklas |
178 |
} |
6605 |
23 Feb 22 |
nicklas |
179 |
return getQueryStatus(); |
6605 |
23 Feb 22 |
nicklas |
180 |
} |
6605 |
23 Feb 22 |
nicklas |
181 |
|
6605 |
23 Feb 22 |
nicklas |
182 |
/** |
6545 |
21 Jan 22 |
nicklas |
Get the path to the index directory. |
6545 |
21 Jan 22 |
nicklas |
184 |
*/ |
6545 |
21 Jan 22 |
nicklas |
185 |
@Override |
6545 |
21 Jan 22 |
nicklas |
186 |
public java.io.File getPath() |
6545 |
21 Jan 22 |
nicklas |
187 |
{ |
6545 |
21 Jan 22 |
nicklas |
188 |
return rootPathPrefix; |
6545 |
21 Jan 22 |
nicklas |
189 |
} |
6545 |
21 Jan 22 |
nicklas |
190 |
|
6545 |
21 Jan 22 |
nicklas |
191 |
/** |
6545 |
21 Jan 22 |
nicklas |
Get the size of the index database on disk. |
6545 |
21 Jan 22 |
nicklas |
193 |
*/ |
6545 |
21 Jan 22 |
nicklas |
194 |
@Override |
6545 |
21 Jan 22 |
nicklas |
195 |
public long getSizeOnDisk() |
6545 |
21 Jan 22 |
nicklas |
196 |
{ |
6545 |
21 Jan 22 |
nicklas |
197 |
if (refPath == null) return -1; |
6545 |
21 Jan 22 |
nicklas |
198 |
return super.getSizeOnDisk()+getSizeOfDir(refPath); |
6545 |
21 Jan 22 |
nicklas |
199 |
} |
6545 |
21 Jan 22 |
nicklas |
200 |
|
6545 |
21 Jan 22 |
nicklas |
201 |
/** |
6540 |
17 Jan 22 |
nicklas |
The analyzer is a keyword analyzer that also support lists of comma- |
6540 |
17 Jan 22 |
nicklas |
and white-space separated values. |
6540 |
17 Jan 22 |
nicklas |
204 |
*/ |
6540 |
17 Jan 22 |
nicklas |
205 |
@Override |
6540 |
17 Jan 22 |
nicklas |
206 |
protected Analyzer createAnalyzer() |
6540 |
17 Jan 22 |
nicklas |
207 |
{ |
6540 |
17 Jan 22 |
nicklas |
208 |
Map<String, Analyzer> fieldAnalyzers = new HashMap<>(); |
6540 |
17 Jan 22 |
nicklas |
209 |
fieldAnalyzers.put("snps", new KeywordListAnalyzer()); |
6540 |
17 Jan 22 |
nicklas |
210 |
return new PerFieldAnalyzerWrapper(new KeywordAnalyzer(), fieldAnalyzers); |
6540 |
17 Jan 22 |
nicklas |
211 |
} |
6540 |
17 Jan 22 |
nicklas |
212 |
|
6546 |
24 Jan 22 |
nicklas |
213 |
|
6540 |
17 Jan 22 |
nicklas |
214 |
@Override |
6540 |
17 Jan 22 |
nicklas |
215 |
protected Indexer createIndexer(IndexWriter writer, int num, RawBioAssay rba, List<VcfFile> vcfFiles) |
6540 |
17 Jan 22 |
nicklas |
216 |
{ |
6540 |
17 Jan 22 |
nicklas |
217 |
return new OncoArrayIndexer(this, writer, num, rba, vcfFiles); |
6540 |
17 Jan 22 |
nicklas |
218 |
} |
6540 |
17 Jan 22 |
nicklas |
219 |
|
6540 |
17 Jan 22 |
nicklas |
220 |
/** |
6540 |
17 Jan 22 |
nicklas |
Create a new parser for creating queries from strings. |
6540 |
17 Jan 22 |
nicklas |
222 |
*/ |
6540 |
17 Jan 22 |
nicklas |
223 |
@Override |
6540 |
17 Jan 22 |
nicklas |
224 |
public QueryParser createQueryParser() |
6540 |
17 Jan 22 |
nicklas |
225 |
{ |
6540 |
17 Jan 22 |
nicklas |
226 |
return new FieldAwareQueryParser("snps", getAnalyzer(), Collections.emptyMap()); |
6540 |
17 Jan 22 |
nicklas |
227 |
} |
6540 |
17 Jan 22 |
nicklas |
228 |
|
6540 |
17 Jan 22 |
nicklas |
229 |
/** |
6540 |
17 Jan 22 |
nicklas |
Summarize the 'numGenotypes' field for all entries with a 'mainId'. |
6540 |
17 Jan 22 |
nicklas |
231 |
*/ |
6540 |
17 Jan 22 |
nicklas |
232 |
@Override |
6540 |
17 Jan 22 |
nicklas |
233 |
public long getNumVariants() |
6540 |
17 Jan 22 |
nicklas |
234 |
throws IOException |
6540 |
17 Jan 22 |
nicklas |
235 |
{ |
6540 |
17 Jan 22 |
nicklas |
236 |
if (getWorkingStatus() == Status.DISABLED) return -1; |
6540 |
17 Jan 22 |
nicklas |
237 |
Query query = IntPoint.newRangeQuery("mainId", 0, Integer.MAX_VALUE); |
6550 |
25 Jan 22 |
nicklas |
238 |
SumFieldCollector collector = new SumFieldCollector("numGenotypes"); |
6540 |
17 Jan 22 |
nicklas |
239 |
getIndexSearcher().search(query, collector); |
6540 |
17 Jan 22 |
nicklas |
240 |
return collector.getSum(); |
6540 |
17 Jan 22 |
nicklas |
241 |
} |
6540 |
17 Jan 22 |
nicklas |
242 |
|
6540 |
17 Jan 22 |
nicklas |
243 |
/** |
6544 |
19 Jan 22 |
nicklas |
Get the number of indexed SNPs from the reference design. |
6544 |
19 Jan 22 |
nicklas |
245 |
*/ |
6544 |
19 Jan 22 |
nicklas |
246 |
public int getNumIndexedSNPs() |
6544 |
19 Jan 22 |
nicklas |
247 |
{ |
6605 |
23 Feb 22 |
nicklas |
248 |
return refReader != null ? refReader.numDocs() : 0; |
6544 |
19 Jan 22 |
nicklas |
249 |
} |
6544 |
19 Jan 22 |
nicklas |
250 |
|
6544 |
19 Jan 22 |
nicklas |
251 |
/** |
6546 |
24 Jan 22 |
nicklas |
Get a reader for reference design database. |
6546 |
24 Jan 22 |
nicklas |
253 |
*/ |
6546 |
24 Jan 22 |
nicklas |
254 |
public IndexReader getRefReader() |
6546 |
24 Jan 22 |
nicklas |
255 |
{ |
6546 |
24 Jan 22 |
nicklas |
256 |
return refReader; |
6546 |
24 Jan 22 |
nicklas |
257 |
} |
6546 |
24 Jan 22 |
nicklas |
258 |
|
6546 |
24 Jan 22 |
nicklas |
259 |
/** |
6546 |
24 Jan 22 |
nicklas |
Get a searcher for reference design database. |
6546 |
24 Jan 22 |
nicklas |
261 |
*/ |
6546 |
24 Jan 22 |
nicklas |
262 |
public IndexSearcher getRefSearcher() |
6546 |
24 Jan 22 |
nicklas |
263 |
{ |
6546 |
24 Jan 22 |
nicklas |
264 |
return refSearcher; |
6546 |
24 Jan 22 |
nicklas |
265 |
} |
6546 |
24 Jan 22 |
nicklas |
266 |
|
6546 |
24 Jan 22 |
nicklas |
267 |
/** |
6546 |
24 Jan 22 |
nicklas |
Create a query parser for reference design database. |
6546 |
24 Jan 22 |
nicklas |
269 |
*/ |
6555 |
27 Jan 22 |
nicklas |
270 |
public QueryParser createRefQueryParser(final OncoArrayFilterAction filter) |
6546 |
24 Jan 22 |
nicklas |
271 |
{ |
6555 |
27 Jan 22 |
nicklas |
272 |
return new FieldAwareQueryParser("gene", refAnalyzer, refQueryFields) |
6555 |
27 Jan 22 |
nicklas |
273 |
{ |
6555 |
27 Jan 22 |
nicklas |
274 |
@Override |
6555 |
27 Jan 22 |
nicklas |
275 |
public Query newTermQuery(Term term, float boost) |
6555 |
27 Jan 22 |
nicklas |
276 |
{ |
6555 |
27 Jan 22 |
nicklas |
// Ignore queries against gt: and move that to the filter implementation |
6555 |
27 Jan 22 |
nicklas |
278 |
if ("gt".equals(term.field())) |
6555 |
27 Jan 22 |
nicklas |
279 |
{ |
6555 |
27 Jan 22 |
nicklas |
280 |
if (filter != null) filter.setRequestedGt(term.text()); |
6555 |
27 Jan 22 |
nicklas |
281 |
return null; |
6555 |
27 Jan 22 |
nicklas |
282 |
} |
6555 |
27 Jan 22 |
nicklas |
283 |
return super.newTermQuery(term, boost); |
6555 |
27 Jan 22 |
nicklas |
284 |
} |
6555 |
27 Jan 22 |
nicklas |
285 |
}; |
6546 |
24 Jan 22 |
nicklas |
286 |
} |
6546 |
24 Jan 22 |
nicklas |
287 |
|
6546 |
24 Jan 22 |
nicklas |
288 |
/** |
6551 |
26 Jan 22 |
nicklas |
Get the id of all raw bioassays where the specified variant has been found. |
6551 |
26 Jan 22 |
nicklas |
290 |
*/ |
6551 |
26 Jan 22 |
nicklas |
291 |
@Override |
6551 |
26 Jan 22 |
nicklas |
292 |
public Set<Integer> getRawBioAssaysWithVariant(String chrom, long pos, String ref, String alt, String snpId) |
6551 |
26 Jan 22 |
nicklas |
293 |
throws IOException |
6551 |
26 Jan 22 |
nicklas |
294 |
{ |
6551 |
26 Jan 22 |
nicklas |
295 |
if (getWorkingStatus() == Status.DISABLED) return Collections.emptySet(); |
6551 |
26 Jan 22 |
nicklas |
296 |
BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
6551 |
26 Jan 22 |
nicklas |
297 |
builder.add(new TermQuery(new Term("snps", snpId)), Occur.MUST); |
6551 |
26 Jan 22 |
nicklas |
298 |
if (getIndexAllGenotypes()) |
6551 |
26 Jan 22 |
nicklas |
299 |
{ |
6551 |
26 Jan 22 |
nicklas |
300 |
builder.add(new TermQuery(new Term("gt", "0/0")), Occur.MUST_NOT); |
6551 |
26 Jan 22 |
nicklas |
301 |
} |
6551 |
26 Jan 22 |
nicklas |
302 |
Query query = builder.build(); |
6553 |
27 Jan 22 |
nicklas |
303 |
RawBioAssayIdCollector collector = new RawBioAssayIdCollector("rbaId", -1); |
6551 |
26 Jan 22 |
nicklas |
304 |
getIndexSearcher().search(query, collector); |
6551 |
26 Jan 22 |
nicklas |
305 |
return collector.getRbaIds(); |
6551 |
26 Jan 22 |
nicklas |
306 |
} |
6551 |
26 Jan 22 |
nicklas |
307 |
|
6551 |
26 Jan 22 |
nicklas |
308 |
/** |
6556 |
28 Jan 22 |
nicklas |
Get the "OncoArray500K" array design. |
6556 |
28 Jan 22 |
nicklas |
Returns null if it can't be found. |
6556 |
28 Jan 22 |
nicklas |
311 |
*/ |
6556 |
28 Jan 22 |
nicklas |
312 |
public ArrayDesign getArrayDesign(DbControl dc) |
6556 |
28 Jan 22 |
nicklas |
313 |
{ |
6556 |
28 Jan 22 |
nicklas |
314 |
ItemQuery<ArrayDesign> query = ArrayDesign.getQuery(); |
6556 |
28 Jan 22 |
nicklas |
315 |
query.setIncludes(Include.ALL); |
6556 |
28 Jan 22 |
nicklas |
316 |
query.restrict(Restrictions.eq(Hql.property("name"), Expressions.string("OncoArray500K"))); |
6556 |
28 Jan 22 |
nicklas |
317 |
List<ArrayDesign> list = query.list(dc); |
6556 |
28 Jan 22 |
nicklas |
318 |
ArrayDesign design = null; |
6556 |
28 Jan 22 |
nicklas |
319 |
if (list.size() > 0) |
6556 |
28 Jan 22 |
nicklas |
320 |
{ |
6556 |
28 Jan 22 |
nicklas |
321 |
design = list.get(0); |
6556 |
28 Jan 22 |
nicklas |
322 |
} |
6556 |
28 Jan 22 |
nicklas |
323 |
return design; |
6556 |
28 Jan 22 |
nicklas |
324 |
} |
6556 |
28 Jan 22 |
nicklas |
325 |
|
6556 |
28 Jan 22 |
nicklas |
326 |
/** |
6544 |
19 Jan 22 |
nicklas |
The custom action "INDEX_SNP" is used to index the reference VCF |
6544 |
19 Jan 22 |
nicklas |
file on the "OncoArray500K" array design. |
6544 |
19 Jan 22 |
nicklas |
329 |
*/ |
6544 |
19 Jan 22 |
nicklas |
330 |
@Override |
6544 |
19 Jan 22 |
nicklas |
331 |
public void doCustomAction(String customAction) |
6544 |
19 Jan 22 |
nicklas |
332 |
{ |
6545 |
21 Jan 22 |
nicklas |
333 |
if (!"INDEX_SNP".equals(customAction)) return; |
6545 |
21 Jan 22 |
nicklas |
334 |
|
7074 |
24 Mar 23 |
nicklas |
335 |
logger.info("Starting full rebuild of reference index: " + getName()); |
6545 |
21 Jan 22 |
nicklas |
336 |
|
6545 |
21 Jan 22 |
nicklas |
337 |
SimpleProgressReporter progress = null; |
6545 |
21 Jan 22 |
nicklas |
338 |
DbControl dc = null; |
6545 |
21 Jan 22 |
nicklas |
339 |
IndexWriter writer = null; |
6556 |
28 Jan 22 |
nicklas |
340 |
Directory rebuildDir = null; |
6556 |
28 Jan 22 |
nicklas |
// A directory that we need to get rid of at the end -- either the old index, or the rebuild-directory (if something failed) |
6556 |
28 Jan 22 |
nicklas |
342 |
java.io.File toDelete = null; |
6556 |
28 Jan 22 |
nicklas |
343 |
boolean failed = false; |
6544 |
19 Jan 22 |
nicklas |
344 |
try |
6544 |
19 Jan 22 |
nicklas |
345 |
{ |
6604 |
23 Feb 22 |
nicklas |
346 |
dc = VarSearchService.getInstance().getRootSessionControl().newDbControl("Variant search: Rebuild reference index"); |
6545 |
21 Jan 22 |
nicklas |
347 |
progress = setProgressReporter(new SimpleProgressReporter(null)); |
6544 |
19 Jan 22 |
nicklas |
348 |
setWorkingStatus(Status.INDEXING); |
6545 |
21 Jan 22 |
nicklas |
349 |
|
6556 |
28 Jan 22 |
nicklas |
350 |
ArrayDesign design = getArrayDesign(dc); |
6556 |
28 Jan 22 |
nicklas |
351 |
if (design == null) |
6556 |
28 Jan 22 |
nicklas |
352 |
{ |
6556 |
28 Jan 22 |
nicklas |
353 |
throw new ItemNotFoundException("ArrayDesign[name=OncoArray500K]"); |
6556 |
28 Jan 22 |
nicklas |
354 |
} |
6545 |
21 Jan 22 |
nicklas |
355 |
|
6556 |
28 Jan 22 |
nicklas |
356 |
File vcf = Datafiletype.VCF_DESIGN.getFile(dc, design); |
6556 |
28 Jan 22 |
nicklas |
357 |
if (vcf == null) |
6556 |
28 Jan 22 |
nicklas |
358 |
{ |
6556 |
28 Jan 22 |
nicklas |
359 |
throw new ItemNotFoundException("VCF file[ArrayDesign=OncoArray500K"); |
6556 |
28 Jan 22 |
nicklas |
360 |
} |
6545 |
21 Jan 22 |
nicklas |
361 |
|
6556 |
28 Jan 22 |
nicklas |
362 |
java.io.File rebuildPath = getNewPath(refPathPrefix); |
6556 |
28 Jan 22 |
nicklas |
363 |
toDelete = rebuildPath; |
6556 |
28 Jan 22 |
nicklas |
364 |
rebuildDir = new NIOFSDirectory(rebuildPath.toPath()); |
6549 |
25 Jan 22 |
nicklas |
365 |
IndexWriterConfig config = new IndexWriterConfig(refAnalyzer); |
6545 |
21 Jan 22 |
nicklas |
366 |
config.setOpenMode(OpenMode.CREATE_OR_APPEND); |
6545 |
21 Jan 22 |
nicklas |
367 |
|
6545 |
21 Jan 22 |
nicklas |
// Create a multi-threader MergeScheduler with 1-8 threads |
6545 |
21 Jan 22 |
nicklas |
369 |
int numMergeThreads = getNumThreads(1, 8); |
6545 |
21 Jan 22 |
nicklas |
370 |
ConcurrentMergeScheduler mergeSceduler = new ConcurrentMergeScheduler(); |
6545 |
21 Jan 22 |
nicklas |
371 |
mergeSceduler.setMaxMergesAndThreads(numMergeThreads*2, numMergeThreads); |
6545 |
21 Jan 22 |
nicklas |
372 |
config.setMergeScheduler(mergeSceduler); |
6545 |
21 Jan 22 |
nicklas |
373 |
|
6545 |
21 Jan 22 |
nicklas |
// Create a MergePolicy for 1GB segments |
6545 |
21 Jan 22 |
nicklas |
375 |
TieredMergePolicy mergePolicy = new TieredMergePolicy(); |
6545 |
21 Jan 22 |
nicklas |
376 |
mergePolicy.setMaxMergedSegmentMB(5000); // The default is 5000 |
6545 |
21 Jan 22 |
nicklas |
377 |
config.setMergePolicy(mergePolicy); |
6545 |
21 Jan 22 |
nicklas |
378 |
|
6556 |
28 Jan 22 |
nicklas |
379 |
writer = new IndexWriter(rebuildDir, config); |
6545 |
21 Jan 22 |
nicklas |
380 |
|
6556 |
28 Jan 22 |
nicklas |
381 |
VariantCallIndexer indexer = new VariantCallIndexer(this, writer, design, vcf); |
6556 |
28 Jan 22 |
nicklas |
382 |
indexer.setProgressReporter(progress); |
6545 |
21 Jan 22 |
nicklas |
383 |
indexer.call(); |
6545 |
21 Jan 22 |
nicklas |
384 |
writer.commit(); |
6556 |
28 Jan 22 |
nicklas |
385 |
FileUtil.close(writer); |
6556 |
28 Jan 22 |
nicklas |
386 |
writer = null; |
6545 |
21 Jan 22 |
nicklas |
387 |
|
6556 |
28 Jan 22 |
nicklas |
388 |
design.setNumFileFeatures(indexer.getNumGenotypes()); |
6556 |
28 Jan 22 |
nicklas |
389 |
dc.commit(); |
7074 |
24 Mar 23 |
nicklas |
390 |
logger.info("Reference index rebuilt successfully: "+getName()); |
6556 |
28 Jan 22 |
nicklas |
391 |
|
6556 |
28 Jan 22 |
nicklas |
392 |
FileUtil.close(rebuildDir); |
6556 |
28 Jan 22 |
nicklas |
393 |
toDelete = refPath; |
6556 |
28 Jan 22 |
nicklas |
394 |
|
6556 |
28 Jan 22 |
nicklas |
395 |
FileUtil.close(refReader); |
6556 |
28 Jan 22 |
nicklas |
396 |
FileUtil.close(refDirectory); |
6556 |
28 Jan 22 |
nicklas |
397 |
|
6556 |
28 Jan 22 |
nicklas |
398 |
refPath = rebuildPath; |
6556 |
28 Jan 22 |
nicklas |
399 |
refDirectory = new NIOFSDirectory(refPath.toPath()); |
6545 |
21 Jan 22 |
nicklas |
400 |
refReader = createIndexReader(refDirectory); |
6545 |
21 Jan 22 |
nicklas |
401 |
refSearcher = createIndexSearcher(refReader); |
6544 |
19 Jan 22 |
nicklas |
402 |
} |
6545 |
21 Jan 22 |
nicklas |
403 |
catch (Exception ex) |
6545 |
21 Jan 22 |
nicklas |
404 |
{ |
6556 |
28 Jan 22 |
nicklas |
405 |
setError(new RuntimeException(customAction + " failed: "+ex.getMessage(), ex)); |
6545 |
21 Jan 22 |
nicklas |
406 |
logger.error("Custom action '"+customAction+"' failed ("+getName()+")", ex); |
6556 |
28 Jan 22 |
nicklas |
407 |
if (writer != null) |
6556 |
28 Jan 22 |
nicklas |
408 |
{ |
6556 |
28 Jan 22 |
nicklas |
409 |
try |
6556 |
28 Jan 22 |
nicklas |
410 |
{ |
6556 |
28 Jan 22 |
nicklas |
411 |
writer.rollback(); |
6556 |
28 Jan 22 |
nicklas |
412 |
} |
6556 |
28 Jan 22 |
nicklas |
413 |
catch (Exception ex2) |
6556 |
28 Jan 22 |
nicklas |
414 |
{ |
6556 |
28 Jan 22 |
nicklas |
415 |
logger.warn("Exception during rollback", ex2); |
6556 |
28 Jan 22 |
nicklas |
416 |
} |
6556 |
28 Jan 22 |
nicklas |
417 |
} |
6545 |
21 Jan 22 |
nicklas |
418 |
} |
6544 |
19 Jan 22 |
nicklas |
419 |
finally |
6544 |
19 Jan 22 |
nicklas |
420 |
{ |
6556 |
28 Jan 22 |
nicklas |
421 |
FileUtil.close(writer); |
6556 |
28 Jan 22 |
nicklas |
422 |
FileUtil.close(rebuildDir); |
6556 |
28 Jan 22 |
nicklas |
423 |
if (toDelete != null) |
6556 |
28 Jan 22 |
nicklas |
424 |
{ |
6556 |
28 Jan 22 |
nicklas |
425 |
logger.debug("Deleting "+(failed?"rebuild":"old")+" ref index ("+getName()+"): " + toDelete); |
6556 |
28 Jan 22 |
nicklas |
426 |
int numDeleted = FileUtil.deleteTempDirectory(toDelete); |
6556 |
28 Jan 22 |
nicklas |
427 |
if (toDelete.exists()) |
6556 |
28 Jan 22 |
nicklas |
428 |
{ |
6556 |
28 Jan 22 |
nicklas |
429 |
logger.warn((failed?"Rebuild":"Old")+" ref index could not be deleted ("+getName()+"): " + toDelete); |
6556 |
28 Jan 22 |
nicklas |
430 |
} |
6556 |
28 Jan 22 |
nicklas |
431 |
else |
6556 |
28 Jan 22 |
nicklas |
432 |
{ |
6556 |
28 Jan 22 |
nicklas |
433 |
logger.debug((failed?"Rebuild":"Old")+" ref index deleted ("+getName()+"): " + toDelete); |
6556 |
28 Jan 22 |
nicklas |
434 |
} |
6556 |
28 Jan 22 |
nicklas |
435 |
} |
6544 |
19 Jan 22 |
nicklas |
436 |
setWorkingStatus(Status.IDLE); |
6544 |
19 Jan 22 |
nicklas |
437 |
setProgressReporter(null); |
6545 |
21 Jan 22 |
nicklas |
438 |
if (dc != null) dc.close(); |
6544 |
19 Jan 22 |
nicklas |
439 |
} |
6544 |
19 Jan 22 |
nicklas |
440 |
} |
6544 |
19 Jan 22 |
nicklas |
441 |
|
6544 |
19 Jan 22 |
nicklas |
442 |
/** |
6546 |
24 Jan 22 |
nicklas |
Creates a OncoArrayFilterAction. |
6546 |
24 Jan 22 |
nicklas |
444 |
*/ |
6546 |
24 Jan 22 |
nicklas |
445 |
@Override |
6546 |
24 Jan 22 |
nicklas |
446 |
public OncoArrayFilterAction createFilterAction(InvokationContext<? super QueryFilterAction> context) |
6546 |
24 Jan 22 |
nicklas |
447 |
{ |
6551 |
26 Jan 22 |
nicklas |
448 |
return new OncoArrayFilterAction(context == null ? null : context.getClientContext().getCurrentItem(), this); |
6546 |
24 Jan 22 |
nicklas |
449 |
} |
6552 |
26 Jan 22 |
nicklas |
450 |
|
6552 |
26 Jan 22 |
nicklas |
451 |
@Override |
6552 |
26 Jan 22 |
nicklas |
452 |
public OncoArrayColumnAction createColumnAction(LuceneFilterAction<?> filter) |
6552 |
26 Jan 22 |
nicklas |
453 |
{ |
6552 |
26 Jan 22 |
nicklas |
454 |
return new OncoArrayColumnAction(this, (OncoArrayFilterAction)filter); |
6552 |
26 Jan 22 |
nicklas |
455 |
} |
6546 |
24 Jan 22 |
nicklas |
456 |
|
6546 |
24 Jan 22 |
nicklas |
457 |
/** |
6553 |
27 Jan 22 |
nicklas |
Get the raw bioassays genotype for the given SNP. |
6553 |
27 Jan 22 |
nicklas |
Note! This is a helper method that that only test for |
6553 |
27 Jan 22 |
nicklas |
0/1, 1/1 and 0/0 genotypes. To check for other genotypes |
6555 |
27 Jan 22 |
nicklas |
use one the {@link #getRawBioAssaysWithSnpGenotype(String, String)} |
6555 |
27 Jan 22 |
nicklas |
or {@link #hasGenotype(int, String, String)} methods. |
6553 |
27 Jan 22 |
nicklas |
463 |
*/ |
6553 |
27 Jan 22 |
nicklas |
464 |
public String getGenotype(int rbaId, String snpId) |
6553 |
27 Jan 22 |
nicklas |
465 |
throws IOException |
6553 |
27 Jan 22 |
nicklas |
466 |
{ |
6553 |
27 Jan 22 |
nicklas |
467 |
String[] test = { "0/1", "1/1", "0/0" }; |
6553 |
27 Jan 22 |
nicklas |
468 |
for (String gt : test) |
6553 |
27 Jan 22 |
nicklas |
469 |
{ |
6553 |
27 Jan 22 |
nicklas |
470 |
if (getRawBioAssaysWithSnpGenotype(snpId, gt).contains(rbaId)) return gt; |
6553 |
27 Jan 22 |
nicklas |
471 |
} |
6553 |
27 Jan 22 |
nicklas |
472 |
return null; |
6553 |
27 Jan 22 |
nicklas |
473 |
} |
6553 |
27 Jan 22 |
nicklas |
474 |
|
6555 |
27 Jan 22 |
nicklas |
475 |
public boolean hasGenotype(int rbaId, String snpId, String gt) |
6555 |
27 Jan 22 |
nicklas |
476 |
throws IOException |
6555 |
27 Jan 22 |
nicklas |
477 |
{ |
6555 |
27 Jan 22 |
nicklas |
478 |
return getRawBioAssaysWithSnpGenotype(snpId, gt).contains(rbaId); |
6555 |
27 Jan 22 |
nicklas |
479 |
} |
6555 |
27 Jan 22 |
nicklas |
480 |
|
6553 |
27 Jan 22 |
nicklas |
481 |
/** |
6553 |
27 Jan 22 |
nicklas |
Get all raw bioassays that has the specified genotype on the given snp. |
6553 |
27 Jan 22 |
nicklas |
483 |
*/ |
6553 |
27 Jan 22 |
nicklas |
484 |
public Set<Integer> getRawBioAssaysWithSnpGenotype(String snpId, String gt) |
6553 |
27 Jan 22 |
nicklas |
485 |
throws IOException |
6553 |
27 Jan 22 |
nicklas |
486 |
{ |
6553 |
27 Jan 22 |
nicklas |
487 |
String cacheKey = snpId+":"+gt; |
6553 |
27 Jan 22 |
nicklas |
488 |
QueryResult result = snpGtCache.get(cacheKey, -1); |
6553 |
27 Jan 22 |
nicklas |
489 |
if (result == null) |
6553 |
27 Jan 22 |
nicklas |
490 |
{ |
6553 |
27 Jan 22 |
nicklas |
491 |
BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
6553 |
27 Jan 22 |
nicklas |
492 |
builder.add(new TermQuery(new Term("snps", snpId)), Occur.MUST); |
6553 |
27 Jan 22 |
nicklas |
493 |
builder.add(new TermQuery(new Term("gt", gt)), Occur.MUST); |
6553 |
27 Jan 22 |
nicklas |
494 |
result = getIndexSearcher().search(builder.build(), new RawBioAssayIdCollectorManager("rbaId", -1)); |
6553 |
27 Jan 22 |
nicklas |
495 |
snpGtCache.store(cacheKey, result); |
6553 |
27 Jan 22 |
nicklas |
496 |
} |
6553 |
27 Jan 22 |
nicklas |
497 |
return result.getIdList(); |
6553 |
27 Jan 22 |
nicklas |
498 |
} |
6553 |
27 Jan 22 |
nicklas |
499 |
|
6553 |
27 Jan 22 |
nicklas |
500 |
/** |
6540 |
17 Jan 22 |
nicklas |
Implements indexing for a VCF file attached to a raw bioassay. |
6540 |
17 Jan 22 |
nicklas |
@author nicklas |
6540 |
17 Jan 22 |
nicklas |
503 |
*/ |
6540 |
17 Jan 22 |
nicklas |
504 |
public class OncoArrayIndexer |
6540 |
17 Jan 22 |
nicklas |
505 |
implements Indexer |
6540 |
17 Jan 22 |
nicklas |
506 |
{ |
6540 |
17 Jan 22 |
nicklas |
507 |
|
6540 |
17 Jan 22 |
nicklas |
508 |
|
6540 |
17 Jan 22 |
nicklas |
509 |
private final LuceneIndex idx; |
6540 |
17 Jan 22 |
nicklas |
510 |
private final IndexWriter writer; |
6540 |
17 Jan 22 |
nicklas |
511 |
private final int num; |
6540 |
17 Jan 22 |
nicklas |
512 |
private final RawBioAssay rba; |
6540 |
17 Jan 22 |
nicklas |
513 |
private final List<VcfFile> vcfFiles; |
6540 |
17 Jan 22 |
nicklas |
514 |
private int numVariants; |
6540 |
17 Jan 22 |
nicklas |
515 |
private int numGenotypes; |
6540 |
17 Jan 22 |
nicklas |
516 |
private boolean aborted; |
6540 |
17 Jan 22 |
nicklas |
517 |
|
6540 |
17 Jan 22 |
nicklas |
518 |
|
6540 |
17 Jan 22 |
nicklas |
519 |
public OncoArrayIndexer(LuceneIndex idx, IndexWriter writer, int num, RawBioAssay rba, List<VcfFile> vcfFiles) |
6540 |
17 Jan 22 |
nicklas |
520 |
{ |
6540 |
17 Jan 22 |
nicklas |
521 |
this.idx = idx; |
6540 |
17 Jan 22 |
nicklas |
522 |
this.num = num; |
6540 |
17 Jan 22 |
nicklas |
523 |
this.writer = writer; |
6540 |
17 Jan 22 |
nicklas |
524 |
this.rba = rba; |
6540 |
17 Jan 22 |
nicklas |
525 |
this.vcfFiles = vcfFiles; |
6540 |
17 Jan 22 |
nicklas |
526 |
} |
6540 |
17 Jan 22 |
nicklas |
527 |
|
6540 |
17 Jan 22 |
nicklas |
528 |
/** |
6540 |
17 Jan 22 |
nicklas |
Get the number of variants that was indexed. |
6540 |
17 Jan 22 |
nicklas |
530 |
*/ |
6540 |
17 Jan 22 |
nicklas |
531 |
@Override |
6540 |
17 Jan 22 |
nicklas |
532 |
public int getNumVariants() |
6540 |
17 Jan 22 |
nicklas |
533 |
{ |
6540 |
17 Jan 22 |
nicklas |
534 |
return numVariants; |
6540 |
17 Jan 22 |
nicklas |
535 |
} |
6540 |
17 Jan 22 |
nicklas |
536 |
|
6540 |
17 Jan 22 |
nicklas |
537 |
/** |
6540 |
17 Jan 22 |
nicklas |
Get the number of genotypes that was indexed. |
6540 |
17 Jan 22 |
nicklas |
@since 1.2 |
6540 |
17 Jan 22 |
nicklas |
540 |
*/ |
6540 |
17 Jan 22 |
nicklas |
541 |
@Override |
6540 |
17 Jan 22 |
nicklas |
542 |
public int getNumGenotypes() |
6540 |
17 Jan 22 |
nicklas |
543 |
{ |
6540 |
17 Jan 22 |
nicklas |
544 |
return numGenotypes; |
6540 |
17 Jan 22 |
nicklas |
545 |
} |
6540 |
17 Jan 22 |
nicklas |
546 |
|
6540 |
17 Jan 22 |
nicklas |
547 |
/** |
6540 |
17 Jan 22 |
nicklas |
Get the raw bioassay that was indexed. |
6540 |
17 Jan 22 |
nicklas |
549 |
*/ |
6540 |
17 Jan 22 |
nicklas |
550 |
@Override |
6540 |
17 Jan 22 |
nicklas |
551 |
public RawBioAssay getRawBioAssay() |
6540 |
17 Jan 22 |
nicklas |
552 |
{ |
6540 |
17 Jan 22 |
nicklas |
553 |
return rba; |
6540 |
17 Jan 22 |
nicklas |
554 |
} |
6540 |
17 Jan 22 |
nicklas |
555 |
|
6545 |
21 Jan 22 |
nicklas |
556 |
@Override |
6545 |
21 Jan 22 |
nicklas |
557 |
public ArrayDesign getArrayDesign() |
6545 |
21 Jan 22 |
nicklas |
558 |
{ |
6545 |
21 Jan 22 |
nicklas |
559 |
return null; |
6545 |
21 Jan 22 |
nicklas |
560 |
} |
6545 |
21 Jan 22 |
nicklas |
561 |
|
6540 |
17 Jan 22 |
nicklas |
562 |
/** |
6540 |
17 Jan 22 |
nicklas |
Return TRUE if the indexing was aborted due to |
6540 |
17 Jan 22 |
nicklas |
closing down. |
6540 |
17 Jan 22 |
nicklas |
565 |
*/ |
6540 |
17 Jan 22 |
nicklas |
566 |
@Override |
6540 |
17 Jan 22 |
nicklas |
567 |
public boolean wasAborted() |
6540 |
17 Jan 22 |
nicklas |
568 |
{ |
6540 |
17 Jan 22 |
nicklas |
569 |
return aborted; |
6540 |
17 Jan 22 |
nicklas |
570 |
} |
6540 |
17 Jan 22 |
nicklas |
571 |
|
6540 |
17 Jan 22 |
nicklas |
572 |
@Override |
6540 |
17 Jan 22 |
nicklas |
573 |
public OncoArrayIndexer call() |
6540 |
17 Jan 22 |
nicklas |
574 |
throws Exception |
6540 |
17 Jan 22 |
nicklas |
575 |
{ |
6540 |
17 Jan 22 |
nicklas |
576 |
if (idx.isClosing()) |
6540 |
17 Jan 22 |
nicklas |
577 |
{ |
6540 |
17 Jan 22 |
nicklas |
578 |
aborted = true; |
6540 |
17 Jan 22 |
nicklas |
579 |
return this; |
6540 |
17 Jan 22 |
nicklas |
580 |
} |
6540 |
17 Jan 22 |
nicklas |
581 |
VcfParser parser = null; |
6540 |
17 Jan 22 |
nicklas |
582 |
numVariants = 0; |
6540 |
17 Jan 22 |
nicklas |
583 |
numGenotypes = 0; |
6540 |
17 Jan 22 |
nicklas |
584 |
boolean indexAllGenotypes = idx.getIndexAllGenotypes(); |
6540 |
17 Jan 22 |
nicklas |
585 |
try |
6540 |
17 Jan 22 |
nicklas |
586 |
{ |
6540 |
17 Jan 22 |
nicklas |
587 |
logger.debug("Indexing #" + num +": " +rba.getName() + ": " + vcfFiles.size() + " VCF files"); |
6540 |
17 Jan 22 |
nicklas |
588 |
long time = -System.currentTimeMillis(); |
6540 |
17 Jan 22 |
nicklas |
589 |
|
6540 |
17 Jan 22 |
nicklas |
// Re-indexing: delete existing information about this raw bioassay id |
6540 |
17 Jan 22 |
nicklas |
591 |
writer.deleteDocuments(IntPoint.newExactQuery("mainId", rba.getId())); |
6540 |
17 Jan 22 |
nicklas |
592 |
writer.deleteDocuments(IntPoint.newExactQuery("rbaId", rba.getId())); |
6540 |
17 Jan 22 |
nicklas |
593 |
|
6540 |
17 Jan 22 |
nicklas |
594 |
for (VcfFile vcfFile : vcfFiles) |
6540 |
17 Jan 22 |
nicklas |
595 |
{ |
6540 |
17 Jan 22 |
nicklas |
596 |
logger.debug("Indexing #" + num +": " +rba.getName() + ": " + vcfFile.getName()); |
6540 |
17 Jan 22 |
nicklas |
597 |
parser = new VcfParser(vcfFile.getFile()); |
6540 |
17 Jan 22 |
nicklas |
598 |
VcfHeader header = parser.parseHeaders(); |
6540 |
17 Jan 22 |
nicklas |
599 |
|
6540 |
17 Jan 22 |
nicklas |
600 |
if (header == null) |
6540 |
17 Jan 22 |
nicklas |
601 |
{ |
6540 |
17 Jan 22 |
nicklas |
602 |
logger.warn("Unable to index (no header found): " +rba.getName() + "/" + vcfFile.getName()); |
6540 |
17 Jan 22 |
nicklas |
603 |
continue; // With the next VCF |
6540 |
17 Jan 22 |
nicklas |
604 |
} |
6540 |
17 Jan 22 |
nicklas |
605 |
|
6540 |
17 Jan 22 |
nicklas |
606 |
int chrCol = header.indexOf("#CHROM"); |
6540 |
17 Jan 22 |
nicklas |
607 |
int idCol = header.indexOf("ID"); |
6540 |
17 Jan 22 |
nicklas |
608 |
int formatCol = header.indexOf("FORMAT"); |
6540 |
17 Jan 22 |
nicklas |
609 |
|
6540 |
17 Jan 22 |
nicklas |
610 |
if (isMissingColumn(chrCol, idCol, formatCol)) |
6540 |
17 Jan 22 |
nicklas |
611 |
{ |
6540 |
17 Jan 22 |
nicklas |
612 |
logger.warn("Unable to index (missing header column): " +rba.getName() + "/" + vcfFile.getName()); |
7074 |
24 Mar 23 |
nicklas |
613 |
logger.debug("CHROM: "+chrCol+"; ID: "+idCol+"; FORMAT: "+formatCol); |
6540 |
17 Jan 22 |
nicklas |
614 |
continue; // With the next VCF |
6540 |
17 Jan 22 |
nicklas |
615 |
} |
6540 |
17 Jan 22 |
nicklas |
616 |
|
6540 |
17 Jan 22 |
nicklas |
617 |
Map<String, List<String>> genotypes = new HashMap<>(); |
6540 |
17 Jan 22 |
nicklas |
618 |
genotypes.put("0/0", new ArrayList<>()); |
6540 |
17 Jan 22 |
nicklas |
619 |
genotypes.put("0/1", new ArrayList<>()); |
6540 |
17 Jan 22 |
nicklas |
620 |
genotypes.put("1/1", new ArrayList<>()); |
6540 |
17 Jan 22 |
nicklas |
621 |
|
6540 |
17 Jan 22 |
nicklas |
622 |
int lineNo = 0; |
6540 |
17 Jan 22 |
nicklas |
623 |
do |
6540 |
17 Jan 22 |
nicklas |
624 |
{ |
6540 |
17 Jan 22 |
nicklas |
625 |
VcfLine line = parser.nextLine(); |
6540 |
17 Jan 22 |
nicklas |
626 |
if (line == null) break; |
6540 |
17 Jan 22 |
nicklas |
627 |
|
6540 |
17 Jan 22 |
nicklas |
628 |
lineNo++; |
6540 |
17 Jan 22 |
nicklas |
629 |
|
6540 |
17 Jan 22 |
nicklas |
630 |
if (lineNo % 10==0 && idx.isClosing()) |
6540 |
17 Jan 22 |
nicklas |
631 |
{ |
6540 |
17 Jan 22 |
nicklas |
632 |
aborted = true; |
6540 |
17 Jan 22 |
nicklas |
633 |
return this; |
6540 |
17 Jan 22 |
nicklas |
634 |
} |
6540 |
17 Jan 22 |
nicklas |
635 |
|
6540 |
17 Jan 22 |
nicklas |
636 |
String id = line.col(idCol); |
6540 |
17 Jan 22 |
nicklas |
637 |
String gt = line.format("GT"); |
6540 |
17 Jan 22 |
nicklas |
638 |
List<String> list = genotypes.get(gt); |
6540 |
17 Jan 22 |
nicklas |
639 |
if (list != null) |
6540 |
17 Jan 22 |
nicklas |
640 |
{ |
6540 |
17 Jan 22 |
nicklas |
641 |
numGenotypes++; |
6540 |
17 Jan 22 |
nicklas |
642 |
if (!"0/0".equals(gt)) numVariants++; |
6540 |
17 Jan 22 |
nicklas |
643 |
list.add(id); |
6540 |
17 Jan 22 |
nicklas |
644 |
} |
6540 |
17 Jan 22 |
nicklas |
645 |
|
6540 |
17 Jan 22 |
nicklas |
646 |
} while (true); |
6540 |
17 Jan 22 |
nicklas |
647 |
|
6540 |
17 Jan 22 |
nicklas |
648 |
FileUtil.close(parser); |
6540 |
17 Jan 22 |
nicklas |
649 |
|
6540 |
17 Jan 22 |
nicklas |
// Add SNP-IDs for each of the genotypes |
6540 |
17 Jan 22 |
nicklas |
651 |
for (Map.Entry<String, List<String>> e : genotypes.entrySet()) |
6540 |
17 Jan 22 |
nicklas |
652 |
{ |
6540 |
17 Jan 22 |
nicklas |
653 |
DocumentCreator doc = new DocumentCreator(); |
6540 |
17 Jan 22 |
nicklas |
654 |
doc.addRawBioAssayFields(rba, "rba"); |
6540 |
17 Jan 22 |
nicklas |
655 |
doc.addFileFields(vcfFile.getFile(), null); |
6540 |
17 Jan 22 |
nicklas |
656 |
doc.addStringField("gt", e.getKey(), Store.YES); |
6540 |
17 Jan 22 |
nicklas |
657 |
doc.addField(new ListField("snps", e.getValue(), Store.NO)); |
6540 |
17 Jan 22 |
nicklas |
658 |
writer.addDocument(doc.doc()); |
6540 |
17 Jan 22 |
nicklas |
659 |
} |
6540 |
17 Jan 22 |
nicklas |
660 |
} |
6540 |
17 Jan 22 |
nicklas |
661 |
|
6540 |
17 Jan 22 |
nicklas |
// Add summary fields for the complete rba |
6540 |
17 Jan 22 |
nicklas |
663 |
DocumentCreator main = new DocumentCreator(); |
6540 |
17 Jan 22 |
nicklas |
664 |
main.addRawBioAssayFields(rba, "main"); |
6540 |
17 Jan 22 |
nicklas |
665 |
main.addIntField("numVariants", numVariants, Store.YES); |
6540 |
17 Jan 22 |
nicklas |
666 |
main.addIntField("numGenotypes", numGenotypes, Store.YES); |
6540 |
17 Jan 22 |
nicklas |
667 |
writer.addDocument(main.doc()); |
6540 |
17 Jan 22 |
nicklas |
668 |
|
6540 |
17 Jan 22 |
nicklas |
669 |
time += System.currentTimeMillis(); |
6540 |
17 Jan 22 |
nicklas |
670 |
logger.debug("Index complete #"+num+": " +rba.getName() + ": " + numVariants + " variants; " + numGenotypes + " genotypes; " + time + " ms"); |
6540 |
17 Jan 22 |
nicklas |
671 |
if (LuceneIndex.SLOW_MODE) Thread.sleep(10); |
6540 |
17 Jan 22 |
nicklas |
672 |
} |
6540 |
17 Jan 22 |
nicklas |
673 |
catch (Exception ex) |
6540 |
17 Jan 22 |
nicklas |
674 |
{ |
6540 |
17 Jan 22 |
nicklas |
675 |
logger.warn("Indexing failed #"+num+": " + rba.getName(), ex); |
6540 |
17 Jan 22 |
nicklas |
676 |
throw ex; |
6540 |
17 Jan 22 |
nicklas |
677 |
} |
6540 |
17 Jan 22 |
nicklas |
678 |
finally |
6540 |
17 Jan 22 |
nicklas |
679 |
{ |
6540 |
17 Jan 22 |
nicklas |
680 |
FileUtil.close(parser); |
6540 |
17 Jan 22 |
nicklas |
681 |
} |
6540 |
17 Jan 22 |
nicklas |
682 |
return this; |
6540 |
17 Jan 22 |
nicklas |
683 |
} |
6540 |
17 Jan 22 |
nicklas |
684 |
|
6540 |
17 Jan 22 |
nicklas |
685 |
/** |
6540 |
17 Jan 22 |
nicklas |
Check if some column was missing (index < 0). |
6540 |
17 Jan 22 |
nicklas |
687 |
*/ |
6540 |
17 Jan 22 |
nicklas |
688 |
private boolean isMissingColumn(int... colIndexes) |
6540 |
17 Jan 22 |
nicklas |
689 |
{ |
6540 |
17 Jan 22 |
nicklas |
690 |
for (int i : colIndexes) |
6540 |
17 Jan 22 |
nicklas |
691 |
{ |
6540 |
17 Jan 22 |
nicklas |
692 |
if (i < 0) return true; |
6540 |
17 Jan 22 |
nicklas |
693 |
} |
6540 |
17 Jan 22 |
nicklas |
694 |
return false; |
6540 |
17 Jan 22 |
nicklas |
695 |
} |
6540 |
17 Jan 22 |
nicklas |
696 |
} |
6540 |
17 Jan 22 |
nicklas |
697 |
|
6546 |
24 Jan 22 |
nicklas |
698 |
/** |
6546 |
24 Jan 22 |
nicklas |
Filter action implementation for OncoArray index. The implementation need |
6546 |
24 Jan 22 |
nicklas |
to run a query against the reference index first. The results from that |
6546 |
24 Jan 22 |
nicklas |
query are used to create the "main" query running against the variant |
6546 |
24 Jan 22 |
nicklas |
index. |
6546 |
24 Jan 22 |
nicklas |
703 |
*/ |
6546 |
24 Jan 22 |
nicklas |
704 |
public static class OncoArrayFilterAction |
6546 |
24 Jan 22 |
nicklas |
705 |
extends LuceneFilterAction<OncoArrayIndex> |
6546 |
24 Jan 22 |
nicklas |
706 |
{ |
6546 |
24 Jan 22 |
nicklas |
707 |
|
6555 |
27 Jan 22 |
nicklas |
708 |
private final List<SnpCollector> snpCollectors; |
6555 |
27 Jan 22 |
nicklas |
709 |
private final Set<String> allSnps; |
6546 |
24 Jan 22 |
nicklas |
710 |
protected final QueryParser refParser; |
6555 |
27 Jan 22 |
nicklas |
711 |
private String requestedGt; |
6546 |
24 Jan 22 |
nicklas |
712 |
|
6546 |
24 Jan 22 |
nicklas |
713 |
public OncoArrayFilterAction(ItemContext context, OncoArrayIndex idx) |
6546 |
24 Jan 22 |
nicklas |
714 |
{ |
6546 |
24 Jan 22 |
nicklas |
715 |
super(context, idx); |
6555 |
27 Jan 22 |
nicklas |
716 |
this.refParser = idx.createRefQueryParser(this); |
6555 |
27 Jan 22 |
nicklas |
717 |
this.snpCollectors = new ArrayList<>(); |
6555 |
27 Jan 22 |
nicklas |
718 |
this.allSnps = new HashSet<>(); |
6555 |
27 Jan 22 |
nicklas |
719 |
|
6546 |
24 Jan 22 |
nicklas |
720 |
} |
6555 |
27 Jan 22 |
nicklas |
721 |
|
6546 |
24 Jan 22 |
nicklas |
722 |
/** |
6555 |
27 Jan 22 |
nicklas |
Set the requested genotype to return from the filter. |
6546 |
24 Jan 22 |
nicklas |
724 |
*/ |
6555 |
27 Jan 22 |
nicklas |
725 |
public void setRequestedGt(String gt) |
6546 |
24 Jan 22 |
nicklas |
726 |
{ |
6555 |
27 Jan 22 |
nicklas |
727 |
this.requestedGt = gt; |
6546 |
24 Jan 22 |
nicklas |
728 |
} |
6546 |
24 Jan 22 |
nicklas |
729 |
|
6546 |
24 Jan 22 |
nicklas |
730 |
@Override |
6546 |
24 Jan 22 |
nicklas |
731 |
protected Query createVariantQuery(String queryString) |
6546 |
24 Jan 22 |
nicklas |
732 |
throws ParseException, IOException |
6546 |
24 Jan 22 |
nicklas |
733 |
{ |
6546 |
24 Jan 22 |
nicklas |
734 |
queryString = AutoPrefixer.INSTANCE.autoPrefix(queryString); |
6559 |
01 Feb 22 |
nicklas |
735 |
if (queryString.indexOf("gt:") != queryString.lastIndexOf("gt:")) |
6559 |
01 Feb 22 |
nicklas |
736 |
{ |
6559 |
01 Feb 22 |
nicklas |
737 |
throw new ParseException("More than one term with 'gt:' is not supported: " + queryString.replace("\\", "")); |
6559 |
01 Feb 22 |
nicklas |
738 |
} |
6559 |
01 Feb 22 |
nicklas |
739 |
|
6555 |
27 Jan 22 |
nicklas |
// Reset this and let the parser handle 'gt' if it is present in the query |
6555 |
27 Jan 22 |
nicklas |
741 |
this.requestedGt = null; |
6546 |
24 Jan 22 |
nicklas |
742 |
Query query = refParser.parse(queryString); |
6546 |
24 Jan 22 |
nicklas |
743 |
|
6559 |
01 Feb 22 |
nicklas |
// Modify the query to not return gt:0/0 unless a gt: has been explicitely requested |
6559 |
01 Feb 22 |
nicklas |
745 |
Occur requestedGtOccur = null; |
6559 |
01 Feb 22 |
nicklas |
746 |
if (requestedGt != null) |
6559 |
01 Feb 22 |
nicklas |
747 |
{ |
6559 |
01 Feb 22 |
nicklas |
748 |
requestedGtOccur = queryString.contains("-gt:") ? Occur.MUST_NOT : Occur.MUST; |
6559 |
01 Feb 22 |
nicklas |
749 |
} |
6559 |
01 Feb 22 |
nicklas |
750 |
else |
6559 |
01 Feb 22 |
nicklas |
751 |
{ |
6559 |
01 Feb 22 |
nicklas |
752 |
requestedGtOccur = Occur.MUST_NOT; |
6559 |
01 Feb 22 |
nicklas |
753 |
this.requestedGt = "0/0"; |
6559 |
01 Feb 22 |
nicklas |
754 |
} |
6559 |
01 Feb 22 |
nicklas |
755 |
|
6546 |
24 Jan 22 |
nicklas |
756 |
logger.debug("RefQueryString: " + queryString); |
6546 |
24 Jan 22 |
nicklas |
757 |
logger.debug("RefQuery: " + query); |
6555 |
27 Jan 22 |
nicklas |
758 |
logger.debug("RequestedGt: " + requestedGt); |
6546 |
24 Jan 22 |
nicklas |
759 |
if (logger.isDebugEnabled()) |
6546 |
24 Jan 22 |
nicklas |
760 |
{ |
6546 |
24 Jan 22 |
nicklas |
761 |
logger.debug("RefCount: "+idx.getRefSearcher().count(query)); |
6546 |
24 Jan 22 |
nicklas |
762 |
} |
6546 |
24 Jan 22 |
nicklas |
763 |
|
6559 |
01 Feb 22 |
nicklas |
764 |
SnpCollector currentCollector = new SnpCollector(MAX_SNP_HITS, requestedGt, requestedGtOccur); |
6548 |
24 Jan 22 |
nicklas |
765 |
idx.getRefSearcher().search(query, currentCollector); |
6548 |
24 Jan 22 |
nicklas |
766 |
int numSnps = currentCollector.getNumSnps(); |
6548 |
24 Jan 22 |
nicklas |
767 |
if (numSnps > MAX_SNP_HITS) |
6546 |
24 Jan 22 |
nicklas |
768 |
{ |
6559 |
01 Feb 22 |
nicklas |
769 |
throw new ParseException("Too many SNPs matching query '"+queryString.replace("\\", "")+"': " + numSnps); |
6546 |
24 Jan 22 |
nicklas |
770 |
} |
6548 |
24 Jan 22 |
nicklas |
771 |
else if (numSnps == 0) |
6546 |
24 Jan 22 |
nicklas |
772 |
{ |
6548 |
24 Jan 22 |
nicklas |
773 |
query = new MatchNoDocsQuery("No matching SNPs"); |
6546 |
24 Jan 22 |
nicklas |
774 |
} |
6546 |
24 Jan 22 |
nicklas |
775 |
else |
6546 |
24 Jan 22 |
nicklas |
776 |
{ |
6548 |
24 Jan 22 |
nicklas |
777 |
query = parser.parse(escape(Values.getString(currentCollector.getSnpList(), " ", true))); |
6546 |
24 Jan 22 |
nicklas |
778 |
BooleanQuery.Builder b = new BooleanQuery.Builder(); |
6546 |
24 Jan 22 |
nicklas |
779 |
b.add(query, Occur.MUST); |
6559 |
01 Feb 22 |
nicklas |
780 |
b.add(new TermQuery(new Term("gt", requestedGt)), requestedGtOccur); |
6546 |
24 Jan 22 |
nicklas |
781 |
query = b.build(); |
6546 |
24 Jan 22 |
nicklas |
782 |
} |
6555 |
27 Jan 22 |
nicklas |
783 |
snpCollectors.add(currentCollector); |
6555 |
27 Jan 22 |
nicklas |
784 |
allSnps.addAll(currentCollector.getSnpList()); |
6546 |
24 Jan 22 |
nicklas |
785 |
return query; |
6546 |
24 Jan 22 |
nicklas |
786 |
} |
6546 |
24 Jan 22 |
nicklas |
787 |
|
6546 |
24 Jan 22 |
nicklas |
788 |
/** |
6546 |
24 Jan 22 |
nicklas |
Some SNP ID have ':' in their name which need to be escaped. |
6546 |
24 Jan 22 |
nicklas |
790 |
*/ |
6546 |
24 Jan 22 |
nicklas |
791 |
private String escape(String s) |
6546 |
24 Jan 22 |
nicklas |
792 |
{ |
6546 |
24 Jan 22 |
nicklas |
793 |
return s.replace(":", "\\;"); |
6546 |
24 Jan 22 |
nicklas |
794 |
} |
6550 |
25 Jan 22 |
nicklas |
795 |
|
6550 |
25 Jan 22 |
nicklas |
796 |
@Override |
6550 |
25 Jan 22 |
nicklas |
797 |
public AllDocsCollector getRawBioAssayHits(int rbaId, AllDocsCollector hits) |
6550 |
25 Jan 22 |
nicklas |
798 |
{ |
6550 |
25 Jan 22 |
nicklas |
799 |
try |
6550 |
25 Jan 22 |
nicklas |
800 |
{ |
6555 |
27 Jan 22 |
nicklas |
801 |
Set<String> accepted = new HashSet<>(); |
6555 |
27 Jan 22 |
nicklas |
802 |
for (SnpCollector c : snpCollectors) |
6550 |
25 Jan 22 |
nicklas |
803 |
{ |
6555 |
27 Jan 22 |
nicklas |
804 |
String requestedGt = c.getRequstedGt(); |
6559 |
01 Feb 22 |
nicklas |
805 |
Occur requestedGtOccur = c.getRequstedGtOccur(); |
6555 |
27 Jan 22 |
nicklas |
806 |
for (SnpDocument doc : c.getDocuments()) |
6550 |
25 Jan 22 |
nicklas |
807 |
{ |
6555 |
27 Jan 22 |
nicklas |
// Avoid duplicate matches from multiple filter rows |
6555 |
27 Jan 22 |
nicklas |
809 |
if (accepted.contains(doc.snpId)) continue; |
6555 |
27 Jan 22 |
nicklas |
810 |
|
6555 |
27 Jan 22 |
nicklas |
811 |
String acceptedGt = null; |
6559 |
01 Feb 22 |
nicklas |
812 |
if (requestedGtOccur == Occur.MUST) |
6555 |
27 Jan 22 |
nicklas |
813 |
{ |
6555 |
27 Jan 22 |
nicklas |
814 |
if (idx.hasGenotype(rbaId, doc.snpId, requestedGt)) |
6555 |
27 Jan 22 |
nicklas |
815 |
{ |
6555 |
27 Jan 22 |
nicklas |
816 |
acceptedGt = requestedGt; |
6555 |
27 Jan 22 |
nicklas |
817 |
} |
6555 |
27 Jan 22 |
nicklas |
818 |
} |
6559 |
01 Feb 22 |
nicklas |
819 |
else // MUST_NOT |
6555 |
27 Jan 22 |
nicklas |
820 |
{ |
6555 |
27 Jan 22 |
nicklas |
821 |
String gt = idx.getGenotype(rbaId, doc.snpId); |
6559 |
01 Feb 22 |
nicklas |
822 |
if (gt != null && !gt.equals(requestedGt)) |
6555 |
27 Jan 22 |
nicklas |
823 |
{ |
6555 |
27 Jan 22 |
nicklas |
824 |
acceptedGt = gt; |
6555 |
27 Jan 22 |
nicklas |
825 |
} |
6555 |
27 Jan 22 |
nicklas |
826 |
} |
6555 |
27 Jan 22 |
nicklas |
827 |
if (acceptedGt != null) |
6555 |
27 Jan 22 |
nicklas |
828 |
{ |
6555 |
27 Jan 22 |
nicklas |
829 |
doc.gt.setStringValue(acceptedGt); |
6555 |
27 Jan 22 |
nicklas |
830 |
hits.addDocument(doc.document); |
6555 |
27 Jan 22 |
nicklas |
831 |
accepted.add(doc.snpId); |
6555 |
27 Jan 22 |
nicklas |
832 |
} |
6550 |
25 Jan 22 |
nicklas |
833 |
} |
6550 |
25 Jan 22 |
nicklas |
834 |
} |
6550 |
25 Jan 22 |
nicklas |
835 |
} |
6550 |
25 Jan 22 |
nicklas |
836 |
catch (Exception ex) |
6550 |
25 Jan 22 |
nicklas |
837 |
{ |
6848 |
17 Oct 22 |
nicklas |
838 |
VarSearch.throwRuntimException(ex); |
6550 |
25 Jan 22 |
nicklas |
839 |
} |
6550 |
25 Jan 22 |
nicklas |
840 |
return hits; |
6550 |
25 Jan 22 |
nicklas |
841 |
} |
6546 |
24 Jan 22 |
nicklas |
842 |
} |
6552 |
26 Jan 22 |
nicklas |
843 |
|
6552 |
26 Jan 22 |
nicklas |
844 |
public static class OncoArrayColumnAction |
6552 |
26 Jan 22 |
nicklas |
845 |
extends LuceneColumnAction<OncoArrayIndex, OncoArrayFilterAction> |
6552 |
26 Jan 22 |
nicklas |
846 |
{ |
6552 |
26 Jan 22 |
nicklas |
847 |
|
6552 |
26 Jan 22 |
nicklas |
848 |
public OncoArrayColumnAction(OncoArrayIndex idx, OncoArrayFilterAction filter) |
6552 |
26 Jan 22 |
nicklas |
849 |
{ |
6552 |
26 Jan 22 |
nicklas |
850 |
super(idx, filter); |
6552 |
26 Jan 22 |
nicklas |
851 |
} |
6552 |
26 Jan 22 |
nicklas |
852 |
|
6552 |
26 Jan 22 |
nicklas |
853 |
@Override |
6552 |
26 Jan 22 |
nicklas |
854 |
public String getResultSummary() |
6552 |
26 Jan 22 |
nicklas |
855 |
{ |
6552 |
26 Jan 22 |
nicklas |
856 |
StringBuilder sb = new StringBuilder(); |
6552 |
26 Jan 22 |
nicklas |
857 |
QueryResult result = filter.getQueryResult(); |
6552 |
26 Jan 22 |
nicklas |
858 |
|
6552 |
26 Jan 22 |
nicklas |
859 |
if (result != null) |
6552 |
26 Jan 22 |
nicklas |
860 |
{ |
6552 |
26 Jan 22 |
nicklas |
861 |
if (result.hasTimedOut()) |
6552 |
26 Jan 22 |
nicklas |
862 |
{ |
6552 |
26 Jan 22 |
nicklas |
863 |
int loadedHits = result.getTotalDocuments()-result.getDocumentsAfterTimeout(); |
6552 |
26 Jan 22 |
nicklas |
864 |
sb.append("The query did not finish within " + result.getTimeoutInSeconds() + " seconds. "); |
6552 |
26 Jan 22 |
nicklas |
865 |
} |
6552 |
26 Jan 22 |
nicklas |
866 |
if (result.getHits() >= 0) |
6552 |
26 Jan 22 |
nicklas |
867 |
{ |
6552 |
26 Jan 22 |
nicklas |
868 |
sb.append("Found <span title=\""+VarSearch.niceCount(result.getTotalDocuments(),"","")+"\">"); |
6555 |
27 Jan 22 |
nicklas |
869 |
sb.append(VarSearch.formatCount(filter.allSnps.size()," variant", " variants")+"</span>"); |
6552 |
26 Jan 22 |
nicklas |
870 |
sb.append(" and "+VarSearch.niceCount(result.getHits(), " raw bioassay", " raw bioassays")+"."); |
6552 |
26 Jan 22 |
nicklas |
871 |
} |
6552 |
26 Jan 22 |
nicklas |
872 |
} |
6552 |
26 Jan 22 |
nicklas |
873 |
return sb.length() == 0 ? null : sb.toString(); |
6552 |
26 Jan 22 |
nicklas |
874 |
} |
6552 |
26 Jan 22 |
nicklas |
875 |
|
6552 |
26 Jan 22 |
nicklas |
876 |
} |
6552 |
26 Jan 22 |
nicklas |
877 |
|
6540 |
17 Jan 22 |
nicklas |
878 |
} |