6540 |
17 Jan 22 |
nicklas |
1 |
package net.sf.basedb.varsearch.index; |
6540 |
17 Jan 22 |
nicklas |
2 |
|
6540 |
17 Jan 22 |
nicklas |
3 |
import java.io.IOException; |
6540 |
17 Jan 22 |
nicklas |
4 |
import java.util.Arrays; |
6551 |
26 Jan 22 |
nicklas |
5 |
import java.util.Collections; |
6540 |
17 Jan 22 |
nicklas |
6 |
import java.util.HashMap; |
6540 |
17 Jan 22 |
nicklas |
7 |
import java.util.List; |
6540 |
17 Jan 22 |
nicklas |
8 |
import java.util.Map; |
6540 |
17 Jan 22 |
nicklas |
9 |
import java.util.Set; |
6540 |
17 Jan 22 |
nicklas |
10 |
|
6540 |
17 Jan 22 |
nicklas |
11 |
import org.apache.lucene.analysis.Analyzer; |
6540 |
17 Jan 22 |
nicklas |
12 |
import org.apache.lucene.analysis.core.KeywordAnalyzer; |
6540 |
17 Jan 22 |
nicklas |
13 |
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; |
6540 |
17 Jan 22 |
nicklas |
14 |
import org.apache.lucene.document.IntPoint; |
6551 |
26 Jan 22 |
nicklas |
15 |
import org.apache.lucene.document.LongPoint; |
6540 |
17 Jan 22 |
nicklas |
16 |
import org.apache.lucene.document.Field.Store; |
6540 |
17 Jan 22 |
nicklas |
17 |
import org.apache.lucene.index.IndexWriter; |
6546 |
24 Jan 22 |
nicklas |
18 |
import org.apache.lucene.index.Term; |
6546 |
24 Jan 22 |
nicklas |
19 |
import org.apache.lucene.queryparser.classic.ParseException; |
6540 |
17 Jan 22 |
nicklas |
20 |
import org.apache.lucene.queryparser.classic.QueryParser; |
6546 |
24 Jan 22 |
nicklas |
21 |
import org.apache.lucene.search.BooleanQuery; |
6546 |
24 Jan 22 |
nicklas |
22 |
import org.apache.lucene.search.Query; |
6546 |
24 Jan 22 |
nicklas |
23 |
import org.apache.lucene.search.TermQuery; |
6546 |
24 Jan 22 |
nicklas |
24 |
import org.apache.lucene.search.BooleanClause.Occur; |
6540 |
17 Jan 22 |
nicklas |
25 |
import org.slf4j.LoggerFactory; |
6540 |
17 Jan 22 |
nicklas |
26 |
|
6556 |
28 Jan 22 |
nicklas |
27 |
import net.sf.basedb.core.AbsoluteProgressReporter; |
6540 |
17 Jan 22 |
nicklas |
28 |
import net.sf.basedb.core.AnyToAny; |
6545 |
21 Jan 22 |
nicklas |
29 |
import net.sf.basedb.core.ArrayDesign; |
6545 |
21 Jan 22 |
nicklas |
30 |
import net.sf.basedb.core.File; |
6546 |
24 Jan 22 |
nicklas |
31 |
import net.sf.basedb.core.ItemContext; |
6556 |
28 Jan 22 |
nicklas |
32 |
import net.sf.basedb.core.ProgressReporter; |
6540 |
17 Jan 22 |
nicklas |
33 |
import net.sf.basedb.core.RawBioAssay; |
6556 |
28 Jan 22 |
nicklas |
34 |
import net.sf.basedb.core.SimpleAbsoluteProgressReporter; |
6546 |
24 Jan 22 |
nicklas |
35 |
import net.sf.basedb.core.query.QueryFilterAction; |
6540 |
17 Jan 22 |
nicklas |
36 |
import net.sf.basedb.util.FileUtil; |
6545 |
21 Jan 22 |
nicklas |
37 |
import net.sf.basedb.util.Values; |
6546 |
24 Jan 22 |
nicklas |
38 |
import net.sf.basedb.util.extensions.InvokationContext; |
7074 |
24 Mar 23 |
nicklas |
39 |
import net.sf.basedb.util.extensions.logging.ExtensionsLog; |
7074 |
24 Mar 23 |
nicklas |
40 |
import net.sf.basedb.util.extensions.logging.ExtensionsLogger; |
6552 |
26 Jan 22 |
nicklas |
41 |
import net.sf.basedb.varsearch.LuceneColumnFactory.LuceneColumnAction; |
6540 |
17 Jan 22 |
nicklas |
42 |
import net.sf.basedb.varsearch.VarSearch; |
6540 |
17 Jan 22 |
nicklas |
43 |
import net.sf.basedb.varsearch.analyze.AlphaNumericIgnoreCaseAnalyzer; |
6540 |
17 Jan 22 |
nicklas |
44 |
import net.sf.basedb.varsearch.analyze.AminoAcidTranslator; |
6540 |
17 Jan 22 |
nicklas |
45 |
import net.sf.basedb.varsearch.analyze.EffectAnalyzer; |
6540 |
17 Jan 22 |
nicklas |
46 |
import net.sf.basedb.varsearch.analyze.HgvsCdnaAnalyzer; |
6540 |
17 Jan 22 |
nicklas |
47 |
import net.sf.basedb.varsearch.analyze.HgvsProtAnalyzer; |
6545 |
21 Jan 22 |
nicklas |
48 |
import net.sf.basedb.varsearch.dao.Datafiletype; |
6540 |
17 Jan 22 |
nicklas |
49 |
import net.sf.basedb.varsearch.fields.ListField; |
6550 |
25 Jan 22 |
nicklas |
50 |
import net.sf.basedb.varsearch.query.AllDocsCollector; |
6546 |
24 Jan 22 |
nicklas |
51 |
import net.sf.basedb.varsearch.query.AutoPrefixer; |
6540 |
17 Jan 22 |
nicklas |
52 |
import net.sf.basedb.varsearch.query.FieldAwareQueryParser; |
6540 |
17 Jan 22 |
nicklas |
53 |
import net.sf.basedb.varsearch.query.FloatQueryField; |
6540 |
17 Jan 22 |
nicklas |
54 |
import net.sf.basedb.varsearch.query.IntQueryField; |
6540 |
17 Jan 22 |
nicklas |
55 |
import net.sf.basedb.varsearch.query.LongQueryField; |
6540 |
17 Jan 22 |
nicklas |
56 |
import net.sf.basedb.varsearch.query.QueryField; |
6551 |
26 Jan 22 |
nicklas |
57 |
import net.sf.basedb.varsearch.query.RawBioAssayIdCollector; |
6540 |
17 Jan 22 |
nicklas |
58 |
import net.sf.basedb.varsearch.query.StripWildcardQueryField; |
6546 |
24 Jan 22 |
nicklas |
59 |
import net.sf.basedb.varsearch.query.LuceneQueryFactory.LuceneFilterAction; |
7074 |
24 Mar 23 |
nicklas |
60 |
import net.sf.basedb.varsearch.service.VarSearchService; |
6540 |
17 Jan 22 |
nicklas |
61 |
import net.sf.basedb.varsearch.util.NullSafeLinkedSet; |
6540 |
17 Jan 22 |
nicklas |
62 |
import net.sf.basedb.varsearch.util.NullSafeTreeSet; |
6540 |
17 Jan 22 |
nicklas |
63 |
import net.sf.basedb.varsearch.vcf.VcfParser; |
6540 |
17 Jan 22 |
nicklas |
64 |
import net.sf.basedb.varsearch.vcf.VcfParser.Info; |
6540 |
17 Jan 22 |
nicklas |
65 |
import net.sf.basedb.varsearch.vcf.VcfParser.VcfHeader; |
6540 |
17 Jan 22 |
nicklas |
66 |
import net.sf.basedb.varsearch.vcf.VcfParser.VcfLine; |
6540 |
17 Jan 22 |
nicklas |
67 |
|
6540 |
17 Jan 22 |
nicklas |
68 |
/** |
6540 |
17 Jan 22 |
nicklas |
Index implementation for the VCF files that are produced by |
6540 |
17 Jan 22 |
nicklas |
the variant calling and targeted genotyping pipelines. Each |
6540 |
17 Jan 22 |
nicklas |
variant is indexed as a separate document which provide full |
6540 |
17 Jan 22 |
nicklas |
support for searching on multiple fields at the same time. |
6540 |
17 Jan 22 |
nicklas |
73 |
|
6540 |
17 Jan 22 |
nicklas |
@author nicklas |
6540 |
17 Jan 22 |
nicklas |
@since 1.5 |
6540 |
17 Jan 22 |
nicklas |
76 |
*/ |
6540 |
17 Jan 22 |
nicklas |
77 |
public class VariantCallIndex |
6540 |
17 Jan 22 |
nicklas |
78 |
extends LuceneIndex |
6540 |
17 Jan 22 |
nicklas |
79 |
{ |
7074 |
24 Mar 23 |
nicklas |
80 |
private static final ExtensionsLogger logger = |
7074 |
24 Mar 23 |
nicklas |
81 |
ExtensionsLog.getLogger(VarSearchService.ID, true).wrap(LoggerFactory.getLogger(VariantCallIndex.class)); |
7074 |
24 Mar 23 |
nicklas |
82 |
|
6540 |
17 Jan 22 |
nicklas |
83 |
private Map<String, QueryField> queryFields; |
6540 |
17 Jan 22 |
nicklas |
84 |
|
6540 |
17 Jan 22 |
nicklas |
85 |
public VariantCallIndex(String id) |
6540 |
17 Jan 22 |
nicklas |
86 |
{ |
6540 |
17 Jan 22 |
nicklas |
87 |
super(id); |
6546 |
24 Jan 22 |
nicklas |
88 |
queryFields = VariantCallIndexer.createQueryFields(false); |
6540 |
17 Jan 22 |
nicklas |
89 |
} |
6540 |
17 Jan 22 |
nicklas |
90 |
|
6540 |
17 Jan 22 |
nicklas |
91 |
/** |
6540 |
17 Jan 22 |
nicklas |
Create the Analyzer. Most fields are indexed literally with the |
6540 |
17 Jan 22 |
nicklas |
default KeywordAnalyzer, but some fields use special analyzers. |
6540 |
17 Jan 22 |
nicklas |
94 |
*/ |
6540 |
17 Jan 22 |
nicklas |
95 |
@Override |
6540 |
17 Jan 22 |
nicklas |
96 |
protected Analyzer createAnalyzer() |
6540 |
17 Jan 22 |
nicklas |
97 |
{ |
6546 |
24 Jan 22 |
nicklas |
98 |
return VariantCallIndexer.createAnalyzer(false); |
6540 |
17 Jan 22 |
nicklas |
99 |
} |
6540 |
17 Jan 22 |
nicklas |
100 |
|
6540 |
17 Jan 22 |
nicklas |
101 |
@Override |
6540 |
17 Jan 22 |
nicklas |
102 |
protected Indexer createIndexer(IndexWriter writer, int num, RawBioAssay rba, List<VcfFile> vcfFiles) |
6540 |
17 Jan 22 |
nicklas |
103 |
{ |
6540 |
17 Jan 22 |
nicklas |
104 |
return new VariantCallIndexer(this, writer, num, rba, vcfFiles); |
6540 |
17 Jan 22 |
nicklas |
105 |
} |
6540 |
17 Jan 22 |
nicklas |
106 |
|
6540 |
17 Jan 22 |
nicklas |
107 |
/** |
6540 |
17 Jan 22 |
nicklas |
Create a new parser for creating queries from strings. |
6540 |
17 Jan 22 |
nicklas |
109 |
*/ |
6540 |
17 Jan 22 |
nicklas |
110 |
@Override |
6540 |
17 Jan 22 |
nicklas |
111 |
public QueryParser createQueryParser() |
6540 |
17 Jan 22 |
nicklas |
112 |
{ |
6540 |
17 Jan 22 |
nicklas |
113 |
return new FieldAwareQueryParser("gene", getAnalyzer(), queryFields); |
6540 |
17 Jan 22 |
nicklas |
114 |
} |
6540 |
17 Jan 22 |
nicklas |
115 |
|
6540 |
17 Jan 22 |
nicklas |
116 |
/** |
6540 |
17 Jan 22 |
nicklas |
There is one Document for each variant and one extra for each Raw bioassay |
6540 |
17 Jan 22 |
nicklas |
which we need to account for. |
6540 |
17 Jan 22 |
nicklas |
119 |
*/ |
6540 |
17 Jan 22 |
nicklas |
120 |
@Override |
6540 |
17 Jan 22 |
nicklas |
121 |
public long getNumVariants() |
6540 |
17 Jan 22 |
nicklas |
122 |
throws IOException |
6540 |
17 Jan 22 |
nicklas |
123 |
{ |
6540 |
17 Jan 22 |
nicklas |
124 |
if (getWorkingStatus() == Status.DISABLED) return -1; |
6540 |
17 Jan 22 |
nicklas |
125 |
return getIndexReader().numDocs() - getNumRawBioAssays(); |
6540 |
17 Jan 22 |
nicklas |
126 |
} |
6540 |
17 Jan 22 |
nicklas |
127 |
|
6545 |
21 Jan 22 |
nicklas |
128 |
/** |
6551 |
26 Jan 22 |
nicklas |
Get the id of all raw bioassays where the specified variant has been found. |
6551 |
26 Jan 22 |
nicklas |
130 |
*/ |
6551 |
26 Jan 22 |
nicklas |
131 |
@Override |
6551 |
26 Jan 22 |
nicklas |
132 |
public Set<Integer> getRawBioAssaysWithVariant(String chrom, long pos, String ref, String alt, String snpId) |
6551 |
26 Jan 22 |
nicklas |
133 |
throws IOException |
6551 |
26 Jan 22 |
nicklas |
134 |
{ |
6551 |
26 Jan 22 |
nicklas |
135 |
if (getWorkingStatus() == Status.DISABLED) return Collections.emptySet(); |
6551 |
26 Jan 22 |
nicklas |
136 |
BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
6551 |
26 Jan 22 |
nicklas |
137 |
if (snpId == null || snpId.equals(".")) |
6551 |
26 Jan 22 |
nicklas |
138 |
{ |
6551 |
26 Jan 22 |
nicklas |
139 |
builder.add(LongPoint.newExactQuery(chrom, pos), Occur.MUST); |
6551 |
26 Jan 22 |
nicklas |
140 |
builder.add(new TermQuery(new Term("ref", ref)), Occur.MUST); |
6551 |
26 Jan 22 |
nicklas |
141 |
builder.add(new TermQuery(new Term("alt", alt)), Occur.MUST); |
6551 |
26 Jan 22 |
nicklas |
142 |
} |
6551 |
26 Jan 22 |
nicklas |
143 |
else |
6551 |
26 Jan 22 |
nicklas |
144 |
{ |
6551 |
26 Jan 22 |
nicklas |
145 |
builder.add(new TermQuery(new Term("snpId", snpId)), Occur.MUST); |
6551 |
26 Jan 22 |
nicklas |
146 |
} |
6551 |
26 Jan 22 |
nicklas |
147 |
if (getIndexAllGenotypes()) |
6551 |
26 Jan 22 |
nicklas |
148 |
{ |
6551 |
26 Jan 22 |
nicklas |
149 |
builder.add(new TermQuery(new Term("gt", "0/0")), Occur.MUST_NOT); |
6551 |
26 Jan 22 |
nicklas |
150 |
} |
6551 |
26 Jan 22 |
nicklas |
151 |
Query query = builder.build(); |
6553 |
27 Jan 22 |
nicklas |
152 |
RawBioAssayIdCollector collector = new RawBioAssayIdCollector("rbaId", -1); |
6551 |
26 Jan 22 |
nicklas |
153 |
getIndexSearcher().search(query, collector); |
6551 |
26 Jan 22 |
nicklas |
154 |
return collector.getRbaIds(); |
6551 |
26 Jan 22 |
nicklas |
155 |
} |
6551 |
26 Jan 22 |
nicklas |
156 |
|
6551 |
26 Jan 22 |
nicklas |
157 |
|
6551 |
26 Jan 22 |
nicklas |
158 |
/** |
6546 |
24 Jan 22 |
nicklas |
Creates a VariantCallFilterAction. |
6546 |
24 Jan 22 |
nicklas |
160 |
*/ |
6546 |
24 Jan 22 |
nicklas |
161 |
@Override |
6546 |
24 Jan 22 |
nicklas |
162 |
public VariantCallFilterAction createFilterAction(InvokationContext<? super QueryFilterAction> context) |
6546 |
24 Jan 22 |
nicklas |
163 |
{ |
6551 |
26 Jan 22 |
nicklas |
164 |
return new VariantCallFilterAction(context == null ? null : context.getClientContext().getCurrentItem(), this); |
6546 |
24 Jan 22 |
nicklas |
165 |
} |
6552 |
26 Jan 22 |
nicklas |
166 |
|
6546 |
24 Jan 22 |
nicklas |
167 |
|
6552 |
26 Jan 22 |
nicklas |
168 |
@Override |
6552 |
26 Jan 22 |
nicklas |
169 |
public VariantCallColumnAction createColumnAction(LuceneFilterAction<?> filter) |
6552 |
26 Jan 22 |
nicklas |
170 |
{ |
6552 |
26 Jan 22 |
nicklas |
171 |
return new VariantCallColumnAction(this, (VariantCallFilterAction)filter); |
6552 |
26 Jan 22 |
nicklas |
172 |
} |
6546 |
24 Jan 22 |
nicklas |
173 |
|
6546 |
24 Jan 22 |
nicklas |
174 |
/** |
6545 |
21 Jan 22 |
nicklas |
Indexer implementation for VCF files with information and annotations |
6545 |
21 Jan 22 |
nicklas |
that are compatible with the standard Variant calling pipeline in Reggie. |
6545 |
21 Jan 22 |
nicklas |
We support two versions: |
6545 |
21 Jan 22 |
nicklas |
* VCF files attached to raw bioassays with full genotype and annotation information per SNP |
6545 |
21 Jan 22 |
nicklas |
* VCF files attached to array designs with annotation information per SNP (no genotype information) |
6545 |
21 Jan 22 |
nicklas |
180 |
*/ |
6545 |
21 Jan 22 |
nicklas |
181 |
public static class VariantCallIndexer |
6540 |
17 Jan 22 |
nicklas |
182 |
implements Indexer |
6540 |
17 Jan 22 |
nicklas |
183 |
{ |
6540 |
17 Jan 22 |
nicklas |
184 |
|
6546 |
24 Jan 22 |
nicklas |
185 |
/** |
6546 |
24 Jan 22 |
nicklas |
Creates an analyzer implementation that is suitable for this |
6546 |
24 Jan 22 |
nicklas |
indexer implementation. |
6546 |
24 Jan 22 |
nicklas |
@param refOnly If TRUE, only analyzers that are needed for array design VCF are created |
6546 |
24 Jan 22 |
nicklas |
189 |
*/ |
6546 |
24 Jan 22 |
nicklas |
190 |
public static Analyzer createAnalyzer(boolean refOnly) |
6546 |
24 Jan 22 |
nicklas |
191 |
{ |
6546 |
24 Jan 22 |
nicklas |
192 |
Map<String, Analyzer> fieldAnalyzers = new HashMap<>(); |
6546 |
24 Jan 22 |
nicklas |
193 |
fieldAnalyzers.put("gene", new AlphaNumericIgnoreCaseAnalyzer()); // Gene names can be a list and we ignore case |
6546 |
24 Jan 22 |
nicklas |
194 |
fieldAnalyzers.put("c", new HgvsCdnaAnalyzer()); // HGVS.c analyzer |
6546 |
24 Jan 22 |
nicklas |
195 |
fieldAnalyzers.put("p", new HgvsProtAnalyzer()); // HGVS.p analyzer |
6546 |
24 Jan 22 |
nicklas |
196 |
fieldAnalyzers.put("effect", new EffectAnalyzer()); // ANN.Annotation (=Effect) analyzer |
6546 |
24 Jan 22 |
nicklas |
197 |
return new PerFieldAnalyzerWrapper(new KeywordAnalyzer(), fieldAnalyzers); |
6546 |
24 Jan 22 |
nicklas |
198 |
} |
6546 |
24 Jan 22 |
nicklas |
199 |
|
6546 |
24 Jan 22 |
nicklas |
200 |
/** |
6546 |
24 Jan 22 |
nicklas |
Create a map with custom query fields that are suitable when parsing |
6546 |
24 Jan 22 |
nicklas |
queries for thie indexer implementation. |
6546 |
24 Jan 22 |
nicklas |
@param refOnly If TRUE, only fields that are needed for array design VCF are created |
6546 |
24 Jan 22 |
nicklas |
204 |
*/ |
6546 |
24 Jan 22 |
nicklas |
205 |
public static Map<String, QueryField> createQueryFields(boolean refOnly) |
6546 |
24 Jan 22 |
nicklas |
206 |
{ |
6546 |
24 Jan 22 |
nicklas |
207 |
Map<String, QueryField> queryFields = new HashMap<>(); |
6546 |
24 Jan 22 |
nicklas |
208 |
queryFields.put("pos", LongQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
209 |
for (int i = 1; i < 23; i++) |
6546 |
24 Jan 22 |
nicklas |
210 |
{ |
6546 |
24 Jan 22 |
nicklas |
211 |
queryFields.put("chr"+i, LongQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
212 |
} |
6546 |
24 Jan 22 |
nicklas |
213 |
queryFields.put("chrX", LongQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
214 |
queryFields.put("chrY", LongQueryField.INSTANCE); |
7304 |
28 Aug 23 |
nicklas |
215 |
queryFields.put("chrM", LongQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
216 |
queryFields.put("c", StripWildcardQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
217 |
queryFields.put("p", StripWildcardQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
218 |
if (!refOnly) |
6546 |
24 Jan 22 |
nicklas |
219 |
{ |
6546 |
24 Jan 22 |
nicklas |
220 |
queryFields.put("dp", IntQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
221 |
queryFields.put("vd", IntQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
222 |
queryFields.put("af", FloatQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
223 |
queryFields.put("rbaId", IntQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
224 |
queryFields.put("file", IntQueryField.INSTANCE); |
6546 |
24 Jan 22 |
nicklas |
225 |
} |
6546 |
24 Jan 22 |
nicklas |
226 |
return queryFields; |
6546 |
24 Jan 22 |
nicklas |
227 |
} |
6546 |
24 Jan 22 |
nicklas |
228 |
|
6540 |
17 Jan 22 |
nicklas |
229 |
private final LuceneIndex idx; |
6540 |
17 Jan 22 |
nicklas |
230 |
private final IndexWriter writer; |
6540 |
17 Jan 22 |
nicklas |
231 |
private final int num; |
6540 |
17 Jan 22 |
nicklas |
232 |
private final RawBioAssay rba; |
6545 |
21 Jan 22 |
nicklas |
233 |
private final ArrayDesign design; |
6540 |
17 Jan 22 |
nicklas |
234 |
private final List<VcfFile> vcfFiles; |
6556 |
28 Jan 22 |
nicklas |
235 |
private ProgressReporter progress; |
6540 |
17 Jan 22 |
nicklas |
236 |
private int numVariants; |
6540 |
17 Jan 22 |
nicklas |
237 |
private int numGenotypes; |
6540 |
17 Jan 22 |
nicklas |
238 |
private boolean aborted; |
6540 |
17 Jan 22 |
nicklas |
239 |
|
6545 |
21 Jan 22 |
nicklas |
240 |
/** |
6545 |
21 Jan 22 |
nicklas |
Create an indexer for indexing VCF files linked to a raw bioassay. |
6545 |
21 Jan 22 |
nicklas |
242 |
*/ |
6540 |
17 Jan 22 |
nicklas |
243 |
public VariantCallIndexer(LuceneIndex idx, IndexWriter writer, int num, RawBioAssay rba, List<VcfFile> vcfFiles) |
6540 |
17 Jan 22 |
nicklas |
244 |
{ |
6540 |
17 Jan 22 |
nicklas |
245 |
this.idx = idx; |
6540 |
17 Jan 22 |
nicklas |
246 |
this.num = num; |
6540 |
17 Jan 22 |
nicklas |
247 |
this.writer = writer; |
6540 |
17 Jan 22 |
nicklas |
248 |
this.rba = rba; |
6545 |
21 Jan 22 |
nicklas |
249 |
this.design = null; |
6540 |
17 Jan 22 |
nicklas |
250 |
this.vcfFiles = vcfFiles; |
6540 |
17 Jan 22 |
nicklas |
251 |
} |
6540 |
17 Jan 22 |
nicklas |
252 |
|
6540 |
17 Jan 22 |
nicklas |
253 |
/** |
6545 |
21 Jan 22 |
nicklas |
Create an indexer for indexing VCF files linked to an array design. |
6545 |
21 Jan 22 |
nicklas |
255 |
*/ |
6545 |
21 Jan 22 |
nicklas |
256 |
public VariantCallIndexer(LuceneIndex idx, IndexWriter writer, ArrayDesign design, File vcfFile) |
6545 |
21 Jan 22 |
nicklas |
257 |
{ |
6545 |
21 Jan 22 |
nicklas |
258 |
this.idx = idx; |
6545 |
21 Jan 22 |
nicklas |
259 |
this.num = 1; |
6545 |
21 Jan 22 |
nicklas |
260 |
this.writer = writer; |
6545 |
21 Jan 22 |
nicklas |
261 |
this.rba = null; |
6545 |
21 Jan 22 |
nicklas |
262 |
this.design = design; |
6545 |
21 Jan 22 |
nicklas |
263 |
this.vcfFiles = Arrays.asList(new VcfFile(vcfFile, Datafiletype.VCF_DESIGN.get(design.getDbControl()))); |
6545 |
21 Jan 22 |
nicklas |
264 |
} |
6545 |
21 Jan 22 |
nicklas |
265 |
|
6545 |
21 Jan 22 |
nicklas |
266 |
/** |
6556 |
28 Jan 22 |
nicklas |
Set a progress reporter for reporting the indexing progress. |
6556 |
28 Jan 22 |
nicklas |
268 |
*/ |
6556 |
28 Jan 22 |
nicklas |
269 |
public void setProgressReporter(ProgressReporter progress) |
6556 |
28 Jan 22 |
nicklas |
270 |
{ |
6556 |
28 Jan 22 |
nicklas |
271 |
this.progress = progress; |
6556 |
28 Jan 22 |
nicklas |
272 |
} |
6556 |
28 Jan 22 |
nicklas |
273 |
|
6556 |
28 Jan 22 |
nicklas |
274 |
/** |
6540 |
17 Jan 22 |
nicklas |
Get the number of variants that was indexed. |
6540 |
17 Jan 22 |
nicklas |
276 |
*/ |
6540 |
17 Jan 22 |
nicklas |
277 |
@Override |
6540 |
17 Jan 22 |
nicklas |
278 |
public int getNumVariants() |
6540 |
17 Jan 22 |
nicklas |
279 |
{ |
6540 |
17 Jan 22 |
nicklas |
280 |
return numVariants; |
6540 |
17 Jan 22 |
nicklas |
281 |
} |
6540 |
17 Jan 22 |
nicklas |
282 |
|
6540 |
17 Jan 22 |
nicklas |
283 |
/** |
6540 |
17 Jan 22 |
nicklas |
Get the number of genotypes that was indexed. |
6540 |
17 Jan 22 |
nicklas |
@since 1.2 |
6540 |
17 Jan 22 |
nicklas |
286 |
*/ |
6540 |
17 Jan 22 |
nicklas |
287 |
@Override |
6540 |
17 Jan 22 |
nicklas |
288 |
public int getNumGenotypes() |
6540 |
17 Jan 22 |
nicklas |
289 |
{ |
6540 |
17 Jan 22 |
nicklas |
290 |
return numGenotypes; |
6540 |
17 Jan 22 |
nicklas |
291 |
} |
6540 |
17 Jan 22 |
nicklas |
292 |
|
6540 |
17 Jan 22 |
nicklas |
293 |
/** |
6540 |
17 Jan 22 |
nicklas |
Get the raw bioassay that was indexed. |
6540 |
17 Jan 22 |
nicklas |
295 |
*/ |
6540 |
17 Jan 22 |
nicklas |
296 |
@Override |
6540 |
17 Jan 22 |
nicklas |
297 |
public RawBioAssay getRawBioAssay() |
6540 |
17 Jan 22 |
nicklas |
298 |
{ |
6540 |
17 Jan 22 |
nicklas |
299 |
return rba; |
6540 |
17 Jan 22 |
nicklas |
300 |
} |
6540 |
17 Jan 22 |
nicklas |
301 |
|
6545 |
21 Jan 22 |
nicklas |
302 |
@Override |
6545 |
21 Jan 22 |
nicklas |
303 |
public ArrayDesign getArrayDesign() |
6545 |
21 Jan 22 |
nicklas |
304 |
{ |
6545 |
21 Jan 22 |
nicklas |
305 |
return design; |
6545 |
21 Jan 22 |
nicklas |
306 |
} |
6545 |
21 Jan 22 |
nicklas |
307 |
|
6540 |
17 Jan 22 |
nicklas |
308 |
/** |
6540 |
17 Jan 22 |
nicklas |
Return TRUE if the indexing was aborted due to |
6540 |
17 Jan 22 |
nicklas |
closing down. |
6540 |
17 Jan 22 |
nicklas |
311 |
*/ |
6540 |
17 Jan 22 |
nicklas |
312 |
@Override |
6540 |
17 Jan 22 |
nicklas |
313 |
public boolean wasAborted() |
6540 |
17 Jan 22 |
nicklas |
314 |
{ |
6540 |
17 Jan 22 |
nicklas |
315 |
return aborted; |
6540 |
17 Jan 22 |
nicklas |
316 |
} |
6540 |
17 Jan 22 |
nicklas |
317 |
|
6540 |
17 Jan 22 |
nicklas |
318 |
@Override |
6540 |
17 Jan 22 |
nicklas |
319 |
public VariantCallIndexer call() |
6540 |
17 Jan 22 |
nicklas |
320 |
throws Exception |
6540 |
17 Jan 22 |
nicklas |
321 |
{ |
6540 |
17 Jan 22 |
nicklas |
322 |
if (idx.isClosing()) |
6540 |
17 Jan 22 |
nicklas |
323 |
{ |
6540 |
17 Jan 22 |
nicklas |
324 |
aborted = true; |
6540 |
17 Jan 22 |
nicklas |
325 |
return this; |
6540 |
17 Jan 22 |
nicklas |
326 |
} |
6540 |
17 Jan 22 |
nicklas |
327 |
VcfParser parser = null; |
6556 |
28 Jan 22 |
nicklas |
328 |
AbsoluteProgressReporter aProgress = null; |
6540 |
17 Jan 22 |
nicklas |
329 |
numVariants = 0; |
6540 |
17 Jan 22 |
nicklas |
330 |
numGenotypes = 0; |
6540 |
17 Jan 22 |
nicklas |
331 |
boolean indexAllGenotypes = idx.getIndexAllGenotypes(); |
6540 |
17 Jan 22 |
nicklas |
332 |
boolean updateLinkDescriptions = !VarSearch.isReggieInstalled(); |
6545 |
21 Jan 22 |
nicklas |
333 |
String itemName = rba != null ? rba.getName() : design.getName(); |
6540 |
17 Jan 22 |
nicklas |
334 |
try |
6540 |
17 Jan 22 |
nicklas |
335 |
{ |
6545 |
21 Jan 22 |
nicklas |
336 |
logger.debug("Indexing #" + num +": " +itemName + ": " + vcfFiles.size() + " VCF files"); |
6540 |
17 Jan 22 |
nicklas |
337 |
long time = -System.currentTimeMillis(); |
6540 |
17 Jan 22 |
nicklas |
338 |
|
6545 |
21 Jan 22 |
nicklas |
339 |
if (rba != null) |
6545 |
21 Jan 22 |
nicklas |
340 |
{ |
6545 |
21 Jan 22 |
nicklas |
// Re-indexing: delete existing information about this raw bioassay id |
6545 |
21 Jan 22 |
nicklas |
342 |
writer.deleteDocuments(IntPoint.newExactQuery("rbaId", rba.getId())); |
6545 |
21 Jan 22 |
nicklas |
343 |
writer.deleteDocuments(IntPoint.newExactQuery("mainId", rba.getId())); |
6545 |
21 Jan 22 |
nicklas |
344 |
} |
6545 |
21 Jan 22 |
nicklas |
345 |
if (design != null) |
6545 |
21 Jan 22 |
nicklas |
346 |
{ |
6545 |
21 Jan 22 |
nicklas |
// Re-indexing: delete existing information about this array design |
6545 |
21 Jan 22 |
nicklas |
348 |
writer.deleteDocuments(IntPoint.newExactQuery("designId", design.getId())); |
6545 |
21 Jan 22 |
nicklas |
349 |
} |
6556 |
28 Jan 22 |
nicklas |
350 |
|
6556 |
28 Jan 22 |
nicklas |
351 |
long parsedBefore = 0; |
6556 |
28 Jan 22 |
nicklas |
352 |
if (progress != null) |
6556 |
28 Jan 22 |
nicklas |
353 |
{ |
6556 |
28 Jan 22 |
nicklas |
354 |
long totalFileSize = 0; |
6556 |
28 Jan 22 |
nicklas |
355 |
for (VcfFile vcfFile : vcfFiles) |
6556 |
28 Jan 22 |
nicklas |
356 |
{ |
6556 |
28 Jan 22 |
nicklas |
357 |
totalFileSize += vcfFile.getFile().getSize(); |
6556 |
28 Jan 22 |
nicklas |
358 |
} |
6556 |
28 Jan 22 |
nicklas |
359 |
aProgress = new SimpleAbsoluteProgressReporter(progress, totalFileSize); |
6556 |
28 Jan 22 |
nicklas |
360 |
} |
6540 |
17 Jan 22 |
nicklas |
361 |
|
6540 |
17 Jan 22 |
nicklas |
362 |
for (VcfFile vcfFile : vcfFiles) |
6540 |
17 Jan 22 |
nicklas |
363 |
{ |
6540 |
17 Jan 22 |
nicklas |
364 |
int numFileVariants = 0; |
6540 |
17 Jan 22 |
nicklas |
365 |
int numFileGenotypes = 0; |
6540 |
17 Jan 22 |
nicklas |
366 |
|
6545 |
21 Jan 22 |
nicklas |
367 |
logger.debug("Indexing #" + num +": " +itemName + ": " + vcfFile.getName()); |
6540 |
17 Jan 22 |
nicklas |
368 |
parser = new VcfParser(vcfFile.getFile()); |
6540 |
17 Jan 22 |
nicklas |
369 |
VcfHeader header = parser.parseHeaders(); |
6540 |
17 Jan 22 |
nicklas |
370 |
|
6540 |
17 Jan 22 |
nicklas |
371 |
if (header == null) |
6540 |
17 Jan 22 |
nicklas |
372 |
{ |
6545 |
21 Jan 22 |
nicklas |
373 |
logger.warn("Unable to index (no header found): " +itemName + "/" + vcfFile.getName()); |
6540 |
17 Jan 22 |
nicklas |
374 |
continue; // With the next VCF |
6540 |
17 Jan 22 |
nicklas |
375 |
} |
6540 |
17 Jan 22 |
nicklas |
376 |
|
6540 |
17 Jan 22 |
nicklas |
377 |
int chrCol = header.indexOf("#CHROM"); |
6540 |
17 Jan 22 |
nicklas |
378 |
int posCol = header.indexOf("POS"); |
6545 |
21 Jan 22 |
nicklas |
379 |
int idCol = header.indexOf("ID"); |
6540 |
17 Jan 22 |
nicklas |
380 |
int refCol = header.indexOf("REF"); |
6540 |
17 Jan 22 |
nicklas |
381 |
int altCol = header.indexOf("ALT"); |
6540 |
17 Jan 22 |
nicklas |
382 |
int infoCol = header.indexOf("INFO"); |
6540 |
17 Jan 22 |
nicklas |
383 |
|
6545 |
21 Jan 22 |
nicklas |
384 |
if (isMissingColumn(chrCol, posCol, idCol, refCol, altCol, infoCol)) |
6540 |
17 Jan 22 |
nicklas |
385 |
{ |
6545 |
21 Jan 22 |
nicklas |
386 |
logger.warn("Unable to index (missing header column): " +itemName + "/" + vcfFile.getName()); |
7074 |
24 Mar 23 |
nicklas |
387 |
logger.debug("CHROM: "+chrCol+"; POS: "+posCol+"; ID: "+idCol+"; REF: "+refCol+"; ALT: "+altCol+"; INFO: "+infoCol); |
6540 |
17 Jan 22 |
nicklas |
388 |
continue; // With the next VCF |
6540 |
17 Jan 22 |
nicklas |
389 |
} |
6540 |
17 Jan 22 |
nicklas |
390 |
|
6540 |
17 Jan 22 |
nicklas |
391 |
int geneCol = header.annIndexOf("Gene_Name"); |
6540 |
17 Jan 22 |
nicklas |
392 |
int hgvsCCol = header.annIndexOf("HGVS.c"); |
6540 |
17 Jan 22 |
nicklas |
393 |
int hgvsPCol = header.annIndexOf("HGVS.p"); |
6540 |
17 Jan 22 |
nicklas |
394 |
int effectCol = header.annIndexOf("Annotation"); |
6540 |
17 Jan 22 |
nicklas |
395 |
int lineNo = 0; |
6540 |
17 Jan 22 |
nicklas |
396 |
do |
6540 |
17 Jan 22 |
nicklas |
397 |
{ |
6540 |
17 Jan 22 |
nicklas |
398 |
VcfLine line = parser.nextLine(); |
6540 |
17 Jan 22 |
nicklas |
399 |
if (line == null) break; |
6540 |
17 Jan 22 |
nicklas |
400 |
|
6540 |
17 Jan 22 |
nicklas |
401 |
lineNo++; |
6540 |
17 Jan 22 |
nicklas |
402 |
|
6540 |
17 Jan 22 |
nicklas |
403 |
if (lineNo % 10==0 && idx.isClosing()) |
6540 |
17 Jan 22 |
nicklas |
404 |
{ |
6540 |
17 Jan 22 |
nicklas |
405 |
aborted = true; |
6540 |
17 Jan 22 |
nicklas |
406 |
return this; |
6540 |
17 Jan 22 |
nicklas |
407 |
} |
6540 |
17 Jan 22 |
nicklas |
408 |
|
6556 |
28 Jan 22 |
nicklas |
409 |
if (aProgress != null && lineNo % 1000 == 0) |
6556 |
28 Jan 22 |
nicklas |
410 |
{ |
6556 |
28 Jan 22 |
nicklas |
411 |
aProgress.displayAbsolute(parsedBefore+parser.getParsedBytes(), vcfFile.getName() + ": " + lineNo + "..."); |
6556 |
28 Jan 22 |
nicklas |
412 |
} |
6556 |
28 Jan 22 |
nicklas |
413 |
|
6540 |
17 Jan 22 |
nicklas |
414 |
DocumentCreator variant = new DocumentCreator(); |
6540 |
17 Jan 22 |
nicklas |
415 |
|
6545 |
21 Jan 22 |
nicklas |
// Add information about the raw bioassay / array design |
6545 |
21 Jan 22 |
nicklas |
417 |
if (rba != null) |
6545 |
21 Jan 22 |
nicklas |
418 |
{ |
6545 |
21 Jan 22 |
nicklas |
419 |
variant.addRawBioAssayFields(rba, "rba"); |
6545 |
21 Jan 22 |
nicklas |
420 |
} |
6545 |
21 Jan 22 |
nicklas |
421 |
if (design != null) |
6545 |
21 Jan 22 |
nicklas |
422 |
{ |
6545 |
21 Jan 22 |
nicklas |
423 |
variant.addArrayDesignFields(design); |
6545 |
21 Jan 22 |
nicklas |
424 |
} |
6540 |
17 Jan 22 |
nicklas |
425 |
variant.addFileFields(vcfFile.getFile(), line); |
6540 |
17 Jan 22 |
nicklas |
426 |
|
6540 |
17 Jan 22 |
nicklas |
// Chromosome, position, ref and alt |
6540 |
17 Jan 22 |
nicklas |
428 |
String chr = variant.addStringField("chrom", line.col(chrCol), Store.YES); |
6540 |
17 Jan 22 |
nicklas |
429 |
Long pos = variant.addLongField("pos", line.longValue(posCol), Store.YES); |
6540 |
17 Jan 22 |
nicklas |
430 |
if (chr != null && chr.startsWith("chr")) |
6540 |
17 Jan 22 |
nicklas |
431 |
{ |
6540 |
17 Jan 22 |
nicklas |
// Alternate index allows us to use searches like: |
6540 |
17 Jan 22 |
nicklas |
// 'chr6:123456' <==> chrom:chr6 AND pos:123456 |
6540 |
17 Jan 22 |
nicklas |
434 |
variant.addLongField(chr, pos, Store.NO); |
6540 |
17 Jan 22 |
nicklas |
435 |
} |
6545 |
21 Jan 22 |
nicklas |
436 |
String snpId = Values.getString(line.col(idCol), "."); |
6545 |
21 Jan 22 |
nicklas |
437 |
if (!snpId.equals(".")) |
6545 |
21 Jan 22 |
nicklas |
438 |
{ |
6545 |
21 Jan 22 |
nicklas |
439 |
variant.addStringField("snpId", snpId, Store.YES); |
6545 |
21 Jan 22 |
nicklas |
440 |
} |
6545 |
21 Jan 22 |
nicklas |
441 |
|
6540 |
17 Jan 22 |
nicklas |
442 |
variant.addStringField("ref", line.col(refCol), Store.YES); |
6540 |
17 Jan 22 |
nicklas |
443 |
variant.addStringField("alt", line.col(altCol), Store.YES); |
6540 |
17 Jan 22 |
nicklas |
444 |
|
6540 |
17 Jan 22 |
nicklas |
// Annotations from the ANN field |
6540 |
17 Jan 22 |
nicklas |
// Combined from ncbiRefSeq and ANN.Gene_Name annotations -- TreeSet is sorting the values |
6540 |
17 Jan 22 |
nicklas |
447 |
Set<String> genes = new NullSafeTreeSet<>(); |
6540 |
17 Jan 22 |
nicklas |
448 |
genes.add(line.ann(geneCol)); |
6540 |
17 Jan 22 |
nicklas |
449 |
|
6540 |
17 Jan 22 |
nicklas |
// HGVS.c from ANN.HGVS.c and cosmic_CDS -- LinkedSet is keeping insertion order |
6540 |
17 Jan 22 |
nicklas |
451 |
Set<String> hgvsc = new NullSafeLinkedSet<>(); |
6540 |
17 Jan 22 |
nicklas |
452 |
hgvsc.add(line.ann(hgvsCCol)); |
6540 |
17 Jan 22 |
nicklas |
453 |
|
6540 |
17 Jan 22 |
nicklas |
// HGVS.p from ANN.HGVS.p and cosmic_AA -- LinkedSet is keeping insertion order |
6540 |
17 Jan 22 |
nicklas |
455 |
Set<String> hgvsp = new NullSafeLinkedSet<>(); |
6540 |
17 Jan 22 |
nicklas |
456 |
hgvsp.add(AminoAcidTranslator.INSTANCE.translate(line.ann(hgvsPCol))); |
6540 |
17 Jan 22 |
nicklas |
457 |
|
6540 |
17 Jan 22 |
nicklas |
// Effect from the Annotation field -- List of values separated with '&', we change this to ', ' |
6540 |
17 Jan 22 |
nicklas |
459 |
String effect = line.ann(effectCol); |
6540 |
17 Jan 22 |
nicklas |
460 |
if (effect != null) |
6540 |
17 Jan 22 |
nicklas |
461 |
{ |
6540 |
17 Jan 22 |
nicklas |
462 |
variant.addField(new ListField("effect", Arrays.asList(effect.split("&")), Store.YES)); |
6540 |
17 Jan 22 |
nicklas |
463 |
} |
6540 |
17 Jan 22 |
nicklas |
464 |
|
6540 |
17 Jan 22 |
nicklas |
// Annotations from the INFO fields |
6540 |
17 Jan 22 |
nicklas |
466 |
for (Info info : line.info()) |
6540 |
17 Jan 22 |
nicklas |
467 |
{ |
6540 |
17 Jan 22 |
nicklas |
468 |
String key = info.key; |
6540 |
17 Jan 22 |
nicklas |
469 |
String val = info.value; |
6540 |
17 Jan 22 |
nicklas |
470 |
|
6540 |
17 Jan 22 |
nicklas |
// stats.add(key, val); |
6540 |
17 Jan 22 |
nicklas |
472 |
|
6540 |
17 Jan 22 |
nicklas |
473 |
if ("TYPE".equals(key)) |
6540 |
17 Jan 22 |
nicklas |
474 |
{ |
6540 |
17 Jan 22 |
nicklas |
475 |
variant.addStringField("type", val, Store.YES); |
6540 |
17 Jan 22 |
nicklas |
476 |
} |
6540 |
17 Jan 22 |
nicklas |
477 |
else if ("dbsnp_ID".equals(key)) |
6540 |
17 Jan 22 |
nicklas |
478 |
{ |
6540 |
17 Jan 22 |
nicklas |
479 |
variant.addStringField("rsid", val, Store.YES); |
6540 |
17 Jan 22 |
nicklas |
480 |
} |
6540 |
17 Jan 22 |
nicklas |
481 |
else if ("cosmic_ID".equals(key)) |
6540 |
17 Jan 22 |
nicklas |
482 |
{ |
6540 |
17 Jan 22 |
nicklas |
483 |
variant.addStringField("cosmic", val, Store.YES); |
6540 |
17 Jan 22 |
nicklas |
484 |
} |
6540 |
17 Jan 22 |
nicklas |
485 |
else if ("ncbiRefSeq".equals(key)) |
6540 |
17 Jan 22 |
nicklas |
486 |
{ |
6540 |
17 Jan 22 |
nicklas |
487 |
genes.addAll(Arrays.asList(val.split(","))); |
6540 |
17 Jan 22 |
nicklas |
488 |
} |
6540 |
17 Jan 22 |
nicklas |
489 |
else if ("cosmic_CDS".equals(key)) |
6540 |
17 Jan 22 |
nicklas |
490 |
{ |
6540 |
17 Jan 22 |
nicklas |
491 |
hgvsc.addAll(Arrays.asList(val.split(","))); |
6540 |
17 Jan 22 |
nicklas |
492 |
} |
6540 |
17 Jan 22 |
nicklas |
493 |
else if ("cosmic_AA".equals(key)) |
6540 |
17 Jan 22 |
nicklas |
494 |
{ |
6540 |
17 Jan 22 |
nicklas |
495 |
hgvsp.addAll(Arrays.asList(val.split(","))); |
6540 |
17 Jan 22 |
nicklas |
496 |
} |
6540 |
17 Jan 22 |
nicklas |
497 |
} |
6540 |
17 Jan 22 |
nicklas |
498 |
|
6540 |
17 Jan 22 |
nicklas |
499 |
if (genes.size() > 0) variant.addField(new ListField("gene", genes, Store.YES)); |
6540 |
17 Jan 22 |
nicklas |
500 |
if (hgvsc.size() > 0) variant.addField(new ListField("c", hgvsc, Store.YES)); |
6540 |
17 Jan 22 |
nicklas |
501 |
if (hgvsp.size() > 0) |
6540 |
17 Jan 22 |
nicklas |
502 |
{ |
6540 |
17 Jan 22 |
nicklas |
503 |
hgvsp.remove("p.?"); |
6540 |
17 Jan 22 |
nicklas |
504 |
variant.addField(new ListField("p", hgvsp, Store.YES)); |
6540 |
17 Jan 22 |
nicklas |
505 |
} |
6540 |
17 Jan 22 |
nicklas |
506 |
|
6540 |
17 Jan 22 |
nicklas |
507 |
boolean isVariant = false; |
6540 |
17 Jan 22 |
nicklas |
508 |
boolean isGenotype = false; |
6545 |
21 Jan 22 |
nicklas |
509 |
if (rba != null) |
6540 |
17 Jan 22 |
nicklas |
510 |
{ |
6545 |
21 Jan 22 |
nicklas |
511 |
for (Info info : line.format()) |
6540 |
17 Jan 22 |
nicklas |
512 |
{ |
6545 |
21 Jan 22 |
nicklas |
513 |
String key = info.key; |
6545 |
21 Jan 22 |
nicklas |
514 |
String val = info.value; |
6545 |
21 Jan 22 |
nicklas |
515 |
|
6545 |
21 Jan 22 |
nicklas |
516 |
if ("GT".equals(key)) |
6545 |
21 Jan 22 |
nicklas |
517 |
{ |
6545 |
21 Jan 22 |
nicklas |
518 |
variant.addStringField("gt", val, Store.YES); |
6545 |
21 Jan 22 |
nicklas |
519 |
isGenotype = !"./.".equals(val); |
6545 |
21 Jan 22 |
nicklas |
520 |
isVariant = isGenotype && !"0/0".equals(val); |
6545 |
21 Jan 22 |
nicklas |
521 |
} |
6545 |
21 Jan 22 |
nicklas |
522 |
else if ("DP".equals(key)) |
6545 |
21 Jan 22 |
nicklas |
523 |
{ |
6545 |
21 Jan 22 |
nicklas |
524 |
variant.addIntField("dp", info.intValue(), Store.YES); |
6545 |
21 Jan 22 |
nicklas |
525 |
} |
6545 |
21 Jan 22 |
nicklas |
526 |
else if ("VD".equals(key)) |
6545 |
21 Jan 22 |
nicklas |
527 |
{ |
6545 |
21 Jan 22 |
nicklas |
528 |
variant.addIntField("vd", info.intValue(), Store.YES); |
6545 |
21 Jan 22 |
nicklas |
529 |
} |
6545 |
21 Jan 22 |
nicklas |
530 |
else if ("AF".equals(key)) |
6545 |
21 Jan 22 |
nicklas |
531 |
{ |
6545 |
21 Jan 22 |
nicklas |
532 |
variant.addFloatField("af", info.floatValue(), val); |
6545 |
21 Jan 22 |
nicklas |
533 |
} |
6540 |
17 Jan 22 |
nicklas |
534 |
} |
6545 |
21 Jan 22 |
nicklas |
535 |
} |
6545 |
21 Jan 22 |
nicklas |
536 |
else if (design != null) |
6545 |
21 Jan 22 |
nicklas |
537 |
{ |
6545 |
21 Jan 22 |
nicklas |
538 |
isGenotype = true; |
6545 |
21 Jan 22 |
nicklas |
539 |
isVariant = true; |
6545 |
21 Jan 22 |
nicklas |
540 |
} |
6540 |
17 Jan 22 |
nicklas |
541 |
if (isGenotype) |
6540 |
17 Jan 22 |
nicklas |
542 |
{ |
6540 |
17 Jan 22 |
nicklas |
543 |
numGenotypes++; |
6540 |
17 Jan 22 |
nicklas |
544 |
numFileGenotypes++; |
6540 |
17 Jan 22 |
nicklas |
545 |
if (indexAllGenotypes || isVariant) |
6540 |
17 Jan 22 |
nicklas |
546 |
{ |
6540 |
17 Jan 22 |
nicklas |
547 |
writer.addDocument(variant.doc()); |
6540 |
17 Jan 22 |
nicklas |
548 |
} |
6540 |
17 Jan 22 |
nicklas |
549 |
if (isVariant) |
6540 |
17 Jan 22 |
nicklas |
550 |
{ |
6540 |
17 Jan 22 |
nicklas |
551 |
numVariants++; |
6540 |
17 Jan 22 |
nicklas |
552 |
numFileVariants++; |
6540 |
17 Jan 22 |
nicklas |
553 |
} |
6540 |
17 Jan 22 |
nicklas |
554 |
} |
6540 |
17 Jan 22 |
nicklas |
555 |
} while (true); |
6540 |
17 Jan 22 |
nicklas |
556 |
|
6540 |
17 Jan 22 |
nicklas |
557 |
FileUtil.close(parser); |
6556 |
28 Jan 22 |
nicklas |
558 |
parsedBefore += vcfFile.getFile().getSize(); |
6540 |
17 Jan 22 |
nicklas |
559 |
|
6545 |
21 Jan 22 |
nicklas |
560 |
if (rba != null) |
6540 |
17 Jan 22 |
nicklas |
561 |
{ |
6545 |
21 Jan 22 |
nicklas |
562 |
AnyToAny link = vcfFile.getLink(); |
6545 |
21 Jan 22 |
nicklas |
563 |
if (link != null && updateLinkDescriptions) |
6540 |
17 Jan 22 |
nicklas |
564 |
{ |
6545 |
21 Jan 22 |
nicklas |
565 |
if (indexAllGenotypes) |
6545 |
21 Jan 22 |
nicklas |
566 |
{ |
6545 |
21 Jan 22 |
nicklas |
567 |
link.setDescription("Found "+VarSearch.formatCount(numFileVariants, " variant", " variants")+ |
6545 |
21 Jan 22 |
nicklas |
568 |
" in "+VarSearch.formatCount(numFileGenotypes, " genotyped target.", " genotyped targets.")); |
6545 |
21 Jan 22 |
nicklas |
569 |
} |
6545 |
21 Jan 22 |
nicklas |
570 |
else |
6545 |
21 Jan 22 |
nicklas |
571 |
{ |
6545 |
21 Jan 22 |
nicklas |
572 |
link.setDescription("Found "+VarSearch.formatCount(numFileVariants, " variant.", " variants.")); |
6545 |
21 Jan 22 |
nicklas |
573 |
} |
6545 |
21 Jan 22 |
nicklas |
574 |
vcfFile.getFile().setDescription(link.getDescription()); |
6540 |
17 Jan 22 |
nicklas |
575 |
} |
6540 |
17 Jan 22 |
nicklas |
576 |
} |
6540 |
17 Jan 22 |
nicklas |
577 |
} |
6540 |
17 Jan 22 |
nicklas |
578 |
|
6545 |
21 Jan 22 |
nicklas |
579 |
if (rba != null) |
6545 |
21 Jan 22 |
nicklas |
580 |
{ |
6545 |
21 Jan 22 |
nicklas |
// Add summary fields for the complete rba |
6545 |
21 Jan 22 |
nicklas |
582 |
DocumentCreator main = new DocumentCreator(); |
6545 |
21 Jan 22 |
nicklas |
583 |
main.addRawBioAssayFields(rba, "main"); |
6545 |
21 Jan 22 |
nicklas |
584 |
main.addIntField("numVariants", numVariants, Store.YES); |
6545 |
21 Jan 22 |
nicklas |
585 |
main.addIntField("numGenotypes", numGenotypes, Store.YES); |
6545 |
21 Jan 22 |
nicklas |
586 |
writer.addDocument(main.doc()); |
6545 |
21 Jan 22 |
nicklas |
587 |
} |
6540 |
17 Jan 22 |
nicklas |
588 |
|
6540 |
17 Jan 22 |
nicklas |
589 |
time += System.currentTimeMillis(); |
6545 |
21 Jan 22 |
nicklas |
590 |
logger.debug("Index complete #"+num+": " +itemName + ": " + numVariants + " variants; " + numGenotypes + " genotypes; " + time + " ms"); |
6540 |
17 Jan 22 |
nicklas |
591 |
if (LuceneIndex.SLOW_MODE) Thread.sleep(10); |
6540 |
17 Jan 22 |
nicklas |
592 |
} |
6540 |
17 Jan 22 |
nicklas |
593 |
catch (Exception ex) |
6540 |
17 Jan 22 |
nicklas |
594 |
{ |
6545 |
21 Jan 22 |
nicklas |
595 |
logger.warn("Indexing failed #"+num+": " + itemName, ex); |
6540 |
17 Jan 22 |
nicklas |
596 |
throw ex; |
6540 |
17 Jan 22 |
nicklas |
597 |
} |
6540 |
17 Jan 22 |
nicklas |
598 |
finally |
6540 |
17 Jan 22 |
nicklas |
599 |
{ |
6540 |
17 Jan 22 |
nicklas |
600 |
FileUtil.close(parser); |
6540 |
17 Jan 22 |
nicklas |
601 |
} |
6540 |
17 Jan 22 |
nicklas |
602 |
return this; |
6540 |
17 Jan 22 |
nicklas |
603 |
} |
6540 |
17 Jan 22 |
nicklas |
604 |
|
6540 |
17 Jan 22 |
nicklas |
605 |
/** |
6540 |
17 Jan 22 |
nicklas |
Check if some column was missing (index < 0). |
6540 |
17 Jan 22 |
nicklas |
607 |
*/ |
6540 |
17 Jan 22 |
nicklas |
608 |
private boolean isMissingColumn(int... colIndexes) |
6540 |
17 Jan 22 |
nicklas |
609 |
{ |
6540 |
17 Jan 22 |
nicklas |
610 |
for (int i : colIndexes) |
6540 |
17 Jan 22 |
nicklas |
611 |
{ |
6540 |
17 Jan 22 |
nicklas |
612 |
if (i < 0) return true; |
6540 |
17 Jan 22 |
nicklas |
613 |
} |
6540 |
17 Jan 22 |
nicklas |
614 |
return false; |
6540 |
17 Jan 22 |
nicklas |
615 |
} |
6540 |
17 Jan 22 |
nicklas |
616 |
} |
6540 |
17 Jan 22 |
nicklas |
617 |
|
6546 |
24 Jan 22 |
nicklas |
618 |
/** |
6546 |
24 Jan 22 |
nicklas |
Filter action implementation for the VariantCallIndex. |
6546 |
24 Jan 22 |
nicklas |
620 |
*/ |
6546 |
24 Jan 22 |
nicklas |
621 |
public static class VariantCallFilterAction |
6546 |
24 Jan 22 |
nicklas |
622 |
extends LuceneFilterAction<VariantCallIndex> |
6546 |
24 Jan 22 |
nicklas |
623 |
{ |
6546 |
24 Jan 22 |
nicklas |
624 |
|
6546 |
24 Jan 22 |
nicklas |
625 |
public VariantCallFilterAction(ItemContext context, VariantCallIndex idx) |
6546 |
24 Jan 22 |
nicklas |
626 |
{ |
6546 |
24 Jan 22 |
nicklas |
627 |
super(context, idx); |
6546 |
24 Jan 22 |
nicklas |
628 |
} |
6546 |
24 Jan 22 |
nicklas |
629 |
|
6546 |
24 Jan 22 |
nicklas |
630 |
@Override |
6546 |
24 Jan 22 |
nicklas |
631 |
protected Query createVariantQuery(String queryString) |
6546 |
24 Jan 22 |
nicklas |
632 |
throws ParseException |
6546 |
24 Jan 22 |
nicklas |
633 |
{ |
6546 |
24 Jan 22 |
nicklas |
634 |
queryString = AutoPrefixer.INSTANCE.autoPrefix(queryString); |
6546 |
24 Jan 22 |
nicklas |
635 |
Query query = parser.parse(queryString); |
6546 |
24 Jan 22 |
nicklas |
636 |
if (idx.getIndexAllGenotypes() && excludeHomRefGenotypes(queryString)) |
6546 |
24 Jan 22 |
nicklas |
637 |
{ |
6546 |
24 Jan 22 |
nicklas |
// Modify the query to not return gt:0/0 |
6546 |
24 Jan 22 |
nicklas |
639 |
BooleanQuery.Builder b = new BooleanQuery.Builder(); |
6546 |
24 Jan 22 |
nicklas |
640 |
b.add(query, Occur.MUST); |
6546 |
24 Jan 22 |
nicklas |
641 |
b.add(new TermQuery(new Term("gt", "0/0")), Occur.MUST_NOT); |
6546 |
24 Jan 22 |
nicklas |
642 |
query = b.build(); |
6546 |
24 Jan 22 |
nicklas |
643 |
} |
6546 |
24 Jan 22 |
nicklas |
644 |
return query; |
6546 |
24 Jan 22 |
nicklas |
645 |
} |
6546 |
24 Jan 22 |
nicklas |
646 |
|
6546 |
24 Jan 22 |
nicklas |
647 |
/** |
6546 |
24 Jan 22 |
nicklas |
Inspect the query string and check if the query should be |
6546 |
24 Jan 22 |
nicklas |
modified to not include results with gt:0/0. |
6546 |
24 Jan 22 |
nicklas |
@since 1.4 |
6546 |
24 Jan 22 |
nicklas |
651 |
*/ |
6546 |
24 Jan 22 |
nicklas |
652 |
public boolean excludeHomRefGenotypes(String queryString) |
6546 |
24 Jan 22 |
nicklas |
653 |
{ |
6546 |
24 Jan 22 |
nicklas |
654 |
return !queryString.contains("gt:") && |
6546 |
24 Jan 22 |
nicklas |
655 |
!queryString.contains("dp:") && |
6546 |
24 Jan 22 |
nicklas |
656 |
!queryString.contains("af:") && |
6546 |
24 Jan 22 |
nicklas |
657 |
!queryString.contains("vd:"); |
6546 |
24 Jan 22 |
nicklas |
658 |
} |
6546 |
24 Jan 22 |
nicklas |
659 |
|
6550 |
25 Jan 22 |
nicklas |
660 |
|
6550 |
25 Jan 22 |
nicklas |
661 |
@Override |
6550 |
25 Jan 22 |
nicklas |
662 |
public AllDocsCollector getRawBioAssayHits(int rbaId, AllDocsCollector hits) |
6550 |
25 Jan 22 |
nicklas |
663 |
{ |
6550 |
25 Jan 22 |
nicklas |
// Combine the original queries -- at least one should match |
6550 |
25 Jan 22 |
nicklas |
665 |
BooleanQuery.Builder builder = new BooleanQuery.Builder(); |
6550 |
25 Jan 22 |
nicklas |
666 |
builder.setMinimumNumberShouldMatch(1); |
6550 |
25 Jan 22 |
nicklas |
667 |
for (Query q : getQueries()) |
6550 |
25 Jan 22 |
nicklas |
668 |
{ |
6550 |
25 Jan 22 |
nicklas |
669 |
builder.add(q, Occur.SHOULD); |
6550 |
25 Jan 22 |
nicklas |
670 |
} |
6550 |
25 Jan 22 |
nicklas |
// Add a filter for the current RawBioassay ID |
6550 |
25 Jan 22 |
nicklas |
672 |
BooleanQuery.Builder builder2 = new BooleanQuery.Builder(); |
6550 |
25 Jan 22 |
nicklas |
673 |
builder2.add(builder.build(), Occur.MUST); |
6550 |
25 Jan 22 |
nicklas |
674 |
builder2.add(IntPoint.newExactQuery("rbaId", rbaId), Occur.MUST); |
6550 |
25 Jan 22 |
nicklas |
675 |
Query queryByRba = builder2.build(); |
6550 |
25 Jan 22 |
nicklas |
676 |
|
6550 |
25 Jan 22 |
nicklas |
677 |
try |
6550 |
25 Jan 22 |
nicklas |
678 |
{ |
6550 |
25 Jan 22 |
nicklas |
// The results are the variants that matches the current RawBioassay |
6550 |
25 Jan 22 |
nicklas |
680 |
idx.getIndexSearcher().search(queryByRba, hits); |
6550 |
25 Jan 22 |
nicklas |
681 |
} |
6550 |
25 Jan 22 |
nicklas |
682 |
catch (Exception ex) |
6550 |
25 Jan 22 |
nicklas |
683 |
{ |
6848 |
17 Oct 22 |
nicklas |
684 |
VarSearch.throwRuntimException(ex); |
6550 |
25 Jan 22 |
nicklas |
685 |
} |
6550 |
25 Jan 22 |
nicklas |
686 |
return hits; |
6550 |
25 Jan 22 |
nicklas |
687 |
} |
6550 |
25 Jan 22 |
nicklas |
688 |
|
6546 |
24 Jan 22 |
nicklas |
689 |
} |
6552 |
26 Jan 22 |
nicklas |
690 |
|
6552 |
26 Jan 22 |
nicklas |
691 |
public static class VariantCallColumnAction |
6552 |
26 Jan 22 |
nicklas |
692 |
extends LuceneColumnAction<VariantCallIndex, VariantCallFilterAction> |
6552 |
26 Jan 22 |
nicklas |
693 |
{ |
6552 |
26 Jan 22 |
nicklas |
694 |
|
6552 |
26 Jan 22 |
nicklas |
695 |
public VariantCallColumnAction(VariantCallIndex idx, VariantCallFilterAction filter) |
6552 |
26 Jan 22 |
nicklas |
696 |
{ |
6552 |
26 Jan 22 |
nicklas |
697 |
super(idx, filter); |
6552 |
26 Jan 22 |
nicklas |
698 |
} |
6552 |
26 Jan 22 |
nicklas |
699 |
|
6552 |
26 Jan 22 |
nicklas |
700 |
} |
6540 |
17 Jan 22 |
nicklas |
701 |
} |