5709 |
08 Nov 19 |
nicklas |
#!/usr/bin/python |
5709 |
08 Nov 19 |
nicklas |
2 |
|
5709 |
08 Nov 19 |
nicklas |
3 |
""" |
5719 |
12 Nov 19 |
nicklas |
$Id$ |
5719 |
12 Nov 19 |
nicklas |
5 |
|
5709 |
08 Nov 19 |
nicklas |
Script to create a VCF-like file with GC content and DNA strength INFO fields. This script is |
5709 |
08 Nov 19 |
nicklas |
part of a pipeline that requires a specially prepared bed-file as input (reading from stdin). |
5709 |
08 Nov 19 |
nicklas |
Columns (0-based for convenience): |
5709 |
08 Nov 19 |
nicklas |
9 |
|
5709 |
08 Nov 19 |
nicklas |
0: Chromosome (printed to VCF#CHROM) |
5753 |
22 Nov 19 |
nicklas |
1: Start (=original location - 50; or 0 if original location < 50) |
5709 |
08 Nov 19 |
nicklas |
2: Stop (=original location + 50; not used) |
5709 |
08 Nov 19 |
nicklas |
3: Original location (printed to VCF#POS) |
5709 |
08 Nov 19 |
nicklas |
4: REF (printed to VCF#REF) |
5709 |
08 Nov 19 |
nicklas |
5: ALT (printed to VCF#ALT) |
5709 |
08 Nov 19 |
nicklas |
6: SEQUENCE between start and stop (used for calculating GC and DNA strength, which are printed to VCF#INFO) |
5709 |
08 Nov 19 |
nicklas |
17 |
|
5709 |
08 Nov 19 |
nicklas |
The VCF file that is produced must be compressed (bgzip) and indexed (tabix) before it can be used by vcfanno |
5709 |
08 Nov 19 |
nicklas |
to annotate the orginal VCF file (from VarDict) with the GC content and DNA strength. A .toml file like |
5709 |
08 Nov 19 |
nicklas |
this is needed: |
5709 |
08 Nov 19 |
nicklas |
21 |
|
5709 |
08 Nov 19 |
nicklas |
[[annotation]] |
5709 |
08 Nov 19 |
nicklas |
file="/path/to/vcf-file-produced-by-this-script.gz" |
5709 |
08 Nov 19 |
nicklas |
fields=["GC_CONT","GC_CONT_LOCAL","STRENGTH","STRENGTH_LOCAL"] |
5709 |
08 Nov 19 |
nicklas |
names=["GC_CONT","GC_CONT_LOCAL","STRENGTH","STRENGTH_LOCAL"] |
5709 |
08 Nov 19 |
nicklas |
ops=["self","self","self","self"] |
5709 |
08 Nov 19 |
nicklas |
27 |
|
5709 |
08 Nov 19 |
nicklas |
28 |
""" |
5709 |
08 Nov 19 |
nicklas |
29 |
|
5709 |
08 Nov 19 |
nicklas |
import sys |
5709 |
08 Nov 19 |
nicklas |
31 |
|
5709 |
08 Nov 19 |
nicklas |
NORM_LEN = 50 |
5709 |
08 Nov 19 |
nicklas |
LOCAL_LEN = 6 |
5709 |
08 Nov 19 |
nicklas |
34 |
|
5709 |
08 Nov 19 |
nicklas |
# Column indexes from the BED file we are reading |
5709 |
08 Nov 19 |
nicklas |
COL_CHR = 0 |
5753 |
22 Nov 19 |
nicklas |
COL_START = 1 |
5753 |
22 Nov 19 |
nicklas |
COL_END = 2 |
5709 |
08 Nov 19 |
nicklas |
COL_POS = 3 |
5709 |
08 Nov 19 |
nicklas |
COL_REF = 4 |
5709 |
08 Nov 19 |
nicklas |
COL_ALT = 5 |
5709 |
08 Nov 19 |
nicklas |
COL_SEQ = 6 |
5709 |
08 Nov 19 |
nicklas |
43 |
|
5709 |
08 Nov 19 |
nicklas |
# GC content and DNA strength values |
5747 |
21 Nov 19 |
nicklas |
GC_CONT_VALUES = { "A": 0.0, "C": 1.0, "G": 1.0, "N": 0.5, "T": 0.0 } |
5709 |
08 Nov 19 |
nicklas |
DNA_STRENGTH_VALUES = { |
5747 |
21 Nov 19 |
nicklas |
"AA": 5.0, "AC": 10.0, "AG": 8.0, "AN": 7.5, "AT": 7.0, |
5747 |
21 Nov 19 |
nicklas |
"CA": 7.0, "CC": 11.0, "CG": 10.0, "CN": 9.0, "CT": 8.0, |
5747 |
21 Nov 19 |
nicklas |
"GA": 8.0, "GC": 13.0, "GG": 11.0, "GN": 10.5, "GT": 10.0, |
5747 |
21 Nov 19 |
nicklas |
"NA": 6.0, "NC": 10.5, "NG": 9.0, "NN": 8.25, "NT": 7.5, |
5747 |
21 Nov 19 |
nicklas |
"TA": 4.0, "TC": 8.0, "TG": 7.0, "TN": 6.0, "TT": 5.0 |
5709 |
08 Nov 19 |
nicklas |
52 |
} |
5709 |
08 Nov 19 |
nicklas |
53 |
|
5709 |
08 Nov 19 |
nicklas |
def getSequenceInfo(seqIn): |
5709 |
08 Nov 19 |
nicklas |
55 |
""" |
5709 |
08 Nov 19 |
nicklas |
Returns GC-content and DNA-strength of the seqIn |
5709 |
08 Nov 19 |
nicklas |
57 |
""" |
5709 |
08 Nov 19 |
nicklas |
seqIn = seqIn.upper() |
5709 |
08 Nov 19 |
nicklas |
gcContSum = 0 |
5709 |
08 Nov 19 |
nicklas |
strengthSum = 0 |
5709 |
08 Nov 19 |
nicklas |
for seqPos in range(0, len(seqIn)): |
5709 |
08 Nov 19 |
nicklas |
if seqPos > 1: |
5709 |
08 Nov 19 |
nicklas |
strengthSum += DNA_STRENGTH_VALUES[seqIn[seqPos - 2:seqPos]] |
5709 |
08 Nov 19 |
nicklas |
gcContSum += GC_CONT_VALUES[seqIn[seqPos]] |
5709 |
08 Nov 19 |
nicklas |
65 |
|
5709 |
08 Nov 19 |
nicklas |
gcCont = gcContSum / len(seqIn) |
5709 |
08 Nov 19 |
nicklas |
dnaStrength = strengthSum / len(seqIn) |
5709 |
08 Nov 19 |
nicklas |
return gcCont, dnaStrength |
5709 |
08 Nov 19 |
nicklas |
69 |
|
5709 |
08 Nov 19 |
nicklas |
70 |
|
5709 |
08 Nov 19 |
nicklas |
header = '''##fileformat=VCFv4.2 |
5709 |
08 Nov 19 |
nicklas |
##INFO=<ID=GC_CONT,Number=1,Type=Float,Description="Proportion of bases that are GC within {norm} bases up- and down-stream of position"> |
5709 |
08 Nov 19 |
nicklas |
##INFO=<ID=GC_CONT_LOCAL,Number=1,Type=Float,Description="Proportion of bases that are GC within {loc} bases up and downstream of position"> |
5709 |
08 Nov 19 |
nicklas |
##INFO=<ID=STRENGTH,Number=1,Type=Float,Description="DNA strength in the region within {norm} bases up- and down-stream of position"> |
5709 |
08 Nov 19 |
nicklas |
##INFO=<ID=STRENGTH_LOCAL,Number=1,Type=Float,Description="DNA strength in the region within {loc} bases up- and down-stream of position"> |
5709 |
08 Nov 19 |
nicklas |
#CHROM POS ID REF ALT QUAL FILTER INFO'''.format(norm=NORM_LEN, loc=LOCAL_LEN) |
5709 |
08 Nov 19 |
nicklas |
77 |
|
5709 |
08 Nov 19 |
nicklas |
print(header) |
5709 |
08 Nov 19 |
nicklas |
79 |
|
5709 |
08 Nov 19 |
nicklas |
for line in sys.stdin: |
5709 |
08 Nov 19 |
nicklas |
cols = line.rstrip().split('\t') |
5709 |
08 Nov 19 |
nicklas |
seq = cols[COL_SEQ] |
5753 |
22 Nov 19 |
nicklas |
posInSeq = int(cols[COL_POS])-int(cols[COL_START]) |
5753 |
22 Nov 19 |
nicklas |
seqLocal = cols[COL_SEQ][max(posInSeq-LOCAL_LEN, 0):posInSeq+LOCAL_LEN] |
5709 |
08 Nov 19 |
nicklas |
gcNorm, strengthNorm = getSequenceInfo(seq) |
5709 |
08 Nov 19 |
nicklas |
gcLocal, strengthLocal = getSequenceInfo(seqLocal) |
5709 |
08 Nov 19 |
nicklas |
cout = (cols[COL_CHR], cols[COL_POS], '.', cols[COL_REF], cols[COL_ALT], '.', '.', gcNorm, gcLocal, strengthNorm, strengthLocal) |
5709 |
08 Nov 19 |
nicklas |
print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tGC_CONT={7};GC_CONT_LOCAL={8};STRENGTH={9};STRENGTH_LOCAL={10}'.format(*cout)) |