5767 |
02 Dec 19 |
nicklas |
#!/usr/bin/python |
5767 |
02 Dec 19 |
nicklas |
2 |
""" |
5767 |
02 Dec 19 |
nicklas |
$Id $ |
5767 |
02 Dec 19 |
nicklas |
4 |
|
5767 |
02 Dec 19 |
nicklas |
Script that calculates statistics for variants in a list of VCF files. |
5771 |
03 Dec 19 |
nicklas |
The script need three parameters: |
5767 |
02 Dec 19 |
nicklas |
7 |
|
5769 |
03 Dec 19 |
nicklas |
1: Path to text file with list of VCF files to process (se below) |
5769 |
03 Dec 19 |
nicklas |
2: Path to progress reporting file |
5771 |
03 Dec 19 |
nicklas |
3: Path to a log file |
5769 |
03 Dec 19 |
nicklas |
11 |
|
5769 |
03 Dec 19 |
nicklas |
The text file given as the first parameter should be a tab-separated |
5769 |
03 Dec 19 |
nicklas |
text file. Each line should have three columns: |
5769 |
03 Dec 19 |
nicklas |
14 |
|
5767 |
02 Dec 19 |
nicklas |
1: Patient identifier |
5767 |
02 Dec 19 |
nicklas |
2: Path to VCF file |
5769 |
03 Dec 19 |
nicklas |
3: Name of alignment (used for progress reporting) |
5767 |
02 Dec 19 |
nicklas |
18 |
|
5767 |
02 Dec 19 |
nicklas |
Counts and frequencies for all variants will calculated for the total and |
5767 |
02 Dec 19 |
nicklas |
per-patient. |
5767 |
02 Dec 19 |
nicklas |
21 |
|
5767 |
02 Dec 19 |
nicklas |
The output is a VCF file with all variants and the following annotations: |
5767 |
02 Dec 19 |
nicklas |
CNT_ALL: The number of VCF files the variants was seen in |
5767 |
02 Dec 19 |
nicklas |
MF_ALL: The mutation frequency among all VCF files |
5767 |
02 Dec 19 |
nicklas |
CNT_NR: The number of patients the variants was seen at least one time |
5767 |
02 Dec 19 |
nicklas |
MF_NR: The mutation frequency among all patients |
5767 |
02 Dec 19 |
nicklas |
27 |
|
5767 |
02 Dec 19 |
nicklas |
28 |
""" |
5767 |
02 Dec 19 |
nicklas |
29 |
|
5767 |
02 Dec 19 |
nicklas |
import sys |
5767 |
02 Dec 19 |
nicklas |
import gzip |
5767 |
02 Dec 19 |
nicklas |
import datetime |
5769 |
03 Dec 19 |
nicklas |
import time |
5767 |
02 Dec 19 |
nicklas |
34 |
|
5767 |
02 Dec 19 |
nicklas |
# Store all variants that we have seen in the VCF files we load |
5767 |
02 Dec 19 |
nicklas |
# The 'key' is a combination of chromosome, position, id, ref and alt columns |
5767 |
02 Dec 19 |
nicklas |
# The 'value' is an array that is counting the number of times we have seen the variant |
5767 |
02 Dec 19 |
nicklas |
# [0] = Total number of times including replicates |
5767 |
02 Dec 19 |
nicklas |
# [1] = Number of times in unique patients |
5767 |
02 Dec 19 |
nicklas |
allVariants = {} |
5767 |
02 Dec 19 |
nicklas |
41 |
|
5767 |
02 Dec 19 |
nicklas |
# The total number of samples, 0=including replicates, 1=only non-replicates |
5767 |
02 Dec 19 |
nicklas |
tots = [0, 0] |
5767 |
02 Dec 19 |
nicklas |
44 |
|
5767 |
02 Dec 19 |
nicklas |
# Store all variants that we have seen for the current patient |
5767 |
02 Dec 19 |
nicklas |
# The 'key' is the same as for the allVarints but we only store |
5767 |
02 Dec 19 |
nicklas |
# single count value |
5767 |
02 Dec 19 |
nicklas |
currentVariants = {} |
5767 |
02 Dec 19 |
nicklas |
currentVariants['name'] = '' |
5767 |
02 Dec 19 |
nicklas |
50 |
|
5767 |
02 Dec 19 |
nicklas |
def loadVcf(path, patient): |
5767 |
02 Dec 19 |
nicklas |
tots[0] += 1 |
5767 |
02 Dec 19 |
nicklas |
53 |
|
5767 |
02 Dec 19 |
nicklas |
# If this is the first time we see the patient |
5767 |
02 Dec 19 |
nicklas |
# the current information is reset |
5767 |
02 Dec 19 |
nicklas |
if patient != currentVariants['name']: |
5767 |
02 Dec 19 |
nicklas |
currentVariants.clear() |
5767 |
02 Dec 19 |
nicklas |
currentVariants['name'] = patient |
5767 |
02 Dec 19 |
nicklas |
tots[1] += 1 |
5767 |
02 Dec 19 |
nicklas |
60 |
|
5767 |
02 Dec 19 |
nicklas |
try: |
5767 |
02 Dec 19 |
nicklas |
# Open the VCF file and read it line by line |
5767 |
02 Dec 19 |
nicklas |
vcf = gzip.open(path, 'rt') |
5767 |
02 Dec 19 |
nicklas |
line = vcf.readline() |
5767 |
02 Dec 19 |
nicklas |
while line: |
5767 |
02 Dec 19 |
nicklas |
if not line.startswith("#"): |
5767 |
02 Dec 19 |
nicklas |
cols = line.rstrip().split('\t') |
5767 |
02 Dec 19 |
nicklas |
# Key is <chr>|<pos>|<id>|<ref>|<alt> |
5767 |
02 Dec 19 |
nicklas |
key = '|'.join(cols[i] for i in [0,1,2,3,4]) |
5767 |
02 Dec 19 |
nicklas |
70 |
|
5767 |
02 Dec 19 |
nicklas |
# Update the total count |
5767 |
02 Dec 19 |
nicklas |
if key in allVariants: |
5767 |
02 Dec 19 |
nicklas |
allVariants[key][0] += 1 |
5767 |
02 Dec 19 |
nicklas |
else: |
5767 |
02 Dec 19 |
nicklas |
allVariants[key] = [1, 0] |
5767 |
02 Dec 19 |
nicklas |
76 |
|
5767 |
02 Dec 19 |
nicklas |
# Update count for this patient |
5767 |
02 Dec 19 |
nicklas |
if key in currentVariants: |
5767 |
02 Dec 19 |
nicklas |
currentVariants[key] += 1 |
5767 |
02 Dec 19 |
nicklas |
else: |
5767 |
02 Dec 19 |
nicklas |
currentVariants[key] = 1 |
5767 |
02 Dec 19 |
nicklas |
allVariants[key][1] += 1 |
5767 |
02 Dec 19 |
nicklas |
83 |
|
5767 |
02 Dec 19 |
nicklas |
line = vcf.readline() |
5767 |
02 Dec 19 |
nicklas |
finally: |
5767 |
02 Dec 19 |
nicklas |
vcf.close() |
5767 |
02 Dec 19 |
nicklas |
87 |
|
5769 |
03 Dec 19 |
nicklas |
# Report progress to the progressFile (1-90%) |
5769 |
03 Dec 19 |
nicklas |
def progressReport(progressFile, alignment, current, total): |
5769 |
03 Dec 19 |
nicklas |
percent = 1+current * 89 / total |
5769 |
03 Dec 19 |
nicklas |
with open(progressFile, 'w') as p: |
5769 |
03 Dec 19 |
nicklas |
p.write("{0} Reading VCF file for '{1}' ({2} of {3})".format(percent, alignment, current, total)) |
5769 |
03 Dec 19 |
nicklas |
93 |
|
5767 |
02 Dec 19 |
nicklas |
# Read the patient/VCF list |
5767 |
02 Dec 19 |
nicklas |
# Each line should be tab-separated <PAT>\t<PATH-TO-VCF> |
5767 |
02 Dec 19 |
nicklas |
vcfList = sys.argv[1] |
5767 |
02 Dec 19 |
nicklas |
with open(vcfList) as f: |
5767 |
02 Dec 19 |
nicklas |
lines = f.read().splitlines() |
5767 |
02 Dec 19 |
nicklas |
99 |
|
5767 |
02 Dec 19 |
nicklas |
# Sort lines so that entries for the same patient is after each other |
5767 |
02 Dec 19 |
nicklas |
# The counting algorithm depends on it |
5767 |
02 Dec 19 |
nicklas |
lines.sort() |
5767 |
02 Dec 19 |
nicklas |
103 |
|
5769 |
03 Dec 19 |
nicklas |
progressFile = sys.argv[2] |
5771 |
03 Dec 19 |
nicklas |
logFile = sys.argv[3] |
5769 |
03 Dec 19 |
nicklas |
106 |
|
5767 |
02 Dec 19 |
nicklas |
# Load the VCF files and count the variants in them |
5769 |
03 Dec 19 |
nicklas |
vcfCount = 0 |
5769 |
03 Dec 19 |
nicklas |
totalVcf = len(lines) |
5769 |
03 Dec 19 |
nicklas |
nextProgressReport = time.time()+15 |
5769 |
03 Dec 19 |
nicklas |
111 |
|
5767 |
02 Dec 19 |
nicklas |
for line in lines: |
5769 |
03 Dec 19 |
nicklas |
cols = line.split('\t') |
5769 |
03 Dec 19 |
nicklas |
vcfCount += 1 |
5769 |
03 Dec 19 |
nicklas |
# Report progress every 15 seconds |
5769 |
03 Dec 19 |
nicklas |
if time.time() > nextProgressReport: |
5769 |
03 Dec 19 |
nicklas |
progressReport(progressFile, cols[2], vcfCount, totalVcf) |
5769 |
03 Dec 19 |
nicklas |
nextProgressReport = time.time()+15 |
5767 |
02 Dec 19 |
nicklas |
loadVcf(cols[1], cols[0]) |
5767 |
02 Dec 19 |
nicklas |
120 |
|
5771 |
03 Dec 19 |
nicklas |
# Write the log file |
5771 |
03 Dec 19 |
nicklas |
with open(logFile, 'w') as log: |
5771 |
03 Dec 19 |
nicklas |
log.write("NumberOfVCF: {0}\n".format(tots[0])) |
5771 |
03 Dec 19 |
nicklas |
log.write("NumberOfPatients: {0}\n".format(tots[1])) |
5771 |
03 Dec 19 |
nicklas |
log.write("NumberOfVariants: {0}\n".format(len(allVariants))) |
5771 |
03 Dec 19 |
nicklas |
126 |
|
5767 |
02 Dec 19 |
nicklas |
# Generate header for the output VCF |
5767 |
02 Dec 19 |
nicklas |
header = '''##fileformat=VCFv4.2 |
5767 |
02 Dec 19 |
nicklas |
##build={0} |
5767 |
02 Dec 19 |
nicklas |
'''.format(datetime.datetime.today().strftime('%Y-%m-%d')) |
5767 |
02 Dec 19 |
nicklas |
header += '''##INFO=<ID=CNT_ALL,Number=1,Type=Integer,Description="Number of samples with this mutation among ALL ({all}) samples"> |
5767 |
02 Dec 19 |
nicklas |
##INFO=<ID=CNT_NR,Number=1,Type=Integer,Description="Number of NON-replicate ({nonrep}) samples with this mutation"> |
5767 |
02 Dec 19 |
nicklas |
##INFO=<ID=MF_ALL,Number=1,Type=Float,Description="Mutant frequency among ALL ({all}) samples"> |
5767 |
02 Dec 19 |
nicklas |
##INFO=<ID=MF_NR,Number=1,Type=Float,Description="Mutant frequency among NON-replicate ({nonrep}) samples"> |
5767 |
02 Dec 19 |
nicklas |
#CHROM POS ID REF ALT QUAL FILTER INFO'''.format(all=tots[0], nonrep=tots[1]) |
5767 |
02 Dec 19 |
nicklas |
136 |
|
5767 |
02 Dec 19 |
nicklas |
print(header) |
5767 |
02 Dec 19 |
nicklas |
138 |
|
5767 |
02 Dec 19 |
nicklas |
# Print out all variants that was found |
5767 |
02 Dec 19 |
nicklas |
for key in allVariants.keys(): |
5767 |
02 Dec 19 |
nicklas |
cols = key.split('|') |
5767 |
02 Dec 19 |
nicklas |
vals = (cols + ['.', '.'] + allVariants[key] + [ float(num) / tot for num, tot in zip(allVariants[key], tots) ] ) |
5767 |
02 Dec 19 |
nicklas |
print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tCNT_ALL={7};CNT_NR={8};MF_ALL={9:.4};MF_NR={10:.4}".format(*vals)) |
5767 |
02 Dec 19 |
nicklas |
144 |
|