svndigest - svndigest

extensions/net.sf.basedb.reggie/trunk/src/net/sf/basedb/reggie/grid/scripts/gc_stat.py

: Code
: Comments
: Other

Rev	Date	Author	Line
6656	29 Mar 22	nicklas	1	#!/usr/bin/env python
6656	29 Mar 22	nicklas	2
6656	29 Mar 22	nicklas	3	"""
6656	29 Mar 22	nicklas	4	$Id: gc_stat.py 5753 2019-11-22 12:10:32Z nicklas $
6656	29 Mar 22	nicklas	5
6656	29 Mar 22	nicklas	6	Script to create a VCF-like file with GC content and DNA strength INFO fields. This script is
6656	29 Mar 22	nicklas	7	part of a pipeline that requires a specially prepared bed-file as input (reading from stdin).
6656	29 Mar 22	nicklas	8	Columns (0-based for convenience):
6656	29 Mar 22	nicklas	9
6656	29 Mar 22	nicklas	10	0: Chromosome (printed to VCF#CHROM)
6656	29 Mar 22	nicklas	11	1: Start (=original location - 50; or 0 if original location < 50)
6656	29 Mar 22	nicklas	12	2: Stop (=original location + 50; not used)
6656	29 Mar 22	nicklas	13	3: Original location (printed to VCF#POS)
6656	29 Mar 22	nicklas	14	4: REF (printed to VCF#REF)
6656	29 Mar 22	nicklas	15	5: ALT (printed to VCF#ALT)
6656	29 Mar 22	nicklas	16	6: SEQUENCE between start and stop (used for calculating GC and DNA strength, which are printed to VCF#INFO)
6656	29 Mar 22	nicklas	17
6656	29 Mar 22	nicklas	18	The VCF file that is produced must be compressed (bgzip) and indexed (tabix) before it can be used by vcfanno
6656	29 Mar 22	nicklas	19	to annotate the orginal VCF file (from VarDict) with the GC content and DNA strength. A .toml file like
6656	29 Mar 22	nicklas	20	this is needed:
6656	29 Mar 22	nicklas	21
6656	29 Mar 22	nicklas	22	[[annotation]]
6656	29 Mar 22	nicklas	23	file="/path/to/vcf-file-produced-by-this-script.gz"
6656	29 Mar 22	nicklas	24	fields=["GC_CONT","GC_CONT_LOCAL","STRENGTH","STRENGTH_LOCAL"]
6656	29 Mar 22	nicklas	25	names=["GC_CONT","GC_CONT_LOCAL","STRENGTH","STRENGTH_LOCAL"]
6656	29 Mar 22	nicklas	26	ops=["self","self","self","self"]
6656	29 Mar 22	nicklas	27
6656	29 Mar 22	nicklas	28	"""
6656	29 Mar 22	nicklas	29
6656	29 Mar 22	nicklas	30	import sys
6656	29 Mar 22	nicklas	31
6656	29 Mar 22	nicklas	32	NORM_LEN = 50
6656	29 Mar 22	nicklas	33	LOCAL_LEN = 6
6656	29 Mar 22	nicklas	34
6656	29 Mar 22	nicklas	35	# Column indexes from the BED file we are reading
6656	29 Mar 22	nicklas	36	COL_CHR = 0
6656	29 Mar 22	nicklas	37	COL_START = 1
6656	29 Mar 22	nicklas	38	COL_END = 2
6656	29 Mar 22	nicklas	39	COL_POS = 3
6656	29 Mar 22	nicklas	40	COL_REF = 4
6656	29 Mar 22	nicklas	41	COL_ALT = 5
6656	29 Mar 22	nicklas	42	COL_SEQ = 6
6656	29 Mar 22	nicklas	43
6656	29 Mar 22	nicklas	44	# GC content and DNA strength values
6656	29 Mar 22	nicklas	45	GC_CONT_VALUES = { "A": 0.0, "C": 1.0, "G": 1.0, "N": 0.5, "T": 0.0 }
6656	29 Mar 22	nicklas	46	DNA_STRENGTH_VALUES = {
6656	29 Mar 22	nicklas	47	"AA": 5.0, "AC": 10.0, "AG": 8.0, "AN": 7.5, "AT": 7.0,
6656	29 Mar 22	nicklas	48	"CA": 7.0, "CC": 11.0, "CG": 10.0, "CN": 9.0, "CT": 8.0,
6656	29 Mar 22	nicklas	49	"GA": 8.0, "GC": 13.0, "GG": 11.0, "GN": 10.5, "GT": 10.0,
6656	29 Mar 22	nicklas	50	"NA": 6.0, "NC": 10.5, "NG": 9.0, "NN": 8.25, "NT": 7.5,
6656	29 Mar 22	nicklas	51	"TA": 4.0, "TC": 8.0, "TG": 7.0, "TN": 6.0, "TT": 5.0
6656	29 Mar 22	nicklas	52	}
6656	29 Mar 22	nicklas	53
6656	29 Mar 22	nicklas	54	def getSequenceInfo(seqIn):
6656	29 Mar 22	nicklas	55	"""
6656	29 Mar 22	nicklas	56	Returns GC-content and DNA-strength of the seqIn
6656	29 Mar 22	nicklas	57	"""
6656	29 Mar 22	nicklas	58	seqIn = seqIn.upper()
6656	29 Mar 22	nicklas	59	gcContSum = 0
6656	29 Mar 22	nicklas	60	strengthSum = 0
6656	29 Mar 22	nicklas	61	for seqPos in range(0, len(seqIn)):
6656	29 Mar 22	nicklas	62	if seqPos > 1:
6656	29 Mar 22	nicklas	63	strengthSum += DNA_STRENGTH_VALUES[seqIn[seqPos - 2:seqPos]]
6656	29 Mar 22	nicklas	64	gcContSum += GC_CONT_VALUES[seqIn[seqPos]]
6656	29 Mar 22	nicklas	65
6656	29 Mar 22	nicklas	66	gcCont = gcContSum / len(seqIn)
6656	29 Mar 22	nicklas	67	dnaStrength = strengthSum / len(seqIn)
6656	29 Mar 22	nicklas	68	return gcCont, dnaStrength
6656	29 Mar 22	nicklas	69
6656	29 Mar 22	nicklas	70
6656	29 Mar 22	nicklas	71	header = '''##fileformat=VCFv4.2
6656	29 Mar 22	nicklas	72	##INFO=<ID=GC_CONT,Number=1,Type=Float,Description="Proportion of bases that are GC within {norm} bases up- and down-stream of position">
6656	29 Mar 22	nicklas	73	##INFO=<ID=GC_CONT_LOCAL,Number=1,Type=Float,Description="Proportion of bases that are GC within {loc} bases up and downstream of position">
6656	29 Mar 22	nicklas	74	##INFO=<ID=STRENGTH,Number=1,Type=Float,Description="DNA strength in the region within {norm} bases up- and down-stream of position">
6656	29 Mar 22	nicklas	75	##INFO=<ID=STRENGTH_LOCAL,Number=1,Type=Float,Description="DNA strength in the region within {loc} bases up- and down-stream of position">
6656	29 Mar 22	nicklas	76	#CHROM POS ID REF ALT QUAL FILTER INFO'''.format(norm=NORM_LEN, loc=LOCAL_LEN)
6656	29 Mar 22	nicklas	77
6656	29 Mar 22	nicklas	78	print(header)
6656	29 Mar 22	nicklas	79
6656	29 Mar 22	nicklas	80	for line in sys.stdin:
6656	29 Mar 22	nicklas	81	cols = line.rstrip().split('\t')
6656	29 Mar 22	nicklas	82	seq = cols[COL_SEQ]
6656	29 Mar 22	nicklas	83	posInSeq = int(cols[COL_POS])-int(cols[COL_START])
6656	29 Mar 22	nicklas	84	seqLocal = cols[COL_SEQ][max(posInSeq-LOCAL_LEN, 0):posInSeq+LOCAL_LEN]
6656	29 Mar 22	nicklas	85	gcNorm, strengthNorm = getSequenceInfo(seq)
6656	29 Mar 22	nicklas	86	gcLocal, strengthLocal = getSequenceInfo(seqLocal)
6656	29 Mar 22	nicklas	87	cout = (cols[COL_CHR], cols[COL_POS], '.', cols[COL_REF], cols[COL_ALT], '.', '.', gcNorm, gcLocal, strengthNorm, strengthLocal)
6656	29 Mar 22	nicklas	88	print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\tGC_CONT={7};GC_CONT_LOCAL={8};STRENGTH={9};STRENGTH_LOCAL={10}'.format(*cout))