Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# given
# <gene_set>.<ID>.sample_interval_summary - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
# <gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
#
# generate a coverage output file
# all the columns from the coverage file for this regions+ another column with the number of P/LP (ACP) ClinVar variants in this region
# all the columns from the ClinVar file + another column with the percentage of bases covered at least 20x from the coverage file
#
# Author: MH
# last modified: SEPT 26, 2019
import sys
import os
import csv
import gzip
COV_DICT = {} # key: 'chr:start-1:end'; value: coverage column (data[8])
def go(in_gatk_file,in_clin_file,out_cov_file):
# read in the coverage file
read_coverage(in_gatk_file)
out_han = open(out_cov_file,'w')
out_cntr = 0
out_han.write('chr\tstart\tend\tgene\tnum ClinVar P/LP (ACP)\tpercent bases covered > 20x\n')
in_han = open(in_clin_file,'r')
in_cntr = 0
for line in in_han:
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
start = int(data[1])
end = int(data[2])
key = '%s:%s:%s' % (chr,start,end)
if key in COV_DICT:
cov = COV_DICT[key]
else:
print "ERROR: cannot find the coverage for key = %s" % (key)
raiseSystemExit
to_write = ''
for d in data:
to_write = to_write + '%s\t' % (d)
to_write = to_write + '%.1f\n' % (cov)
out_han.write(to_write)
out_cntr += 1
in_han.close()
out_han.close()
print "Read %s intervals from %s" % (in_cntr,in_clin_file)
print "Recorder %s intervals in output" % (out_cntr)
print ""
print "Output coverage file = %s" % (out_cov_file)
print ""
sys.stdout.flush()
def read_coverage(in_file):
in_han = open(in_file,'r')
for line in in_han:
if line.startswith('Target'):
if line.endswith('%_above_20\n'):
# header line, the last column is percenatge of bases covered > 20x
continue
else:
print "ERROR: found header line, but the last column is not bases above 20x"
print line
raise SystemExit
data = [x.strip() for x in line.strip().split('\t')]
if len(data) != 9:
print "ERROR: wrong number of columns in coverage file = %s" % (in_file)
print line
raise SystemExit
locus = data[0]
if data[8] == 'NaN':
cov = float(0.0)
else:
cov = float(data[8])
chr,pos = locus.split(':')
s,e = pos.split('-')
start = int(s)-1
end = int(e)
key = '%s:%s:%s' % (chr,start,end)
if key not in COV_DICT:
COV_DICT[key] = cov
else:
print "ERROR: key from GATK coverage file not unique = %s" % (key)
raise SystemExit
in_han.close()
print "Recorded the coverage for %s unique keys" % (len(COV_DICT))
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) == 4:
go(sys.argv[1],sys.argv[2],sys.argv[3])
else:
print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/generate_coverage_result_file.py \
DDG2P.s14-NFE-Twist-NA12878.sample_interval_summary \
/home/u035/u035/shared/resources/G2P/DDG2P.20180830.ClinVar.20190520.plus15bp.txt \
DDG2P.s14-NFE-Twist-NA12878.COV.txt"
raise SystemExit