Newer
Older
# <gene_set>.<ID>.sample_interval_summary - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
# <gene_set>.<gene_set_date>.ClinVar.<clinvar_date>.plus15bp.txt - e.g. DDG2P.20180830.ClinVar.20190520.plus15bp.txt
#
# generate a coverage output file
# all the columns from the coverage file for this regions+ another column with the number of P/LP (ACP) ClinVar variants in this region
# all the columns from the ClinVar file + another column with the percentage of bases covered at least 20x from the coverage file
#
# Author: MH
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import sys
import os
import csv
import gzip
COV_DICT = {} # key: 'chr:start-1:end'; value: coverage column (data[8])
def go(in_gatk_file,in_clin_file,out_cov_file):
# read in the coverage file
read_coverage(in_gatk_file)
out_han = open(out_cov_file,'w')
out_cntr = 0
out_han.write('chr\tstart\tend\tgene\tnum ClinVar P/LP (ACP)\tpercent bases covered > 20x\n')
in_han = open(in_clin_file,'r')
in_cntr = 0
for line in in_han:
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
start = int(data[1])
end = int(data[2])
key = '%s:%s:%s' % (chr,start,end)
if key in COV_DICT:
cov = COV_DICT[key]
else:
print "ERROR: cannot find the coverage for key = %s" % (key)
raiseSystemExit
to_write = ''
for d in data:
to_write = to_write + '%s\t' % (d)
to_write = to_write + '%.1f\n' % (cov)
out_han.write(to_write)
out_cntr += 1
in_han.close()
out_han.close()
print "Read %s intervals from %s" % (in_cntr,in_clin_file)
print "Recorder %s intervals in output" % (out_cntr)
print ""
print "Output coverage file = %s" % (out_cov_file)
print ""
sys.stdout.flush()
def read_coverage(in_file):
in_han = open(in_file,'r')
for line in in_han:
if line.startswith('Target'):
if line.endswith('%_above_20\n'):
# header line, the last column is percenatge of bases covered > 20x
continue
else:
print "ERROR: found header line, but the last column is not bases above 20x"
print line
raise SystemExit
data = [x.strip() for x in line.strip().split('\t')]
if len(data) != 9:
print "ERROR: wrong number of columns in coverage file = %s" % (in_file)
print line
raise SystemExit
locus = data[0]
if data[8] == 'NaN':
cov = float(0.0)
else:
cov = float(data[8])
chr,pos = locus.split(':')
s,e = pos.split('-')
start = int(s)-1
end = int(e)
key = '%s:%s:%s' % (chr,start,end)
if key not in COV_DICT:
COV_DICT[key] = cov
else:
print "ERROR: key from GATK coverage file not unique = %s" % (key)
raise SystemExit
in_han.close()
print "Recorded the coverage for %s unique keys" % (len(COV_DICT))
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) == 4:
go(sys.argv[1],sys.argv[2],sys.argv[3])
else:
print "Suggested use: time $PYTHON /home/u035/u035/shared/scripts/generate_coverage_result_file.py \
DDG2P.s14-NFE-Twist-NA12878.sample_interval_summary \
/home/u035/u035/shared/resources/G2P/DDG2P.20180830.ClinVar.20190520.plus15bp.txt \
DDG2P.s14-NFE-Twist-NA12878.COV.txt"
raise SystemExit