Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# given a VCF file and a list of IDs (as in the #CHROM line)
# extract a subset VCF containing only variants which are not HOM REF in *all* the listed IDs
#
#
# Author: MH
# last modified: AUG 18, 2020
import sys
import os
import gzip
ID_DICT = {} # key: ID, value: field index (0 = chr, 1 = pos, ....)
def go(in_file,list_ids,out_file):
ids = [x.strip() for x in list_ids.strip().split(',')]
print "Found %s IDs for which the shared variants are to be extracted" % (len(ids))
print ids
in_han = gzip.open(in_file,'r')
out_han = open(out_file,'w')
in_cntr = 0
out_cntr = 0
for line in in_han:
if line.startswith('#CHROM'):
print line
# find the index for each ID
# potential problems - ID more than once in the VCF (will return only the first match) - should not occur in VCFs
# - ID not in the list
header_fields = [x.strip() for x in line.strip().split('\t')]
for i in ids:
try:
index = header_fields.index(i)
if i not in ID_DICT:
ID_DICT[i] = index
else:
print "The supplied list of IDs contain duplicates for %s" % (i)
raise SystemExit
except ValueError:
print "%s not found in %s" % (i,line)
raise SystemExit
if len(ID_DICT) != len(ids):
print "Could not find the indexes for all IDs"
print ID_DICT
raise SystemExit
# write the new line in out_file
new_header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
for z in xrange(0,len(ids)):
new_header = new_header + '%s\t' % (ids[z])
new_header = new_header[:-1] + '\n'
out_han.write(new_header)
continue
if line.startswith('#'):
out_han.write(line)
continue
# startng to read the variants
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
desired = {} # key: id; value: the variant info
for i in ids:
ind = ID_DICT[i]
var = data[ind]
desired[i] = var
print desired
out_cntr += 1
in_han.close()
out_han.close()
print "Read %s vars from the family file, wrote %s shared variants" % (in_cntr,out_cntr)
sys.stdout.flush()
def temp():
AFF_PROBANDS = []
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
pro_fam_id = data[1]
par_1 = int(data[2])
par_2 = int(data[3])
aff = int(data[5])
if (par_1 != 0) or (par_2 != 0):
print "ERROR: Found a proband with a parent"
print line
raise SystemExit(1)
if aff != 2:
print "ERROR: Found unaffected proband"
print line
raise SystemExit(1)
if pro_fam_id not in AFF_PROBANDS:
AFF_PROBANDS.append(pro_fam_id)
else:
print "ERROR: Found duplicate proband"
print line
raise SystemExit(1)
in_han.close()
print "PED file checks: success"
print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file)
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) == 4:
go(sys.argv[1],sys.argv[2],sys.argv[3])
else:
print "Suggested use: time python /home/u035/project/scripts/NHS_WES_extract_shared_vars.py in_vcf comma,sep,list,of,ids out_vcf"
raise SystemExit