Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# given a VCF file and a list of IDs (as in the #CHROM line)
# extract a subset VCF containing only variants which are not HOM REF in *all* the listed IDs
#
#
# Author: MH
# last modified: AUG 18, 2020
import sys
import os
import gzip
ID_DICT = {} # key: ID, value: field index (0 = chr, 1 = pos, ....)
def go(in_file,list_ids,out_file):
ids = [x.strip() for x in list_ids.strip().split(',')]
print "Found %s IDs for which the shared variants are to be extracted" % (len(ids))
print ids
in_han = gzip.open(in_file,'r')
out_han = open(out_file,'w')
in_cntr = 0
out_cntr = 0
for line in in_han:
if line.startswith('#CHROM'):
print line
# find the index for each ID
# potential problems - ID more than once in the VCF (will return only the first match) - should not occur in VCFs
# - ID not in the list
header_fields = [x.strip() for x in line.strip().split('\t')]
for i in ids:
try:
index = header_fields.index(i)
if i not in ID_DICT:
ID_DICT[i] = index
else:
print "The supplied list of IDs contain duplicates for %s" % (i)
raise SystemExit
except ValueError:
print "%s not found in %s" % (i,line)
raise SystemExit
if len(ID_DICT) != len(ids):
print "Could not find the indexes for all IDs"
print ID_DICT
raise SystemExit
# write the new line in out_file
new_header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'
for z in xrange(0,len(ids)):
new_header = new_header + '%s\t' % (ids[z])
new_header = new_header[:-1] + '\n'
out_han.write(new_header)
continue
if line.startswith('#'):
out_han.write(line)
continue
# startng to read the variants
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
desired = {} # key: id; value: the variant info
for i in ids:
ind = ID_DICT[i]
var = data[ind]
desired[i] = var
print desired
out_cntr += 1
in_han.close()
out_han.close()
print "Read %s vars from the family file, wrote %s shared variants" % (in_cntr,out_cntr)
sys.stdout.flush()
def temp():
AFF_PROBANDS = []
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
pro_fam_id = data[1]
par_1 = int(data[2])
par_2 = int(data[3])
aff = int(data[5])
if (par_1 != 0) or (par_2 != 0):
print "ERROR: Found a proband with a parent"
print line
raise SystemExit(1)
if aff != 2:
print "ERROR: Found unaffected proband"
print line
raise SystemExit(1)
if pro_fam_id not in AFF_PROBANDS:
AFF_PROBANDS.append(pro_fam_id)
else:
print "ERROR: Found duplicate proband"
print line
raise SystemExit(1)
in_han.close()
print "PED file checks: success"
print "Found %s affected probands with no parents in %s" % (len(AFF_PROBANDS),in_file)
sys.stdout.flush()
if __name__ == '__main__':
if len(sys.argv) == 4:
go(sys.argv[1],sys.argv[2],sys.argv[3])
else:
print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_extract_shared_vars.py in_vcf comma,sep,list,of,ids out_vcf"
raise SystemExit