Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# given the G2P FAM_ID.report.txt for a trio ran thru G2P
# given an individual VCF file for a sample from this trio
# extract a VCF file for the variants in this individual which are in the G2P output
#
# Author: MH
# last modified: APR 15, 2019
import sys
import os
import csv
import gzip
VAR_DICT = {} # key: chr:start:end:ref:alt; value: irrelevant - for variants found in G2P output for this sample_id
NUM_UNIQ_VARS = 0
def go(in_vcf_file,in_g2p_file,sample_id,out_vcf_file):
# read the G2P output for this sample_id
read_G2P(in_g2p_file,sample_id)
# now read the input VCF, check if the variant in the G2P output: if yes - put it in the out file
in_cntr = 0
out_cntr = 0
in_han = gzip.open(in_vcf_file,'r')
out_han = open(out_vcf_file,'w')
for line in in_han:
if line.startswith('#'):
out_han.write(line)
continue
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
pos = int(data[1])
ref = data[3]
alt = data[4]
# did the splitting and normalizing - should noe have multiallelic variants
if alt.find(',') != -1:
print "ERROR: found multiallelic variant"
print line
raiseSystemExit
##############################################################
# different processing depending on being a SNP, INS, or DEL #
##############################################################
if len(ref) == len(alt): # SNP
if len(ref) != 1:
print "ERROR: MNPs are not supported!"
print line
raise SystemExit
key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
if key_to_match in VAR_DICT:
out_cntr += 1
out_han.write(line)
elif len(ref) > len(alt): # DEL
if len(alt) != 1:
print "ERROR with a deletion"
print line
raise SystemExit
key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
if key_to_match in VAR_DICT:
out_cntr += 1
out_han.write(line)
elif len(ref) < len(alt): # INS
if len(ref) != 1:
print "ERROR with an insertion"
print line
raise SystemExit
key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
if key_to_match in VAR_DICT:
out_cntr += 1
out_han.write(line)
else:
print "Cannot establish the type of this VCF variant"
print line
raise SystemExit
in_han.close()
out_han.close()
print "read %s VCF vars for %s, wrote %s G2P vars in %s" % (in_cntr,sample_id,out_cntr,out_vcf_file)
if out_cntr == NUM_UNIQ_VARS:
print "OK: All G2P vars are recorded in the G2P VCF file"
else:
print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"
def read_G2P(in_file,sample_id):
global NUM_UNIQ_VARS
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
# ignore variants in other memebers of the trio
sam_id = data[0]
if sam_id != sample_id:
continue
var_list = [y.strip() for y in data[6].split(';')]
for v in var_list:
v_details = [z.strip() for z in v.split(':')]
chr = v_details[0]
start = int(v_details[1])
end = int(v_details[2])
ref = v_details[3]
alt = v_details[4]
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
key = '%s:%s:%s:%s' % (chr,start,ref,alt)
if key not in VAR_DICT:
VAR_DICT[key] = 1
else:
pass
in_han.close()
NUM_UNIQ_VARS = len(VAR_DICT)
print "Found %s unique G2P variants for %s" % (NUM_UNIQ_VARS,sample_id)
sys.stdout.flush()
# def go(in_vcf_file,in_g2p_file,sample_id,out_vcf_file):
if __name__ == '__main__':
if len(sys.argv) == 5:
go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4])
else:
print "Suggested use: time python /home/u035/u035/shared/scripts/generate_G2P_out_VCF.py 2820-gatk-haplotype-annotated.2820_2820.vcf.gz ../output_dd/2820_log_dir/2820.report.txt 2820_2820 2820_2820.G2P.vcf "