Newer
Older
# the family PED file [${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped]
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]
# G2P text output for the trio [${FAMILY_ID}.report.txt]
# VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf]
#
#
# output:
# DECIPHER formated file for the proband
# - all G2P variants
# - denovo variants marked as such
#
# checks:
# all G2P variants found in the individual VCF
# all VASE denovo variants found in the individual VCF
#
# Author: MH
# last modified: DEC 05, 2019
import sys
import os
import csv
import gzip
from collections import defaultdict
ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'
G2P_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
G2P_DATA = {} # key: chr:pos:ref:alt; value: (transcript,gene,GT)
VASE_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID
NUM_UNIQ_G2P_VARS = 0
NUM_UNIQ_VASE_VARS = 0
CHILD_ID = 0
CHILD_SEX = 0
DEC_CHILD_SEX = 'unknown'
MOM_ID = 0
MOM_STAT = 0 # 1 = UNAFF, 2 = AFF
DAD_ID = 0
DAD_STAT = 0 # 1 = UNAFF, 2 = AFF
ALL_CHILD_DICT = {} # key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
ALL_MOM_DICT = {} # key: chr:pos:ref:alt; value: irrelevant
ALL_DAD_DICT = {} # key: chr:pos:ref:alt; value: irrelevant
CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown'
SNAP_FLANK = 25
MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) ID
TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
### call the python scrpit
#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV.py \
#${DEC_MAP} \
#${TRANS_MAP} \
#${PED_FILE} \
#${IN_G2P_FILE} \
#${IN_VASE_FILE} \
#${FAM_IGV_DIR} \
#${VCF_DIR} \
#${PLATE_ID} \
#${FAMILY_ID} \
#${DEC_DIR} \
#${FAM_BAM_DIR}
def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
# read the decipher to internal ID mapping file
read_map_file(dec_map_file)
# read the transcript mapping file
read_trans_map(trans_map_file)
# read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
read_ped(ped_file)
if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0):
print "======================================"
print "Analyzing:"
print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT)
print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT)
print "======================================"
sys.stdout.flush()
else:
print "ERROR: problems reading the PED file = %s" % (ped_file)
raise SystemExit
# read the G2P output for this family
read_G2P(in_g2p_file)
# read the VASE output for this family
read_VASE(in_vase_file)
# now read the individual VCFs and record all the variants
child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID)
dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID)
read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
sys.stdout.flush()
read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT)
print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID)
sys.stdout.flush()
read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT)
print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID)
sys.stdout.flush()
# now go over all child variants and set the inheritance
num_child_vars_assigned = 0
for key,v in ALL_CHILD_DICT.iteritems():
if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT):
CHILD_INHER_DICT[key] = 'Biparental'
num_child_vars_assigned += 1
elif key in ALL_MOM_DICT:
CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother'
num_child_vars_assigned += 1
elif key in ALL_DAD_DICT:
CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father'
num_child_vars_assigned += 1
else:
CHILD_INHER_DICT[key] = 'Unknown'
assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0
print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
sys.stdout.flush()
# setup the DECIPHER output file
out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID) ################################
out_han = open(out_dec_file,'w')
out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
# setup the IGV snapshot file
out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID) #################################
out_igv_han = open(out_igv_file,'w')
out_igv_han.write('new\n')
out_igv_han.write('genome hg38\n')
out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
out_igv_han.write('new\n')
child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID)
dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID)
out_igv_han.write('load %s\n' % (child_bam))
out_igv_han.write('load %s\n' % (mom_bam))
out_igv_han.write('load %s\n' % (dad_bam))
out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
out_igv_han.write('\n')
# now read the child VCF, check if the variant in the G2P/VASE output, if yes:
# set the value in the dict to 1
# print out to to output file
Loading
Loading full blame...