NHS_WES_generate_DEC_IGV.py_wrong_gene_trans 51.54 KiB
# input:
# the family PED file
# G2P text output for the trio [${FAMILY_ID}.report.txt]
# VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf]
#
#
# individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]
# G2P text output for the trio [${FAMILY_ID}.report.txt]
# VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf]
#
#
# output:
# DECIPHER formated file for the proband
# - all G2P variants
# - denovo variants marked as such
#
# checks:
# all G2P variants found in the individual VCF
# all VASE denovo variants found in the individual VCF
#
# Author: MH
# last modified: NOV 11, 2019
import sys
import os
import csv
import gzip
from collections import defaultdict
ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'
G2P_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
G2P_DATA = {} # key: chr:pos:ref:alt; value: (transcript,gene,GT)
VASE_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID
NUM_UNIQ_G2P_VARS = 0
NUM_UNIQ_VASE_VARS = 0
CHILD_ID = 0
CHILD_SEX = 0
DEC_CHILD_SEX = 'unknown'
MOM_ID = 0
MOM_STAT = 0 # 1 = UNAFF, 2 = AFF
DAD_ID = 0
DAD_STAT = 0 # 1 = UNAFF, 2 = AFF
ALL_CHILD_DICT = {} # key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
ALL_MOM_DICT = {} # key: chr:pos:ref:alt; value: irrelevant
ALL_DAD_DICT = {} # key: chr:pos:ref:alt; value: irrelevant
CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown'
SNAP_FLANK = 25
MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) ID
TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
### call the python scrpit
#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV.py \
#${DEC_MAP} \
#${TRANS_MAP} \
#${PED_FILE} \
#${IN_G2P_FILE} \
#${IN_VASE_FILE} \
#${FAM_IGV_DIR} \
#${VCF_DIR} \
#${PLATE_ID} \
#${FAMILY_ID} \
#${DEC_DIR} \
#${FAM_BAM_DIR}
def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
# read the decipher to internal ID mapping file
read_map_file(dec_map_file)
# read the transcript mapping file
read_trans_map(trans_map_file)
# read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
read_ped(ped_file)
if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0):
print "======================================"
print "Analyzing:"
print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT)
print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT)
print "======================================"
sys.stdout.flush()
else:
print "ERROR: problems reading the PED file = %s" % (ped_file)
raise SystemExit
# read the G2P output for this family
read_G2P(in_g2p_file)
# read the VASE output for this family
read_VASE(in_vase_file)
# now read the individual VCFs and record all the variants
child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID)
dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID)
read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
sys.stdout.flush()
read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT)
print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID)
sys.stdout.flush()
read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT)
print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID)
sys.stdout.flush()
# now go over all child variants and set the inheritance
num_child_vars_assigned = 0
for key,v in ALL_CHILD_DICT.iteritems():
if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT):
CHILD_INHER_DICT[key] = 'Biparental'
num_child_vars_assigned += 1
elif key in ALL_MOM_DICT:
CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother'
num_child_vars_assigned += 1
elif key in ALL_DAD_DICT:
CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father'
num_child_vars_assigned += 1
else:
CHILD_INHER_DICT[key] = 'Unknown'
assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0
print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
sys.stdout.flush()
# setup the DECIPHER output file
out_dec_file = '%s/%s_DEC.FLT.csv' % (dec_dir,CHILD_ID) ################################
out_han = open(out_dec_file,'w')
out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
# setup the IGV snapshot file
out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID) #################################
out_igv_han = open(out_igv_file,'w')
out_igv_han.write('new\n')
out_igv_han.write('genome hg38\n')
out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
out_igv_han.write('new\n')
child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID)
dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID)
out_igv_han.write('load %s\n' % (child_bam))
out_igv_han.write('load %s\n' % (mom_bam))
out_igv_han.write('load %s\n' % (dad_bam))
out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
out_igv_han.write('\n')
# now read the child VCF, check if the variant in the G2P/VASE output, if yes:
# set the value in the dict to 1
# print out to to output file
in_cntr = 0
out_cntr = 0
child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
in_han = gzip.open(child_vcf_file,'r')
for line in in_han:
if line.startswith('#'):
continue
in_cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
pos = int(data[1])
ref = data[3]
alt = data[4]
VCF_VAR = data[9]
key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
inher_stat = CHILD_INHER_DICT[key]
##############################################################
# different processing depending on being a SNP, INS, or DEL #
##############################################################
if len(ref) == len(alt): # SNP
if len(ref) != 1:
print "ERROR: MNPs are not supported!"
print line
raise SystemExit
key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
is_denovo = False
if key_to_match in VASE_DICT:
VASE_DICT[key_to_match] = 1
is_denovo = True
if key_to_match in G2P_DICT:
G2P_DICT[key_to_match] = 1
trans = G2P_DATA[key_to_match][0]
gene = G2P_DATA[key_to_match][1]
GT = G2P_DATA[key_to_match][2]
if is_denovo:
if inher_stat == 'Unknown':
inher_stat = 'De novo constitutive'
else:
print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
raise SystemExit
if (chr != 'chrX') and (chr != 'chrY'):
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif (chr == 'chrX') or (chr == 'chrY'):
if DEC_CHILD_SEX == '46XX': # a girl
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif DEC_CHILD_SEX == '46XY': # a boy
if GT == 'HET':
genotype = 'Heterozygous'
print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
elif GT == 'HOM':
genotype = 'Hemizygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
else:
print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
raise SystemExit
else:
print "ERROR: unknown chr"
print line
raise SystemExit
# write to the DECIPHER file
gene_id_idx = gene.find('(')
if gene_id_idx == -1:
gene_id_idx = len(gene)
gene_id = gene[0:gene_id_idx]
int_ID = MAP_DICT[fam_id]
if trans in TRANS_DICT: # if the transcriptID is to be replaced
safe_trans = TRANS_DICT[trans]
else:
safe_trans = trans
to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
out_cntr += 1
out_han.write(to_write)
# write to the IGV file
i_s = pos - SNAP_FLANK
i_e = pos + SNAP_FLANK
i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt)
out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
out_igv_han.write('sort strand\n')
out_igv_han.write('squish\n')
out_igv_han.write('snapshot %s\n' % (i_name))
out_igv_han.write('\n')
elif len(ref) > len(alt): # DEL
if len(alt) != 1:
print "ERROR with a deletion"
print line
raise SystemExit
G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
is_denovo = False
if VASE_key_to_match in VASE_DICT:
VASE_DICT[VASE_key_to_match] = 1
is_denovo = True
if G2P_key_to_match in G2P_DICT:
G2P_DICT[G2P_key_to_match] = 1
trans = G2P_DATA[G2P_key_to_match][0]
gene = G2P_DATA[G2P_key_to_match][1]
GT = G2P_DATA[G2P_key_to_match][2]
if is_denovo:
if inher_stat == 'Unknown':
inher_stat = 'De novo constitutive'
else:
print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
raise SystemExit
if (chr != 'chrX') and (chr != 'chrY'):
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif (chr == 'chrX') or (chr == 'chrY'):
if DEC_CHILD_SEX == '46XX': # a girl
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif DEC_CHILD_SEX == '46XY': # a boy
if GT == 'HET':
genotype = 'Heterozygous'
print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
elif GT == 'HOM':
genotype = 'Hemizygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
else:
print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
raise SystemExit
else:
print "ERROR: unknown chr"
print line
raise SystemExit
# write to the DECIPHER file
gene_id_idx = gene.find('(')
if gene_id_idx == -1:
gene_id_idx = len(gene)
gene_id = gene[0:gene_id_idx]
int_ID = MAP_DICT[fam_id]
if trans in TRANS_DICT: # if the transcriptID is to be replaced
safe_trans = TRANS_DICT[trans]
else:
safe_trans = trans
to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
out_cntr += 1
out_han.write(to_write)
# write to the IGV file
i_s = pos - SNAP_FLANK
i_e = pos + SNAP_FLANK
i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt)
out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
out_igv_han.write('sort strand\n')
out_igv_han.write('squish\n')
out_igv_han.write('snapshot %s\n' % (i_name))
out_igv_han.write('\n')
elif len(ref) < len(alt): # INS
if len(ref) != 1:
print "ERROR with an insertion"
print line
raise SystemExit
G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
is_denovo = False
if VASE_key_to_match in VASE_DICT:
VASE_DICT[VASE_key_to_match] = 1
is_denovo = True
if G2P_key_to_match in G2P_DICT:
G2P_DICT[G2P_key_to_match] = 1
trans = G2P_DATA[G2P_key_to_match][0]
gene = G2P_DATA[G2P_key_to_match][1]
GT = G2P_DATA[G2P_key_to_match][2]
if is_denovo:
if inher_stat == 'Unknown':
inher_stat = 'De novo constitutive'
else:
print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
raise SystemExit
if (chr != 'chrX') and (chr != 'chrY'):
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif (chr == 'chrX') or (chr == 'chrY'):
if DEC_CHILD_SEX == '46XX': # a girl
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif DEC_CHILD_SEX == '46XY': # a boy
if GT == 'HET':
genotype = 'Heterozygous'
print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
elif GT == 'HOM':
genotype = 'Hemizygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
else:
print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
raise SystemExit
else:
print "ERROR: unknown chr"
print line
raise SystemExit
# write to the DECIPHER file
gene_id_idx = gene.find('(')
if gene_id_idx == -1:
gene_id_idx = len(gene)
gene_id = gene[0:gene_id_idx]
int_ID = MAP_DICT[fam_id]
if trans in TRANS_DICT: # if the transcriptID is to be replaced
safe_trans = TRANS_DICT[trans]
else:
safe_trans = trans
to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
out_cntr += 1
out_han.write(to_write)
# write to the IGV file
i_s = pos - SNAP_FLANK
i_e = pos + SNAP_FLANK
i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt)
out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
out_igv_han.write('sort strand\n')
out_igv_han.write('squish\n')
out_igv_han.write('snapshot %s\n' % (i_name))
out_igv_han.write('\n')
else:
print "Cannot establish the type of this VCF variant"
print line
raise SystemExit
in_han.close()
out_han.close()
out_igv_han.close()
### check if all G2P and VASE variants were found/matched in the proband's VCF
found_all_G2P = True
found_all_VASE = True
for k,v in G2P_DICT.iteritems():
if int(v) == 0:
print k
found_all_G2P = False
break
for k,v in VASE_DICT.iteritems():
if int(v) == 0:
print k
found_all_VASE = False
break
if found_all_G2P:
print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT))
else:
print "ERROR: Could not find all G2P variants in the probands VCF file"
raise SystemExit
if found_all_VASE:
print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT))
else:
print "ERROR: Could not find all VASE variants in the probands VCF file"
raise SystemExit
### check if all G2P variants are written out
if out_cntr == NUM_UNIQ_G2P_VARS:
print "OK: All G2P vars are recorded in the output DECIPHER file"
else:
print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"
print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file)
print "The batch snapshot file = %s" % (out_igv_file)
sys.stdout.flush()
def read_all_VCF_vars(in_vcf_file,THIS_DICT):
in_han = gzip.open(in_vcf_file,'r')
for line in in_han:
if line.startswith('#'):
continue
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
pos = int(data[1])
ref = data[3]
alt = data[4]
# did the splitting and normalizing - should not have multiallelic variants
if alt.find(',') != -1:
print "ERROR: found multiallelic variant"
print line
raiseSystemExit
key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
if key not in THIS_DICT:
THIS_DICT[key] = 1
else:
print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
raise SystemExit
in_han.close()
def read_VASE(in_file):
global NUM_UNIQ_VASE_VARS
in_han = open(in_file,'r')
for line in in_han:
# ignore header lines
if line.startswith('#'):
continue
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
pos = data[1]
ref = data[3]
alt = data[4]
key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
if key not in VASE_DICT:
VASE_DICT[key] = 0
else:
print "ERROR: duplicate VASE variant key = %s" % (key)
raise SystemExit
in_han.close()
NUM_UNIQ_VASE_VARS = len(VASE_DICT)
print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID)
sys.stdout.flush()
def read_G2P(in_file):
global NUM_UNIQ_G2P_VARS
known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']
# first, read the G2P variants on canonical transcripts for each of the family members
CHILD_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene)
MOM_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene)
DAD_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene)
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
# get the individual_id
sam_id = data[0]
# ignore variants not on canonical transcripts
is_canon = data[3]
if is_canon != 'is_canonical':
continue
# select only variants in genes being considered under the dominant model
inher_model = data[4]
aaa,OBS_state = inher_model.split('=')
if OBS_state not in known_OBS_states:
print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
raise SystemExit
# get the gene name in format ENSG00000165899(C12orf64,OTOGL)
gene_name = data[1]
# get the transcript name in format ENST00000238647
transcript = data[2]
# this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
var_list = [y.strip() for y in data[6].split(';')]
for v in var_list:
v_details = [z.strip() for z in v.split(':')]
chr = v_details[0]
start = int(v_details[1])
end = int(v_details[2])
ref = v_details[3]
alt = v_details[4]
GT = v_details[5]
second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
if sam_id == CHILD_ID:
# check for duplication
if OBS_state not in CHILD_DICT:
pass
elif second_key not in CHILD_DICT[OBS_state]:
pass
else:
print "ERROR: a duplicate variant %s in %s gene for CHILD" % (second_key,OBS_state)
raise SystemExit
CHILD_DICT[OBS_state][second_key] = (GT,gene_name)
elif sam_id == MOM_ID:
# check for duplication
if OBS_state not in MOM_DICT:
pass
elif second_key not in MOM_DICT[OBS_state]:
pass
else:
print "ERROR: a duplicate variant %s in %s gene for MOM" % (second_key,OBS_state)
raise SystemExit
MOM_DICT[OBS_state][second_key] = (GT,gene_name)
elif sam_id == DAD_ID:
# check for duplication
if OBS_state not in DAD_DICT:
pass
elif second_key not in DAD_DICT[OBS_state]:
pass
else:
print "ERROR: a duplicate variant %s in %s gene for DAD" % (second_key,OBS_state)
raise SystemExit
DAD_DICT[OBS_state][second_key] = (GT,gene_name)
else:
print "ERROR: cannot identify the person for this variant"
print line
raise SystemExit
in_han.close()
### print out the number of unique G2P variants in CHILD ###
child_mono = 0
child_bi = 0
child_hemi = 0
child_xld = 0
child_xlod = 0
if 'monoallelic' in CHILD_DICT:
child_mono = len(CHILD_DICT['monoallelic'])
if 'biallelic' in CHILD_DICT:
child_bi = len(CHILD_DICT['biallelic'])
if 'hemizygous' in CHILD_DICT:
child_hemi = len(CHILD_DICT['hemizygous'])
if 'x-linked dominant' in CHILD_DICT:
child_xld = len(CHILD_DICT['x-linked dominant'])
if 'x-linked over-dominance' in CHILD_DICT:
child_xlod = len(CHILD_DICT['x-linked over-dominance'])
print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID)
print " monoallelic: %s" % (child_mono)
print " biallelic: %s" % (child_bi)
print " hemizygous: %s" % (child_hemi)
print " x-linked dominant: %s" % (child_xld)
print " x-linked over-dominance: %s" % (child_xlod)
### print out the number of unique G2P variants in MOM ###
mom_mono = 0
mom_bi = 0
mom_hemi = 0
mom_xld = 0
mom_xlod = 0
if 'monoallelic' in MOM_DICT:
mom_mono = len(MOM_DICT['monoallelic'])
if 'biallelic' in MOM_DICT:
mom_bi = len(MOM_DICT['biallelic'])
if 'hemizygous' in MOM_DICT:
mom_hemi = len(MOM_DICT['hemizygous'])
if 'x-linked dominant' in MOM_DICT:
mom_xld = len(MOM_DICT['x-linked dominant'])
if 'x-linked over-dominance' in MOM_DICT:
mom_xlod = len(MOM_DICT['x-linked over-dominance'])
print "MOM (%s): number of unique G2P variants on canon transcript in the following OBS states" % (MOM_ID)
print " monoallelic: %s" % (mom_mono)
print " biallelic: %s" % (mom_bi)
print " hemizygous: %s" % (mom_hemi)
print " x-linked dominant: %s" % (mom_xld)
print " x-linked over-dominance: %s" % (mom_xlod)
### print out the number of unique G2P variants in DAD ###
dad_mono = 0
dad_bi = 0
dad_hemi = 0
dad_xld = 0
dad_xlod = 0
if 'monoallelic' in DAD_DICT:
dad_mono = len(DAD_DICT['monoallelic'])
if 'biallelic' in DAD_DICT:
dad_bi = len(DAD_DICT['biallelic'])
if 'hemizygous' in DAD_DICT:
dad_hemi = len(DAD_DICT['hemizygous'])
if 'x-linked dominant' in DAD_DICT:
dad_xld = len(DAD_DICT['x-linked dominant'])
if 'x-linked over-dominance' in DAD_DICT:
dad_xlod = len(DAD_DICT['x-linked over-dominance'])
print "DAD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (DAD_ID)
print " monoallelic: %s" % (dad_mono)
print " biallelic: %s" % (dad_bi)
print " hemizygous: %s" % (dad_hemi)
print " x-linked dominant: %s" % (dad_xld)
print " x-linked over-dominance: %s" % (dad_xlod)
sys.stdout.flush()
###############################################################################################
#### DOM filtering ####
#### if the gene has been considered under the dominant model (OBS == monoallelic) ####
#### exclude child variants seen in UNAFFECTED mother/father, regardless of GT ####
###############################################################################################
print ""
print "=== DOM filtering ==="
for key in CHILD_DICT['monoallelic']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene)
CHILD_GT = CHILD_DICT['monoallelic'][key][0]
if (key in MOM_DICT['monoallelic']) and (MOM_STAT == "UNAFFECTED"):
MOM_GT = MOM_DICT['monoallelic'][key][0]
print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GT,MOM_GT,MOM_STAT)
continue
if (key in DAD_DICT['monoallelic']) and (DAD_STAT == "UNAFFECTED"):
DAD_GT = DAD_DICT['monoallelic'][key][0]
print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT)
continue
# if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
chr,start,end,ref,alt = key.split(":")
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
# record the data for CHILD G2P variants (for OBS=monoallelic)
if new_key not in G2P_DICT:
G2P_DICT[new_key] = 0
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
# and record the required data (transcript,gene_name,CHILD_GT) in G2P_DATA
if new_key not in G2P_DATA:
G2P_DATA[new_key] = (transcript,gene_name,CHILD_GT)
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
NUM_UNIQ_G2P_VARS = len(G2P_DICT)
print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
sys.stdout.flush()
print ""
####################################################################################################################
#### X-linked filtering ####
#### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) ####
#### exclude child HET variants if seen as HOM in UNAFFECTED father ####
####################################################################################################################
print ""
print "=== X-linked filtering ==="
################################
### process hemizygous genes ###
################################
for key in CHILD_DICT['hemizygous']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene)
CHILD_GT = CHILD_DICT['hemizygous'][key][0]
if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl)
pass
else:
if (key in DAD_DICT['hemizygous']) and (DAD_STAT == "UNAFFECTED"):
DAD_GT = DAD_DICT['hemizygous'][key][0]
if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father
print "***[X-linked model (hemizygous)]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT)
continue
# if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
chr,start,end,ref,alt = key.split(":")
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
# record the data for CHILD G2P variants (for OBS=hemizygous)
if new_key not in G2P_DICT:
G2P_DICT[new_key] = 0
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
# and record the required data (transcript,gene_name,CHILD_GT) in G2P_DATA
if new_key not in G2P_DATA:
G2P_DATA[new_key] = (transcript,gene_name,CHILD_GT)
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
#######################################
### process x-linked dominant genes ###
#######################################
for key in CHILD_DICT['x-linked dominant']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene)
CHILD_GT = CHILD_DICT['x-linked dominant'][key][0]
if CHILD_GT == 'HOM': # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl)
pass
else:
if (key in DAD_DICT['x-linked dominant']) and (DAD_STAT == "UNAFFECTED"):
DAD_GT = DAD_DICT['x-linked dominant'][key][0]
if DAD_GT == 'HOM': # i.e., x-linked dominant variant in unnafected father
print "***[X-linked model (x-linked dominant)]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT)
continue
# if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
chr,start,end,ref,alt = key.split(":")
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
# record the data for CHILD G2P variants (for OBS=x-linked dominant)
if new_key not in G2P_DICT:
G2P_DICT[new_key] = 0
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
# and record the required data (transcript,gene_name,CHILD_GT) in G2P_DATA
if new_key not in G2P_DATA:
G2P_DATA[new_key] = (transcript,gene_name,CHILD_GT)
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
##############################################
### process x-linked over-dominance genes ###
##############################################
for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene)
CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0]
# if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
chr,start,end,ref,alt = key.split(":")
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
# record the data for CHILD G2P variants (for OBS=x-linked dominant)
if new_key not in G2P_DICT:
G2P_DICT[new_key] = 0
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
# and record the required data (transcript,gene_name,CHILD_GT) in G2P_DATA
if new_key not in G2P_DATA:
G2P_DATA[new_key] = (transcript,gene_name,CHILD_GT)
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
NUM_UNIQ_G2P_VARS = len(G2P_DICT)
print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
sys.stdout.flush()
print ""
##############################################################################################################
#### BIALLELIC filtering ####
#### under the biallelic model (OBS == biallelic) - consider ALL variants per gene ####
#### must all be HET in CHILD, GT in parent does not matter ####
#### all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) ####
#### and this parent must be unaffected ####
#### if all these: then exclude all child variants in this gene ####
##############################################################################################################
print ""
print "=== BIALLELIC filtering ==="
GENE_KEY_GT = defaultdict(dict) # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: GT
# process all variants in biallelic genes in child
for key in CHILD_DICT['biallelic']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene)
b_GT = CHILD_DICT['biallelic'][key][0]
b_gene = CHILD_DICT['biallelic'][key][1]
GENE_KEY_GT[b_gene][key] = b_GT
# iterate over genes in GENE_KEY_GT
for g in GENE_KEY_GT: # this is the biallelic gene name
all_HET = True
# iterate over variants in this gene
for k in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt
if GENE_KEY_GT[g][k] == 'HOM': # there is a HOM variant in the child - NO filtering
all_HET = False
break
if all_HET:
# all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent
# if yes, filter out and write a message to the log file
# if not, to be added to G2P_DICT and G2P_DATA for further processing
all_from_one_parent = True
# iterate again over the variants in this gene
VAR_SOURCE_LIST = {} # key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED)
for k in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt
this_var_status = 'NONE'
if ((k in MOM_DICT['biallelic']) or (k in MOM_DICT['monoallelic'])) and (MOM_STAT == "UNAFFECTED"):
this_var_status = 'MOM'
if ((k in DAD_DICT['biallelic']) or (k in DAD_DICT['monoallelic'])) and (DAD_STAT == "UNAFFECTED"):
if this_var_status == 'NONE':
this_var_status = 'DAD'
elif this_var_status == 'MOM':
this_var_status = 'BOTH'
VAR_SOURCE_LIST[k] = this_var_status
# have collected the parent source for all variants in this gene
tot_num_vars = len(VAR_SOURCE_LIST)
num_mom = 0
num_dad = 0
num_none = 0
for k,v in VAR_SOURCE_LIST.iteritems():
if v == 'NONE':
num_none += 1
elif v == 'MOM':
num_mom += 1
elif v == 'DAD':
num_dad += 1
elif v == 'BOTH':
num_mom += 1
num_dad += 1
else:
print "ERROR: cannot understand the source parent = %s" % (v)
raise SystemExit
if num_none > 0:
all_from_one_parent = False
elif num_mom < tot_num_vars and num_dad < tot_num_vars:
all_from_one_parent = False
# if all variants in the child in this gene are found in single unaffected parent - filter out
if all_from_one_parent:
for k in GENE_KEY_GT[g]:
print "***[Biallelic model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (k,g,VAR_SOURCE_LIST[k])
continue
# end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written
# and gone to evaluating the next biallelic gene in the child
# if here
# - either not all CHILD variants in this gene are not HET, or
# - not all of them can be traced to a single unaffected parent
# --> add to be processed
# here we are at gene level, must iterate aver all variants in this gene
# iterate over variants in this gene
for kkk in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt
CHILD_GT = CHILD_DICT['biallelic'][kkk][0]
# if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
chr,start,end,ref,alt = kkk.split(":")
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
# record the data for CHILD G2P variants (for OBS=x-linked dominant)
if new_key not in G2P_DICT:
G2P_DICT[new_key] = 0
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
# and record the required data (transcript,gene_name,CHILD_GT) in G2P_DATA
if new_key not in G2P_DATA:
G2P_DATA[new_key] = (transcript,gene_name,b_GT)
else:
# print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
# raise SystemExit
# this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
pass
NUM_UNIQ_G2P_VARS = len(G2P_DICT)
print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, X-LINKED and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
sys.stdout.flush()
print ""
print ""
def read_ped(in_file):
global CHILD_ID
global CHILD_SEX
global DEC_CHILD_SEX
global MOM_ID
global MOM_STAT
global DAD_ID
global DAD_STAT
CHILD_ID = 0
CHILD_SEX = 0
MOM_ID = 0
MOM_STAT = 0
DAD_ID = 0
DAD_STAT = 0
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
if data[2] != '0' and data[3] != '0': # this is the child in the trio
if CHILD_ID == 0:
CHILD_ID = data[1]
else: # seen another child
print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment"
raise SystemExit
if DAD_ID == 0:
DAD_ID = data[2]
else:
if data[2] != DAD_ID:
print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID)
raise SystemExit
if MOM_ID == 0:
MOM_ID = data[3]
else:
if data[3] != MOM_ID:
print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID)
raise SystemExit
CHILD_SEX = int(data[4])
if CHILD_SEX == 1: # boy
DEC_CHILD_SEX = '46XY'
elif CHILD_SEX == 2: # girl
DEC_CHILD_SEX = '46XX'
else:
print "ERROR: proband sex unknown"
print line
raise SystemExit
if int(data[5]) != 2:
print "ERROR: child not affected"
print line
raise SystemExit
elif int(data[2]) == 0 and int(data[3]) == 0: # this is a parent record
if int(data[4]) == 1: # this is the dad
if int(data[5]) == 1:
DAD_STAT = "UNAFFECTED"
elif int(data[5]) == 2:
DAD_STAT = "AFFECTED"
else:
print "ERROR: cannot establish the dad's status"
print line
raise SystemExit
if DAD_ID == 0:
DAD_ID = data[1]
else:
if data[1] != DAD_ID:
print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID)
raise SystemExit
if int(data[4]) == 2: # this is the mom
if int(data[5]) == 1:
MOM_STAT = "UNAFFECTED"
elif int(data[5]) == 2:
MOM_STAT = "AFFECTED"
else:
print "ERROR: cannot establish mom's status"
print line
raise SystemExit
if MOM_ID == 0:
MOM_ID = data[1]
else:
if data[1] != MOM_ID:
print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID)
raise SystemExit
else:
print "ERROR: problematic PED line"
print line
raise SystemExit
def read_map_file(in_file):
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
dec_id = data[0]
int_id = data[1]
if dec_id not in MAP_DICT:
MAP_DICT[dec_id] = int_id
else:
print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
raise SystemExit
in_han.close()
def read_trans_map(in_file):
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
old_trans_id = data[0]
new_trans_id = data[1]
if old_trans_id not in TRANS_DICT:
TRANS_DICT[old_trans_id] = new_trans_id
else:
print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
raise SystemExit
in_han.close()
if __name__ == '__main__':
if len(sys.argv) == 12:
go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11])
else:
print "Suggested use: time python /home/u027/project/scripts/NHS_WES_generate_DEC_IGV.py \
dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
raise SystemExit