-
not populated not populated authorednot populated not populated authored
generate_DEC_IGV_aff_sib_scripts_from_quad.py 22.28 KiB
# input:
# the family PED file
# G2P text output for the trio [${FAMILY_ID}.report.txt]
# the joint and individual VCFs
#
#
# output (per affected proband_:
# DECIPHER formated file (all shared G2P variants)
# IGV snapshot script file
#
# checks:
# all G2P variants found in the individual VCF
#
# Author: MH
# last modified: DEC 07, 2021
import sys
import os
import csv
import gzip
from collections import defaultdict
ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'
TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
KIDS_SEX_DICT = {} # key: <indi_fam_id>; value: sex (in the format 46XX/46XY)
KIDS_G2P_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: (FS,SOR)
SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans)
NUM_SHARED_G2P_VARS = 0
SNAP_FLANK = 25
FS_THRESH = float(60)
SOR_THRESH = float(3)
## create the names of the needed files
#PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped
#IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
#FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_shared
#FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}
### call the python scrpit
#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_sib_from_quad.py \
#${DECIPHER_ID} \
#${TRANS_MAP} \
#${PED_FILE} \
#${IN_G2P_FILE} \
#${FAM_IGV_DIR} \
#${VCF_DIR} \
#${PLATE_ID} \
#${FAMILY_ID} \
#${DEC_DIR} \
#${FAM_BAM_DIR}
def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):
# read the transcript mapping file
read_trans_map(trans_map_file)
# read the ped file and establish KID_ID + KID_SEX
read_ped(ped_file)
# read the G2P output for this family
read_G2P(in_g2p_file)
# now read the individual VCFs and record all the variants
# list of all ids
proband_ids = KIDS_G2P_DICT.keys()
for pro_id in proband_ids:
vcf_file = '%s/%s_%s_shared.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,pro_id)
read_all_VCF_vars(vcf_file,KIDS_VCF_DICT,pro_id)
print ""
for k,v in KIDS_VCF_DICT.iteritems():
print "Found %s unique VCF variants for affected proband (%s)" % (len(v),k)
print ""
sys.stdout.flush()
print "Going over the varaints in each affected proband and checking each if it is shared G2P variant (to keep)"
# setup the DECIPHER and IGV snapshot output files - per each affected proband
proband_ids = KIDS_G2P_DICT.keys()
for pro_id in proband_ids:
num_out_vars = 0 # must be == NUM_SHARED_G2P_VARS
out_dec_file = '%s/%s_shared_DEC_FLT.csv' % (dec_dir,pro_id)
out_han = open(out_dec_file,'w')
out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')
# setup the IGV snapshot file
out_igv_file = '%s/IGV/%s.shared.snapshot.FLT.txt' % (dec_dir,pro_id)
out_igv_han = open(out_igv_file,'w')
out_igv_han.write('new\n')
out_igv_han.write('genome hg38\n')
out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
out_igv_han.write('new\n')
child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,pro_id,pro_id)
out_igv_han.write('load %s\n' % (child_bam))
out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
out_igv_han.write('\n')
# go over the individual's VCF variants, check if found in the shared G2P variants, if yes - output it (with VCF's coordinates)
# KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant
# SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans)
pro_vcf_vars = KIDS_VCF_DICT[pro_id]
for pro_vcf_var,fs_sor in pro_vcf_vars.iteritems():
chr,pos,ref,alt = pro_vcf_var.split(':')
pos = int(pos)
FS = fs_sor[0]
SOR = fs_sor[1]
# adjust pro_vcf_var for indels to match G2P style of recording
if len(ref) == len(alt): # SNP
if len(ref) != 1:
print "ERROR: MNPs are not supported!"
print line
raise SystemExit
G2P_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
elif len(ref) > len(alt): # DEL
if len(alt) != 1:
print "ERROR with a deletion"
print line
raise SystemExit
G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
elif len(ref) < len(alt): # INS
if len(ref) != 1:
print "ERROR with an insertion"
print line
raise SystemExit
G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
else:
print "Cannot establish the type of this VCF variant"
print line
raise SystemExit
if G2P_key_to_match not in SHARED_DICT: # an individual variant which is not in the shared G2P output
continue
# if here, this variant is in the shared G2P output, write it out
print "\t%s:\tfound %s (VCF) -> %s (shared G2P)" % (pro_id,pro_vcf_var,G2P_key_to_match)
GT = SHARED_DICT[G2P_key_to_match][0]
gene = SHARED_DICT[G2P_key_to_match][1]
trans = SHARED_DICT[G2P_key_to_match][2]
inher_stat = 'Unknown'
if (chr != 'chrX') and (chr != 'chrY'):
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif (chr == 'chrX') or (chr == 'chrY'):
if KIDS_SEX_DICT[pro_id] == '46XX': # a girl
if GT == 'HET':
genotype = 'Heterozygous'
elif GT == 'HOM':
genotype = 'Homozygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
elif KIDS_SEX_DICT[pro_id] == '46XY': # a boy
if GT == 'HET':
genotype = 'Heterozygous'
print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (pro_id,chr,pos,ref,alt,pro_vcf_var)
elif GT == 'HOM':
genotype = 'Hemizygous'
else:
print "ERROR: Cannot understand GT = %s" % (GT)
raise SystemExit
else:
print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
raise SystemExit
else:
print "ERROR: unknown chr"
print line
raise SystemExit
# write to the DECIPHER file
gene_id_idx = gene.find('(')
if gene_id_idx == -1:
gene_id_idx = len(gene)
gene_id = gene[0:gene_id_idx]
if trans in TRANS_DICT: # if the transcriptID is to be replaced
safe_trans = TRANS_DICT[trans]
else:
safe_trans = trans
to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (dec_id,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,KIDS_SEX_DICT[pro_id],ACCESS,inher_stat,genotype)
out_han.write(to_write)
# write to the IGV file
i_s = pos - SNAP_FLANK
i_e = pos + SNAP_FLANK
# check if above FS/SOR_THRESH to include in the snapshot name
if (FS == '') or (SOR == ''):
flag = 'NA'
elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
else:
flag = 'OK'
i_name = '%s_%s_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt,flag)
out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
out_igv_han.write('sort strand\n')
out_igv_han.write('squish\n')
out_igv_han.write('snapshot %s\n' % (i_name))
out_igv_han.write('\n')
num_out_vars += 1
out_han.close()
out_igv_han.close()
if num_out_vars == NUM_SHARED_G2P_VARS:
print "\t%s:\tNumber of output variants matches the number of shared variants: OK" % (pro_id)
else:
print "\t%s:\tERROR: number of output variants does NOT match the number of shared variants" % (pro_id)
print "\t%s:\tdecipher file = %s" % (pro_id,out_dec_file)
print "\t%s:\tigv snapshot file for %s" % (pro_id,out_igv_file)
print "\t--------------------------------"
def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id):
cntr = 0
in_han = gzip.open(in_vcf_file,'r')
for line in in_han:
if line.startswith('#'):
continue
cntr += 1
data = [x.strip() for x in line.strip().split('\t')]
chr = data[0]
pos = int(data[1])
ref = data[3]
alt = data[4]
# extract FS and SOR
FS = ''
SOR = ''
infos = [y.strip() for y in data[7].strip().split(';')]
for info in infos:
if info.startswith('FS='):
tag,FS = info.split('=')
FS = float(FS)
elif info.startswith('SOR='):
tag,SOR = info.split('=')
SOR = float(SOR)
# did the splitting and normalizing - should not have multiallelic variants
if alt.find(',') != -1:
print "ERROR: found multiallelic variant"
print line
raiseSystemExit
key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
if pro_id not in THIS_DICT:
THIS_DICT[pro_id][key] = (FS,SOR)
elif key not in THIS_DICT[pro_id]:
THIS_DICT[pro_id][key] = (FS,SOR)
else:
print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
raise SystemExit
in_han.close()
def read_G2P(in_file):
global NUM_SHARED_G2P_VARS
known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']
# to make sure no duplicate vars per indi
CHECK_DICT = defaultdict(dict) # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant
# first, read the G2P variants on canonical transcripts for each of the affected probands
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
# get the individual_id
sam_id = data[0]
# if, in addition to the affected siblings there is an unaffected parent, they would be in the family VCF
# they would be G2P-ed, must be excluded by now!!!
if sam_id not in KIDS_SEX_DICT:
print "ERROR: In the G2P file found a sample which is not an affected kid = %s !!!" % (sam_id)
raise SystemExit
# ignore variants not on canonical transcripts
is_canon = data[3]
if is_canon != 'is_canonical':
continue
# split the variants based on the gene's OBS model of inheritance
inher_model = data[4]
aaa,OBS_state = inher_model.split('=')
if OBS_state not in known_OBS_states:
print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
raise SystemExit
# get the gene name in format ENSG00000165899(C12orf64,OTOGL)
gene_name = data[1]
# get the transcript name in format ENST00000238647
transcript = data[2]
# this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
var_list = [y.strip() for y in data[6].split(';')]
for v in var_list:
v_details = [z.strip() for z in v.split(':')]
chr = v_details[0]
start = int(v_details[1])
end = int(v_details[2])
ref = v_details[3]
alt = v_details[4]
GT = v_details[5]
second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
##########################################################################
# check_key = '%s:%s' % (sam_id,second_key)
# if check_key not in CHECK_DICT:
# CHECK_DICT[check_key][OBS_state] = 1
# elif OBS_state not in CHECK_DICT[check_key].keys():
# CHECK_DICT[check_key][OBS_state] = 1
# else:
# print "ERROR: a duplicate variant %s in %s gene for CHILD = %s, OBS_state = %s" % (check_key,gene_name,sam_id,OBS_state)
# raise SystemExit
#
# if sam_id not in KIDS_G2P_DICT:
# KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
# elif second_key not in KIDS_G2P_DICT[sam_id]:
# KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
# else:
## print "ERROR: a duplicate variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
## raise SystemExit
# pass # the same variant in diff OBS_state - see above !
############################################################################
##########################################
### to deal with the new output of G2P ###
##########################################
check_key = '%s:%s' % (sam_id,second_key)
if check_key not in CHECK_DICT: # first time we see this var in this sample, any OBS_state
CHECK_DICT[check_key][OBS_state] = 1
if sam_id not in KIDS_G2P_DICT:
KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
elif second_key not in KIDS_G2P_DICT[sam_id]:
KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
else: # sanity check
print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
raise SystemExit
elif OBS_state not in CHECK_DICT[check_key].keys(): # first time we see this var in this sample with this OBS_state
CHECK_DICT[check_key][OBS_state] = 1
if sam_id not in KIDS_G2P_DICT:
KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
elif second_key not in KIDS_G2P_DICT[sam_id]:
KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript): # diff OBS_state, but must have same (GT,gene_name,transcript)
pass
else:
print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
raise SystemExit
else: # same individual, same variant, known OBS_state
# due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq
# check the gene name in KIDS_G2P_DICT[sam_id][second_key]
if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'): # recorded is refseq
if gene_name.startswith('ENSG'): # this is ensembl
KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) # replace
else: # this is refseq again, ignore
pass
else: # recorded is ensembl, ignore
pass
in_han.close()
print ""
print ""
print "Found the following variants on canonical transcripts in the G2P output for these affected probands"
for id,val in KIDS_G2P_DICT.iteritems():
print "--------------------------"
for k,v in val.iteritems():
print " %s\t%s\t%s" % (id,k,v)
print ""
print ""
###################################################################################
#### SHARED variant filtering ####
#### select only variants seen in all affected probands with the same GT ####
###################################################################################
print ""
print "=== SHARED variant filtering ==="
# list of all ids
proband_ids = KIDS_G2P_DICT.keys()
print "All affected probands = %s" % (proband_ids)
# for each proband, go thru all of their variants, check if seen in all probands excl this one, iff yes, record in SHARED_DICT
for pro_id in proband_ids:
other_pro_ids = []
for aaa in proband_ids:
other_pro_ids.append(aaa)
other_pro_ids.remove(pro_id)
print "Analyzing variants in %s, to be compared against the variants in all other affected probands %s" % (pro_id,other_pro_ids)
# go thru all of their variants
pro_vars = KIDS_G2P_DICT[pro_id] # a dict with keys: chr,start,end,ref,alt and values: (GT,gene_name,transcript)
for var_loc,var_info in pro_vars.iteritems():
found_in_all = True
# check if seen in all probands excl this one
for o_id in other_pro_ids:
if var_loc not in KIDS_G2P_DICT[o_id]:
print " Excluding variant %s in %s, since not seen in %s" % (var_loc,pro_id,o_id)
found_in_all = False
break
# if variant found, check if GT matches
else:
o_info = KIDS_G2P_DICT[o_id][var_loc]
if var_info[0] != o_info[0]:
print " Excluding variant %s in %s (GT = %s); it is seen in %s but GT does not match (ST = %s)" % (var_loc,pro_id,var_info[0],o_id,o_info[0])
found_in_all = False
break
if found_in_all: # this variant has been found in all affected probands with matching GT, keep it
if var_loc not in SHARED_DICT: # it has not been recorded previously when considering another proband
# for consistency with the standard trio-based processing
# if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
chr,start,end,ref,alt = var_loc.split(":")
if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized
if len(ref) < len(alt): # an INS
orig_start = start
orig_ref = ref
orig_alt = alt
start = orig_start
ref = '-'
alt = orig_alt[len(orig_ref):]
print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
else: # a DEL
print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
print line
raise SystemExit
new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
SHARED_DICT[new_key] = var_info
print " Keeping %s found in all affected probands, same GT" % (new_key)
print "---------------------"
NUM_SHARED_G2P_VARS = len(SHARED_DICT)
print "Found %s unique and canonical G2P variants SHARED between all %s affected probands in this family" % (NUM_SHARED_G2P_VARS,len(proband_ids))
sys.stdout.flush()
print ""
def read_ped(in_file):
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
kid_id = data[1]
kid_se = int(data[4])
if kid_se == 1: # boy
kid_sex = '46XY'
elif kid_se == 2: # girl
kid_sex = '46XX'
else:
print "ERROR: proband sex unknown"
print line
raise SystemExit
if kid_id not in KIDS_SEX_DICT:
KIDS_SEX_DICT[kid_id] = kid_sex
else:
print "ERROR: proband sex unknown"
print line
raise SystemExit
in_han.close()
print "Found the following affected probands"
for k,v in KIDS_SEX_DICT.iteritems():
print " %s: %s" % (k,v)
sys.stdout.flush()
def read_trans_map(in_file):
in_han = open(in_file,'r')
for line in in_han:
data = [x.strip() for x in line.strip().split('\t')]
old_trans_id = data[0]
new_trans_id = data[1]
if old_trans_id not in TRANS_DICT:
TRANS_DICT[old_trans_id] = new_trans_id
else:
print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
raise SystemExit
in_han.close()
if __name__ == '__main__':
if len(sys.argv) == 11:
go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10])
else:
print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV_aff_probands.py \
dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
raise SystemExit