# input: # the family PED file # G2P text output for the trio [${FAMILY_ID}.report.txt] # the joint and individual VCFs # # # output (per affected proband_: # DECIPHER formated file (all shared G2P variants) # IGV snapshot script file # # checks: # all G2P variants found in the individual VCF # # Author: MH # last modified: MAR 23, 2020 import sys import os import csv import gzip from collections import defaultdict ASSEMBLY = 'GRCh38' INTERGENIC = 'No' ACCESS = 'No' TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER KIDS_SEX_DICT = {} # key: <indi_fam_id>; value: sex (in the format 46XX/46XY) KIDS_G2P_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans) NUM_SHARED_G2P_VARS = 0 SNAP_FLANK = 25 ### call the python scrpit #time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_aff_probands.py \ #${DECIPHER_ID} \ #${TRANS_MAP} \ #${PED_FILE} \ #${IN_G2P_FILE} \ #${FAM_IGV_DIR} \ #${VCF_DIR} \ #${PLATE_ID} \ #${FAMILY_ID} \ #${DEC_DIR} \ #${FAM_BAM_DIR} def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir): # read the transcript mapping file read_trans_map(trans_map_file) # read the ped file and establish KID_ID + KID_SEX read_ped(ped_file) # read the G2P output for this family read_G2P(in_g2p_file) # now read the individual VCFs and record all the variants # list of all ids proband_ids = KIDS_G2P_DICT.keys() for pro_id in proband_ids: vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,pro_id) read_all_VCF_vars(vcf_file,KIDS_VCF_DICT,pro_id) print "" for k,v in KIDS_VCF_DICT.iteritems(): print "Found %s unique VCF variants for affected proband (%s)" % (len(v),k) print "" sys.stdout.flush() print "Going over the varaints in each affected proband and checking each if it is shared G2P variant (to keep)" # setup the DECIPHER and IGV snapshot output files - per each affected proband proband_ids = KIDS_G2P_DICT.keys() for pro_id in proband_ids: num_out_vars = 0 # must be == NUM_SHARED_G2P_VARS out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,pro_id) out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,pro_id) out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir)) out_igv_han.write('new\n') child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,pro_id,pro_id) out_igv_han.write('load %s\n' % (child_bam)) out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir)) out_igv_han.write('\n') # go over the individual's VCF variants, check if found in the shared G2P variants, if yes - output it (with VCF's coordinates) # KIDS_VCF_DICT = defaultdict(dict) # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant # SHARED_DICT = {} # key: chr:start:ref:alt; value: (ZYG,gene,trans) pro_vcf_vars = KIDS_VCF_DICT[pro_id] for pro_vcf_var,rub in pro_vcf_vars.iteritems(): chr,pos,ref,alt = pro_vcf_var.split(':') pos = int(pos) # adjust pro_vcf_var for indels to match G2P style of recording if len(ref) == len(alt): # SNP if len(ref) != 1: print "ERROR: MNPs are not supported!" print line raise SystemExit G2P_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) elif len(ref) > len(alt): # DEL if len(alt) != 1: print "ERROR with a deletion" print line raise SystemExit G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:]) elif len(ref) < len(alt): # INS if len(ref) != 1: print "ERROR with an insertion" print line raise SystemExit G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:]) else: print "Cannot establish the type of this VCF variant" print line raise SystemExit if G2P_key_to_match not in SHARED_DICT: # an individual variant which is not in the shared G2P output continue # if here, this variant is in the shared G2P output, write it out print "\t%s:\tfound %s (VCF) -> %s (shared G2P)" % (pro_id,pro_vcf_var,G2P_key_to_match) GT = SHARED_DICT[G2P_key_to_match][0] gene = SHARED_DICT[G2P_key_to_match][1] trans = SHARED_DICT[G2P_key_to_match][2] inher_stat = 'Unknown' if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if KIDS_SEX_DICT[pro_id] == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif KIDS_SEX_DICT[pro_id] == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (pro_id,chr,pos,ref,alt,pro_vcf_var) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (dec_id,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,KIDS_SEX_DICT[pro_id],ACCESS,inher_stat,genotype) out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') num_out_vars += 1 out_han.close() out_igv_han.close() if num_out_vars == NUM_SHARED_G2P_VARS: print "\t%s:\tNumber of output variants matches the number of shared variants: OK" % (pro_id) else: print "\t%s:\tERROR: number of output variants does NOT match the number of shared variants" % (pro_id) print "\t%s:\tdecipher file = %s" % (pro_id,out_dec_file) print "\t%s:\tigv snapshot file for %s" % (pro_id,out_igv_file) print "\t--------------------------------" def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id): cntr = 0 in_han = gzip.open(in_vcf_file,'r') for line in in_han: if line.startswith('#'): continue cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" print line raiseSystemExit key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if pro_id not in THIS_DICT: THIS_DICT[pro_id][key] = 1 elif key not in THIS_DICT[pro_id]: THIS_DICT[pro_id][key] = 1 else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit in_han.close() def read_G2P(in_file): global NUM_SHARED_G2P_VARS known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] # to make sure no duplicate vars per indi CHECK_DICT = defaultdict(dict) # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant # first, read the G2P variants on canonical transcripts for each of the affected probands in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # get the individual_id sam_id = data[0] # if, in addition to the affected siblings there is an unaffected parent, they would be in the family VCF # they would be G2P-ed, must be excluded by now!!! if sam_id not in KIDS_SEX_DICT: print "ERROR: In the G2P file found a sample which is not an affected kid = %s !!!" % (sam_id) raise SystemExit # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue # split the variants based on the gene's OBS model of inheritance inher_model = data[4] aaa,OBS_state = inher_model.split('=') if OBS_state not in known_OBS_states: print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file) raise SystemExit # get the gene name in format ENSG00000165899(C12orf64,OTOGL) gene_name = data[1] # get the transcript name in format ENST00000238647 transcript = data[2] # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) ################################################################################### # check_key = '%s:%s' % (sam_id,second_key) # if check_key not in CHECK_DICT: # CHECK_DICT[check_key][OBS_state] = 1 # elif OBS_state not in CHECK_DICT[check_key].keys(): # CHECK_DICT[check_key][OBS_state] = 1 # else: # print "ERROR: a duplicate variant %s in %s gene for CHILD = %s, OBS_state = %s" % (check_key,gene_name,sam_id,OBS_state) # raise SystemExit # # if sam_id not in KIDS_G2P_DICT: # KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) # elif second_key not in KIDS_G2P_DICT[sam_id]: # KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) # else: ## print "ERROR: a duplicate variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id) ## raise SystemExit # pass # the same variant in diff OBS_state - see above ! ################################################################################### ########################################## ### to deal with the new output of G2P ### ########################################## check_key = '%s:%s' % (sam_id,second_key) if check_key not in CHECK_DICT: # first time we see this var in this sample, any OBS_state CHECK_DICT[check_key][OBS_state] = 1 if sam_id not in KIDS_G2P_DICT: KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) elif second_key not in KIDS_G2P_DICT[sam_id]: KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) else: # sanity check print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id) raise SystemExit elif OBS_state not in CHECK_DICT[check_key].keys(): # first time we see this var in this sample with this OBS_state CHECK_DICT[check_key][OBS_state] = 1 if sam_id not in KIDS_G2P_DICT: KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) elif second_key not in KIDS_G2P_DICT[sam_id]: KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript): # diff OBS_state, but must have same (GT,gene_name,transcript) pass else: print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id) raise SystemExit else: # same individual, same variant, known OBS_state # due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq # check the gene name in KIDS_G2P_DICT[sam_id][second_key] if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'): # recorded is refseq if gene_name.startswith('ENSG'): # this is ensembl KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript) # replace else: # this is refseq again, ignore pass else: # recorded is ensembl, ignore pass in_han.close() print "" print "" print "Found the following variants on canonical transcripts in the G2P output for these affected probands" for id,val in KIDS_G2P_DICT.iteritems(): print "--------------------------" for k,v in val.iteritems(): print " %s\t%s\t%s" % (id,k,v) print "" print "" ################################################################################### #### SHARED variant filtering #### #### select only variants seen in all affected probands with the same GT #### ################################################################################### print "" print "=== SHARED variant filtering ===" # list of all ids proband_ids = KIDS_G2P_DICT.keys() print "All affected probands = %s" % (proband_ids) # for each proband, go thru all of their variants, check if seen in all probands excl this one, iff yes, record in SHARED_DICT for pro_id in proband_ids: other_pro_ids = [] for aaa in proband_ids: other_pro_ids.append(aaa) other_pro_ids.remove(pro_id) print "Analyzing variants in %s, to be compared against the variants in all other affected probands %s" % (pro_id,other_pro_ids) # go thru all of their variants pro_vars = KIDS_G2P_DICT[pro_id] # a dict with keys: chr,start,end,ref,alt and values: (GT,gene_name,transcript) for var_loc,var_info in pro_vars.iteritems(): found_in_all = True # check if seen in all probands excl this one for o_id in other_pro_ids: if var_loc not in KIDS_G2P_DICT[o_id]: print " Excluding variant %s in %s, since not seen in %s" % (var_loc,pro_id,o_id) found_in_all = False break # if variant found, check if GT matches else: o_info = KIDS_G2P_DICT[o_id][var_loc] if var_info[0] != o_info[0]: print " Excluding variant %s in %s (GT = %s); it is seen in %s but GT does not match (ST = %s)" % (var_loc,pro_id,var_info[0],o_id,o_info[0]) found_in_all = False break if found_in_all: # this variant has been found in all affected probands with matching GT, keep it if var_loc not in SHARED_DICT: # it has not been recorded previously when considering another proband # for consistency with the standard trio-based processing # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = var_loc.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) SHARED_DICT[new_key] = var_info print " Keeping %s found in all affected probands, same GT" % (new_key) print "---------------------" NUM_SHARED_G2P_VARS = len(SHARED_DICT) print "Found %s unique and canonical G2P variants SHARED between all %s affected probands in this family" % (NUM_SHARED_G2P_VARS,len(proband_ids)) sys.stdout.flush() print "" def read_ped(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] kid_id = data[1] kid_se = int(data[4]) if kid_se == 1: # boy kid_sex = '46XY' elif kid_se == 2: # girl kid_sex = '46XX' else: print "ERROR: proband sex unknown" print line raise SystemExit if kid_id not in KIDS_SEX_DICT: KIDS_SEX_DICT[kid_id] = kid_sex else: print "ERROR: proband sex unknown" print line raise SystemExit in_han.close() print "Found the following affected probands" for k,v in KIDS_SEX_DICT.iteritems(): print " %s: %s" % (k,v) sys.stdout.flush() def read_trans_map(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] old_trans_id = data[0] new_trans_id = data[1] if old_trans_id not in TRANS_DICT: TRANS_DICT[old_trans_id] = new_trans_id else: print "ERROR: duplicate old transcript ID = %s" % (old_trans_id) raise SystemExit in_han.close() if __name__ == '__main__': if len(sys.argv) == 11: go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV_aff_probands.py \ dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir" raise SystemExit