# input: # the family PED file [${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped] # the VCF folder containing the proband VCF [${PROJECT_ID}/VCF/ ${PLATE_ID}_${FAMILY_ID}.ready.${CHILD_ID}_${FAMILY_ID}.vcf.gz] # G2P text output for the trio [${FAMILY_ID}.report.txt] # # # output: # DECIPHER formated file for the proband # - all (filtered) G2P variants # # checks: # all G2P variants found in the individual VCF # # Author: MH # last modified: MARCH 04, 2022 import sys import os import csv import gzip from collections import defaultdict ASSEMBLY = 'GRCh38' INTERGENIC = 'No' ACCESS = 'No' G2P_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID G2P_DATA = {} # key: chr:pos:ref:alt; value: (transcript,gene,GT) NUM_UNIQ_G2P_VARS = 0 CHILD_ID = 0 CHILD_SEX = 0 DEC_CHILD_SEX = 'unknown' ALL_CHILD_DICT = {} # key: chr:pos:ref:alt; value: (num_ALT_reads,VAF) CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: "Unknown" - all variants in a singleton are of unknown inheritance SNAP_FLANK = 25 MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) ID TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER FS_THRESH = float(60) SOR_THRESH = float(3) def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir): # read the decipher to internal ID mapping file read_map_file(dec_map_file) # read the transcript mapping file read_trans_map(trans_map_file) # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID read_ped(ped_file) if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown'): print "======================================" print "Analyzing singleton CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX) print "======================================" sys.stdout.flush() else: print "ERROR: problems reading the PED file = %s" % (ped_file) raise SystemExit # read the G2P output for this family read_G2P(in_g2p_file) # now read the proband VCFs and record all the variants child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT) print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID) sys.stdout.flush() # now go over all child variants and set the inheritance to "Unknown" num_child_vars_assigned = 0 for key,v in ALL_CHILD_DICT.iteritems(): CHILD_INHER_DICT[key] = 'Unknown' assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0 print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID) sys.stdout.flush() # setup the DECIPHER output file out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID) ################################ out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file out_igv_file = '%s/IGV/%s.solo.snapshot.FLT.txt' % (dec_dir,CHILD_ID) ################################# out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir)) out_igv_han.write('new\n') child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID) out_igv_han.write('load %s\n' % (child_bam)) out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir)) out_igv_han.write('\n') # now read the child VCF, check if the variant in the G2P output, if yes: # set the value in the dict to 1 # print out to to output file in_cntr = 0 out_cntr = 0 child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) in_han = gzip.open(child_vcf_file,'r') for line in in_han: if line.startswith('#'): continue in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # extract FS and SOR FS = '' SOR = '' infos = [y.strip() for y in data[7].strip().split(';')] for info in infos: if info.startswith('FS='): tag,FS = info.split('=') FS = float(FS) elif info.startswith('SOR='): tag,SOR = info.split('=') SOR = float(SOR) VCF_VAR = data[9] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) inher_stat = CHILD_INHER_DICT[key] ############################################################## # different processing depending on being a SNP, INS, or DEL # ############################################################## if len(ref) == len(alt): # SNP if len(ref) != 1: print "ERROR: MNPs are not supported!" print line raise SystemExit key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key_to_match in G2P_DICT: G2P_DICT[key_to_match] = 1 trans = G2P_DATA[key_to_match][0] gene = G2P_DATA[key_to_match][1] GT = G2P_DATA[key_to_match][2] if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK # check if above FS/SOR_THRESH to include in the snapshot name if (FS == '') or (SOR == ''): flag = 'NA' elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) else: flag = 'OK' i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) > len(alt): # DEL if len(alt) != 1: print "ERROR with a deletion" print line raise SystemExit G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:]) if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK # check if above FS/SOR_THRESH to include in the snapshot name if (FS == '') or (SOR == ''): flag = 'NA' elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) else: flag = 'OK' i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) < len(alt): # INS if len(ref) != 1: print "ERROR with an insertion" print line raise SystemExit G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:]) if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK # check if above FS/SOR_THRESH to include in the snapshot name if (FS == '') or (SOR == ''): flag = 'NA' elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) else: flag = 'OK' i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') else: print "Cannot establish the type of this VCF variant" print line raise SystemExit in_han.close() out_han.close() out_igv_han.close() ### check if all G2P and VASE variants were found/matched in the proband's VCF found_all_G2P = True for k,v in G2P_DICT.iteritems(): if int(v) == 0: print k found_all_G2P = False break if found_all_G2P: print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT)) else: print "ERROR: Could not find all G2P variants in the probands VCF file" raise SystemExit ### check if all G2P variants are written out if out_cntr == NUM_UNIQ_G2P_VARS: print "OK: All G2P vars are recorded in the output DECIPHER file" else: print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file" print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file) print "The batch snapshot file = %s" % (out_igv_file) sys.stdout.flush() def read_all_VCF_vars(in_vcf_file,THIS_DICT): in_han = gzip.open(in_vcf_file,'r') for line in in_han: if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" print line raiseSystemExit key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in THIS_DICT: THIS_DICT[key] = 1 else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit in_han.close() def read_G2P(in_file): global NUM_UNIQ_G2P_VARS known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # first, read the G2P variants on canonical transcripts for the singleton CHILD_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # get the individual_id sam_id = data[0] # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue # split the variants based on the gene's OBS model of inheritance inher_model = data[4] aaa,OBS_state = inher_model.split('=') if OBS_state not in known_OBS_states: print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file) raise SystemExit # get the gene name in format ENSG00000165899(C12orf64,OTOGL) or gene-MYT1L(MYT1L) gene_name = data[1] # get the transcript name in format ENST00000238647 or gene-MYT1L(MYT1L) transcript = data[2] # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) if sam_id == CHILD_ID: # check for duplication if OBS_state not in CHILD_DICT: CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) elif second_key not in CHILD_DICT[OBS_state]: CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) else: # already recorded this variant # if we have refseq recorded and this is ensembl --> replace if not CHILD_DICT[OBS_state][second_key][1].startswith('ENSG'): # recorded is refseq if gene_name.startswith('ENSG'): # this is ensembl CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) # replace else: # this is refseq again, ignore pass else: # recorded is ensembl, ignore pass else: print "ERROR: cannot identify the person for this variant" print line raise SystemExit in_han.close() ### print out the number of unique G2P variants in CHILD ### child_mono = 0 child_bi = 0 child_hem = 0 child_het = 0 if 'monoallelic_autosomal' in CHILD_DICT: child_mono = len(CHILD_DICT['monoallelic_autosomal']) if 'biallelic_autosomal' in CHILD_DICT: child_bi = len(CHILD_DICT['biallelic_autosomal']) if 'monoallelic_X_hem' in CHILD_DICT: child_hem = len(CHILD_DICT['monoallelic_X_hem']) if 'monoallelic_X_het' in CHILD_DICT: child_het = len(CHILD_DICT['monoallelic_X_het']) print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID) print " monoallelic_autosomal: %s" % (child_mono) print " biallelic_autosomal: %s" % (child_bi) print " monoallelic_X_hem: %s" % (child_hem) print " monoallelic_X_het: %s" % (child_het) ###################################################################################################### #### Dominant filtering #### #### if the gene has been considered under the dominant model (OBS == monoallelic_autosomal) #### #### exclude child variants seen in UNAFFECTED mother/father, regardless of GT #### ###################################################################################################### print "" print "=== monoallelic autosomal (DOMINANT) filtering ===" for key in CHILD_DICT['monoallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0] CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1] CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2] # if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"): # MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0] # print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT) # continue # # if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"): # DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0] # print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) # continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = key.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=monoallelic) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() print "" ############################################################################################################## #### Recessive filtering #### #### under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene #### #### must all be HET in CHILD, GT in parent does not matter #### #### all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) #### #### and this parent must be unaffected #### #### if all these: then exclude all child variants in this gene #### ############################################################################################################## print "" print "=== biallelic autosomal (RECESSIVE) filtering ===" GENE_KEY_GT = defaultdict(dict) # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans) # process all variants in biallelic genes in child for key in CHILD_DICT['biallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) b_GT = CHILD_DICT['biallelic_autosomal'][key][0] b_gene = CHILD_DICT['biallelic_autosomal'][key][1] b_trans = CHILD_DICT['biallelic_autosomal'][key][2] GENE_KEY_GT[b_gene][key] = (b_GT,b_trans) # iterate over genes in GENE_KEY_GT for g in GENE_KEY_GT: # this is the biallelic gene name # all_HET = True # # # iterate over variants in this gene # for kx in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt # if GENE_KEY_GT[g][kx][0] == 'HOM': # there is a HOM variant in the child - NO filtering # all_HET = False # break # # if all_HET: # for this gene # # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent # # if yes, filter out and write a message to the log file # # if not, to be added to G2P_DICT and G2P_DATA for further processing # # all_from_one_parent = True # # # iterate again over the variants in this gene # VAR_SOURCE_LIST = {} # key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED) # # for ky in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt # # this_var_status = 'NONE' # # if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"): # this_var_status = 'MOM' # if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"): # if this_var_status == 'NONE': # this_var_status = 'DAD' # elif this_var_status == 'MOM': # this_var_status = 'BOTH' # # VAR_SOURCE_LIST[ky] = this_var_status # # # have collected the parent source for all variants in this gene # tot_num_vars = len(VAR_SOURCE_LIST) # num_mom = 0 # num_dad = 0 # num_none = 0 # for kt,v in VAR_SOURCE_LIST.iteritems(): # if v == 'NONE': # num_none += 1 # elif v == 'MOM': # num_mom += 1 # elif v == 'DAD': # num_dad += 1 # elif v == 'BOTH': # num_mom += 1 # num_dad += 1 # else: # print "ERROR: cannot understand the source parent = %s" % (v) # raise SystemExit # # if num_none > 0: # all_from_one_parent = False # elif num_mom < tot_num_vars and num_dad < tot_num_vars: # all_from_one_parent = False # # # if all variants in the child in this gene are found in single unaffected parent - filter out # if all_from_one_parent: # for kz in GENE_KEY_GT[g]: # print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz]) # continue # # # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written # # and gone to evaluating the next biallelic gene in the child # # # if here # # - either not all CHILD variants in this gene are not HET, or # # - not all of them can be traced to a single unaffected parent # # --> add to be processed # # here we are at gene level, must iterate over all variants in this gene # iterate over variants in this gene for kkk in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0] CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1] CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2] # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = kkk.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=biallelic) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() print "" #################################################################################################################### #### X-linked filtering #### #.# #### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) #### #### under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het) #### #### exclude child HET variants if seen as HOM in UNAFFECTED father #### #### #### #### Note 18/01/2022 #### #### This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het #### #### and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance #### #### the code below treats x-linked over-dominance as the others (i.e. filters, while it should not) #### #### Issue flagged to G2P plug-in team, awaiting their fix #### #### for now manually scan the output of G2P for the proband (both for boys and girls) #### #### to check if any variant has been called in PCDH19 and EFNB1 #### #### also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name #### #################################################################################################################### print "" print "=== X-linked filtering ===" ####################################### ### process monoallelic_X_hem genes ### ####################################### for key in CHILD_DICT['monoallelic_X_hem']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0] CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1] CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2] # if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl) # pass # else: # if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"): # DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0] # if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father # print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) # continue # # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = key.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass ####################################### ### process monoallelic_X_het genes ### ####################################### for key in CHILD_DICT['monoallelic_X_het']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0] CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1] CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2] # if CHILD_GT == 'HOM': # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl) # pass # else: # if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"): # DAD_GT = DAD_DICT['monoallelic_X_het'][key][0] # if DAD_GT == 'HOM': # i.e., x-linked dominant variant in unnafected father # print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) # continue # # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = key.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=monoallelic_X_het) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass ##.# ######################################################################## ##.# ### process x-linked over-dominance genes - no filtering to be done ### ##.# ######################################################################## # ##.# for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) # ##.# CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] ##.# CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] ##.# CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] # ##.# # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) ##.# chr,start,end,ref,alt = key.split(":") ##.# if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized ##.# if len(ref) < len(alt): # an INS ##.# orig_start = start ##.# orig_ref = ref ##.# orig_alt = alt ##.# start = orig_start ##.# ref = '-' ##.# alt = orig_alt[len(orig_ref):] ##.# print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) ##.# else: # a DEL ##.# print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" ##.# print line ##.# raise SystemExit # ##.# new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # ##.# # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) ##.# if new_key not in G2P_DICT: ##.# G2P_DICT[new_key] = 0 ##.# else: ##.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) ##.# # raise SystemExit ##.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req ##.# pass # ##.# # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA ##.# if new_key not in G2P_DATA: ##.# G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) ##.# else: ##.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) ##.# # raise SystemExit ##.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req ##.# pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() print "" print "" def read_ped(in_file): global CHILD_ID global CHILD_SEX global DEC_CHILD_SEX CHILD_ID = 0 CHILD_SEX = 0 # no need to do PED checks, did them for singletons at trio_setup.sh in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] CHILD_ID = data[1] CHILD_SEX = int(data[4]) if CHILD_SEX == 1: # boy DEC_CHILD_SEX = '46XY' elif CHILD_SEX == 2: # girl DEC_CHILD_SEX = '46XX' else: print "ERROR: proband sex unknown" print line raise SystemExit def read_map_file(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] dec_id = data[0] int_id = data[1] if dec_id not in MAP_DICT: MAP_DICT[dec_id] = int_id else: print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id) raise SystemExit in_han.close() def read_trans_map(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] old_trans_id = data[0] new_trans_id = data[1] if old_trans_id not in TRANS_DICT: TRANS_DICT[old_trans_id] = new_trans_id else: print "ERROR: duplicate old transcript ID = %s" % (old_trans_id) raise SystemExit in_han.close() if __name__ == '__main__': if len(sys.argv) == 11: go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \ dec_map_file,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir" raise SystemExit