# input: # the family PED file # G2P text output for the trio [${FAMILY_ID}.report.txt] # VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf] # # # individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz] # G2P text output for the trio [${FAMILY_ID}.report.txt] # VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf] # # # output: # DECIPHER formated file for the proband # - all G2P variants # - denovo variants marked as such # # checks: # all G2P variants found in the individual VCF # all VASE denovo variants found in the individual VCF # # Author: MH # last modified: SEPT 27, 2019 import sys import os import csv import gzip ASSEMBLY = 'GRCh38' INTERGENIC = 'No' ACCESS = 'No' G2P_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID G2P_DATA = {} # key: chr:pos:ref:alt; value: (transcript,gene,GT) VASE_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID NUM_UNIQ_G2P_VARS = 0 NUM_UNIQ_VASE_VARS = 0 CHILD_ID = 0 CHILD_SEX = 0 DEC_CHILD_SEX = 'unknown' MOM_ID = 0 MOM_STAT = 0 # 1 = UNAFF, 2 = AFF DAD_ID = 0 DAD_STAT = 0 # 1 = UNAFF, 2 = AFF ALL_CHILD_DICT = {} # key: chr:pos:ref:alt; value: (num_ALT_reads,VAF) ALL_MOM_DICT = {} # key: chr:pos:ref:alt; value: irrelevant ALL_DAD_DICT = {} # key: chr:pos:ref:alt; value: irrelevant CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown' SNAP_FLANK = 25 MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) ID TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER def go(ped_file,in_g2p_file,in_vase_file,in_vcf,dec_dir,igv_dir,fam_igv_dir,data_source_dir,fam_id,map_dir,trans_map_file): # read the decipher to internal ID mapping file map_file = '%s/DECIPHER_INTERNAL_IDs.txt' % (map_dir) read_map_file(map_file) # read the transcript mapping file read_trans_map(trans_map_file) # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID read_ped(ped_file) if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0): print "======================================" print "Analyzing:" print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX) print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT) print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT) print "======================================" sys.stdout.flush() else: print "ERROR: problems reading the PED file = %s" % (ped_file) raise SystemExit # read the G2P output for this family read_G2P(in_g2p_file) # read the VASE output for this family read_VASE(in_vase_file) # now read the individual VCFs and record all the variants child_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (CHILD_ID) mom_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (MOM_ID) dad_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (DAD_ID) read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT) print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID) sys.stdout.flush() read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT) print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID) sys.stdout.flush() read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT) print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID) sys.stdout.flush() # now go over all child variants and set the inheritance num_child_vars_assigned = 0 for key,v in ALL_CHILD_DICT.iteritems(): if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT): CHILD_INHER_DICT[key] = 'Biparental' num_child_vars_assigned += 1 elif key in ALL_MOM_DICT: CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother' num_child_vars_assigned += 1 elif key in ALL_DAD_DICT: CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father' num_child_vars_assigned += 1 else: CHILD_INHER_DICT[key] = 'Unknown' assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0 print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID) sys.stdout.flush() # now read the child VCF, check if the variant in the G2P/VASE output, if yes: # set the value in the dict to 1 # print out to to output file in_cntr = 0 out_cntr = 0 child_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (CHILD_ID) in_han = gzip.open(child_vcf_file,'r') # setup the DECIPHER output file out_dec_file = '%s/%s_DEC.csv' % (dec_dir,CHILD_ID) out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file out_igv_file = '%s/%s.snapshot.txt' % (igv_dir,CHILD_ID) out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir)) out_igv_han.write('new\n') out_igv_han.write('load %s/%s/%s-ready.bam\n' % (data_source_dir,CHILD_ID,CHILD_ID)) out_igv_han.write('load %s/%s/%s-ready.bam\n' % (data_source_dir,MOM_ID,MOM_ID)) out_igv_han.write('load %s/%s/%s-ready.bam\n' % (data_source_dir,DAD_ID,DAD_ID)) out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir)) out_igv_han.write('\n') for line in in_han: if line.startswith('#'): continue in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # VCF_FORMAT = data[8] # VCF_FORMAT_SPLIT = [y.strip() for y in VCF_FORMAT.strip().split(':')] # if VCF_FORMAT_SPLIT[1] != 'AD': # print "ERROR: AD tag not found where expected" # print VCF_FORMAT # raise SystemExit VCF_VAR = data[9] # VCF_VAR_SPLIT = [z.strip() for z in VCF_VAR.strip().split(':')] # num_REF,num_ALT = [int(w.strip()) for w in VCF_VAR_SPLIT[1].strip().split(',')] # if num_REF+num_ALT == 0: # the result of spliting the family VCF to indivudual VCFs, non-variants are not excluded by bcftools # continue # VAF = float(num_ALT)/float(num_REF+num_ALT) key = '%s:%s:%s:%s' % (chr,pos,ref,alt) inher_stat = CHILD_INHER_DICT[key] ############################################################## # different processing depending on being a SNP, INS, or DEL # ############################################################## if len(ref) == len(alt): # SNP if len(ref) != 1: print "ERROR: MNPs are not supported!" print line raise SystemExit key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if key_to_match in VASE_DICT: VASE_DICT[key_to_match] = 1 is_denovo = True if key_to_match in G2P_DICT: G2P_DICT[key_to_match] = 1 trans = G2P_DATA[key_to_match][0] gene = G2P_DATA[key_to_match][1] GT = G2P_DATA[key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) ### print to_write[:-1] # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('collapse\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) > len(alt): # DEL if len(alt) != 1: print "ERROR with a deletion" print line raise SystemExit G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:]) VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if VASE_key_to_match in VASE_DICT: VASE_DICT[VASE_key_to_match] = 1 is_denovo = True if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) ### print to_write[:-1] # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('collapse\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) < len(alt): # INS if len(ref) != 1: print "ERROR with an insertion" print line raise SystemExit G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:]) VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if VASE_key_to_match in VASE_DICT: VASE_DICT[VASE_key_to_match] = 1 is_denovo = True if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) # to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) ### print to_write[:-1] # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('collapse\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') else: print "Cannot establish the type of this VCF variant" print line raise SystemExit in_han.close() out_han.close() out_igv_han.close() ### check if all G2P and VASE variants were found/matched in the proband's VCF found_all_G2P = True found_all_VASE = True for k,v in G2P_DICT.iteritems(): if int(v) == 0: print k found_all_G2P = False break for k,v in VASE_DICT.iteritems(): if int(v) == 0: print k found_all_VASE = False break if found_all_G2P: print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT)) else: print "ERROR: Could not find all G2P variants in the probands VCF file" raise SystemExit if found_all_VASE: print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT)) else: print "ERROR: Could not find all VASE variants in the probands VCF file" raise SystemExit ### check if all G2P variants are written out - does not work at the moment, since we are excluding some G2P vars based on num_ALT_reads, VAF if out_cntr == NUM_UNIQ_G2P_VARS: print "OK: All G2P vars are recorded in the output DECIPHER file" else: print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file" print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file) print "The batch snapshot file = %s" % (out_igv_file) sys.stdout.flush() ######################################################################### def read_all_VCF_vars(in_vcf_file,THIS_DICT): in_han = gzip.open(in_vcf_file,'r') for line in in_han: if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" print line raiseSystemExit key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in THIS_DICT: THIS_DICT[key] = 1 else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit in_han.close() def read_VASE(in_file): global NUM_UNIQ_VASE_VARS in_han = open(in_file,'r') for line in in_han: # ignore header lines if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = data[1] ref = data[3] alt = data[4] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in VASE_DICT: VASE_DICT[key] = 0 else: print "ERROR: duplicate VASE variant key = %s" % (key) raise SystemExit in_han.close() NUM_UNIQ_VASE_VARS = len(VASE_DICT) print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID) sys.stdout.flush() def read_G2P(in_file): global NUM_UNIQ_G2P_VARS # first, read the G2P variants on canonical transcripts for each of the parents in genes considered under the OBS=monoallelic scenario MOM_DICT = {} # key: chr:start:end:ref:alt ; value: ZYG DAD_DICT = {} # key: chr:start:end:ref:alt ; value: ZYG in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # ignore variants in the child sam_id = data[0] if sam_id == CHILD_ID: continue # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue # select only variants in genes being considered under the dominant model inher_model = data[4] if inher_model != 'OBS=monoallelic': continue # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under the dominant model # and seen in one of the parents var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) if sam_id == MOM_ID: if key not in MOM_DICT: MOM_DICT[key] = GT else: print "ERROR: a duplicate variant in dominant gene for MOM" print line raise SystemExit elif sam_id == DAD_ID: if key not in DAD_DICT: DAD_DICT[key] = GT else: print "ERROR: a duplicate variant in dominant gene for DAD" print line raise SystemExit else: print "ERROR: cannot identify the person for this variant" print line raise SystemExit in_han.close() print "Found %s unique G2P variants in MOM (%s) on canon transcript in genes being considered under the DOM model" % (len(MOM_DICT),MOM_ID) print "Found %s unique G2P variants in DAD (%s) on canon transcript in genes being considered under the DOM model" % (len(DAD_DICT),DAD_ID) sys.stdout.flush() # now, read the file again to collect child variants: under any inheritance model # if the gene is being considered under the dominant model, exclude variants seen in UNAFFECTED mother/father in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # ignore variants in other memebers of the trio sam_id = data[0] if sam_id != CHILD_ID: continue # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue inher_model = data[4] gene = data[1] transcript = data[2] var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] if inher_model == 'OBS=monoallelic': key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) if (key in MOM_DICT) and (MOM_STAT == "UNAFFECTED"): MOM_GT = MOM_DICT[key] print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,GT,MOM_GT,MOM_STAT) continue # if GT == 'HET' and (MOM_GT == 'HET' or MOM_GT == 'HOM'): # print "***Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s [DOM model]" % (key,GT,MOM_GT,MOM_STAT) # continue # else: # pass if (key in DAD_DICT) and (DAD_STAT == "UNAFFECTED"): DAD_GT = DAD_DICT[key] print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,GT,DAD_GT,DAD_STAT) continue # if GT == 'HET' and (DAD_GT == 'HET' or DAD_GT == 'HOM'): # print "***Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s [DOM model]" % (key,GT,DAD_GT,DAD_STAT) # continue # else: # pass # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit # record the data for CHILD G2P variants (both DOM and REC model) key = '%s:%s:%s:%s' % (chr,start,ref,alt) if key not in G2P_DICT: G2P_DICT[key] = 0 else: # print "ERROR: duplicate G2P variant key = %s" % (key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (transcript,gene,GT) in G2P_DATA if key not in G2P_DATA: G2P_DATA[key] = (transcript,gene,GT) else: # print "ERROR: duplicate G2P variant key = %s" % (key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass in_han.close() NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants for CHILD (%s) after DOM model filtering" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() def read_ped(in_file): global CHILD_ID global CHILD_SEX global DEC_CHILD_SEX global MOM_ID global MOM_STAT global DAD_ID global DAD_STAT CHILD_ID = 0 CHILD_SEX = 0 MOM_ID = 0 MOM_STAT = 0 DAD_ID = 0 DAD_STAT = 0 in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] if data[2] != '0' and data[3] != '0': # this is the child in the trio if CHILD_ID == 0: CHILD_ID = data[1] else: # seen another child print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment" raise SystemExit if DAD_ID == 0: DAD_ID = data[2] else: if data[2] != DAD_ID: print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID) raise SystemExit if MOM_ID == 0: MOM_ID = data[3] else: if data[3] != MOM_ID: print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID) raise SystemExit CHILD_SEX = int(data[4]) if CHILD_SEX == 1: # boy DEC_CHILD_SEX = '46XY' elif CHILD_SEX == 2: # girl DEC_CHILD_SEX = '46XX' else: print "ERROR: proband sex unknown" print line raise SystemExit if int(data[5]) != 2: print "ERROR: child not affected" print line raise SystemExit elif int(data[2]) == 0 and int(data[3]) == 0: # this is a parent record if int(data[4]) == 1: # this is the dad if int(data[5]) == 1: DAD_STAT = "UNAFFECTED" elif int(data[5]) == 2: DAD_STAT = "AFFECTED" else: print "ERROR: cannot establish the dad's status" print line raise SystemExit if DAD_ID == 0: DAD_ID = data[1] else: if data[1] != DAD_ID: print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID) raise SystemExit if int(data[4]) == 2: # this is the mom if int(data[5]) == 1: MOM_STAT = "UNAFFECTED" elif int(data[5]) == 2: MOM_STAT = "AFFECTED" else: print "ERROR: cannot establish mom's status" print line raise SystemExit if MOM_ID == 0: MOM_ID = data[1] else: if data[1] != MOM_ID: print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID) raise SystemExit else: print "ERROR: problematic PED line" print line raise SystemExit def read_map_file(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] dec_id = data[0] int_id = data[1] if dec_id not in MAP_DICT: MAP_DICT[dec_id] = int_id else: print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id) raise SystemExit in_han.close() def read_trans_map(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] old_trans_id = data[0] new_trans_id = data[1] if old_trans_id not in TRANS_DICT: TRANS_DICT[old_trans_id] = new_trans_id else: print "ERROR: duplicate old transcript ID = %s" % (old_trans_id) raise SystemExit in_han.close() if __name__ == '__main__': if len(sys.argv) == 12: go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/generate_DEC_IGV.py \ 2820-gatk-haplotype-annotated.2820_2820.vcf.gz \ ../output_dd/2820_log_dir/2820.report.txt \ /scratch/u035/u035/shared/analysis/wes_pilot/VASE/08042019/output/2820_2820.strict.denovo.vcf \ 2820_2820 \ 2820_2820.DEC.txt \ DECIPHER_DIR \ IGV_DIR \ FAM_IGV_DIR \ SOURCE_DIR \ fam_id \ map_dir \ trans_map_file" raise SystemExit