# input: # this trio PED file [${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped] # individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz] ???? # G2P text output for the trio [${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt] # VASE output [${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf] # # # output: # DECIPHER formated file for the proband # - all G2P variants # - denovo variants marked as such # # checks: # all G2P variants found in the individual VCF # all VASE denovo variants found in the individual VCF # # Author: MH # last modified: JAN 21, 2022 import sys import os import csv import gzip from collections import defaultdict ASSEMBLY = 'GRCh38' INTERGENIC = 'No' ACCESS = 'No' G2P_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID G2P_DATA = {} # key: chr:pos:ref:alt; value: (transcript,gene,GT) VASE_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID NUM_UNIQ_G2P_VARS = 0 NUM_UNIQ_VASE_VARS = 0 CHILD_ID = 0 CHILD_SEX = 0 DEC_CHILD_SEX = 'unknown' MOM_ID = 0 MOM_STAT = 0 # 1 = UNAFF, 2 = AFF DAD_ID = 0 DAD_STAT = 0 # 1 = UNAFF, 2 = AFF ALL_CHILD_DICT = {} # key: chr:pos:ref:alt; value: (num_ALT_reads,VAF) ALL_MOM_DICT = {} # key: chr:pos:ref:alt; value: irrelevant ALL_DAD_DICT = {} # key: chr:pos:ref:alt; value: irrelevant CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown' SNAP_FLANK = 25 MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) ID TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER FS_THRESH = float(60) SOR_THRESH = float(3) ### call the python scrpit #time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_trio_from_quad.py \ #${DECIPHER_ID} \ #${TRANS_MAP} \ #${PED_FILE} \ #${IN_G2P_FILE} \ #${IN_VASE_FILE} \ #${FAM_IGV_DIR} \ #${VCF_DIR} \ #${PLATE_ID} \ #${FAMILY_ID} \ #${DEC_DIR} \ #${FAM_BAM_DIR} \ #${KID_ID} def go(dec_id,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir,this_kid): ### # read the decipher to internal ID mapping file ### read_map_file(dec_map_file) # set the Internal ID (called dec_id) for this Decipher ID (called fam_id) MAP_DICT[fam_id] = dec_id # read the transcript mapping file read_trans_map(trans_map_file) # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID read_ped(ped_file) if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0): print "======================================" print "Analyzing:" print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX) print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT) print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT) print "======================================" sys.stdout.flush() else: print "ERROR: problems reading the PED file = %s" % (ped_file) raise SystemExit # read the G2P output for this family read_G2P(in_g2p_file) # read the VASE output for this family read_VASE(in_vase_file) # now read the individual VCFs and record all the variants child_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,CHILD_ID) mom_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,MOM_ID) dad_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,DAD_ID) read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT) print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID) sys.stdout.flush() read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT) print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID) sys.stdout.flush() read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT) print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID) sys.stdout.flush() # now go over all child variants and set the inheritance num_child_vars_assigned = 0 for key,v in ALL_CHILD_DICT.iteritems(): if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT): CHILD_INHER_DICT[key] = 'Biparental' num_child_vars_assigned += 1 elif key in ALL_MOM_DICT: CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother' num_child_vars_assigned += 1 elif key in ALL_DAD_DICT: CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father' num_child_vars_assigned += 1 else: CHILD_INHER_DICT[key] = 'Unknown' assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0 print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID) sys.stdout.flush() # setup the DECIPHER output file out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID) ################################ out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID) ################################# out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir)) out_igv_han.write('new\n') child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID) mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID) dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID) out_igv_han.write('load %s\n' % (child_bam)) out_igv_han.write('load %s\n' % (mom_bam)) out_igv_han.write('load %s\n' % (dad_bam)) out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir)) out_igv_han.write('\n') # now read the child VCF, check if the variant in the G2P/VASE output, if yes: # set the value in the dict to 1 # print out to to output file in_cntr = 0 out_cntr = 0 child_vcf_file = '%s/%s_%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,this_kid,CHILD_ID) in_han = gzip.open(child_vcf_file,'r') for line in in_han: if line.startswith('#'): continue in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # extract FS and SOR FS = '' SOR = '' infos = [y.strip() for y in data[7].strip().split(';')] for info in infos: if info.startswith('FS='): tag,FS = info.split('=') FS = float(FS) elif info.startswith('SOR='): tag,SOR = info.split('=') SOR = float(SOR) VCF_VAR = data[9] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) inher_stat = CHILD_INHER_DICT[key] ############################################################## # different processing depending on being a SNP, INS, or DEL # ############################################################## if len(ref) == len(alt): # SNP if len(ref) != 1: print "ERROR: MNPs are not supported!" print line raise SystemExit key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if key_to_match in VASE_DICT: VASE_DICT[key_to_match] = 1 is_denovo = True if key_to_match in G2P_DICT: G2P_DICT[key_to_match] = 1 trans = G2P_DATA[key_to_match][0] gene = G2P_DATA[key_to_match][1] GT = G2P_DATA[key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK # check if above FS/SOR_THRESH to include in the snapshot name if (FS == '') or (SOR == ''): flag = 'NA' elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) else: flag = 'OK' i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) > len(alt): # DEL if len(alt) != 1: print "ERROR with a deletion" print line raise SystemExit G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:]) VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if VASE_key_to_match in VASE_DICT: VASE_DICT[VASE_key_to_match] = 1 is_denovo = True if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK # check if above FS/SOR_THRESH to include in the snapshot name if (FS == '') or (SOR == ''): flag = 'NA' elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) else: flag = 'OK' i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) < len(alt): # INS if len(ref) != 1: print "ERROR with an insertion" print line raise SystemExit G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:]) VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if VASE_key_to_match in VASE_DICT: VASE_DICT[VASE_key_to_match] = 1 is_denovo = True if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK # check if above FS/SOR_THRESH to include in the snapshot name if (FS == '') or (SOR == ''): flag = 'NA' elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH): flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR) else: flag = 'OK' i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') else: print "Cannot establish the type of this VCF variant" print line raise SystemExit in_han.close() out_han.close() out_igv_han.close() ### check if all G2P and VASE variants were found/matched in the proband's VCF found_all_G2P = True found_all_VASE = True for k,v in G2P_DICT.iteritems(): if int(v) == 0: print k found_all_G2P = False break for k,v in VASE_DICT.iteritems(): if int(v) == 0: print k found_all_VASE = False break if found_all_G2P: print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT)) else: print "ERROR: Could not find all G2P variants in the probands VCF file" raise SystemExit if found_all_VASE: print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT)) else: print "ERROR: Could not find all VASE variants in the probands VCF file" raise SystemExit ### check if all G2P variants are written out if out_cntr == NUM_UNIQ_G2P_VARS: print "OK: All G2P vars are recorded in the output DECIPHER file" else: print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file" print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file) print "The batch snapshot file = %s" % (out_igv_file) sys.stdout.flush() def read_all_VCF_vars(in_vcf_file,THIS_DICT): in_han = gzip.open(in_vcf_file,'r') for line in in_han: if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" print line raiseSystemExit key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in THIS_DICT: THIS_DICT[key] = 1 else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit in_han.close() def read_VASE(in_file): global NUM_UNIQ_VASE_VARS in_han = open(in_file,'r') for line in in_han: # ignore header lines if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = data[1] ref = data[3] alt = data[4] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in VASE_DICT: VASE_DICT[key] = 0 else: print "ERROR: duplicate VASE variant key = %s" % (key) raise SystemExit in_han.close() NUM_UNIQ_VASE_VARS = len(VASE_DICT) print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID) sys.stdout.flush() def read_G2P(in_file): global NUM_UNIQ_G2P_VARS #.# known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # first, read the G2P variants on canonical transcripts for each of the family members CHILD_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) MOM_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) DAD_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # get the individual_id sam_id = data[0] # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue # split the variants based on the gene's OBS model of inheritance inher_model = data[4] aaa,OBS_state = inher_model.split('=') if OBS_state not in known_OBS_states: print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file) raise SystemExit # get the gene name in format ENSG00000165899(C12orf64,OTOGL) gene_name = data[1] # get the transcript name in format ENST00000238647 transcript = data[2] # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) if sam_id == CHILD_ID: # check for duplication if OBS_state not in CHILD_DICT: CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) elif second_key not in CHILD_DICT[OBS_state]: CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) else: # already recorded this variant # if we have refseq recorded and this is ensembl --> replace if not CHILD_DICT[OBS_state][second_key][1].startswith('ENSG'): # recorded is refseq if gene_name.startswith('ENSG'): # this is ensembl CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) # replace else: # this is refseq again, ignore pass else: # recorded is ensembl, ignore pass elif sam_id == MOM_ID: # check for duplication if OBS_state not in MOM_DICT: MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript) elif second_key not in MOM_DICT[OBS_state]: MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript) else: # already recorded this variant # if we have refseq recorded and this is ensembl --> replace if not MOM_DICT[OBS_state][second_key][1].startswith('ENSG'): # recorded is refseq if gene_name.startswith('ENSG'): # this is ensembl MOM_DICT[OBS_state][second_key] = (GT,gene_name,transcript) # replace else: # this is refseq again, ignore pass else: # recorded is ensembl, ignore pass elif sam_id == DAD_ID: # check for duplication if OBS_state not in DAD_DICT: DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) elif second_key not in DAD_DICT[OBS_state]: DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) else: # already recorded this variant # if we have refseq recorded and this is ensembl --> replace if not DAD_DICT[OBS_state][second_key][1].startswith('ENSG'): # recorded is refseq if gene_name.startswith('ENSG'): # this is ensembl DAD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) # replace else: # this is refseq again, ignore pass else: # recorded is ensembl, ignore pass else: print "ERROR: cannot identify the person for this variant" print line raise SystemExit in_han.close() ### print out the number of unique G2P variants in CHILD ### child_mono = 0 child_bi = 0 child_hem = 0 child_het = 0 if 'monoallelic_autosomal' in CHILD_DICT: child_mono = len(CHILD_DICT['monoallelic_autosomal']) if 'biallelic_autosomal' in CHILD_DICT: child_bi = len(CHILD_DICT['biallelic_autosomal']) if 'monoallelic_X_hem' in CHILD_DICT: child_hem = len(CHILD_DICT['monoallelic_X_hem']) if 'monoallelic_X_het' in CHILD_DICT: child_het = len(CHILD_DICT['monoallelic_X_het']) print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID) print " monoallelic_autosomal: %s" % (child_mono) print " biallelic_autosomal: %s" % (child_bi) print " monoallelic_X_hem: %s" % (child_hem) print " monoallelic_X_het: %s" % (child_het) ### print out the number of unique G2P variants in MOM ### mom_mono = 0 mom_bi = 0 mom_hem = 0 mom_het = 0 if 'monoallelic_autosomal' in MOM_DICT: mom_mono = len(MOM_DICT['monoallelic_autosomal']) if 'biallelic_autosomal' in MOM_DICT: mom_bi = len(MOM_DICT['biallelic_autosomal']) if 'monoallelic_X_hem' in MOM_DICT: mom_hem = len(MOM_DICT['monoallelic_X_hem']) if 'monoallelic_X_het' in MOM_DICT: mom_het = len(MOM_DICT['monoallelic_X_het']) print "MOM (%s): number of unique G2P variants on canon transcript in the following OBS states" % (MOM_ID) print " monoallelic_autosomal: %s" % (mom_mono) print " biallelic_autosomal: %s" % (mom_bi) print " monoallelic_X_hem: %s" % (mom_hem) print " monoallelic_X_het: %s" % (mom_het) ### print out the number of unique G2P variants in DAD ### dad_mono = 0 dad_bi = 0 dad_hem = 0 dad_het = 0 if 'monoallelic_autosomal' in DAD_DICT: dad_mono = len(DAD_DICT['monoallelic_autosomal']) if 'biallelic_autosomal' in DAD_DICT: dad_bi = len(DAD_DICT['biallelic_autosomal']) if 'monoallelic_X_hem' in DAD_DICT: dad_hem = len(DAD_DICT['monoallelic_X_hem']) if 'monoallelic_X_het' in DAD_DICT: dad_het = len(DAD_DICT['monoallelic_X_het']) print "DAD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (DAD_ID) print " monoallelic_autosomal: %s" % (dad_mono) print " biallelic_autosomal: %s" % (dad_bi) print " monoallelic_X_hem: %s" % (dad_hem) print " monoallelic_X_het: %s" % (dad_het) sys.stdout.flush() ###################################################################################################### #### Dominant filtering #### #### if the gene has been considered under the dominant model (OBS == monoallelic_autosomal) #### #### exclude child variants seen in UNAFFECTED mother/father, regardless of GT #### ###################################################################################################### print "" print "=== monoallelic autosomal (DOMINANT) filtering ===" for key in CHILD_DICT['monoallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0] CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1] CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2] if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"): MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0] print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT) continue if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"): DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0] print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = key.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=monoallelic) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() print "" ############################################################################################################## #### Recessive filtering #### #### under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene #### #### must all be HET in CHILD, GT in parent does not matter #### #### all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) #### #### and this parent must be unaffected #### #### if all these: then exclude all child variants in this gene #### ############################################################################################################## print "" print "=== biallelic autosomal (RECESSIVE) filtering ===" GENE_KEY_GT = defaultdict(dict) # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans) # process all variants in biallelic genes in child for key in CHILD_DICT['biallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) b_GT = CHILD_DICT['biallelic_autosomal'][key][0] b_gene = CHILD_DICT['biallelic_autosomal'][key][1] b_trans = CHILD_DICT['biallelic_autosomal'][key][2] GENE_KEY_GT[b_gene][key] = (b_GT,b_trans) # iterate over genes in GENE_KEY_GT for g in GENE_KEY_GT: # this is the biallelic gene name all_HET = True # iterate over variants in this gene for kx in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt if GENE_KEY_GT[g][kx][0] == 'HOM': # there is a HOM variant in the child - NO filtering all_HET = False break if all_HET: # for this gene # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent # if yes, filter out and write a message to the log file # if not, to be added to G2P_DICT and G2P_DATA for further processing all_from_one_parent = True # iterate again over the variants in this gene VAR_SOURCE_LIST = {} # key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED) for ky in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt this_var_status = 'NONE' if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"): this_var_status = 'MOM' if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"): if this_var_status == 'NONE': this_var_status = 'DAD' elif this_var_status == 'MOM': this_var_status = 'BOTH' VAR_SOURCE_LIST[ky] = this_var_status # have collected the parent source for all variants in this gene tot_num_vars = len(VAR_SOURCE_LIST) num_mom = 0 num_dad = 0 num_none = 0 for kt,v in VAR_SOURCE_LIST.iteritems(): if v == 'NONE': num_none += 1 elif v == 'MOM': num_mom += 1 elif v == 'DAD': num_dad += 1 elif v == 'BOTH': num_mom += 1 num_dad += 1 else: print "ERROR: cannot understand the source parent = %s" % (v) raise SystemExit if num_none > 0: all_from_one_parent = False elif num_mom < tot_num_vars and num_dad < tot_num_vars: all_from_one_parent = False # if all variants in the child in this gene are found in single unaffected parent - filter out if all_from_one_parent: for kz in GENE_KEY_GT[g]: print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz]) continue # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written # and gone to evaluating the next biallelic gene in the child # if here # - either not all CHILD variants in this gene are not HET, or # - not all of them can be traced to a single unaffected parent # --> add to be processed # here we are at gene level, must iterate over all variants in this gene # iterate over variants in this gene for kkk in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0] CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1] CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2] # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = kkk.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=biallelic) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() print "" #################################################################################################################### #### X-linked filtering #### #.# #### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) #### #### under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het) #### #### exclude child HET variants if seen as HOM in UNAFFECTED father #### #### #### #### Note 18/01/2022 #### #### This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het #### #### and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance #### #### the code below treats x-linked over-dominance as the others (i.e. filters, while it should not) #### #### Issue flagged to G2P plug-in team, awaiting their fix #### #### for now manually scan the output of G2P for the proband (both for boys and girls) #### #### to check if any variant has been called in PCDH19 and EFNB1 #### #### also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name #### #################################################################################################################### print "" print "=== X-linked filtering ===" ####################################### ### process monoallelic_X_hem genes ### ####################################### for key in CHILD_DICT['monoallelic_X_hem']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0] CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1] CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2] if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl) pass else: if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"): DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0] if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = key.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass ####################################### ### process monoallelic_X_het genes ### ####################################### for key in CHILD_DICT['monoallelic_X_het']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0] CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1] CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2] if CHILD_GT == 'HOM': # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl) pass else: if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"): DAD_GT = DAD_DICT['monoallelic_X_het'][key][0] if DAD_GT == 'HOM': # i.e., x-linked dominant variant in unnafected father print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = key.split(":") if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) # record the data for CHILD G2P variants (for OBS=monoallelic_X_het) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA if new_key not in G2P_DATA: G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) else: # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass #.# ######################################################################## #.# ### process x-linked over-dominance genes - no filtering to be done ### #.# ######################################################################## #.# for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) #.# CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] #.# CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] #.# CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] #.# # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) #.# chr,start,end,ref,alt = key.split(":") #.# if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized #.# if len(ref) < len(alt): # an INS #.# orig_start = start #.# orig_ref = ref #.# orig_alt = alt #.# start = orig_start #.# ref = '-' #.# alt = orig_alt[len(orig_ref):] #.# print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) #.# else: # a DEL #.# print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" #.# print line #.# raise SystemExit #.# new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) #.# # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) #.# if new_key not in G2P_DICT: #.# G2P_DICT[new_key] = 0 #.# else: #.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) #.# # raise SystemExit #.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req #.# pass #.# # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA #.# if new_key not in G2P_DATA: #.# G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) #.# else: #.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) #.# # raise SystemExit #.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req #.# pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() print "" print "" def read_ped(in_file): global CHILD_ID global CHILD_SEX global DEC_CHILD_SEX global MOM_ID global MOM_STAT global DAD_ID global DAD_STAT CHILD_ID = 0 CHILD_SEX = 0 MOM_ID = 0 MOM_STAT = 0 DAD_ID = 0 DAD_STAT = 0 in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] if data[2] != '0' and data[3] != '0': # this is the child in the trio if CHILD_ID == 0: CHILD_ID = data[1] else: # seen another child print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment" raise SystemExit if DAD_ID == 0: DAD_ID = data[2] else: if data[2] != DAD_ID: print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID) raise SystemExit if MOM_ID == 0: MOM_ID = data[3] else: if data[3] != MOM_ID: print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID) raise SystemExit CHILD_SEX = int(data[4]) if CHILD_SEX == 1: # boy DEC_CHILD_SEX = '46XY' elif CHILD_SEX == 2: # girl DEC_CHILD_SEX = '46XX' else: print "ERROR: proband sex unknown" print line raise SystemExit if int(data[5]) != 2: print "ERROR: child not affected" print line raise SystemExit elif int(data[2]) == 0 and int(data[3]) == 0: # this is a parent record if int(data[4]) == 1: # this is the dad if int(data[5]) == 1: DAD_STAT = "UNAFFECTED" elif int(data[5]) == 2: DAD_STAT = "AFFECTED" else: print "ERROR: cannot establish the dad's status" print line raise SystemExit if DAD_ID == 0: DAD_ID = data[1] else: if data[1] != DAD_ID: print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID) raise SystemExit if int(data[4]) == 2: # this is the mom if int(data[5]) == 1: MOM_STAT = "UNAFFECTED" elif int(data[5]) == 2: MOM_STAT = "AFFECTED" else: print "ERROR: cannot establish mom's status" print line raise SystemExit if MOM_ID == 0: MOM_ID = data[1] else: if data[1] != MOM_ID: print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID) raise SystemExit else: print "ERROR: problematic PED line" print line raise SystemExit #def read_map_file(in_file): # in_han = open(in_file,'r') # for line in in_han: # data = [x.strip() for x in line.strip().split('\t')] # dec_id = data[0] # int_id = data[1] # if dec_id not in MAP_DICT: # MAP_DICT[dec_id] = int_id # else: # print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id) # raise SystemExit # in_han.close() def read_trans_map(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] old_trans_id = data[0] new_trans_id = data[1] if old_trans_id not in TRANS_DICT: TRANS_DICT[old_trans_id] = new_trans_id else: print "ERROR: duplicate old transcript ID = %s" % (old_trans_id) raise SystemExit in_han.close() if __name__ == '__main__': if len(sys.argv) == 13: go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11],sys.argv[12]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \ dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir,indi_id_for_this_kid" raise SystemExit