# input: # the family PED file # G2P text output for the trio [${FAMILY_ID}.report.txt] # VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf] # # # individual VCF file for the trio proband [${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz] # G2P text output for the trio [${FAMILY_ID}.report.txt] # VASE output [${SAMPLE_ID}.clean.strict.denovo.vcf] # # # output: # DECIPHER formated file for the proband # - all G2P variants # - denovo variants marked as such # # checks: # all G2P variants found in the individual VCF # all VASE denovo variants found in the individual VCF # # Author: MH # last modified: SEPT 26, 2019 import sys import os import csv import gzip ASSEMBLY = 'GRCh38' INTERGENIC = 'No' ACCESS = 'No' G2P_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID G2P_DATA = {} # key: chr:pos:ref:alt; value: (transcript,gene,GT) VASE_DICT = {} # key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID NUM_UNIQ_G2P_VARS = 0 NUM_UNIQ_VASE_VARS = 0 CHILD_ID = 0 CHILD_SEX = 0 DEC_CHILD_SEX = 'unknown' MOM_ID = 0 MOM_STAT = 0 # 1 = UNAFF, 2 = AFF DAD_ID = 0 DAD_STAT = 0 # 1 = UNAFF, 2 = AFF ALL_CHILD_DICT = {} # key: chr:pos:ref:alt; value: (num_ALT_reads,VAF) ALL_MOM_DICT = {} # key: chr:pos:ref:alt; value: irrelevant ALL_DAD_DICT = {} # key: chr:pos:ref:alt; value: irrelevant CHILD_INHER_DICT = {} # key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown' SNAP_FLANK = 25 MAP_DICT = {} # key: family_id (aka decipher_id); value: internal (decipher) ID TRANS_DICT = {} # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER ### call the python scrpit #time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV.py \ #${DEC_MAP} \ #${TRANS_MAP} \ #${PED_FILE} \ #${IN_G2P_FILE} \ #${IN_VASE_FILE} \ #${FAM_IGV_DIR} \ #${VCF_DIR} \ #${PLATE_ID} \ #${FAMILY_ID} \ #${DEC_DIR} \ #${FAM_BAM_DIR} def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir): # read the decipher to internal ID mapping file read_map_file(dec_map_file) # read the transcript mapping file read_trans_map(trans_map_file) # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID read_ped(ped_file) if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0): print "======================================" print "Analyzing:" print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX) print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT) print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT) print "======================================" sys.stdout.flush() else: print "ERROR: problems reading the PED file = %s" % (ped_file) raise SystemExit # read the G2P output for this family read_G2P(in_g2p_file) # read the VASE output for this family read_VASE(in_vase_file) # now read the individual VCFs and record all the variants child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID) dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID) read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT) print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID) sys.stdout.flush() read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT) print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID) sys.stdout.flush() read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT) print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID) sys.stdout.flush() # now go over all child variants and set the inheritance num_child_vars_assigned = 0 for key,v in ALL_CHILD_DICT.iteritems(): if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT): CHILD_INHER_DICT[key] = 'Biparental' num_child_vars_assigned += 1 elif key in ALL_MOM_DICT: CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother' num_child_vars_assigned += 1 elif key in ALL_DAD_DICT: CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father' num_child_vars_assigned += 1 else: CHILD_INHER_DICT[key] = 'Unknown' assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0 print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID) sys.stdout.flush() # setup the DECIPHER output file out_dec_file = '%s/%s_DEC.csv' % (dec_dir,CHILD_ID) out_han = open(out_dec_file,'w') out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n') # setup the IGV snapshot file out_igv_file = '%s/IGV/%s.snapshot.txt' % (dec_dir,CHILD_ID) out_igv_han = open(out_igv_file,'w') out_igv_han.write('new\n') out_igv_han.write('genome hg38\n') out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir)) out_igv_han.write('new\n') child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID) mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID) dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID) out_igv_han.write('load %s\n' % (child_bam)) out_igv_han.write('load %s\n' % (mom_bam)) out_igv_han.write('load %s\n' % (dad_bam)) out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir)) out_igv_han.write('\n') # now read the child VCF, check if the variant in the G2P/VASE output, if yes: # set the value in the dict to 1 # print out to to output file in_cntr = 0 out_cntr = 0 child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID) in_han = gzip.open(child_vcf_file,'r') for line in in_han: if line.startswith('#'): continue in_cntr += 1 data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] VCF_VAR = data[9] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) inher_stat = CHILD_INHER_DICT[key] ############################################################## # different processing depending on being a SNP, INS, or DEL # ############################################################## if len(ref) == len(alt): # SNP if len(ref) != 1: print "ERROR: MNPs are not supported!" print line raise SystemExit key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if key_to_match in VASE_DICT: VASE_DICT[key_to_match] = 1 is_denovo = True if key_to_match in G2P_DICT: G2P_DICT[key_to_match] = 1 trans = G2P_DATA[key_to_match][0] gene = G2P_DATA[key_to_match][1] GT = G2P_DATA[key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) > len(alt): # DEL if len(alt) != 1: print "ERROR with a deletion" print line raise SystemExit G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:]) VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if VASE_key_to_match in VASE_DICT: VASE_DICT[VASE_key_to_match] = 1 is_denovo = True if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') elif len(ref) < len(alt): # INS if len(ref) != 1: print "ERROR with an insertion" print line raise SystemExit G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:]) VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt) is_denovo = False if VASE_key_to_match in VASE_DICT: VASE_DICT[VASE_key_to_match] = 1 is_denovo = True if G2P_key_to_match in G2P_DICT: G2P_DICT[G2P_key_to_match] = 1 trans = G2P_DATA[G2P_key_to_match][0] gene = G2P_DATA[G2P_key_to_match][1] GT = G2P_DATA[G2P_key_to_match][2] if is_denovo: if inher_stat == 'Unknown': inher_stat = 'De novo constitutive' else: print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat) raise SystemExit if (chr != 'chrX') and (chr != 'chrY'): if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif (chr == 'chrX') or (chr == 'chrY'): if DEC_CHILD_SEX == '46XX': # a girl if GT == 'HET': genotype = 'Heterozygous' elif GT == 'HOM': genotype = 'Homozygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit elif DEC_CHILD_SEX == '46XY': # a boy if GT == 'HET': genotype = 'Heterozygous' print " WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR) elif GT == 'HOM': genotype = 'Hemizygous' else: print "ERROR: Cannot understand GT = %s" % (GT) raise SystemExit else: print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX) raise SystemExit else: print "ERROR: unknown chr" print line raise SystemExit # write to the DECIPHER file gene_id_idx = gene.find('(') if gene_id_idx == -1: gene_id_idx = len(gene) gene_id = gene[0:gene_id_idx] int_ID = MAP_DICT[fam_id] if trans in TRANS_DICT: # if the transcriptID is to be replaced safe_trans = TRANS_DICT[trans] else: safe_trans = trans to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype) out_cntr += 1 out_han.write(to_write) # write to the IGV file i_s = pos - SNAP_FLANK i_e = pos + SNAP_FLANK i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt) out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e)) out_igv_han.write('sort strand\n') out_igv_han.write('squish\n') out_igv_han.write('snapshot %s\n' % (i_name)) out_igv_han.write('\n') else: print "Cannot establish the type of this VCF variant" print line raise SystemExit in_han.close() out_han.close() out_igv_han.close() ### check if all G2P and VASE variants were found/matched in the proband's VCF found_all_G2P = True found_all_VASE = True for k,v in G2P_DICT.iteritems(): if int(v) == 0: print k found_all_G2P = False break for k,v in VASE_DICT.iteritems(): if int(v) == 0: print k found_all_VASE = False break if found_all_G2P: print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT)) else: print "ERROR: Could not find all G2P variants in the probands VCF file" raise SystemExit if found_all_VASE: print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT)) else: print "ERROR: Could not find all VASE variants in the probands VCF file" raise SystemExit ### check if all G2P variants are written out if out_cntr == NUM_UNIQ_G2P_VARS: print "OK: All G2P vars are recorded in the output DECIPHER file" else: print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file" print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file) print "The batch snapshot file = %s" % (out_igv_file) sys.stdout.flush() def read_all_VCF_vars(in_vcf_file,THIS_DICT): in_han = gzip.open(in_vcf_file,'r') for line in in_han: if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = int(data[1]) ref = data[3] alt = data[4] # did the splitting and normalizing - should not have multiallelic variants if alt.find(',') != -1: print "ERROR: found multiallelic variant" print line raiseSystemExit key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in THIS_DICT: THIS_DICT[key] = 1 else: print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file) raise SystemExit in_han.close() def read_VASE(in_file): global NUM_UNIQ_VASE_VARS in_han = open(in_file,'r') for line in in_han: # ignore header lines if line.startswith('#'): continue data = [x.strip() for x in line.strip().split('\t')] chr = data[0] pos = data[1] ref = data[3] alt = data[4] key = '%s:%s:%s:%s' % (chr,pos,ref,alt) if key not in VASE_DICT: VASE_DICT[key] = 0 else: print "ERROR: duplicate VASE variant key = %s" % (key) raise SystemExit in_han.close() NUM_UNIQ_VASE_VARS = len(VASE_DICT) print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID) sys.stdout.flush() def read_G2P(in_file): global NUM_UNIQ_G2P_VARS # first, read the G2P variants on canonical transcripts for each of the parents in genes considered under the OBS=monoallelic scenario MOM_DICT = {} # key: chr:start:end:ref:alt ; value: ZYG DAD_DICT = {} # key: chr:start:end:ref:alt ; value: ZYG in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # ignore variants in the child sam_id = data[0] if sam_id == CHILD_ID: continue # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue # select only variants in genes being considered under the dominant model inher_model = data[4] if inher_model != 'OBS=monoallelic': continue # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under the dominant model # and seen in one of the parents var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) if sam_id == MOM_ID: if key not in MOM_DICT: MOM_DICT[key] = GT else: print "ERROR: a duplicate variant in dominant gene for MOM" print line raise SystemExit elif sam_id == DAD_ID: if key not in DAD_DICT: DAD_DICT[key] = GT else: print "ERROR: a duplicate variant in dominant gene for DAD" print line raise SystemExit else: print "ERROR: cannot identify the person for this variant" print line raise SystemExit in_han.close() print "Found %s unique G2P variants in MOM (%s) on canon transcript in genes being considered under the DOM model" % (len(MOM_DICT),MOM_ID) print "Found %s unique G2P variants in DAD (%s) on canon transcript in genes being considered under the DOM model" % (len(DAD_DICT),DAD_ID) sys.stdout.flush() # now, read the file again to collect child variants: under any inheritance model # if the gene is being considered under the dominant model, exclude variants seen in UNAFFECTED mother/father in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] # ignore variants in other memebers of the trio sam_id = data[0] if sam_id != CHILD_ID: continue # ignore variants not on canonical transcripts is_canon = data[3] if is_canon != 'is_canonical': continue inher_model = data[4] gene = data[1] transcript = data[2] var_list = [y.strip() for y in data[6].split(';')] for v in var_list: v_details = [z.strip() for z in v.split(':')] chr = v_details[0] start = int(v_details[1]) end = int(v_details[2]) ref = v_details[3] alt = v_details[4] GT = v_details[5] if inher_model == 'OBS=monoallelic': key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt) if (key in MOM_DICT) and (MOM_STAT == "UNAFFECTED"): MOM_GT = MOM_DICT[key] print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,GT,MOM_GT,MOM_STAT) continue if (key in DAD_DICT) and (DAD_STAT == "UNAFFECTED"): DAD_GT = DAD_DICT[key] print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,GT,DAD_GT,DAD_STAT) continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized if len(ref) < len(alt): # an INS orig_start = start orig_ref = ref orig_alt = alt start = orig_start ref = '-' alt = orig_alt[len(orig_ref):] print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) else: # a DEL print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" print line raise SystemExit # record the data for CHILD G2P variants (both DOM and REC model) key = '%s:%s:%s:%s' % (chr,start,ref,alt) if key not in G2P_DICT: G2P_DICT[key] = 0 else: # print "ERROR: duplicate G2P variant key = %s" % (key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass # and record the required data (transcript,gene,GT) in G2P_DATA if key not in G2P_DATA: G2P_DATA[key] = (transcript,gene,GT) else: # print "ERROR: duplicate G2P variant key = %s" % (key) # raise SystemExit # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req pass in_han.close() NUM_UNIQ_G2P_VARS = len(G2P_DICT) print "Found %s unique G2P variants for CHILD (%s) after DOM model filtering" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() def read_ped(in_file): global CHILD_ID global CHILD_SEX global DEC_CHILD_SEX global MOM_ID global MOM_STAT global DAD_ID global DAD_STAT CHILD_ID = 0 CHILD_SEX = 0 MOM_ID = 0 MOM_STAT = 0 DAD_ID = 0 DAD_STAT = 0 in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] if data[2] != '0' and data[3] != '0': # this is the child in the trio if CHILD_ID == 0: CHILD_ID = data[1] else: # seen another child print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment" raise SystemExit if DAD_ID == 0: DAD_ID = data[2] else: if data[2] != DAD_ID: print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID) raise SystemExit if MOM_ID == 0: MOM_ID = data[3] else: if data[3] != MOM_ID: print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID) raise SystemExit CHILD_SEX = int(data[4]) if CHILD_SEX == 1: # boy DEC_CHILD_SEX = '46XY' elif CHILD_SEX == 2: # girl DEC_CHILD_SEX = '46XX' else: print "ERROR: proband sex unknown" print line raise SystemExit if int(data[5]) != 2: print "ERROR: child not affected" print line raise SystemExit elif int(data[2]) == 0 and int(data[3]) == 0: # this is a parent record if int(data[4]) == 1: # this is the dad if int(data[5]) == 1: DAD_STAT = "UNAFFECTED" elif int(data[5]) == 2: DAD_STAT = "AFFECTED" else: print "ERROR: cannot establish the dad's status" print line raise SystemExit if DAD_ID == 0: DAD_ID = data[1] else: if data[1] != DAD_ID: print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID) raise SystemExit if int(data[4]) == 2: # this is the mom if int(data[5]) == 1: MOM_STAT = "UNAFFECTED" elif int(data[5]) == 2: MOM_STAT = "AFFECTED" else: print "ERROR: cannot establish mom's status" print line raise SystemExit if MOM_ID == 0: MOM_ID = data[1] else: if data[1] != MOM_ID: print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID) raise SystemExit else: print "ERROR: problematic PED line" print line raise SystemExit def read_map_file(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] dec_id = data[0] int_id = data[1] if dec_id not in MAP_DICT: MAP_DICT[dec_id] = int_id else: print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id) raise SystemExit in_han.close() def read_trans_map(in_file): in_han = open(in_file,'r') for line in in_han: data = [x.strip() for x in line.strip().split('\t')] old_trans_id = data[0] new_trans_id = data[1] if old_trans_id not in TRANS_DICT: TRANS_DICT[old_trans_id] = new_trans_id else: print "ERROR: duplicate old transcript ID = %s" % (old_trans_id) raise SystemExit in_han.close() if __name__ == '__main__': if len(sys.argv) == 12: go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11]) else: print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \ dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir" raise SystemExit