Skip to content
Snippets Groups Projects
NHS_WES_generate_DEC_IGV.py 52.4 KiB
Newer Older
#	input:
#		the family PED file				[${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped]
#		individual VCF file for the trio proband	[${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]
#		G2P text output for the trio			[${FAMILY_ID}.report.txt]
#		VASE output					[${SAMPLE_ID}.clean.strict.denovo.vcf]
#
#
#	output:
#		DECIPHER formated file for the proband
#		- all G2P variants
#		- denovo variants marked as such
#
#	checks:
#		all G2P variants found in the individual VCF
#		all VASE denovo variants found in the individual VCF
#
#       Author: MH
#       last modified: DEC 05, 2019



import sys
import os
import csv
import gzip
from collections import defaultdict


ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'


G2P_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
G2P_DATA = {}		# key: chr:pos:ref:alt; value: (transcript,gene,GT)
VASE_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID


NUM_UNIQ_G2P_VARS = 0
NUM_UNIQ_VASE_VARS = 0


CHILD_ID = 0
CHILD_SEX = 0
DEC_CHILD_SEX = 'unknown'

MOM_ID = 0
MOM_STAT = 0	# 1 = UNAFF, 2 = AFF

DAD_ID = 0
DAD_STAT = 0	# 1 = UNAFF, 2 = AFF


ALL_CHILD_DICT = {}		# key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
ALL_MOM_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant
ALL_DAD_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant


CHILD_INHER_DICT = {}		# key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown'

SNAP_FLANK = 25


MAP_DICT = {}			# key: family_id (aka decipher_id); value: internal (decipher) ID
TRANS_DICT = {}			# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER






### call the python scrpit
#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV.py \
#${DEC_MAP} \
#${TRANS_MAP} \
#${PED_FILE} \
#${IN_G2P_FILE} \
#${IN_VASE_FILE} \
#${FAM_IGV_DIR} \
#${VCF_DIR} \
#${PLATE_ID} \
#${FAMILY_ID} \
#${DEC_DIR} \
#${FAM_BAM_DIR}







def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):

    # read the decipher to internal ID mapping file
    read_map_file(dec_map_file)


    # read the transcript mapping file
    read_trans_map(trans_map_file)


    # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
    read_ped(ped_file)

    if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0):
        print "======================================"
        print "Analyzing:"
        print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
        print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT)
        print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT)
        print "======================================"
        sys.stdout.flush() 
    else:
        print "ERROR: problems reading the PED file = %s" % (ped_file)
        raise SystemExit       


    # read the G2P output for this family
    read_G2P(in_g2p_file)


    # read the VASE output for this family
    read_VASE(in_vase_file)


    # now read the individual VCFs and record all the variants
    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
    mom_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,MOM_ID)
    dad_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,DAD_ID)


    read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
    print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
    sys.stdout.flush() 

    read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT)
    print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID)
    sys.stdout.flush()

    read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT)
    print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID)
    sys.stdout.flush()


    # now go over all child variants and set the inheritance
    num_child_vars_assigned = 0
    for key,v in ALL_CHILD_DICT.iteritems():
        if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT):
            CHILD_INHER_DICT[key] = 'Biparental'
            num_child_vars_assigned += 1
        elif key in ALL_MOM_DICT:
            CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother'
            num_child_vars_assigned += 1
        elif key in ALL_DAD_DICT:
            CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father'
            num_child_vars_assigned += 1
        else:
            CHILD_INHER_DICT[key] = 'Unknown'

    assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0

    print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
    sys.stdout.flush()






    # setup the DECIPHER output file
    out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID)		################################
    out_han = open(out_dec_file,'w')
    out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')


    # setup the IGV snapshot file
    out_igv_file = '%s/IGV/%s.snapshot.FLT.txt' % (dec_dir,CHILD_ID)	#################################
    out_igv_han = open(out_igv_file,'w')
    out_igv_han.write('new\n')
    out_igv_han.write('genome hg38\n')
    out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
    out_igv_han.write('new\n')

    child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
    mom_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,MOM_ID,MOM_ID)
    dad_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,DAD_ID,DAD_ID)
    out_igv_han.write('load %s\n' % (child_bam))
    out_igv_han.write('load %s\n' % (mom_bam))
    out_igv_han.write('load %s\n' % (dad_bam))

    out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
    out_igv_han.write('\n')
     


    # now read the child VCF, check if the variant in the G2P/VASE output, if yes:
    # set the value in the dict to 1
    # print out to to output file


Loading
Loading full blame...