Skip to content
Snippets Groups Projects
generate_DEC_IGV_solo_scripts.py 45.12 KiB
#	input:
#		the family PED file				[${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped]
#		the VCF folder containing the proband VCF	[${PROJECT_ID}/VCF/  ${PLATE_ID}_${FAMILY_ID}.ready.${CHILD_ID}_${FAMILY_ID}.vcf.gz]
#		G2P text output for the trio			[${FAMILY_ID}.report.txt]
#
#
#	output:
#		DECIPHER formated file for the proband
#		- all (filtered) G2P variants
#
#	checks:
#		all G2P variants found in the individual VCF
#
#       Author: MH
#       last modified: MARCH 04, 2022



import sys
import os
import csv
import gzip
from collections import defaultdict


ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'


G2P_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
G2P_DATA = {}		# key: chr:pos:ref:alt; value: (transcript,gene,GT)


NUM_UNIQ_G2P_VARS = 0


CHILD_ID = 0
CHILD_SEX = 0
DEC_CHILD_SEX = 'unknown'


ALL_CHILD_DICT = {}		# key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
CHILD_INHER_DICT = {}           # key: chr:pos:ref:alt; value: "Unknown" - all variants in a singleton are of unknown inheritance


SNAP_FLANK = 25


MAP_DICT = {}			# key: family_id (aka decipher_id); value: internal (decipher) ID
TRANS_DICT = {}			# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER


FS_THRESH = float(60)
SOR_THRESH = float(3)



def go(dec_map_file,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):

    # read the decipher to internal ID mapping file
    read_map_file(dec_map_file)


    # read the transcript mapping file
    read_trans_map(trans_map_file)

    # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
    read_ped(ped_file)

    if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown'):
        print "======================================"
        print "Analyzing singleton CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
        print "======================================"
        sys.stdout.flush()
    else:
        print "ERROR: problems reading the PED file = %s" % (ped_file)
        raise SystemExit


    # read the G2P output for this family
    read_G2P(in_g2p_file)


    # now read the proband VCFs and record all the variants
    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
    read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
    print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
    sys.stdout.flush()

    # now go over all child variants and set the inheritance to "Unknown"
    num_child_vars_assigned = 0
    for key,v in ALL_CHILD_DICT.iteritems():
        CHILD_INHER_DICT[key] = 'Unknown'
    assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0
    print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
    sys.stdout.flush()



    # setup the DECIPHER output file
    out_dec_file = '%s/%s_DEC_FLT.csv' % (dec_dir,CHILD_ID)		################################
    out_han = open(out_dec_file,'w')
    out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')


    # setup the IGV snapshot file
    out_igv_file = '%s/IGV/%s.solo.snapshot.FLT.txt' % (dec_dir,CHILD_ID)	#################################
    out_igv_han = open(out_igv_file,'w')
    out_igv_han.write('new\n')
    out_igv_han.write('genome hg38\n')
    out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
    out_igv_han.write('new\n')

    child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,CHILD_ID,CHILD_ID)
    out_igv_han.write('load %s\n' % (child_bam))

    out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
    out_igv_han.write('\n')


    # now read the child VCF, check if the variant in the G2P output, if yes:
    # set the value in the dict to 1
    # print out to to output file

    in_cntr = 0
    out_cntr = 0

    child_vcf_file = '%s/%s_%s.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,CHILD_ID)
    in_han = gzip.open(child_vcf_file,'r')

    for line in in_han:
        if line.startswith('#'):
            continue

        in_cntr += 1

        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = int(data[1])
        ref = data[3]
        alt = data[4]

        # extract FS and SOR
        FS = ''
        SOR = ''
        infos = [y.strip() for y in data[7].strip().split(';')]
        for info in infos:
            if info.startswith('FS='):
                tag,FS = info.split('=')
                FS = float(FS)
            elif info.startswith('SOR='):
                tag,SOR = info.split('=')
                SOR = float(SOR)

        VCF_VAR = data[9]

        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
        inher_stat = CHILD_INHER_DICT[key]




        ##############################################################
        # different processing depending on being a SNP, INS, or DEL #
        ##############################################################

        if len(ref) == len(alt):			# SNP
            if len(ref) != 1:
                print "ERROR: MNPs are not supported!"
                print line
                raise SystemExit

            key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
            if key_to_match in G2P_DICT:
                G2P_DICT[key_to_match] = 1
                trans = G2P_DATA[key_to_match][0]
                gene = G2P_DATA[key_to_match][1]
                GT = G2P_DATA[key_to_match][2]

                if (chr != 'chrX') and (chr != 'chrY'):
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit

                elif (chr == 'chrX') or (chr == 'chrY'):
                    if DEC_CHILD_SEX == '46XX':			# a girl
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                        elif GT == 'HOM':
                            genotype = 'Homozygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    elif DEC_CHILD_SEX == '46XY':		# a boy
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
                        elif GT == 'HOM':
                            genotype = 'Hemizygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    else:
                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                        raise SystemExit
                else:
                    print "ERROR: unknown chr"
                    print line
                    raise SystemExit

                # write to the DECIPHER file
                gene_id_idx = gene.find('(')
                if gene_id_idx == -1:
                    gene_id_idx = len(gene)
                gene_id = gene[0:gene_id_idx]
                int_ID = MAP_DICT[fam_id]

                if trans in TRANS_DICT:				# if the transcriptID is to be replaced
                    safe_trans = TRANS_DICT[trans]
                else:
                    safe_trans = trans

                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
                out_cntr += 1
                out_han.write(to_write)

                # write to the IGV file
                i_s = pos - SNAP_FLANK
                i_e = pos + SNAP_FLANK

                # check if above FS/SOR_THRESH to include in the snapshot name
                if (FS == '') or (SOR == ''):
                    flag = 'NA'
                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
                else:
                    flag = 'OK'
                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)

                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
                out_igv_han.write('sort strand\n')
                out_igv_han.write('squish\n')
                out_igv_han.write('snapshot %s\n' % (i_name))
                out_igv_han.write('\n')



        elif len(ref) > len(alt):			# DEL
            if len(alt) != 1:
                print "ERROR with a deletion"
                print line
                raise SystemExit

            G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
            if G2P_key_to_match in G2P_DICT:
                G2P_DICT[G2P_key_to_match] = 1
                trans = G2P_DATA[G2P_key_to_match][0]
                gene = G2P_DATA[G2P_key_to_match][1]
                GT = G2P_DATA[G2P_key_to_match][2]

                if (chr != 'chrX') and (chr != 'chrY'):
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit
                elif (chr == 'chrX') or (chr == 'chrY'):
                    if DEC_CHILD_SEX == '46XX':                 # a girl
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                        elif GT == 'HOM':
                            genotype = 'Homozygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    elif DEC_CHILD_SEX == '46XY':               # a boy
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
                        elif GT == 'HOM':
                            genotype = 'Hemizygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    else:
                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                        raise SystemExit
                else:
                    print "ERROR: unknown chr"
                    print line
                    raise SystemExit

                # write to the DECIPHER file
                gene_id_idx = gene.find('(')
                if gene_id_idx == -1:
                    gene_id_idx = len(gene)
                gene_id = gene[0:gene_id_idx]
                int_ID = MAP_DICT[fam_id]

                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
                    safe_trans = TRANS_DICT[trans]
                else:
                    safe_trans = trans

                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
                out_cntr += 1
                out_han.write(to_write)

                # write to the IGV file
                i_s = pos - SNAP_FLANK
                i_e = pos + SNAP_FLANK

                # check if above FS/SOR_THRESH to include in the snapshot name
                if (FS == '') or (SOR == ''):
                    flag = 'NA'
                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
                else:
                    flag = 'OK'
                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)

                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
                out_igv_han.write('sort strand\n')
                out_igv_han.write('squish\n')
                out_igv_han.write('snapshot %s\n' % (i_name))
                out_igv_han.write('\n')



        elif len(ref) < len(alt):                       # INS
            if len(ref) != 1:
                print "ERROR with an insertion"
                print line
                raise SystemExit

            G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
            if G2P_key_to_match in G2P_DICT:
                G2P_DICT[G2P_key_to_match] = 1
                trans = G2P_DATA[G2P_key_to_match][0]
                gene = G2P_DATA[G2P_key_to_match][1]
                GT = G2P_DATA[G2P_key_to_match][2]

                if (chr != 'chrX') and (chr != 'chrY'):
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit
                elif (chr == 'chrX') or (chr == 'chrY'):
                    if DEC_CHILD_SEX == '46XX':                 # a girl
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                        elif GT == 'HOM':
                            genotype = 'Homozygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    elif DEC_CHILD_SEX == '46XY':               # a boy
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
                        elif GT == 'HOM':
                            genotype = 'Hemizygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    else:
                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                        raise SystemExit
                else:
                    print "ERROR: unknown chr"
                    print line
                    raise SystemExit


                # write to the DECIPHER file
                gene_id_idx = gene.find('(')
                if gene_id_idx == -1:
                    gene_id_idx = len(gene)
                gene_id = gene[0:gene_id_idx]
                int_ID = MAP_DICT[fam_id]

                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
                    safe_trans = TRANS_DICT[trans]
                else:
                    safe_trans = trans

                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
                out_cntr += 1
                out_han.write(to_write)

                # write to the IGV file
                i_s = pos - SNAP_FLANK
                i_e = pos + SNAP_FLANK

                # check if above FS/SOR_THRESH to include in the snapshot name
                if (FS == '') or (SOR == ''):
                    flag = 'NA'
                elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
                    flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
                else:
                    flag = 'OK'
                i_name = '%s_%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt,flag)

                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
                out_igv_han.write('sort strand\n')
                out_igv_han.write('squish\n')
                out_igv_han.write('snapshot %s\n' % (i_name))
                out_igv_han.write('\n')


        else:
            print "Cannot establish the type of this VCF variant"
            print line
            raise SystemExit

    in_han.close()
    out_han.close()
    out_igv_han.close()






    ### check if all G2P and VASE variants were found/matched in the proband's VCF
    found_all_G2P = True
    for k,v in G2P_DICT.iteritems():
        if int(v) == 0:
            print k
            found_all_G2P = False
            break

    if found_all_G2P:
        print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT))
    else:
        print "ERROR: Could not find all G2P variants in the probands VCF file"
        raise SystemExit


    ### check if all G2P variants are written out
    if out_cntr == NUM_UNIQ_G2P_VARS:
        print "OK: All G2P vars are recorded in the output DECIPHER file"
    else:
        print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"

    print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file)
    print "The batch snapshot file = %s" % (out_igv_file)
    sys.stdout.flush()










def read_all_VCF_vars(in_vcf_file,THIS_DICT):

    in_han = gzip.open(in_vcf_file,'r')
    for line in in_han:
        if line.startswith('#'):
            continue

        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = int(data[1])
        ref = data[3]
        alt = data[4]


        # did the splitting and normalizing - should not have multiallelic variants
        if alt.find(',') != -1:
            print "ERROR: found multiallelic variant"
            print line
            raiseSystemExit

        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
        if key not in THIS_DICT:
            THIS_DICT[key] = 1
        else:
            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
            raise SystemExit

    in_han.close()















def read_G2P(in_file):

    global NUM_UNIQ_G2P_VARS

    known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het']

    # first, read the G2P variants on canonical transcripts for the singleton
    CHILD_DICT = defaultdict(dict)	# 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)

    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]

        # get the individual_id
        sam_id = data[0]

        # ignore variants not on canonical transcripts
        is_canon = data[3]
        if is_canon != 'is_canonical':
            continue

        # split the variants based on the gene's OBS model of inheritance
        inher_model = data[4]
        aaa,OBS_state = inher_model.split('=')

        if OBS_state not in known_OBS_states:
            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
            raise SystemExit

        # get the gene name in format ENSG00000165899(C12orf64,OTOGL) or gene-MYT1L(MYT1L)
        gene_name = data[1]

        # get the transcript name in format ENST00000238647 or gene-MYT1L(MYT1L)
        transcript = data[2]


        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
        var_list = [y.strip() for y in data[6].split(';')]
        for v in var_list:
            v_details = [z.strip() for z in v.split(':')]
            chr = v_details[0]
            start = int(v_details[1])
            end = int(v_details[2])
            ref = v_details[3]
            alt = v_details[4]
            GT = v_details[5]
            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)


            if sam_id == CHILD_ID:
                # check for duplication
                if OBS_state not in CHILD_DICT:
                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
                elif second_key not in CHILD_DICT[OBS_state]:
                    CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript)
                else:		# already recorded this variant
                     		# if we have refseq recorded and this is ensembl --> replace
                    if not CHILD_DICT[OBS_state][second_key][1].startswith('ENSG'):		# recorded is refseq
                        if gene_name.startswith('ENSG'):					# this is ensembl
                            CHILD_DICT[OBS_state][second_key] = (GT,gene_name,transcript) 	# replace
                        else:									# this is refseq again, ignore
                            pass
                    else:									# recorded is ensembl, ignore
                        pass

            else:
                print "ERROR: cannot identify the person for this variant"
                print line
                raise SystemExit

    in_han.close()


    ### print out the number of unique G2P variants in CHILD ###
    child_mono = 0
    child_bi = 0
    child_hem = 0
    child_het = 0

    if 'monoallelic_autosomal' in CHILD_DICT:
        child_mono = len(CHILD_DICT['monoallelic_autosomal'])
    if 'biallelic_autosomal' in CHILD_DICT:
        child_bi = len(CHILD_DICT['biallelic_autosomal'])
    if 'monoallelic_X_hem' in CHILD_DICT:
        child_hem = len(CHILD_DICT['monoallelic_X_hem'])
    if 'monoallelic_X_het' in CHILD_DICT:
        child_het = len(CHILD_DICT['monoallelic_X_het'])

    print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID)
    print "    monoallelic_autosomal: %s" % (child_mono)
    print "    biallelic_autosomal: %s" % (child_bi)
    print "    monoallelic_X_hem: %s" % (child_hem)
    print "    monoallelic_X_het: %s" % (child_het)




    ######################################################################################################
    ####    Dominant filtering                                                                        ####
    ####    if the gene has been considered under the dominant model (OBS == monoallelic_autosomal)   ####
    ####    exclude child variants seen in UNAFFECTED mother/father, regardless of GT                 ####
    ######################################################################################################


    print ""
    print "===   monoallelic autosomal (DOMINANT) filtering   ==="


    for key in CHILD_DICT['monoallelic_autosomal']:	# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)

        CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0]
        CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1]
        CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2]

#        if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"):
#            MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0]
#            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT)
#            continue
#
#        if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"):
#            DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0]
#            print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
#            continue


        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
        chr,start,end,ref,alt = key.split(":")
        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
            if len(ref) < len(alt):                                 # an INS
                orig_start = start
                orig_ref = ref
                orig_alt = alt
                start = orig_start
                ref = '-'
                alt = orig_alt[len(orig_ref):]
                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
            else:                                                   # a DEL
                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                print line
                raise SystemExit

        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

        # record the data for CHILD G2P variants (for OBS=monoallelic)

        if new_key not in G2P_DICT:
            G2P_DICT[new_key] = 0
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass

        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
        if new_key not in G2P_DATA:
            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass


    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
    sys.stdout.flush()

    print ""





    ##############################################################################################################
    ####    Recessive filtering                                                                               ####
    ####    under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene           ####
    ####    must all be HET in CHILD, GT in parent does not matter                                            ####
    ####    all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) ####
    ####    and this parent must be unaffected                                                                ####
    ####    if all these: then exclude all child variants in this gene                                        ####
    ##############################################################################################################


    print ""
    print "===   biallelic autosomal (RECESSIVE) filtering   ==="


    GENE_KEY_GT = defaultdict(dict)		# for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans)

    # process all variants in biallelic genes in child
    for key in CHILD_DICT['biallelic_autosomal']:		# this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
        b_GT = CHILD_DICT['biallelic_autosomal'][key][0]
        b_gene = CHILD_DICT['biallelic_autosomal'][key][1]
        b_trans = CHILD_DICT['biallelic_autosomal'][key][2]
        GENE_KEY_GT[b_gene][key] = (b_GT,b_trans)

    # iterate over genes in GENE_KEY_GT
    for g in GENE_KEY_GT: 			# this is the biallelic gene name
#        all_HET = True
#
#        # iterate over variants in this gene
#        for kx in GENE_KEY_GT[g]:		# this the second key: chr:start:end:ref:alt
#            if GENE_KEY_GT[g][kx][0] == 'HOM':     # there is a HOM variant in the child - NO filtering
#                all_HET = False
#                break
#
#        if all_HET:				# for this gene
#        # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent
#        # if yes, filter out and write a message to the log file
#        # if not, to be added to G2P_DICT and G2P_DATA for further processing
#
#            all_from_one_parent = True
#
#            # iterate again over the variants in this gene
#            VAR_SOURCE_LIST = {}		# key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED)
#
#            for ky in GENE_KEY_GT[g]:		# this the second key: chr:start:end:ref:alt
#
#                this_var_status = 'NONE'
#
#                if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"):
#                    this_var_status = 'MOM'
#                if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"):
#                    if this_var_status == 'NONE':
#                        this_var_status = 'DAD'
#                    elif this_var_status == 'MOM':
#                        this_var_status = 'BOTH'
#
#                VAR_SOURCE_LIST[ky] = this_var_status
#
#            # have collected the parent source for all variants in this gene
#            tot_num_vars = len(VAR_SOURCE_LIST)
#            num_mom = 0
#            num_dad = 0
#            num_none = 0
#            for kt,v in VAR_SOURCE_LIST.iteritems():
#                if v == 'NONE':
#                    num_none += 1
#                elif v == 'MOM':
#                    num_mom += 1
#                elif v == 'DAD':
#                    num_dad += 1
#                elif v == 'BOTH':
#                    num_mom += 1
#                    num_dad += 1
#                else:
#                    print "ERROR: cannot understand the source parent = %s" % (v)
#                    raise SystemExit
#
#            if num_none > 0:
#                all_from_one_parent = False
#            elif num_mom < tot_num_vars and num_dad < tot_num_vars:
#                all_from_one_parent = False
#
#            # if all variants in the child in this gene are found in single unaffected parent - filter out
#            if all_from_one_parent:
#                for kz in GENE_KEY_GT[g]:
#                    print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz])
#                continue
#
#        # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written
#        # and gone to evaluating the next biallelic gene in the child
#

#        # if here
#        # - either not all CHILD variants in this gene are not HET, or
#        # - not all of them can be traced to a single unaffected parent
#        # --> add to be processed
#
        # here we are at gene level, must iterate over all variants in this gene
        # iterate over variants in this gene
        for kkk in GENE_KEY_GT[g]:                # this the second key: chr:start:end:ref:alt

            CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0]
            CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1]
            CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2]

            # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
            chr,start,end,ref,alt = kkk.split(":")
            if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
                if len(ref) < len(alt):                                 # an INS
                    orig_start = start
                    orig_ref = ref
                    orig_alt = alt
                    start = orig_start
                    ref = '-'
                    alt = orig_alt[len(orig_ref):]
                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
                else:                                                   # a DEL
                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                    print line
                    raise SystemExit

            new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

            # record the data for CHILD G2P variants (for OBS=biallelic)
            if new_key not in G2P_DICT:
                G2P_DICT[new_key] = 0
            else:
                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
                # raise SystemExit
                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
                pass

            # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
            if new_key not in G2P_DATA:
                G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
            else:
                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
                # raise SystemExit
                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
                pass

    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
    sys.stdout.flush()
    print ""









    ####################################################################################################################
    ####    X-linked filtering                                                                                      ####
#.#    ####    under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance)      ####
    ####    under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het)                                    ####
    ####    exclude child HET variants if seen as HOM in UNAFFECTED father                                          ####
    ####													    ####
    ####    Note 18/01/2022    									    		    ####
    ####    This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het  ####
    ####    and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance           ####
    ####    the code below treats x-linked over-dominance as the others (i.e. filters, while it should not)         ####
    ####    Issue flagged to G2P plug-in team, awaiting their fix						    ####
    ####    for now manually scan the output of G2P for the proband (both for boys and girls)                       ####
    ####        to check if any variant has been called in PCDH19 and EFNB1                                         ####
    ####    also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name    ####
    ####################################################################################################################


    print ""
    print "===   X-linked filtering   ==="

    #######################################
    ### process monoallelic_X_hem genes ###
    #######################################

    for key in CHILD_DICT['monoallelic_X_hem']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)

        CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0]
        CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1]
        CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2]

#        if CHILD_GT == 'HOM':							# do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl)
#            pass
#        else:
#            if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"):
#                DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0]
#                if DAD_GT == 'HOM':						# i.e., hemizygous variant in unaffected father
#                    print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
#                    continue
#
        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
        chr,start,end,ref,alt = key.split(":")
        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
            if len(ref) < len(alt):                                 # an INS
                orig_start = start
                orig_ref = ref
                orig_alt = alt
                start = orig_start
                ref = '-'
                alt = orig_alt[len(orig_ref):]
                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
            else:                                                   # a DEL
                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                print line
                raise SystemExit

        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

        # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem)
        if new_key not in G2P_DICT:
            G2P_DICT[new_key] = 0
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass

        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
        if new_key not in G2P_DATA:
            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass



    #######################################
    ### process monoallelic_X_het genes ###
    #######################################

    for key in CHILD_DICT['monoallelic_X_het']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)

        CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0]
        CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1]
        CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2]

#        if CHILD_GT == 'HOM':                                                   # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl)
#            pass
#        else:
#            if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"):
#                DAD_GT = DAD_DICT['monoallelic_X_het'][key][0]
#                if DAD_GT == 'HOM':                                             # i.e., x-linked dominant variant in unnafected father
#                    print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
#                    continue
#
        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
        chr,start,end,ref,alt = key.split(":")
        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
            if len(ref) < len(alt):                                 # an INS
                orig_start = start
                orig_ref = ref
                orig_alt = alt
                start = orig_start
                ref = '-'
                alt = orig_alt[len(orig_ref):]
                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
            else:                                                   # a DEL
                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                print line
                raise SystemExit

        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

        # record the data for CHILD G2P variants (for OBS=monoallelic_X_het)
        if new_key not in G2P_DICT:
            G2P_DICT[new_key] = 0
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass

        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
        if new_key not in G2P_DATA:
            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass



##.#    ########################################################################
##.#    ### process x-linked over-dominance  genes - no filtering to be done ###
##.#    ########################################################################
#
##.#    for key in CHILD_DICT['x-linked over-dominance']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
#
##.#        CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0]
##.#        CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1]
##.#        CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2]
#
##.#        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
##.#        chr,start,end,ref,alt = key.split(":")
##.#        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
##.#            if len(ref) < len(alt):                                 # an INS
##.#                orig_start = start
##.#                orig_ref = ref
##.#                orig_alt = alt
##.#                start = orig_start
##.#                ref = '-'
##.#                alt = orig_alt[len(orig_ref):]
##.#                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
##.#            else:                                                   # a DEL
##.#                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
##.#                print line
##.#                raise SystemExit
#
##.#        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
#
##.#        # record the data for CHILD G2P variants (for OBS=x-linked over-dominance)
##.#        if new_key not in G2P_DICT:
##.#            G2P_DICT[new_key] = 0
##.#        else:
##.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
##.#            # raise SystemExit
##.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
##.#            pass
#
##.#        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
##.#        if new_key not in G2P_DATA:
##.#            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
##.#        else:
##.#            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
##.#            # raise SystemExit
##.#            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
##.#            pass


    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
    sys.stdout.flush()

    print ""
    print ""

















def read_ped(in_file):

    global CHILD_ID
    global CHILD_SEX
    global DEC_CHILD_SEX

    CHILD_ID = 0
    CHILD_SEX = 0

    # no need to do PED checks, did them for singletons at trio_setup.sh
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        CHILD_ID = data[1]
        CHILD_SEX = int(data[4])
        if CHILD_SEX == 1:          # boy
            DEC_CHILD_SEX = '46XY'
        elif CHILD_SEX == 2:        # girl
            DEC_CHILD_SEX = '46XX'
        else:
            print "ERROR: proband sex unknown"
            print line
            raise SystemExit




def read_map_file(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        dec_id = data[0]
        int_id = data[1]
        if dec_id not in MAP_DICT:
            MAP_DICT[dec_id] = int_id
        else:
            print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
            raise SystemExit
    in_han.close()




def read_trans_map(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        old_trans_id = data[0]
        new_trans_id = data[1]
        if old_trans_id not in TRANS_DICT:
            TRANS_DICT[old_trans_id] = new_trans_id
        else:
            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
            raise SystemExit
    in_han.close()






if __name__ == '__main__':
    if len(sys.argv) == 11:
        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \
        dec_map_file,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
        raise SystemExit