#	input:
#		the family PED file
#		G2P text output for the trio		[${FAMILY_ID}.report.txt]
#		VASE output				[${SAMPLE_ID}.clean.strict.denovo.vcf]
#
#
#		individual VCF file for the trio proband	[${FAMILY_ID}-gatk-haplotype-annotated.${SAMPLE_ID}.vcf.gz]
#		G2P text output for the trio			[${FAMILY_ID}.report.txt]
#		VASE output					[${SAMPLE_ID}.clean.strict.denovo.vcf]
#
#
#	output:
#		DECIPHER formated file for the proband
#		- all G2P variants
#		- denovo variants marked as such
#
#	checks:
#		all G2P variants found in the individual VCF
#		all VASE denovo variants found in the individual VCF
#
#       Author: MH
#       last modified: SEPT 27, 2019



import sys
import os
import csv
import gzip


ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'


G2P_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in G2P); 1 (if found in VCF) - for variants found in G2P output for this CHILD_ID
G2P_DATA = {}		# key: chr:pos:ref:alt; value: (transcript,gene,GT)
VASE_DICT = {}		# key: chr:pos:ref:alt; value: 0 (if found only in VASE); 1 (if found in VCF) - for variants found in VASE output for this CHILD_ID


NUM_UNIQ_G2P_VARS = 0
NUM_UNIQ_VASE_VARS = 0


CHILD_ID = 0
CHILD_SEX = 0
DEC_CHILD_SEX = 'unknown'

MOM_ID = 0
MOM_STAT = 0	# 1 = UNAFF, 2 = AFF

DAD_ID = 0
DAD_STAT = 0	# 1 = UNAFF, 2 = AFF


ALL_CHILD_DICT = {}		# key: chr:pos:ref:alt; value: (num_ALT_reads,VAF)
ALL_MOM_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant
ALL_DAD_DICT = {}		# key: chr:pos:ref:alt; value: irrelevant


CHILD_INHER_DICT = {}		# key: chr:pos:ref:alt; value: 'Paternally inherited, constitutive in father' | 'Maternally inherited, constitutive in mother' | 'Biparental' | 'De novo constitutive' | 'Unknown'

SNAP_FLANK = 25


MAP_DICT = {}			# key: family_id (aka decipher_id); value: internal (decipher) ID
TRANS_DICT = {}                 # key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER



def go(ped_file,in_g2p_file,in_vase_file,in_vcf,dec_dir,igv_dir,fam_igv_dir,data_source_dir,fam_id,map_dir,trans_map_file):

    # read the decipher to internal ID mapping file
    map_file = '%s/DECIPHER_INTERNAL_IDs.txt' % (map_dir)
    read_map_file(map_file)


    # read the transcript mapping file
    read_trans_map(trans_map_file)



    # read the ped file and establish CHILD_ID,CHILD_SEX,MOM_ID,DAD_ID
    read_ped(ped_file)

    if (CHILD_ID != 0) and (CHILD_SEX != 0) and (DEC_CHILD_SEX != 'unknown') and (MOM_ID != 0) and (MOM_STAT != 0) and (DAD_ID != 0) and (MOM_STAT != 0):
        print "======================================"
        print "Analyzing:"
        print "CHILD_ID = %s, CHILD_SEX = %s, DEC_CHILD_SEX = %s" % (CHILD_ID,CHILD_SEX,DEC_CHILD_SEX)
        print "MOM_ID = %s, MOM_STATUS = %s" % (MOM_ID,MOM_STAT)
        print "DAD_ID = %s, DAD_STATUS = %s" % (DAD_ID,DAD_STAT)
        print "======================================"
        sys.stdout.flush() 
    else:
        print "ERROR: problems reading the PED file = %s" % (ped_file)
        raise SystemExit       


    # read the G2P output for this family
    read_G2P(in_g2p_file)


    # read the VASE output for this family
    read_VASE(in_vase_file)


    # now read the individual VCFs and record all the variants
    child_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (CHILD_ID)
    mom_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (MOM_ID)
    dad_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (DAD_ID)


    read_all_VCF_vars(child_vcf_file,ALL_CHILD_DICT)
    print "Found %s unique VCF variants for CHILD (%s)" % (len(ALL_CHILD_DICT),CHILD_ID)
    sys.stdout.flush() 

    read_all_VCF_vars(mom_vcf_file,ALL_MOM_DICT)
    print "Found %s unique VCF variants for MOM (%s)" % (len(ALL_MOM_DICT),MOM_ID)
    sys.stdout.flush()

    read_all_VCF_vars(dad_vcf_file,ALL_DAD_DICT)
    print "Found %s unique VCF variants for DAD (%s)" % (len(ALL_DAD_DICT),DAD_ID)
    sys.stdout.flush()


    # now go over all child variants and set the inheritance
    num_child_vars_assigned = 0
    for key,v in ALL_CHILD_DICT.iteritems():
        if (key in ALL_MOM_DICT) and (key in ALL_DAD_DICT):
            CHILD_INHER_DICT[key] = 'Biparental'
            num_child_vars_assigned += 1
        elif key in ALL_MOM_DICT:
            CHILD_INHER_DICT[key] = 'Maternally inherited, constitutive in mother'
            num_child_vars_assigned += 1
        elif key in ALL_DAD_DICT:
            CHILD_INHER_DICT[key] = 'Paternally inherited, constitutive in father'
            num_child_vars_assigned += 1
        else:
            CHILD_INHER_DICT[key] = 'Unknown'

    assigned_ratio = (float(num_child_vars_assigned)/float(len(ALL_CHILD_DICT)))*100.0

    print "%s of the %s unique VCF variants (%.2f%%) for CHILD (%s) has been assigned to parents" % (num_child_vars_assigned,len(ALL_CHILD_DICT),assigned_ratio,CHILD_ID)
    sys.stdout.flush()





    # now read the child VCF, check if the variant in the G2P/VASE output, if yes:
    # set the value in the dict to 1
    # print out to to output file

    in_cntr = 0
    out_cntr = 0

    child_vcf_file = in_vcf + '.ready.%s.vcf.gz' % (CHILD_ID)
    in_han = gzip.open(child_vcf_file,'r')


    # setup the DECIPHER output file
    out_dec_file = '%s/%s_DEC.csv' % (dec_dir,CHILD_ID)
    out_han = open(out_dec_file,'w')
    out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')



    # setup the IGV snapshot file
    out_igv_file = '%s/%s.snapshot.txt' % (igv_dir,CHILD_ID)
    out_igv_han = open(out_igv_file,'w')
    out_igv_han.write('new\n')
    out_igv_han.write('genome hg38\n')
    out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
    out_igv_han.write('new\n')
    out_igv_han.write('load %s/%s/%s-ready.bam\n' % (data_source_dir,CHILD_ID,CHILD_ID))
    out_igv_han.write('load %s/%s/%s-ready.bam\n' % (data_source_dir,MOM_ID,MOM_ID))
    out_igv_han.write('load %s/%s/%s-ready.bam\n' % (data_source_dir,DAD_ID,DAD_ID))
    out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
    out_igv_han.write('\n')



    for line in in_han:
        if line.startswith('#'):
            continue

        in_cntr += 1

        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = int(data[1])
        ref = data[3]
        alt = data[4]
#        VCF_FORMAT = data[8]
#        VCF_FORMAT_SPLIT = [y.strip() for y in VCF_FORMAT.strip().split(':')]
#        if VCF_FORMAT_SPLIT[1] != 'AD':
#            print "ERROR: AD tag not found where expected"
#            print VCF_FORMAT
#            raise SystemExit

        VCF_VAR = data[9]
#        VCF_VAR_SPLIT = [z.strip() for z in VCF_VAR.strip().split(':')]
#        num_REF,num_ALT = [int(w.strip()) for w in VCF_VAR_SPLIT[1].strip().split(',')]  

#        if num_REF+num_ALT == 0:	# the result of spliting the family VCF to indivudual VCFs, non-variants are not excluded by bcftools
#            continue  
#        VAF = float(num_ALT)/float(num_REF+num_ALT)


        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)
        inher_stat = CHILD_INHER_DICT[key]



        ##############################################################
        # different processing depending on being a SNP, INS, or DEL #
        ##############################################################
         
        if len(ref) == len(alt):			# SNP
            if len(ref) != 1:
                print "ERROR: MNPs are not supported!"
                print line
                raise SystemExit

            key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
            is_denovo = False
            if key_to_match in VASE_DICT:
                VASE_DICT[key_to_match] = 1
                is_denovo = True
            if key_to_match in G2P_DICT:
                G2P_DICT[key_to_match] = 1
                trans = G2P_DATA[key_to_match][0]
                gene = G2P_DATA[key_to_match][1]
                GT = G2P_DATA[key_to_match][2]

                if is_denovo:
                    if inher_stat == 'Unknown': 
                        inher_stat = 'De novo constitutive'
                    else:
                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
                        raise SystemExit 

                if (chr != 'chrX') and (chr != 'chrY'):
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit

                elif (chr == 'chrX') or (chr == 'chrY'):
                    if DEC_CHILD_SEX == '46XX':			# a girl
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                        elif GT == 'HOM':
                            genotype = 'Homozygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    elif DEC_CHILD_SEX == '46XY':		# a boy
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
                        elif GT == 'HOM':
                            genotype = 'Hemizygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    else:
                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                        raise SystemExit
                else:
                    print "ERROR: unknown chr"
                    print line
                    raise SystemExit  

                # write to the DECIPHER file
                gene_id_idx = gene.find('(')
                if gene_id_idx == -1:
                    gene_id_idx = len(gene) 
                gene_id = gene[0:gene_id_idx]
                int_ID = MAP_DICT[fam_id]

                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
                    safe_trans = TRANS_DICT[trans]
                else:
                    safe_trans = trans

                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)

#                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)

                out_cntr += 1
                out_han.write(to_write)
###                print to_write[:-1]

                # write to the IGV file
                i_s = pos - SNAP_FLANK
                i_e = pos + SNAP_FLANK
                i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt)
                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
                out_igv_han.write('sort strand\n')
                out_igv_han.write('collapse\n')
                out_igv_han.write('snapshot %s\n' % (i_name))
                out_igv_han.write('\n')


            

        elif len(ref) > len(alt):			# DEL
            if len(alt) != 1:
                print "ERROR with a deletion"
                print line
                raise SystemExit 

            G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
            VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
            is_denovo = False
            if VASE_key_to_match in VASE_DICT:
                VASE_DICT[VASE_key_to_match] = 1
                is_denovo = True 
            if G2P_key_to_match in G2P_DICT:
                G2P_DICT[G2P_key_to_match] = 1
                trans = G2P_DATA[G2P_key_to_match][0]
                gene = G2P_DATA[G2P_key_to_match][1]
                GT = G2P_DATA[G2P_key_to_match][2]

                if is_denovo:
                    if inher_stat == 'Unknown':
                        inher_stat = 'De novo constitutive'
                    else:
                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
                        raise SystemExit

                if (chr != 'chrX') and (chr != 'chrY'):
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit
                elif (chr == 'chrX') or (chr == 'chrY'):
                    if DEC_CHILD_SEX == '46XX':                 # a girl
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                        elif GT == 'HOM':
                            genotype = 'Homozygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    elif DEC_CHILD_SEX == '46XY':               # a boy
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
                        elif GT == 'HOM':
                            genotype = 'Hemizygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    else:
                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                        raise SystemExit
                else:
                    print "ERROR: unknown chr"
                    print line
                    raise SystemExit

                # write to the DECIPHER file
                gene_id_idx = gene.find('(')
                if gene_id_idx == -1:
                    gene_id_idx = len(gene)
                gene_id = gene[0:gene_id_idx]
                int_ID = MAP_DICT[fam_id]

                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
                    safe_trans = TRANS_DICT[trans]
                else:
                    safe_trans = trans

                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)

#                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)

                out_cntr += 1
                out_han.write(to_write)
###                print to_write[:-1]

                # write to the IGV file
                i_s = pos - SNAP_FLANK
                i_e = pos + SNAP_FLANK
                i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt)
                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
                out_igv_han.write('sort strand\n')
                out_igv_han.write('collapse\n')
                out_igv_han.write('snapshot %s\n' % (i_name))
                out_igv_han.write('\n')



        elif len(ref) < len(alt):                       # INS
            if len(ref) != 1:
                print "ERROR with an insertion"
                print line
                raise SystemExit

            G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
            VASE_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
            is_denovo = False
            if VASE_key_to_match in VASE_DICT:
                VASE_DICT[VASE_key_to_match] = 1
                is_denovo = True
            if G2P_key_to_match in G2P_DICT:
                G2P_DICT[G2P_key_to_match] = 1
                trans = G2P_DATA[G2P_key_to_match][0]
                gene = G2P_DATA[G2P_key_to_match][1]
                GT = G2P_DATA[G2P_key_to_match][2]

                if is_denovo:
                    if inher_stat == 'Unknown':
                        inher_stat = 'De novo constitutive'
                    else:
                        print "ERROR: %s is both VASE denovo and %s from VCF" % (key,inher_stat)
                        raise SystemExit

                if (chr != 'chrX') and (chr != 'chrY'):
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit
                elif (chr == 'chrX') or (chr == 'chrY'):
                    if DEC_CHILD_SEX == '46XX':                 # a girl
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                        elif GT == 'HOM':
                            genotype = 'Homozygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    elif DEC_CHILD_SEX == '46XY':               # a boy
                        if GT == 'HET':
                            genotype = 'Heterozygous'
                            print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (CHILD_ID,chr,pos,ref,alt,VCF_VAR)
                        elif GT == 'HOM':
                            genotype = 'Hemizygous'
                        else:
                            print "ERROR: Cannot understand GT = %s" % (GT)
                            raise SystemExit
                    else:
                        print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                        raise SystemExit
                else:
                    print "ERROR: unknown chr"
                    print line
                    raise SystemExit


                # write to the DECIPHER file
                gene_id_idx = gene.find('(')
                if gene_id_idx == -1:
                    gene_id_idx = len(gene)
                gene_id = gene[0:gene_id_idx]
                int_ID = MAP_DICT[fam_id]

                if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
                    safe_trans = TRANS_DICT[trans]
                else:
                    safe_trans = trans

                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)

#                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (int_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene_id,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)
#                to_write = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,%s,,,,%s,\n" % (CHILD_ID,chr[3:],pos,ASSEMBLY,ref,alt,trans,gene,INTERGENIC,DEC_CHILD_SEX,ACCESS,inher_stat,genotype)

                out_cntr += 1
                out_han.write(to_write)
###                print to_write[:-1]

                # write to the IGV file
                i_s = pos - SNAP_FLANK
                i_e = pos + SNAP_FLANK
                i_name = '%s_%s_%s_%s_%s.png' % (CHILD_ID,chr,pos,ref,alt)
                out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
                out_igv_han.write('sort strand\n')
                out_igv_han.write('collapse\n')
                out_igv_han.write('snapshot %s\n' % (i_name))
                out_igv_han.write('\n')


        else:
            print "Cannot establish the type of this VCF variant"
            print line
            raise SystemExit 

    in_han.close()
    out_han.close()
    out_igv_han.close()








    ### check if all G2P and VASE variants were found/matched in the proband's VCF
    found_all_G2P = True
    found_all_VASE = True

    for k,v in G2P_DICT.iteritems():
        if int(v) == 0:
            print k 
            found_all_G2P = False
            break

    for k,v in VASE_DICT.iteritems():
        if int(v) == 0:
            print k
            found_all_VASE = False
            break   

    if found_all_G2P:
        print "OK: Found all %s G2P variants in the proband's VCF file" % (len(G2P_DICT))
    else:
        print "ERROR: Could not find all G2P variants in the probands VCF file"
        raise SystemExit

    if found_all_VASE:
        print "OK: Found all %s VASE variants in the proband's VCF file" % (len(VASE_DICT))
    else:
        print "ERROR: Could not find all VASE variants in the probands VCF file"
        raise SystemExit

    ### check if all G2P variants are written out - does not work at the moment, since we are excluding some G2P vars based on num_ALT_reads, VAF
    if out_cntr == NUM_UNIQ_G2P_VARS:
        print "OK: All G2P vars are recorded in the output DECIPHER file"
    else:
        print "ERROR: *NOT* all G2P vars are recorded in the G2P VCF file"


    print "Wrote %s variants in outfile = %s" % (out_cntr,out_dec_file)
    print "The batch snapshot file = %s" % (out_igv_file)
    sys.stdout.flush()








#########################################################################


 

def read_all_VCF_vars(in_vcf_file,THIS_DICT):

    in_han = gzip.open(in_vcf_file,'r')
    for line in in_han:
        if line.startswith('#'):
            continue

        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = int(data[1])
        ref = data[3]
        alt = data[4]


        # did the splitting and normalizing - should not have multiallelic variants
        if alt.find(',') != -1:
            print "ERROR: found multiallelic variant"
            print line
            raiseSystemExit

        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)  
        if key not in THIS_DICT:
            THIS_DICT[key] = 1
        else:
            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
            raise SystemExit

    in_han.close()   







def read_VASE(in_file):

    global NUM_UNIQ_VASE_VARS

    in_han = open(in_file,'r')
    for line in in_han:
        # ignore header lines
        if line.startswith('#'):
            continue

        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = data[1]
        ref = data[3]
        alt = data[4] 

        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)

        if key not in VASE_DICT:
            VASE_DICT[key] = 0
        else:
            print "ERROR: duplicate VASE variant key = %s" % (key)
            raise SystemExit

    in_han.close()
    NUM_UNIQ_VASE_VARS = len(VASE_DICT)
    print "Found %s unique VASE denovo variants for CHILD (%s)" % (NUM_UNIQ_VASE_VARS,CHILD_ID)
    sys.stdout.flush()





def read_G2P(in_file):

    global NUM_UNIQ_G2P_VARS

    # first, read the G2P variants on canonical transcripts for each of the parents in genes considered under the OBS=monoallelic scenario
    MOM_DICT = {}	# key: chr:start:end:ref:alt ; value: ZYG
    DAD_DICT = {}       # key: chr:start:end:ref:alt ; value: ZYG

    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]

        # ignore variants in the child
        sam_id = data[0]
        if sam_id == CHILD_ID:
            continue
        
        # ignore variants not on canonical transcripts
        is_canon = data[3]
        if is_canon != 'is_canonical':
            continue
        
        # select only variants in genes being considered under the dominant model
        inher_model = data[4]
        if inher_model != 'OBS=monoallelic':
            continue

        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under the dominant model
        # and seen in one of the parents
        var_list = [y.strip() for y in data[6].split(';')]
        for v in var_list:
            v_details = [z.strip() for z in v.split(':')]
            chr = v_details[0]
            start = int(v_details[1])
            end = int(v_details[2])
            ref = v_details[3]
            alt = v_details[4]
            GT = v_details[5]
            key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)

            if sam_id == MOM_ID:
                if key not in MOM_DICT:
                    MOM_DICT[key] = GT
                else:
                    print "ERROR: a duplicate variant in dominant gene for MOM"
                    print line
                    raise SystemExit

            elif sam_id == DAD_ID:
                if key not in DAD_DICT:
                    DAD_DICT[key] = GT
                else:
                    print "ERROR: a duplicate variant in dominant gene for DAD"
                    print line
                    raise SystemExit
            else:
                print "ERROR: cannot identify the person for this variant"
                print line
                raise SystemExit   

    in_han.close()
    print "Found %s unique G2P variants in MOM (%s) on canon transcript in genes being considered under the DOM model" % (len(MOM_DICT),MOM_ID)
    print "Found %s unique G2P variants in DAD (%s) on canon transcript in genes being considered under the DOM model" % (len(DAD_DICT),DAD_ID)
    sys.stdout.flush()





    # now, read the file again to collect child variants: under any inheritance model
    # if the gene is being considered under the dominant model, exclude variants seen in UNAFFECTED mother/father

    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]

        # ignore variants in other memebers of the trio 
        sam_id = data[0]
        if sam_id != CHILD_ID:
            continue

        # ignore variants not on canonical transcripts
        is_canon = data[3]
        if is_canon != 'is_canonical':
            continue  

        inher_model = data[4]
        gene = data[1] 
        transcript = data[2] 

        var_list = [y.strip() for y in data[6].split(';')]
        for v in var_list:
            v_details = [z.strip() for z in v.split(':')]
            chr = v_details[0]
            start = int(v_details[1])
            end = int(v_details[2])
            ref = v_details[3]
            alt = v_details[4]
            GT = v_details[5]

            if inher_model == 'OBS=monoallelic':
                key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)
                if (key in MOM_DICT) and (MOM_STAT == "UNAFFECTED"):
                    MOM_GT = MOM_DICT[key]
                    print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,GT,MOM_GT,MOM_STAT)
                    continue

#                    if GT == 'HET' and (MOM_GT == 'HET' or MOM_GT == 'HOM'):
#                        print "***Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s [DOM model]" % (key,GT,MOM_GT,MOM_STAT)
#                        continue
#                    else:
#                        pass

                if (key in DAD_DICT) and (DAD_STAT == "UNAFFECTED"):
                    DAD_GT = DAD_DICT[key]
                    print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,GT,DAD_GT,DAD_STAT)
                    continue

#                    if GT == 'HET' and (DAD_GT == 'HET' or DAD_GT == 'HOM'):
#                        print "***Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s [DOM model]" % (key,GT,DAD_GT,DAD_STAT)
#                        continue
#                    else:
#                        pass


            # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
            if len(ref) > 1 and len(alt) > 1:				# an INDEL - not normalized
                if len(ref) < len(alt):					# an INS
                    orig_start = start
                    orig_ref = ref
                    orig_alt = alt 
                    start = orig_start
                    ref = '-'
                    alt = orig_alt[len(orig_ref):]
                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)  
                else:							# a DEL
                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                    print line
                    raise SystemExit 

            
            # record the data for CHILD G2P variants (both DOM and REC model)
            key = '%s:%s:%s:%s' % (chr,start,ref,alt)
            
            if key not in G2P_DICT:
                G2P_DICT[key] = 0
            else:
                # print "ERROR: duplicate G2P variant key = %s" % (key)
                # raise SystemExit
                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
                pass

            # and record the required data (transcript,gene,GT) in G2P_DATA
            if key not in G2P_DATA:
                G2P_DATA[key] = (transcript,gene,GT)
            else:
                # print "ERROR: duplicate G2P variant key = %s" % (key)
                # raise SystemExit
                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
                pass
                          
    in_han.close()
    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
    print "Found %s unique G2P variants for CHILD (%s) after DOM model filtering" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
    sys.stdout.flush()





def read_ped(in_file):

    global CHILD_ID
    global CHILD_SEX
    global DEC_CHILD_SEX
    global MOM_ID
    global MOM_STAT 
    global DAD_ID
    global DAD_STAT

    CHILD_ID = 0
    CHILD_SEX = 0
    MOM_ID = 0
    MOM_STAT = 0
    DAD_ID = 0
    DAD_STAT = 0

    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        if data[2] != '0' and data[3] != '0':			# this is the child in the trio
            if CHILD_ID == 0:
                CHILD_ID = data[1]
            else:						# seen another child
                print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment"  
                raise SystemExit

            if DAD_ID == 0: 
                DAD_ID = data[2] 
            else:
                if data[2] != DAD_ID:
                    print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID)
                    raise SystemExit   
            if MOM_ID == 0:
                MOM_ID = data[3]
            else:
                if data[3] != MOM_ID:
                    print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID)
                    raise SystemExit

            CHILD_SEX = int(data[4])
            if CHILD_SEX == 1:		# boy
                DEC_CHILD_SEX = '46XY'
            elif CHILD_SEX == 2:	# girl
                DEC_CHILD_SEX = '46XX'
            else:
                print "ERROR: proband sex unknown"
                print line
                raise SystemExit

            if int(data[5]) != 2:
                print "ERROR: child not affected"
                print line
                raise SystemExit  


        elif int(data[2]) == 0 and int(data[3]) == 0:		# this is a parent record
            if int(data[4]) == 1:				# this is the dad
                if int(data[5]) == 1:
                    DAD_STAT = "UNAFFECTED"
                elif int(data[5]) == 2:
                    DAD_STAT = "AFFECTED" 
                else:  
                    print "ERROR: cannot establish the dad's status"
                    print line 
                    raise SystemExit

                if DAD_ID == 0:
                    DAD_ID = data[1]
                else:
                    if data[1] != DAD_ID:
                        print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID)
                        raise SystemExit
                
            if int(data[4]) == 2:                               # this is the mom
                if int(data[5]) == 1:
                    MOM_STAT = "UNAFFECTED"
                elif int(data[5]) == 2:
                    MOM_STAT = "AFFECTED"
                else:
                    print "ERROR: cannot establish mom's status"
                    print line
                    raise SystemExit

                if MOM_ID == 0:
                    MOM_ID = data[1]
                else:
                    if data[1] != MOM_ID:
                        print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID)
                        raise SystemExit
        else:
            print "ERROR: problematic PED line"
            print line
            raise SystemExit 






def read_map_file(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        dec_id = data[0]
        int_id = data[1]
        if dec_id not in MAP_DICT:
            MAP_DICT[dec_id] = int_id
        else:
            print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
            raise SystemExit
    in_han.close()  



def read_trans_map(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        old_trans_id = data[0]
        new_trans_id = data[1]
        if old_trans_id not in TRANS_DICT:
            TRANS_DICT[old_trans_id] = new_trans_id
        else:
            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
            raise SystemExit
    in_han.close()






if __name__ == '__main__':
    if len(sys.argv) == 12:
        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/generate_DEC_IGV.py \
        2820-gatk-haplotype-annotated.2820_2820.vcf.gz \
        ../output_dd/2820_log_dir/2820.report.txt \
        /scratch/u035/u035/shared/analysis/wes_pilot/VASE/08042019/output/2820_2820.strict.denovo.vcf \
        2820_2820 \
        2820_2820.DEC.txt \
        DECIPHER_DIR \
        IGV_DIR \
        FAM_IGV_DIR \
        SOURCE_DIR  \
        fam_id \
        map_dir \
        trans_map_file"
        raise SystemExit