generate_DEC_IGV_aff_sib_scripts_from_quad.py

#	input:
#		the family PED file
#		G2P text output for the trio		[${FAMILY_ID}.report.txt]
#		the joint and individual VCFs
#
#
#	output (per affected proband_:
#		DECIPHER formated file (all shared G2P variants)
#		IGV snapshot script file
#
#	checks:
#		all G2P variants found in the individual VCF
#
#       Author: MH
#       last modified: DEC 07, 2021


import sys
import os
import csv
import gzip
from collections import defaultdict


ASSEMBLY = 'GRCh38'
INTERGENIC = 'No'
ACCESS = 'No'


TRANS_DICT = {}				# key: transcriptID not found in DECIPHER; value: the chosen replacement transcriptID from those available in DECIPHER
KIDS_SEX_DICT = {}			# key: <indi_fam_id>; value: sex (in the format 46XX/46XY)
KIDS_G2P_DICT = defaultdict(dict)	# 1st level key: <indi_fam_id>; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans)
KIDS_VCF_DICT = defaultdict(dict)	# 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: (FS,SOR)
SHARED_DICT = {}			# key: chr:start:ref:alt; value: (ZYG,gene,trans)
NUM_SHARED_G2P_VARS = 0
SNAP_FLANK = 25

FS_THRESH = float(60)
SOR_THRESH = float(3)


## create the names of the needed files
#PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped
#IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
#FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_shared
#FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}

### call the python scrpit
#time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV_sib_from_quad.py \
#${DECIPHER_ID} \
#${TRANS_MAP} \
#${PED_FILE} \
#${IN_G2P_FILE} \
#${FAM_IGV_DIR} \
#${VCF_DIR} \
#${PLATE_ID} \
#${FAMILY_ID} \
#${DEC_DIR} \
#${FAM_BAM_DIR}


def go(dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir):

    # read the transcript mapping file
    read_trans_map(trans_map_file)

    # read the ped file and establish KID_ID + KID_SEX
    read_ped(ped_file)

    # read the G2P output for this family
    read_G2P(in_g2p_file)

    # now read the individual VCFs and record all the variants
    # list of all ids
    proband_ids = KIDS_G2P_DICT.keys()
    for pro_id in proband_ids:
        vcf_file = '%s/%s_%s_shared.ready.%s.vcf.gz' % (vcf_dir,plate_id,fam_id,pro_id)
        read_all_VCF_vars(vcf_file,KIDS_VCF_DICT,pro_id)
    print ""
    for k,v in KIDS_VCF_DICT.iteritems():
        print "Found %s unique VCF variants for affected proband (%s)" % (len(v),k)
    print ""
    sys.stdout.flush()


    print "Going over the varaints in each affected proband and checking each if it is shared G2P variant (to keep)"
    # setup the DECIPHER and IGV snapshot output files - per each affected proband
    proband_ids = KIDS_G2P_DICT.keys()
    for pro_id in proband_ids:

        num_out_vars = 0	# must be == NUM_SHARED_G2P_VARS

        out_dec_file = '%s/%s_shared_DEC_FLT.csv' % (dec_dir,pro_id)
        out_han = open(out_dec_file,'w')
        out_han.write('Internal reference number or ID,Chromosome,Start,Genome assembly,Reference allele,Alternate allele,Transcript,Gene name,Intergenic,Chromosomal sex,Other rearrangements/aneuploidy,Open-access consent,Age at last clinical assessment,Prenatal age in weeks,Note,Inheritance,Pathogenicity,Phenotypes,HGVS code,Genotype,Responsible contact\n')

        # setup the IGV snapshot file
        out_igv_file = '%s/IGV/%s.shared.snapshot.FLT.txt' % (dec_dir,pro_id)
        out_igv_han = open(out_igv_file,'w')
        out_igv_han.write('new\n')
        out_igv_han.write('genome hg38\n')
        out_igv_han.write('mkdir -p "%s"\n' % (fam_igv_dir))
        out_igv_han.write('new\n')

        child_bam = '%s/%s/%s-ready.bam' % (fam_bam_dir,pro_id,pro_id)
        out_igv_han.write('load %s\n' % (child_bam))
        out_igv_han.write('snapshotDirectory "%s"\n' % (fam_igv_dir))
        out_igv_han.write('\n')

        # go over the individual's VCF variants, check if found in the shared G2P variants, if yes - output it (with VCF's coordinates)
        # KIDS_VCF_DICT = defaultdict(dict)       # 1st level key: <indi_fam_id>; 2nd level key: chr:pos:ref:alt; value: irrelevant
        # SHARED_DICT = {}                        # key: chr:start:ref:alt; value: (ZYG,gene,trans)

        pro_vcf_vars = KIDS_VCF_DICT[pro_id]
        for pro_vcf_var,fs_sor in pro_vcf_vars.iteritems():
            chr,pos,ref,alt = pro_vcf_var.split(':')
            pos = int(pos)
            FS = fs_sor[0]
            SOR = fs_sor[1]

            # adjust pro_vcf_var for indels to match G2P style of recording
            if len(ref) == len(alt):							# SNP
                if len(ref) != 1:
                    print "ERROR: MNPs are not supported!"
                    print line
                    raise SystemExit
                G2P_key_to_match = '%s:%s:%s:%s' % (chr,pos,ref,alt)
            elif len(ref) > len(alt):							# DEL
                if len(alt) != 1:
                    print "ERROR with a deletion"
                    print line
                    raise SystemExit
                G2P_key_to_match = '%s:%s:%s:-' % (chr,pos+1,ref[1:])
            elif len(ref) < len(alt):							# INS
                if len(ref) != 1:
                    print "ERROR with an insertion"
                    print line
                    raise SystemExit
                G2P_key_to_match = '%s:%s:-:%s' % (chr,pos+1,alt[1:])
            else:
                print "Cannot establish the type of this VCF variant"
                print line
                raise SystemExit

            if G2P_key_to_match not in SHARED_DICT:					# an individual variant which is not in the shared G2P output
                continue

            # if here, this variant is in the shared G2P output, write it out
            print "\t%s:\tfound %s (VCF) -> %s (shared G2P)" % (pro_id,pro_vcf_var,G2P_key_to_match)

            GT = SHARED_DICT[G2P_key_to_match][0]
            gene = SHARED_DICT[G2P_key_to_match][1]
            trans = SHARED_DICT[G2P_key_to_match][2]

            inher_stat = 'Unknown'

            if (chr != 'chrX') and (chr != 'chrY'):
                if GT == 'HET':
                    genotype = 'Heterozygous'
                elif GT == 'HOM':
                    genotype = 'Homozygous'
                else:
                    print "ERROR: Cannot understand GT = %s" % (GT)
                    raise SystemExit

            elif (chr == 'chrX') or (chr == 'chrY'):
                if KIDS_SEX_DICT[pro_id] == '46XX':                 # a girl
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                    elif GT == 'HOM':
                        genotype = 'Homozygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit
                elif KIDS_SEX_DICT[pro_id] == '46XY':               # a boy
                    if GT == 'HET':
                        genotype = 'Heterozygous'
                        print "   WARNING: HET variant on chrX/Y for a boy (%s): %s\t%s\t%s\t%s\t%s" % (pro_id,chr,pos,ref,alt,pro_vcf_var)
                    elif GT == 'HOM':
                        genotype = 'Hemizygous'
                    else:
                        print "ERROR: Cannot understand GT = %s" % (GT)
                        raise SystemExit
                else:
                    print "ERROR: unknown sex for this proband = %s" % (DEC_CHILD_SEX)
                    raise SystemExit
            else:
                print "ERROR: unknown chr"
                print line
                raise SystemExit

            # write to the DECIPHER file
            gene_id_idx = gene.find('(')
            if gene_id_idx == -1:
                gene_id_idx = len(gene)
            gene_id = gene[0:gene_id_idx]

            if trans in TRANS_DICT:                         # if the transcriptID is to be replaced
                safe_trans = TRANS_DICT[trans]
            else:
                safe_trans = trans

            to_write = '%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,,%s,,,,"%s",,,,%s,\n' % (dec_id,chr[3:],pos,ASSEMBLY,ref,alt,safe_trans,gene_id,INTERGENIC,KIDS_SEX_DICT[pro_id],ACCESS,inher_stat,genotype)
            out_han.write(to_write)

            # write to the IGV file
            i_s = pos - SNAP_FLANK
            i_e = pos + SNAP_FLANK

            # check if above FS/SOR_THRESH to include in the snapshot name
            if (FS == '') or (SOR == ''):
                flag = 'NA'
            elif (FS >= FS_THRESH) and (SOR >= SOR_THRESH):
                flag = 'FS_%.1f_SOR_%.1f' % (FS,SOR)
            else:
                flag = 'OK'
            i_name = '%s_%s_%s_%s_%s_%s.png' % (pro_id,chr,pos,ref,alt,flag)

            out_igv_han.write('goto %s:%s-%s\n' % (chr,i_s,i_e))
            out_igv_han.write('sort strand\n')
            out_igv_han.write('squish\n')
            out_igv_han.write('snapshot %s\n' % (i_name))
            out_igv_han.write('\n')

            num_out_vars += 1

        out_han.close()
        out_igv_han.close()
        if num_out_vars == NUM_SHARED_G2P_VARS:
            print "\t%s:\tNumber of output variants matches the number of shared variants: OK" % (pro_id)
        else:
            print "\t%s:\tERROR: number of output variants does NOT match the number of shared variants" % (pro_id)
        print "\t%s:\tdecipher file = %s" % (pro_id,out_dec_file)
        print "\t%s:\tigv snapshot file for %s" % (pro_id,out_igv_file)
        print "\t--------------------------------"


def read_all_VCF_vars(in_vcf_file,THIS_DICT,pro_id):

    cntr = 0
    in_han = gzip.open(in_vcf_file,'r')
    for line in in_han:
        if line.startswith('#'):
            continue

        cntr += 1
        data = [x.strip() for x in line.strip().split('\t')]
        chr = data[0]
        pos = int(data[1])
        ref = data[3]
        alt = data[4]

        # extract FS and SOR
        FS = ''
        SOR = ''
        infos = [y.strip() for y in data[7].strip().split(';')]
        for info in infos:
            if info.startswith('FS='):
                tag,FS = info.split('=')
                FS = float(FS)
            elif info.startswith('SOR='):
                tag,SOR = info.split('=')
                SOR = float(SOR)

        # did the splitting and normalizing - should not have multiallelic variants
        if alt.find(',') != -1:
            print "ERROR: found multiallelic variant"
            print line
            raiseSystemExit

        key = '%s:%s:%s:%s' % (chr,pos,ref,alt)

        if pro_id not in THIS_DICT:
            THIS_DICT[pro_id][key] = (FS,SOR)
        elif key not in THIS_DICT[pro_id]:
            THIS_DICT[pro_id][key] = (FS,SOR)
        else:
            print "ERROR: duplicate key = %s in %s" % (key,in_vcf_file)
            raise SystemExit

    in_han.close()


def read_G2P(in_file):

    global NUM_SHARED_G2P_VARS

    known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance']

    # to make sure no duplicate vars per indi
    CHECK_DICT =  defaultdict(dict)       # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant

    # first, read the G2P variants on canonical transcripts for each of the affected probands
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]

        # get the individual_id
        sam_id = data[0]

        # if, in addition to the  affected siblings there is an unaffected parent, they would be in the family VCF
        # they would be G2P-ed, must be excluded by now!!!
        if sam_id not in KIDS_SEX_DICT:
            print "ERROR: In the G2P file found a sample which is not an affected kid = %s !!!" % (sam_id)
            raise SystemExit

        # ignore variants not on canonical transcripts
        is_canon = data[3]
        if is_canon != 'is_canonical':
            continue

        # split the variants based on the gene's OBS model of inheritance
        inher_model = data[4]
        aaa,OBS_state = inher_model.split('=')

        if OBS_state not in known_OBS_states:
            print "ERROR: unknown OBS state = %s in %s" % (OBS_state,in_file)
            raise SystemExit

        # get the gene name in format ENSG00000165899(C12orf64,OTOGL)
        gene_name = data[1]

        # get the transcript name in format ENST00000238647
        transcript = data[2]

        # this is a list of variants (n>=1) on a canonical transcript in a gene being considered under any OBS state
        var_list = [y.strip() for y in data[6].split(';')]
        for v in var_list:
            v_details = [z.strip() for z in v.split(':')]
            chr = v_details[0]
            start = int(v_details[1])
            end = int(v_details[2])
            ref = v_details[3]
            alt = v_details[4]
            GT = v_details[5]
            second_key = '%s:%s:%s:%s:%s' % (chr,start,end,ref,alt)

##########################################################################
#            check_key = '%s:%s' % (sam_id,second_key)
#            if check_key not in CHECK_DICT:
#                CHECK_DICT[check_key][OBS_state] = 1
#            elif OBS_state not in CHECK_DICT[check_key].keys():
#                CHECK_DICT[check_key][OBS_state] = 1
#            else:
#                print "ERROR: a duplicate variant %s in %s gene for CHILD = %s, OBS_state = %s" % (check_key,gene_name,sam_id,OBS_state)
#                raise SystemExit
#
#            if sam_id not in KIDS_G2P_DICT:
#                KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
#            elif second_key not in KIDS_G2P_DICT[sam_id]:
#                KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
#            else:
##                print "ERROR: a duplicate variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
##                raise SystemExit
#                pass	# the same variant in diff OBS_state   - see above !
############################################################################


            ##########################################
            ### to deal with the new output of G2P ###
            ##########################################

            check_key = '%s:%s' % (sam_id,second_key)
            if check_key not in CHECK_DICT:						# first time we see this var in this sample, any OBS_state
                CHECK_DICT[check_key][OBS_state] = 1
                if sam_id not in KIDS_G2P_DICT:
                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                elif second_key not in KIDS_G2P_DICT[sam_id]:
                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                else:									# sanity check
                    print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
                    raise SystemExit

            elif OBS_state not in CHECK_DICT[check_key].keys():				# first time we see this var in this sample with this OBS_state
                CHECK_DICT[check_key][OBS_state] = 1
                if sam_id not in KIDS_G2P_DICT:
                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                elif second_key not in KIDS_G2P_DICT[sam_id]:
                    KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript):	# diff OBS_state, but must have same (GT,gene_name,transcript)
                    pass
                else:
                    print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
                    raise SystemExit

            else:       # same individual, same variant, known OBS_state
                        # due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq
                        # check the gene name in KIDS_G2P_DICT[sam_id][second_key]
                if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'):             # recorded is refseq
                    if gene_name.startswith('ENSG'):                                        # this is ensembl
                        KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)       # replace
                    else:                                                                   # this is refseq again, ignore
                        pass
                else:                                                                       # recorded is ensembl, ignore
                    pass


    in_han.close()
    print ""
    print ""
    print "Found the following variants on canonical transcripts in the G2P output for these affected probands"

    for id,val in KIDS_G2P_DICT.iteritems():
        print "--------------------------"
        for k,v in val.iteritems():
            print "    %s\t%s\t%s" % (id,k,v)
    print ""
    print ""


    ###################################################################################
    ####    SHARED variant filtering                                               ####
    ####    select only variants seen in all affected probands with the same GT    ####
    ###################################################################################

    print ""
    print "===   SHARED variant filtering   ==="

    # list of all ids
    proband_ids = KIDS_G2P_DICT.keys()
    print "All affected probands = %s" % (proband_ids)

    # for each proband, go thru all of their variants, check if seen in all probands excl this one, iff yes, record in SHARED_DICT
    for pro_id in proband_ids:
        other_pro_ids = []
        for aaa in proband_ids:
            other_pro_ids.append(aaa)
        other_pro_ids.remove(pro_id)
        print "Analyzing variants in %s, to be compared against the variants in all other affected probands %s" % (pro_id,other_pro_ids)

        # go thru all of their variants
        pro_vars = KIDS_G2P_DICT[pro_id]            # a dict with keys: chr,start,end,ref,alt and values: (GT,gene_name,transcript)
        for var_loc,var_info in pro_vars.iteritems():
            found_in_all = True

            # check if seen in all probands excl this one
            for o_id in other_pro_ids:
                if var_loc not in KIDS_G2P_DICT[o_id]:
                    print "  Excluding variant %s in %s, since not seen in %s" % (var_loc,pro_id,o_id)
                    found_in_all = False
                    break

                # if variant found, check if GT matches
                else:
                    o_info = KIDS_G2P_DICT[o_id][var_loc]
                    if var_info[0] != o_info[0]:
                        print "  Excluding variant %s in %s (GT = %s); it is seen in %s but GT does not match (ST = %s)" % (var_loc,pro_id,var_info[0],o_id,o_info[0])
                        found_in_all = False
                        break

            if found_in_all:	# this variant has been found in all affected probands with matching GT, keep it
                if var_loc not in SHARED_DICT:		# it has not been recorded previously when considering another proband

                    # for consistency with the standard trio-based processing
                    # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
                    chr,start,end,ref,alt = var_loc.split(":")
                    if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
                        if len(ref) < len(alt):                                 # an INS
                            orig_start = start
                            orig_ref = ref
                            orig_alt = alt
                            start = orig_start
                            ref = '-'
                            alt = orig_alt[len(orig_ref):]
                            print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
                        else:                                                   # a DEL
                            print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                            print line
                            raise SystemExit

                    new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)
                    SHARED_DICT[new_key] = var_info
                    print "  Keeping %s found in all affected probands, same GT" % (new_key)


        print "---------------------"

    NUM_SHARED_G2P_VARS = len(SHARED_DICT)
    print "Found %s unique and canonical G2P variants SHARED between all %s affected probands in this family" % (NUM_SHARED_G2P_VARS,len(proband_ids))
    sys.stdout.flush()
    print ""


def read_ped(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        kid_id = data[1]
        kid_se = int(data[4])
        if kid_se == 1:		# boy
            kid_sex =  '46XY'
        elif kid_se == 2:	# girl
            kid_sex =  '46XX'
        else:
            print "ERROR: proband sex unknown"
            print line
            raise SystemExit
        if kid_id not in KIDS_SEX_DICT:
            KIDS_SEX_DICT[kid_id] = kid_sex
        else:
            print "ERROR: proband sex unknown"
            print line
            raise SystemExit
    in_han.close()
    print "Found the following affected probands"
    for k,v in KIDS_SEX_DICT.iteritems():
        print "    %s: %s" % (k,v)
    sys.stdout.flush()


def read_trans_map(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        old_trans_id = data[0]
        new_trans_id = data[1]
        if old_trans_id not in TRANS_DICT:
            TRANS_DICT[old_trans_id] = new_trans_id
        else:
            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
            raise SystemExit
    in_han.close()


if __name__ == '__main__':
    if len(sys.argv) == 11:
        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV_aff_probands.py \
        dec_id,trans_map_file,ped_file,in_g2p_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
        raise SystemExit