Skip to content
Snippets Groups Projects
generate_DEC_IGV_scripts.py 53.8 KiB
Newer Older
        # here we are at gene level, must iterate over all variants in this gene
        # iterate over variants in this gene
        for kkk in GENE_KEY_GT[g]:                # this the second key: chr:start:end:ref:alt

            CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0]
            CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1]
            CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2]

            # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
            chr,start,end,ref,alt = kkk.split(":")
            if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
                if len(ref) < len(alt):                                 # an INS
                    orig_start = start
                    orig_ref = ref
                    orig_alt = alt
                    start = orig_start
                    ref = '-'
                    alt = orig_alt[len(orig_ref):]
                    print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
                else:                                                   # a DEL
                    print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                    print line
                    raise SystemExit

            new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

            # record the data for CHILD G2P variants (for OBS=biallelic)
            if new_key not in G2P_DICT:
                G2P_DICT[new_key] = 0
            else:
                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
                # raise SystemExit
                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
                pass

            # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
            if new_key not in G2P_DATA:
                G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
            else:
                # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
                # raise SystemExit
                # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
                pass

    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
    sys.stdout.flush()
    print ""









    ####################################################################################################################
    ####    X-linked filtering                                                                                      ####
#.#    ####    under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance)      ####
    ####    under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het)                                    ####
    ####    exclude child HET variants if seen as HOM in UNAFFECTED father                                          ####
    ####													    ####
    ####    Note 18/01/2022    									    		    ####
    ####    This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het  ####
    ####    and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance           ####
    ####    the code below treats x-linked over-dominance as the others (i.e. filters, while it should not)         ####
    ####    Issue flagged to G2P plug-in team, awaiting their fix						    ####
    ####    for now manually scan the output of G2P for the proband (both for boys and girls)                       ####
    ####        to check if any variant has been called in PCDH19 and EFNB1                                         ####
    ####    also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name    ####
    ####################################################################################################################


    print ""
    print "===   X-linked filtering   ==="

    #######################################
    ### process monoallelic_X_hem genes ###
    #######################################

    for key in CHILD_DICT['monoallelic_X_hem']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)

        CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0]
        CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1]
        CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2]

        if CHILD_GT == 'HOM':							# do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl)
            pass
        else:
            if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"):
                DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0]
                if DAD_GT == 'HOM':						# i.e., hemizygous variant in unaffected father
                    print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
                    continue

        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
        chr,start,end,ref,alt = key.split(":")
        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
            if len(ref) < len(alt):                                 # an INS
                orig_start = start
                orig_ref = ref
                orig_alt = alt
                start = orig_start
                ref = '-'
                alt = orig_alt[len(orig_ref):]
                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
            else:                                                   # a DEL
                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                print line
                raise SystemExit

        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

        # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem)
        if new_key not in G2P_DICT:
            G2P_DICT[new_key] = 0
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass

        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
        if new_key not in G2P_DATA:
            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass



    #######################################
    ### process monoallelic_X_het genes ###
    #######################################

    for key in CHILD_DICT['monoallelic_X_het']:       # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans)

        CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0]
        CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1]
        CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2]

        if CHILD_GT == 'HOM':                                                   # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl)
            pass
        else:
            if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"):
                DAD_GT = DAD_DICT['monoallelic_X_het'][key][0]
                if DAD_GT == 'HOM':                                             # i.e., x-linked dominant variant in unnafected father
                    print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT)
                    continue

        # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P)
        chr,start,end,ref,alt = key.split(":")
        if len(ref) > 1 and len(alt) > 1:                           # an INDEL - not normalized
            if len(ref) < len(alt):                                 # an INS
                orig_start = start
                orig_ref = ref
                orig_alt = alt
                start = orig_start
                ref = '-'
                alt = orig_alt[len(orig_ref):]
                print "    WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt)
            else:                                                   # a DEL
                print "ERROR: At the momemnt, cannot deal with this non-normalized deletion"
                print line
                raise SystemExit

        new_key = '%s:%s:%s:%s' % (chr,start,ref,alt)

        # record the data for CHILD G2P variants (for OBS=monoallelic_X_het)
        if new_key not in G2P_DICT:
            G2P_DICT[new_key] = 0
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass

        # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA
        if new_key not in G2P_DATA:
            G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT)
        else:
            # print "ERROR: duplicate G2P variant new_key = %s" % (new_key)
            # raise SystemExit
            # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req
            pass


    NUM_UNIQ_G2P_VARS = len(G2P_DICT)
    print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID)
    print ""
    print ""








def read_ped(in_file):

    global CHILD_ID
    global CHILD_SEX
    global DEC_CHILD_SEX
    global MOM_ID
    global MOM_STAT
    global DAD_ID
    global DAD_STAT

    CHILD_ID = 0
    CHILD_SEX = 0
    MOM_ID = 0
    MOM_STAT = 0
    DAD_ID = 0
    DAD_STAT = 0

    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        if data[2] != '0' and data[3] != '0':			# this is the child in the trio
            if CHILD_ID == 0:
                CHILD_ID = data[1]
            else:						# seen another child
                print "ERROR: already have seen a child (possibly a quad) - cannot handle at the moment"
                raise SystemExit

            if DAD_ID == 0:
                DAD_ID = data[2]
            else:
                if data[2] != DAD_ID:
                    print "ERROR: DAD_ID mismatch - from child line dad_id = %s, from dad line dad_id = %s" % (data[2],DAD_ID)
                    raise SystemExit
            if MOM_ID == 0:
                MOM_ID = data[3]
            else:
                if data[3] != MOM_ID:
                    print "ERROR: MOM_ID mismatch - from child line mom_id = %s, from mom line mom_id = %s" % (data[3],MOM_ID)
                    raise SystemExit

            CHILD_SEX = int(data[4])
            if CHILD_SEX == 1:		# boy
                DEC_CHILD_SEX = '46XY'
            elif CHILD_SEX == 2:	# girl
                DEC_CHILD_SEX = '46XX'
            else:
                print "ERROR: proband sex unknown"
                print line
                raise SystemExit

            if int(data[5]) != 2:
                print "ERROR: child not affected"
                print line
                raise SystemExit


        elif int(data[2]) == 0 and int(data[3]) == 0:		# this is a parent record
            if int(data[4]) == 1:				# this is the dad
                if int(data[5]) == 1:
                    DAD_STAT = "UNAFFECTED"
                elif int(data[5]) == 2:
                    DAD_STAT = "AFFECTED"
                else:
                    print "ERROR: cannot establish the dad's status"
                    print line
                    raise SystemExit

                if DAD_ID == 0:
                    DAD_ID = data[1]
                else:
                    if data[1] != DAD_ID:
                        print "ERROR: DAD_ID mismatch - from dad line dad_id = %s, from child line dad_id = %s" % (data[1],DAD_ID)
                        raise SystemExit

            if int(data[4]) == 2:                               # this is the mom
                if int(data[5]) == 1:
                    MOM_STAT = "UNAFFECTED"
                elif int(data[5]) == 2:
                    MOM_STAT = "AFFECTED"
                else:
                    print "ERROR: cannot establish mom's status"
                    print line
                    raise SystemExit

                if MOM_ID == 0:
                    MOM_ID = data[1]
                else:
                    if data[1] != MOM_ID:
                        print "ERROR: MOM_ID mismatch - from mom line mom_id = %s, from child line mom_id = %s" % (data[1],MOM_ID)
                        raise SystemExit
        else:
            print "ERROR: problematic PED line"
            print line
            raise SystemExit






def read_map_file(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        dec_id = data[0]
        int_id = data[1]
        if dec_id not in MAP_DICT:
            MAP_DICT[dec_id] = int_id
        else:
            print "ERROR: duplicate DECIPHER/family ID = %s" % (dec_id)
            raise SystemExit
    in_han.close()




def read_trans_map(in_file):
    in_han = open(in_file,'r')
    for line in in_han:
        data = [x.strip() for x in line.strip().split('\t')]
        old_trans_id = data[0]
        new_trans_id = data[1]
        if old_trans_id not in TRANS_DICT:
            TRANS_DICT[old_trans_id] = new_trans_id
        else:
            print "ERROR: duplicate old transcript ID = %s" % (old_trans_id)
            raise SystemExit
    in_han.close()






if __name__ == '__main__':
    if len(sys.argv) == 12:
        go(sys.argv[1],sys.argv[2],sys.argv[3],sys.argv[4],sys.argv[5],sys.argv[6],sys.argv[7],sys.argv[8],sys.argv[9],sys.argv[10],sys.argv[11])
    else:
        print "Suggested use: time python /home/u035/u035/shared/scripts/NHS_WES_generate_DEC_IGV.py \
        dec_map_file,trans_map_file,ped_file,in_g2p_file,in_vase_file,fam_igv_dir,vcf_dir,plate_id,fam_id,dec_dir,fam_bam_dir"
        raise SystemExit