diff --git a/gather_quad_results.sh b/gather_quad_results.sh index 0cb3af4415bc2d57701cbbe6e9302fda3946d025..c20a8b6b58929b0a9210a5a317079c3e379c6055 100755 --- a/gather_quad_results.sh +++ b/gather_quad_results.sh @@ -79,6 +79,8 @@ cp -r ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_shared ${IGV_SNAP_DIR} # copy proband coverage files cp ${WORK_DIR}/COV/${KID_1_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR} cp ${WORK_DIR}/COV/${KID_2_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR} +cp ${WORK_DIR}/COV/${KID_1_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR} +cp ${WORK_DIR}/COV/${KID_2_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR} diff --git a/gather_shared_results.sh b/gather_shared_results.sh index f7a05d9542dfcee7fc3de871bd1eb8071bd1a386..084b66bfb9febd6a2a103bc57ffa1e1a0fba4a01 100755 --- a/gather_shared_results.sh +++ b/gather_shared_results.sh @@ -59,6 +59,7 @@ cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR} # copy the coverage files cp ${WORK_DIR}/COV/*_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR} +cp ${WORK_DIR}/COV/*_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR} echo "" diff --git a/gather_trio_results.sh b/gather_trio_results.sh index 13535fe2af0acd3bb58ae61b1ab95e0282e3ed09..7325bffbcb4d76771d147504145f2a6fdf30adbe 100755 --- a/gather_trio_results.sh +++ b/gather_trio_results.sh @@ -80,9 +80,9 @@ cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR} cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR} -# copy proband coverage file +# copy proband coverage files cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt ${FAM_DIR} - +cp ${WORK_DIR}/COV/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt ${FAM_DIR} echo "OK: Results for ${FAMILY_ID} are stored in ${FAM_DIR}" diff --git a/generate_DEC_IGV_aff_sib_scripts_from_quad.py b/generate_DEC_IGV_aff_sib_scripts_from_quad.py index e5c889dd0adc21cbed5a82e48169c3a428f81bdb..29bfaa22a5ddaf9e6a049a9b6793d62945c0894d 100755 --- a/generate_DEC_IGV_aff_sib_scripts_from_quad.py +++ b/generate_DEC_IGV_aff_sib_scripts_from_quad.py @@ -12,7 +12,7 @@ # all G2P variants found in the individual VCF # # Author: MH -# last modified: DEC 07, 2021 +# last modified: JAN 21, 2022 @@ -303,7 +303,8 @@ def read_G2P(in_file): global NUM_SHARED_G2P_VARS - known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] +#.# known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] + known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # to make sure no duplicate vars per indi CHECK_DICT = defaultdict(dict) # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant diff --git a/generate_DEC_IGV_scripts.py b/generate_DEC_IGV_scripts.py index 3c3ced5a886a19895491486d6a2f2085b055d588..81892871a58e221e5c1928e1d8d73028bff0393b 100755 --- a/generate_DEC_IGV_scripts.py +++ b/generate_DEC_IGV_scripts.py @@ -15,7 +15,7 @@ # all VASE denovo variants found in the individual VCF # # Author: MH -# last modified: DEC 03, 2021 +# last modified: JAN 18, 2022 @@ -650,8 +650,8 @@ def read_G2P(in_file): global NUM_UNIQ_G2P_VARS - known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] - +#.# known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] + known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # first, read the G2P variants on canonical transcripts for each of the family members CHILD_DICT = defaultdict(dict) # 1st level key: OBS state; 2nd level key: chr:start:end:ref:alt; value: (ZYG,gene,trans) @@ -757,113 +757,102 @@ def read_G2P(in_file): ### print out the number of unique G2P variants in CHILD ### child_mono = 0 child_bi = 0 - child_hemi = 0 - child_xld = 0 - child_xlod = 0 - - if 'monoallelic' in CHILD_DICT: - child_mono = len(CHILD_DICT['monoallelic']) - if 'biallelic' in CHILD_DICT: - child_bi = len(CHILD_DICT['biallelic']) - if 'hemizygous' in CHILD_DICT: - child_hemi = len(CHILD_DICT['hemizygous']) - if 'x-linked dominant' in CHILD_DICT: - child_xld = len(CHILD_DICT['x-linked dominant']) - if 'x-linked over-dominance' in CHILD_DICT: - child_xlod = len(CHILD_DICT['x-linked over-dominance']) + child_hem = 0 + child_het = 0 + + if 'monoallelic_autosomal' in CHILD_DICT: + child_mono = len(CHILD_DICT['monoallelic_autosomal']) + if 'biallelic_autosomal' in CHILD_DICT: + child_bi = len(CHILD_DICT['biallelic_autosomal']) + if 'monoallelic_X_hem' in CHILD_DICT: + child_hem = len(CHILD_DICT['monoallelic_X_hem']) + if 'monoallelic_X_het' in CHILD_DICT: + child_het = len(CHILD_DICT['monoallelic_X_het']) print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID) - print " monoallelic: %s" % (child_mono) - print " biallelic: %s" % (child_bi) - print " hemizygous: %s" % (child_hemi) - print " x-linked dominant: %s" % (child_xld) - print " x-linked over-dominance: %s" % (child_xlod) + print " monoallelic_autosomal: %s" % (child_mono) + print " biallelic_autosomal: %s" % (child_bi) + print " monoallelic_X_hem: %s" % (child_hem) + print " monoallelic_X_het: %s" % (child_het) + ### print out the number of unique G2P variants in MOM ### mom_mono = 0 mom_bi = 0 - mom_hemi = 0 - mom_xld = 0 - mom_xlod = 0 - - if 'monoallelic' in MOM_DICT: - mom_mono = len(MOM_DICT['monoallelic']) - if 'biallelic' in MOM_DICT: - mom_bi = len(MOM_DICT['biallelic']) - if 'hemizygous' in MOM_DICT: - mom_hemi = len(MOM_DICT['hemizygous']) - if 'x-linked dominant' in MOM_DICT: - mom_xld = len(MOM_DICT['x-linked dominant']) - if 'x-linked over-dominance' in MOM_DICT: - mom_xlod = len(MOM_DICT['x-linked over-dominance']) + mom_hem = 0 + mom_het = 0 + + if 'monoallelic_autosomal' in MOM_DICT: + mom_mono = len(MOM_DICT['monoallelic_autosomal']) + if 'biallelic_autosomal' in MOM_DICT: + mom_bi = len(MOM_DICT['biallelic_autosomal']) + if 'monoallelic_X_hem' in MOM_DICT: + mom_hem = len(MOM_DICT['monoallelic_X_hem']) + if 'monoallelic_X_het' in MOM_DICT: + mom_het = len(MOM_DICT['monoallelic_X_het']) print "MOM (%s): number of unique G2P variants on canon transcript in the following OBS states" % (MOM_ID) - print " monoallelic: %s" % (mom_mono) - print " biallelic: %s" % (mom_bi) - print " hemizygous: %s" % (mom_hemi) - print " x-linked dominant: %s" % (mom_xld) - print " x-linked over-dominance: %s" % (mom_xlod) + print " monoallelic_autosomal: %s" % (mom_mono) + print " biallelic_autosomal: %s" % (mom_bi) + print " monoallelic_X_hem: %s" % (mom_hem) + print " monoallelic_X_het: %s" % (mom_het) + ### print out the number of unique G2P variants in DAD ### dad_mono = 0 dad_bi = 0 - dad_hemi = 0 - dad_xld = 0 - dad_xlod = 0 - - if 'monoallelic' in DAD_DICT: - dad_mono = len(DAD_DICT['monoallelic']) - if 'biallelic' in DAD_DICT: - dad_bi = len(DAD_DICT['biallelic']) - if 'hemizygous' in DAD_DICT: - dad_hemi = len(DAD_DICT['hemizygous']) - if 'x-linked dominant' in DAD_DICT: - dad_xld = len(DAD_DICT['x-linked dominant']) - if 'x-linked over-dominance' in DAD_DICT: - dad_xlod = len(DAD_DICT['x-linked over-dominance']) + dad_hem = 0 + dad_het = 0 + + if 'monoallelic_autosomal' in DAD_DICT: + dad_mono = len(DAD_DICT['monoallelic_autosomal']) + if 'biallelic_autosomal' in DAD_DICT: + dad_bi = len(DAD_DICT['biallelic_autosomal']) + if 'monoallelic_X_hem' in DAD_DICT: + dad_hem = len(DAD_DICT['monoallelic_X_hem']) + if 'monoallelic_X_het' in DAD_DICT: + dad_het = len(DAD_DICT['monoallelic_X_het']) print "DAD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (DAD_ID) - print " monoallelic: %s" % (dad_mono) - print " biallelic: %s" % (dad_bi) - print " hemizygous: %s" % (dad_hemi) - print " x-linked dominant: %s" % (dad_xld) - print " x-linked over-dominance: %s" % (dad_xlod) + print " monoallelic_autosomal: %s" % (dad_mono) + print " biallelic_autosomal: %s" % (dad_bi) + print " monoallelic_X_hem: %s" % (dad_hem) + print " monoallelic_X_hem: %s" % (dad_het) sys.stdout.flush() - - ############################################################################################### - #### DOM filtering #### - #### if the gene has been considered under the dominant model (OBS == monoallelic) #### - #### exclude child variants seen in UNAFFECTED mother/father, regardless of GT #### - ############################################################################################### + ###################################################################################################### + #### Dominant filtering #### + #### if the gene has been considered under the dominant model (OBS == monoallelic_autosomal) #### + #### exclude child variants seen in UNAFFECTED mother/father, regardless of GT #### + ###################################################################################################### print "" - print "=== DOM filtering ===" + print "=== monoallelic autosomal (DOMINANT) filtering ===" - for key in CHILD_DICT['monoallelic']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + for key in CHILD_DICT['monoallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - CHILD_GT = CHILD_DICT['monoallelic'][key][0] - CHILD_GENE = CHILD_DICT['monoallelic'][key][1] - CHILD_TRANS = CHILD_DICT['monoallelic'][key][2] + CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0] + CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1] + CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2] - if (key in MOM_DICT['monoallelic']) and (MOM_STAT == "UNAFFECTED"): - MOM_GT = MOM_DICT['monoallelic'][key][0] - print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GT,MOM_GT,MOM_STAT) + if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"): + MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0] + print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT) continue - if (key in DAD_DICT['monoallelic']) and (DAD_STAT == "UNAFFECTED"): - DAD_GT = DAD_DICT['monoallelic'][key][0] - print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT) + if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"): + DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0] + print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue @@ -915,196 +904,9 @@ def read_G2P(in_file): - - - - - - - - #################################################################################################################### - #### X-linked filtering #### - #### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) #### - #### exclude child HET variants if seen as HOM in UNAFFECTED father #### - #################################################################################################################### - - - print "" - print "=== X-linked filtering ===" - - ################################ - ### process hemizygous genes ### - ################################ - - for key in CHILD_DICT['hemizygous']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - - CHILD_GT = CHILD_DICT['hemizygous'][key][0] - CHILD_GENE = CHILD_DICT['hemizygous'][key][1] - CHILD_TRANS = CHILD_DICT['hemizygous'][key][2] - - if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl) - pass - else: - if (key in DAD_DICT['hemizygous']) and (DAD_STAT == "UNAFFECTED"): - DAD_GT = DAD_DICT['hemizygous'][key][0] - if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father - print "***[X-linked model (hemizygous)]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT) - continue - - # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) - chr,start,end,ref,alt = key.split(":") - if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized - if len(ref) < len(alt): # an INS - orig_start = start - orig_ref = ref - orig_alt = alt - start = orig_start - ref = '-' - alt = orig_alt[len(orig_ref):] - print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) - else: # a DEL - print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" - print line - raise SystemExit - - new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - - # record the data for CHILD G2P variants (for OBS=hemizygous) - if new_key not in G2P_DICT: - G2P_DICT[new_key] = 0 - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA - if new_key not in G2P_DATA: - G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - - - ####################################### - ### process x-linked dominant genes ### - ####################################### - - for key in CHILD_DICT['x-linked dominant']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - - CHILD_GT = CHILD_DICT['x-linked dominant'][key][0] - CHILD_GENE = CHILD_DICT['x-linked dominant'][key][1] - CHILD_TRANS = CHILD_DICT['x-linked dominant'][key][2] - - if CHILD_GT == 'HOM': # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl) - pass - else: - if (key in DAD_DICT['x-linked dominant']) and (DAD_STAT == "UNAFFECTED"): - DAD_GT = DAD_DICT['x-linked dominant'][key][0] - if DAD_GT == 'HOM': # i.e., x-linked dominant variant in unnafected father - print "***[X-linked model (x-linked dominant)]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT) - continue - - # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) - chr,start,end,ref,alt = key.split(":") - if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized - if len(ref) < len(alt): # an INS - orig_start = start - orig_ref = ref - orig_alt = alt - start = orig_start - ref = '-' - alt = orig_alt[len(orig_ref):] - print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) - else: # a DEL - print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" - print line - raise SystemExit - - new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - - # record the data for CHILD G2P variants (for OBS=x-linked dominant) - if new_key not in G2P_DICT: - G2P_DICT[new_key] = 0 - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA - if new_key not in G2P_DATA: - G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - - ######################################################################## - ### process x-linked over-dominance genes - no filtering to be done ### - ######################################################################## - - for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - - CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] - CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] - CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] - - # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) - chr,start,end,ref,alt = key.split(":") - if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized - if len(ref) < len(alt): # an INS - orig_start = start - orig_ref = ref - orig_alt = alt - start = orig_start - ref = '-' - alt = orig_alt[len(orig_ref):] - print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) - else: # a DEL - print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" - print line - raise SystemExit - - new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - - # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) - if new_key not in G2P_DICT: - G2P_DICT[new_key] = 0 - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA - if new_key not in G2P_DATA: - G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - - NUM_UNIQ_G2P_VARS = len(G2P_DICT) - print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) - sys.stdout.flush() - - print "" - - - - - ############################################################################################################## - #### BIALLELIC filtering #### - #### under the biallelic model (OBS == biallelic) - consider ALL variants per gene #### + #### Recessive filtering #### + #### under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene #### #### must all be HET in CHILD, GT in parent does not matter #### #### all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) #### #### and this parent must be unaffected #### @@ -1113,16 +915,16 @@ def read_G2P(in_file): print "" - print "=== BIALLELIC filtering ===" + print "=== biallelic autosomal (RECESSIVE) filtering ===" GENE_KEY_GT = defaultdict(dict) # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans) # process all variants in biallelic genes in child - for key in CHILD_DICT['biallelic']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - b_GT = CHILD_DICT['biallelic'][key][0] - b_gene = CHILD_DICT['biallelic'][key][1] - b_trans = CHILD_DICT['biallelic'][key][2] + for key in CHILD_DICT['biallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + b_GT = CHILD_DICT['biallelic_autosomal'][key][0] + b_gene = CHILD_DICT['biallelic_autosomal'][key][1] + b_trans = CHILD_DICT['biallelic_autosomal'][key][2] GENE_KEY_GT[b_gene][key] = (b_GT,b_trans) # iterate over genes in GENE_KEY_GT @@ -1131,7 +933,6 @@ def read_G2P(in_file): # iterate over variants in this gene for kx in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt -### if GENE_KEY_GT[g][kx] == 'HOM': # there is a HOM variant in the child - NO filtering if GENE_KEY_GT[g][kx][0] == 'HOM': # there is a HOM variant in the child - NO filtering all_HET = False break @@ -1150,9 +951,9 @@ def read_G2P(in_file): this_var_status = 'NONE' - if ((ky in MOM_DICT['biallelic']) or (ky in MOM_DICT['monoallelic'])) and (MOM_STAT == "UNAFFECTED"): + if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"): this_var_status = 'MOM' - if ((ky in DAD_DICT['biallelic']) or (ky in DAD_DICT['monoallelic'])) and (DAD_STAT == "UNAFFECTED"): + if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"): if this_var_status == 'NONE': this_var_status = 'DAD' elif this_var_status == 'MOM': @@ -1187,7 +988,7 @@ def read_G2P(in_file): # if all variants in the child in this gene are found in single unaffected parent - filter out if all_from_one_parent: for kz in GENE_KEY_GT[g]: - print "***[Biallelic model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz]) + print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz]) continue # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written @@ -1202,9 +1003,9 @@ def read_G2P(in_file): # iterate over variants in this gene for kkk in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt - CHILD_GT = CHILD_DICT['biallelic'][kkk][0] - CHILD_GENE = CHILD_DICT['biallelic'][kkk][1] - CHILD_TRANS = CHILD_DICT['biallelic'][kkk][2] + CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0] + CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1] + CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2] # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) chr,start,end,ref,alt = kkk.split(":") @@ -1243,8 +1044,203 @@ def read_G2P(in_file): pass NUM_UNIQ_G2P_VARS = len(G2P_DICT) - print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, X-LINKED and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) + print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) + sys.stdout.flush() + print "" + + + + + + + + + + #################################################################################################################### + #### X-linked filtering #### +#.# #### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) #### + #### under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het) #### + #### exclude child HET variants if seen as HOM in UNAFFECTED father #### + #### #### + #### Note 18/01/2022 #### + #### This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het #### + #### and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance #### + #### the code below treats x-linked over-dominance as the others (i.e. filters, while it should not) #### + #### Issue flagged to G2P plug-in team, awaiting their fix #### + #### for now manually scan the output of G2P for the proband (both for boys and girls) #### + #### to check if any variant has been called in PCDH19 and EFNB1 #### + #### also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name #### + #################################################################################################################### + + + print "" + print "=== X-linked filtering ===" + + ####################################### + ### process monoallelic_X_hem genes ### + ####################################### + + for key in CHILD_DICT['monoallelic_X_hem']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + + CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0] + CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1] + CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2] + + if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl) + pass + else: + if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"): + DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0] + if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father + print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) + continue + + # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) + chr,start,end,ref,alt = key.split(":") + if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized + if len(ref) < len(alt): # an INS + orig_start = start + orig_ref = ref + orig_alt = alt + start = orig_start + ref = '-' + alt = orig_alt[len(orig_ref):] + print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) + else: # a DEL + print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" + print line + raise SystemExit + + new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) + + # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem) + if new_key not in G2P_DICT: + G2P_DICT[new_key] = 0 + else: + # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) + # raise SystemExit + # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req + pass + + # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA + if new_key not in G2P_DATA: + G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) + else: + # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) + # raise SystemExit + # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req + pass + + + + ####################################### + ### process monoallelic_X_het genes ### + ####################################### + + for key in CHILD_DICT['monoallelic_X_het']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + + CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0] + CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1] + CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2] + + if CHILD_GT == 'HOM': # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl) + pass + else: + if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"): + DAD_GT = DAD_DICT['monoallelic_X_het'][key][0] + if DAD_GT == 'HOM': # i.e., x-linked dominant variant in unnafected father + print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) + continue + + # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) + chr,start,end,ref,alt = key.split(":") + if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized + if len(ref) < len(alt): # an INS + orig_start = start + orig_ref = ref + orig_alt = alt + start = orig_start + ref = '-' + alt = orig_alt[len(orig_ref):] + print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) + else: # a DEL + print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" + print line + raise SystemExit + + new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) + + # record the data for CHILD G2P variants (for OBS=monoallelic_X_het) + if new_key not in G2P_DICT: + G2P_DICT[new_key] = 0 + else: + # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) + # raise SystemExit + # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req + pass + + # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA + if new_key not in G2P_DATA: + G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) + else: + # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) + # raise SystemExit + # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req + pass + + + +#.# ######################################################################## +#.# ### process x-linked over-dominance genes - no filtering to be done ### +#.# ######################################################################## + +#.# for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + +#.# CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] +#.# CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] +#.# CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] + +#.# # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) +#.# chr,start,end,ref,alt = key.split(":") +#.# if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized +#.# if len(ref) < len(alt): # an INS +#.# orig_start = start +#.# orig_ref = ref +#.# orig_alt = alt +#.# start = orig_start +#.# ref = '-' +#.# alt = orig_alt[len(orig_ref):] +#.# print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) +#.# else: # a DEL +#.# print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" +#.# print line +#.# raise SystemExit + +#.# new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) + +#.# # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) +#.# if new_key not in G2P_DICT: +#.# G2P_DICT[new_key] = 0 +#.# else: +#.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) +#.# # raise SystemExit +#.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req +#.# pass + +#.# # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA +#.# if new_key not in G2P_DATA: +#.# G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) +#.# else: +#.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) +#.# # raise SystemExit +#.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req +#.# pass + + + NUM_UNIQ_G2P_VARS = len(G2P_DICT) + print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) sys.stdout.flush() + print "" print "" @@ -1260,6 +1256,10 @@ def read_G2P(in_file): + + + + def read_ped(in_file): global CHILD_ID diff --git a/generate_DEC_IGV_shared_scripts.py b/generate_DEC_IGV_shared_scripts.py index 78421450dfd382f6077d67827b57b3d0ff227213..687a3138f1685d9706e56a0c8729a80bf11310d3 100755 --- a/generate_DEC_IGV_shared_scripts.py +++ b/generate_DEC_IGV_shared_scripts.py @@ -12,7 +12,7 @@ # all G2P variants found in the individual VCF # # Author: MH -# last modified: DEC 07, 2021 +# last modified: JAN 20, 2022 @@ -299,7 +299,8 @@ def read_G2P(in_file): global NUM_SHARED_G2P_VARS - known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] +#.# known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] + known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # to make sure no duplicate vars per indi CHECK_DICT = defaultdict(dict) # 1st level key: indi_fam_id:chr:start:end:ref:alt; 2nd level key: OBS_state; value: irrelevant diff --git a/generate_DEC_IGV_trio_scripts_from_quad.py b/generate_DEC_IGV_trio_scripts_from_quad.py index 3afdd794f3e4a95a9abc32baa77d6a22f83133c0..408a1240d6af518a960abef44f9e80227445f128 100755 --- a/generate_DEC_IGV_trio_scripts_from_quad.py +++ b/generate_DEC_IGV_trio_scripts_from_quad.py @@ -15,7 +15,7 @@ # all VASE denovo variants found in the individual VCF # # Author: MH -# last modified: DEC 07, 2021 +# last modified: JAN 21, 2022 @@ -686,7 +686,8 @@ def read_G2P(in_file): global NUM_UNIQ_G2P_VARS - known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] +#.# known_OBS_states = ['monoallelic','biallelic','hemizygous','x-linked dominant','x-linked over-dominance'] + known_OBS_states = ['monoallelic_autosomal','biallelic_autosomal','monoallelic_X_hem','monoallelic_X_het'] # first, read the G2P variants on canonical transcripts for each of the family members @@ -795,81 +796,68 @@ def read_G2P(in_file): ### print out the number of unique G2P variants in CHILD ### child_mono = 0 child_bi = 0 - child_hemi = 0 - child_xld = 0 - child_xlod = 0 - - if 'monoallelic' in CHILD_DICT: - child_mono = len(CHILD_DICT['monoallelic']) - if 'biallelic' in CHILD_DICT: - child_bi = len(CHILD_DICT['biallelic']) - if 'hemizygous' in CHILD_DICT: - child_hemi = len(CHILD_DICT['hemizygous']) - if 'x-linked dominant' in CHILD_DICT: - child_xld = len(CHILD_DICT['x-linked dominant']) - if 'x-linked over-dominance' in CHILD_DICT: - child_xlod = len(CHILD_DICT['x-linked over-dominance']) + child_hem = 0 + child_het = 0 + + if 'monoallelic_autosomal' in CHILD_DICT: + child_mono = len(CHILD_DICT['monoallelic_autosomal']) + if 'biallelic_autosomal' in CHILD_DICT: + child_bi = len(CHILD_DICT['biallelic_autosomal']) + if 'monoallelic_X_hem' in CHILD_DICT: + child_hem = len(CHILD_DICT['monoallelic_X_hem']) + if 'monoallelic_X_het' in CHILD_DICT: + child_het = len(CHILD_DICT['monoallelic_X_het']) print "CHILD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (CHILD_ID) - print " monoallelic: %s" % (child_mono) - print " biallelic: %s" % (child_bi) - print " hemizygous: %s" % (child_hemi) - print " x-linked dominant: %s" % (child_xld) - print " x-linked over-dominance: %s" % (child_xlod) - + print " monoallelic_autosomal: %s" % (child_mono) + print " biallelic_autosomal: %s" % (child_bi) + print " monoallelic_X_hem: %s" % (child_hem) + print " monoallelic_X_het: %s" % (child_het) ### print out the number of unique G2P variants in MOM ### mom_mono = 0 mom_bi = 0 - mom_hemi = 0 - mom_xld = 0 - mom_xlod = 0 - - if 'monoallelic' in MOM_DICT: - mom_mono = len(MOM_DICT['monoallelic']) - if 'biallelic' in MOM_DICT: - mom_bi = len(MOM_DICT['biallelic']) - if 'hemizygous' in MOM_DICT: - mom_hemi = len(MOM_DICT['hemizygous']) - if 'x-linked dominant' in MOM_DICT: - mom_xld = len(MOM_DICT['x-linked dominant']) - if 'x-linked over-dominance' in MOM_DICT: - mom_xlod = len(MOM_DICT['x-linked over-dominance']) + mom_hem = 0 + mom_het = 0 + + if 'monoallelic_autosomal' in MOM_DICT: + mom_mono = len(MOM_DICT['monoallelic_autosomal']) + if 'biallelic_autosomal' in MOM_DICT: + mom_bi = len(MOM_DICT['biallelic_autosomal']) + if 'monoallelic_X_hem' in MOM_DICT: + mom_hem = len(MOM_DICT['monoallelic_X_hem']) + if 'monoallelic_X_het' in MOM_DICT: + mom_het = len(MOM_DICT['monoallelic_X_het']) print "MOM (%s): number of unique G2P variants on canon transcript in the following OBS states" % (MOM_ID) - print " monoallelic: %s" % (mom_mono) - print " biallelic: %s" % (mom_bi) - print " hemizygous: %s" % (mom_hemi) - print " x-linked dominant: %s" % (mom_xld) - print " x-linked over-dominance: %s" % (mom_xlod) + print " monoallelic_autosomal: %s" % (mom_mono) + print " biallelic_autosomal: %s" % (mom_bi) + print " monoallelic_X_hem: %s" % (mom_hem) + print " monoallelic_X_het: %s" % (mom_het) ### print out the number of unique G2P variants in DAD ### dad_mono = 0 dad_bi = 0 - dad_hemi = 0 - dad_xld = 0 - dad_xlod = 0 - - if 'monoallelic' in DAD_DICT: - dad_mono = len(DAD_DICT['monoallelic']) - if 'biallelic' in DAD_DICT: - dad_bi = len(DAD_DICT['biallelic']) - if 'hemizygous' in DAD_DICT: - dad_hemi = len(DAD_DICT['hemizygous']) - if 'x-linked dominant' in DAD_DICT: - dad_xld = len(DAD_DICT['x-linked dominant']) - if 'x-linked over-dominance' in DAD_DICT: - dad_xlod = len(DAD_DICT['x-linked over-dominance']) + dad_hem = 0 + dad_het = 0 + + if 'monoallelic_autosomal' in DAD_DICT: + dad_mono = len(DAD_DICT['monoallelic_autosomal']) + if 'biallelic_autosomal' in DAD_DICT: + dad_bi = len(DAD_DICT['biallelic_autosomal']) + if 'monoallelic_X_hem' in DAD_DICT: + dad_hem = len(DAD_DICT['monoallelic_X_hem']) + if 'monoallelic_X_het' in DAD_DICT: + dad_het = len(DAD_DICT['monoallelic_X_het']) print "DAD (%s): number of unique G2P variants on canon transcript in the following OBS states" % (DAD_ID) - print " monoallelic: %s" % (dad_mono) - print " biallelic: %s" % (dad_bi) - print " hemizygous: %s" % (dad_hemi) - print " x-linked dominant: %s" % (dad_xld) - print " x-linked over-dominance: %s" % (dad_xlod) + print " monoallelic_autosomal: %s" % (dad_mono) + print " biallelic_autosomal: %s" % (dad_bi) + print " monoallelic_X_hem: %s" % (dad_hem) + print " monoallelic_X_het: %s" % (dad_het) sys.stdout.flush() @@ -877,31 +865,31 @@ def read_G2P(in_file): - ############################################################################################### - #### DOM filtering #### - #### if the gene has been considered under the dominant model (OBS == monoallelic) #### - #### exclude child variants seen in UNAFFECTED mother/father, regardless of GT #### - ############################################################################################### + ###################################################################################################### + #### Dominant filtering #### + #### if the gene has been considered under the dominant model (OBS == monoallelic_autosomal) #### + #### exclude child variants seen in UNAFFECTED mother/father, regardless of GT #### + ###################################################################################################### print "" - print "=== DOM filtering ===" + print "=== monoallelic autosomal (DOMINANT) filtering ===" - for key in CHILD_DICT['monoallelic']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + for key in CHILD_DICT['monoallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - CHILD_GT = CHILD_DICT['monoallelic'][key][0] - CHILD_GENE = CHILD_DICT['monoallelic'][key][1] - CHILD_TRANS = CHILD_DICT['monoallelic'][key][2] + CHILD_GT = CHILD_DICT['monoallelic_autosomal'][key][0] + CHILD_GENE = CHILD_DICT['monoallelic_autosomal'][key][1] + CHILD_TRANS = CHILD_DICT['monoallelic_autosomal'][key][2] - if (key in MOM_DICT['monoallelic']) and (MOM_STAT == "UNAFFECTED"): - MOM_GT = MOM_DICT['monoallelic'][key][0] - print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GT,MOM_GT,MOM_STAT) + if (key in MOM_DICT['monoallelic_autosomal']) and (MOM_STAT == "UNAFFECTED"): + MOM_GT = MOM_DICT['monoallelic_autosomal'][key][0] + print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, MOM_GT = %s, MOM_STAT = %s" % (key,CHILD_GENE,CHILD_GT,MOM_GT,MOM_STAT) continue - if (key in DAD_DICT['monoallelic']) and (DAD_STAT == "UNAFFECTED"): - DAD_GT = DAD_DICT['monoallelic'][key][0] - print "***[DOM model]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT) + if (key in DAD_DICT['monoallelic_autosomal']) and (DAD_STAT == "UNAFFECTED"): + DAD_GT = DAD_DICT['monoallelic_autosomal'][key][0] + print "***[DOMINANT model]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue @@ -952,9 +940,149 @@ def read_G2P(in_file): + ############################################################################################################## + #### Recessive filtering #### + #### under the recessive model (OBS == biallelic_autosomal) - consider ALL variants per gene #### + #### must all be HET in CHILD, GT in parent does not matter #### + #### all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) #### + #### and this parent must be unaffected #### + #### if all these: then exclude all child variants in this gene #### + ############################################################################################################## + + print "" + print "=== biallelic autosomal (RECESSIVE) filtering ===" + + + GENE_KEY_GT = defaultdict(dict) # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans) + + # process all variants in biallelic genes in child + for key in CHILD_DICT['biallelic_autosomal']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + b_GT = CHILD_DICT['biallelic_autosomal'][key][0] + b_gene = CHILD_DICT['biallelic_autosomal'][key][1] + b_trans = CHILD_DICT['biallelic_autosomal'][key][2] + GENE_KEY_GT[b_gene][key] = (b_GT,b_trans) + + # iterate over genes in GENE_KEY_GT + for g in GENE_KEY_GT: # this is the biallelic gene name + all_HET = True + + # iterate over variants in this gene + for kx in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt + if GENE_KEY_GT[g][kx][0] == 'HOM': # there is a HOM variant in the child - NO filtering + all_HET = False + break + if all_HET: # for this gene + # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent + # if yes, filter out and write a message to the log file + # if not, to be added to G2P_DICT and G2P_DATA for further processing + all_from_one_parent = True + # iterate again over the variants in this gene + VAR_SOURCE_LIST = {} # key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED) + + for ky in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt + + this_var_status = 'NONE' + + if ((ky in MOM_DICT['biallelic_autosomal']) or (ky in MOM_DICT['monoallelic_autosomal'])) and (MOM_STAT == "UNAFFECTED"): + this_var_status = 'MOM' + if ((ky in DAD_DICT['biallelic_autosomal']) or (ky in DAD_DICT['monoallelic_autosomal'])) and (DAD_STAT == "UNAFFECTED"): + if this_var_status == 'NONE': + this_var_status = 'DAD' + elif this_var_status == 'MOM': + this_var_status = 'BOTH' + + VAR_SOURCE_LIST[ky] = this_var_status + + # have collected the parent source for all variants in this gene + tot_num_vars = len(VAR_SOURCE_LIST) + num_mom = 0 + num_dad = 0 + num_none = 0 + for kt,v in VAR_SOURCE_LIST.iteritems(): + if v == 'NONE': + num_none += 1 + elif v == 'MOM': + num_mom += 1 + elif v == 'DAD': + num_dad += 1 + elif v == 'BOTH': + num_mom += 1 + num_dad += 1 + else: + print "ERROR: cannot understand the source parent = %s" % (v) + raise SystemExit + + if num_none > 0: + all_from_one_parent = False + elif num_mom < tot_num_vars and num_dad < tot_num_vars: + all_from_one_parent = False + + + # if all variants in the child in this gene are found in single unaffected parent - filter out + if all_from_one_parent: + for kz in GENE_KEY_GT[g]: + print "***[RECESSIVE model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz]) + continue + + # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written + # and gone to evaluating the next biallelic gene in the child + + # if here + # - either not all CHILD variants in this gene are not HET, or + # - not all of them can be traced to a single unaffected parent + # --> add to be processed + + # here we are at gene level, must iterate over all variants in this gene + # iterate over variants in this gene + for kkk in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt + + CHILD_GT = CHILD_DICT['biallelic_autosomal'][kkk][0] + CHILD_GENE = CHILD_DICT['biallelic_autosomal'][kkk][1] + CHILD_TRANS = CHILD_DICT['biallelic_autosomal'][kkk][2] + + # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) + chr,start,end,ref,alt = kkk.split(":") + if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized + if len(ref) < len(alt): # an INS + orig_start = start + orig_ref = ref + orig_alt = alt + start = orig_start + ref = '-' + alt = orig_alt[len(orig_ref):] + print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) + else: # a DEL + print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" + print line + raise SystemExit + + new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) + + # record the data for CHILD G2P variants (for OBS=biallelic) + if new_key not in G2P_DICT: + G2P_DICT[new_key] = 0 + else: + # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) + # raise SystemExit + # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req + pass + + # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA + if new_key not in G2P_DATA: + G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) + else: + # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) + # raise SystemExit + # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req + pass + + NUM_UNIQ_G2P_VARS = len(G2P_DICT) + print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) + sys.stdout.flush() + print "" @@ -962,31 +1090,41 @@ def read_G2P(in_file): #################################################################################################################### #### X-linked filtering #### - #### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) #### +#.# #### under the x-linked model (OBS == hemizygous or x-linked dominant, but NOT x-linked over-dominance) #### + #### under the chrX model (OBS == monoallelic_X_hem or monoallelic_X_het) #### #### exclude child HET variants if seen as HOM in UNAFFECTED father #### + #### #### + #### Note 18/01/2022 #### + #### This is a temporary solution, since x-linked dominant and x-linked over-dominance -> monoallelic_X_het #### + #### and we should filter x-linked dominant and monoallelic_X_hem, but not x-linked over-dominance #### + #### the code below treats x-linked over-dominance as the others (i.e. filters, while it should not) #### + #### Issue flagged to G2P plug-in team, awaiting their fix #### + #### for now manually scan the output of G2P for the proband (both for boys and girls) #### + #### to check if any variant has been called in PCDH19 and EFNB1 #### + #### also for all the variants filtered out from monoallelic_X_het we will print in the log the gene name #### #################################################################################################################### print "" print "=== X-linked filtering ===" - ################################ - ### process hemizygous genes ### - ################################ + ####################################### + ### process monoallelic_X_hem genes ### + ####################################### - for key in CHILD_DICT['hemizygous']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + for key in CHILD_DICT['monoallelic_X_hem']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - CHILD_GT = CHILD_DICT['hemizygous'][key][0] - CHILD_GENE = CHILD_DICT['hemizygous'][key][1] - CHILD_TRANS = CHILD_DICT['hemizygous'][key][2] + CHILD_GT = CHILD_DICT['monoallelic_X_hem'][key][0] + CHILD_GENE = CHILD_DICT['monoallelic_X_hem'][key][1] + CHILD_TRANS = CHILD_DICT['monoallelic_X_hem'][key][2] - if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl) + if CHILD_GT == 'HOM': # do NOT filter HOM variants in proband (i.e., hemizygous in boy or HOM in girl) pass else: - if (key in DAD_DICT['hemizygous']) and (DAD_STAT == "UNAFFECTED"): - DAD_GT = DAD_DICT['hemizygous'][key][0] - if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father - print "***[X-linked model (hemizygous)]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT) + if (key in DAD_DICT['monoallelic_X_hem']) and (DAD_STAT == "UNAFFECTED"): + DAD_GT = DAD_DICT['monoallelic_X_hem'][key][0] + if DAD_GT == 'HOM': # i.e., hemizygous variant in unaffected father + print "***[monoallelic_X_hem]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) @@ -1007,7 +1145,7 @@ def read_G2P(in_file): new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - # record the data for CHILD G2P variants (for OBS=hemizygous) + # record the data for CHILD G2P variants (for OBS=monoallelic_X_hem) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: @@ -1028,22 +1166,22 @@ def read_G2P(in_file): ####################################### - ### process x-linked dominant genes ### + ### process monoallelic_X_het genes ### ####################################### - for key in CHILD_DICT['x-linked dominant']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) + for key in CHILD_DICT['monoallelic_X_het']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - CHILD_GT = CHILD_DICT['x-linked dominant'][key][0] - CHILD_GENE = CHILD_DICT['x-linked dominant'][key][1] - CHILD_TRANS = CHILD_DICT['x-linked dominant'][key][2] + CHILD_GT = CHILD_DICT['monoallelic_X_het'][key][0] + CHILD_GENE = CHILD_DICT['monoallelic_X_het'][key][1] + CHILD_TRANS = CHILD_DICT['monoallelic_X_het'][key][2] if CHILD_GT == 'HOM': # do NOT filter HOM variants (i.e., hemizygous in boy or HOM in girl) pass else: - if (key in DAD_DICT['x-linked dominant']) and (DAD_STAT == "UNAFFECTED"): - DAD_GT = DAD_DICT['x-linked dominant'][key][0] + if (key in DAD_DICT['monoallelic_X_het']) and (DAD_STAT == "UNAFFECTED"): + DAD_GT = DAD_DICT['monoallelic_X_het'][key][0] if DAD_GT == 'HOM': # i.e., x-linked dominant variant in unnafected father - print "***[X-linked model (x-linked dominant)]*** Excluded CHILD var %s CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GT,DAD_GT,DAD_STAT) + print "***[monoallelic_X_het]*** Excluded CHILD var %s in gene = %s, CHILD_GT = %s, DAD_GT = %s, DAD_STAT = %s" % (key,CHILD_GENE,CHILD_GT,DAD_GT,DAD_STAT) continue # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) @@ -1064,7 +1202,7 @@ def read_G2P(in_file): new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - # record the data for CHILD G2P variants (for OBS=x-linked dominant) + # record the data for CHILD G2P variants (for OBS=monoallelic_X_het) if new_key not in G2P_DICT: G2P_DICT[new_key] = 0 else: @@ -1083,208 +1221,66 @@ def read_G2P(in_file): pass - ######################################################################## - ### process x-linked over-dominance genes - no filtering to be done ### - ######################################################################## - for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] - CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] - CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] - # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) - chr,start,end,ref,alt = key.split(":") - if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized - if len(ref) < len(alt): # an INS - orig_start = start - orig_ref = ref - orig_alt = alt - start = orig_start - ref = '-' - alt = orig_alt[len(orig_ref):] - print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) - else: # a DEL - print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" - print line - raise SystemExit +#.# ######################################################################## +#.# ### process x-linked over-dominance genes - no filtering to be done ### +#.# ######################################################################## - new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) +#.# for key in CHILD_DICT['x-linked over-dominance']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) - if new_key not in G2P_DICT: - G2P_DICT[new_key] = 0 - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - - # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA - if new_key not in G2P_DATA: - G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass +#.# CHILD_GT = CHILD_DICT['x-linked over-dominance'][key][0] +#.# CHILD_GENE = CHILD_DICT['x-linked over-dominance'][key][1] +#.# CHILD_TRANS = CHILD_DICT['x-linked over-dominance'][key][2] +#.# # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) +#.# chr,start,end,ref,alt = key.split(":") +#.# if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized +#.# if len(ref) < len(alt): # an INS +#.# orig_start = start +#.# orig_ref = ref +#.# orig_alt = alt +#.# start = orig_start +#.# ref = '-' +#.# alt = orig_alt[len(orig_ref):] +#.# print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) +#.# else: # a DEL +#.# print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" +#.# print line +#.# raise SystemExit - NUM_UNIQ_G2P_VARS = len(G2P_DICT) - print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) - sys.stdout.flush() - - print "" +#.# new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) +#.# # record the data for CHILD G2P variants (for OBS=x-linked over-dominance) +#.# if new_key not in G2P_DICT: +#.# G2P_DICT[new_key] = 0 +#.# else: +#.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) +#.# # raise SystemExit +#.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req +#.# pass +#.# # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA +#.# if new_key not in G2P_DATA: +#.# G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) +#.# else: +#.# # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) +#.# # raise SystemExit +#.# # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req +#.# pass - - ############################################################################################################## - #### BIALLELIC filtering #### - #### under the biallelic model (OBS == biallelic) - consider ALL variants per gene #### - #### must all be HET in CHILD, GT in parent does not matter #### - #### all of them must *clearly* come from only one of the parents (maternally/paternally + biparental) #### - #### and this parent must be unaffected #### - #### if all these: then exclude all child variants in this gene #### - ############################################################################################################## - + NUM_UNIQ_G2P_VARS = len(G2P_DICT) + print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, BIALLELIC and X-LINKED genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) + sys.stdout.flush() print "" - print "=== BIALLELIC filtering ===" - - - GENE_KEY_GT = defaultdict(dict) # for child - 1st level key: gene_name; 2nd level key: chr:start:end:ref:alt; value: (GT,trans) - - # process all variants in biallelic genes in child - for key in CHILD_DICT['biallelic']: # this the second key: chr:start:end:ref:alt; value: (ZYG,gene,trans) - b_GT = CHILD_DICT['biallelic'][key][0] - b_gene = CHILD_DICT['biallelic'][key][1] - b_trans = CHILD_DICT['biallelic'][key][2] - GENE_KEY_GT[b_gene][key] = (b_GT,b_trans) - - # iterate over genes in GENE_KEY_GT - for g in GENE_KEY_GT: # this is the biallelic gene name - all_HET = True - - # iterate over variants in this gene - for kx in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt -### if GENE_KEY_GT[g][kx] == 'HOM': # there is a HOM variant in the child - NO filtering - if GENE_KEY_GT[g][kx][0] == 'HOM': # there is a HOM variant in the child - NO filtering - all_HET = False - break - - if all_HET: # for this gene - # all variants in this gene in the CHILD are HET, check if all come from a single unaffected parent - # if yes, filter out and write a message to the log file - # if not, to be added to G2P_DICT and G2P_DATA for further processing - - all_from_one_parent = True - - # iterate again over the variants in this gene - VAR_SOURCE_LIST = {} # key: chr:start:end:ref:alt in child; value: (NONE) or (MOM or DAD or BOTH and the parent is UNAFFECTED) - - for ky in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt - - this_var_status = 'NONE' - - if ((ky in MOM_DICT['biallelic']) or (ky in MOM_DICT['monoallelic'])) and (MOM_STAT == "UNAFFECTED"): - this_var_status = 'MOM' - if ((ky in DAD_DICT['biallelic']) or (ky in DAD_DICT['monoallelic'])) and (DAD_STAT == "UNAFFECTED"): - if this_var_status == 'NONE': - this_var_status = 'DAD' - elif this_var_status == 'MOM': - this_var_status = 'BOTH' - - VAR_SOURCE_LIST[ky] = this_var_status - - # have collected the parent source for all variants in this gene - tot_num_vars = len(VAR_SOURCE_LIST) - num_mom = 0 - num_dad = 0 - num_none = 0 - for kt,v in VAR_SOURCE_LIST.iteritems(): - if v == 'NONE': - num_none += 1 - elif v == 'MOM': - num_mom += 1 - elif v == 'DAD': - num_dad += 1 - elif v == 'BOTH': - num_mom += 1 - num_dad += 1 - else: - print "ERROR: cannot understand the source parent = %s" % (v) - raise SystemExit - - if num_none > 0: - all_from_one_parent = False - elif num_mom < tot_num_vars and num_dad < tot_num_vars: - all_from_one_parent = False - - # if all variants in the child in this gene are found in single unaffected parent - filter out - if all_from_one_parent: - for kz in GENE_KEY_GT[g]: - print "***[Biallelic model]*** Excluded CHILD HET var %s in gene = %s, found in = %s, PARENT_STAT = UNAFFECTED" % (kz,g,VAR_SOURCE_LIST[kz]) - continue - - # end processing all HET variants in the proband - if all from single unaffected parent they have been excluded, message to the log written - # and gone to evaluating the next biallelic gene in the child - - # if here - # - either not all CHILD variants in this gene are not HET, or - # - not all of them can be traced to a single unaffected parent - # --> add to be processed - - # here we are at gene level, must iterate over all variants in this gene - # iterate over variants in this gene - for kkk in GENE_KEY_GT[g]: # this the second key: chr:start:end:ref:alt - - CHILD_GT = CHILD_DICT['biallelic'][kkk][0] - CHILD_GENE = CHILD_DICT['biallelic'][kkk][1] - CHILD_TRANS = CHILD_DICT['biallelic'][kkk][2] - - # if a non-normalized INDEL in child G2P - must adjust (should not happen really, we split, normalized and left-aligned the family VCF before sending it to VEP+G2P) - chr,start,end,ref,alt = kkk.split(":") - if len(ref) > 1 and len(alt) > 1: # an INDEL - not normalized - if len(ref) < len(alt): # an INS - orig_start = start - orig_ref = ref - orig_alt = alt - start = orig_start - ref = '-' - alt = orig_alt[len(orig_ref):] - print " WARNING: original INS = %s:%s:%s:%s:%s --> replaced with INS = %s:%s:%s:%s" % (chr,orig_start,end,orig_ref,orig_alt,chr,start,ref,alt) - else: # a DEL - print "ERROR: At the momemnt, cannot deal with this non-normalized deletion" - print line - raise SystemExit + print "" - new_key = '%s:%s:%s:%s' % (chr,start,ref,alt) - # record the data for CHILD G2P variants (for OBS=biallelic) - if new_key not in G2P_DICT: - G2P_DICT[new_key] = 0 - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - # and record the required data (CHILD_TRANS,CHILD_GENE,CHILD_GT) in G2P_DATA - if new_key not in G2P_DATA: - G2P_DATA[new_key] = (CHILD_TRANS,CHILD_GENE,CHILD_GT) - else: - # print "ERROR: duplicate G2P variant new_key = %s" % (new_key) - # raise SystemExit - # this will happen if a gene is e.g. hemizygous,x-linked dominant - there will be two separate lines in the output for each req - pass - NUM_UNIQ_G2P_VARS = len(G2P_DICT) - print "Found %s unique G2P variants in CHILD (%s) after considering MONOALLELIC, X-LINKED and BIALLELIC genes" % (NUM_UNIQ_G2P_VARS,CHILD_ID) - sys.stdout.flush() - print "" - print "" diff --git a/process_quad.sh b/process_quad.sh index 2aa1cead9ae8306dba5453c3db3a611012258cd0..29cd8931f2b7a6130863eb2649e042055580b9f4 100755 --- a/process_quad.sh +++ b/process_quad.sh @@ -30,13 +30,16 @@ SCRIPTS_DIR=/home/u035/u035/shared/scripts # other files to be used -TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.plus15bp.merged.bed # OK -CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.clinvar.20210626.plus15bp.txt # OK +TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed # OK +CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt # OK BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt # OK TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt # OK +REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1 + ### TOOLS ### +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix @@ -227,9 +230,9 @@ for KID_ID in ${KID_IDS[@]}; do --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ - --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt" \ + --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ - --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20210706.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} + --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" echo "" @@ -355,6 +358,51 @@ for KID_ID in ${KID_IDS[@]}; do + ################################################################################################ + # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) # + ################################################################################################ + echo "Performing recurrent coverage analysis for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..." + + # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it + # set the name of the file containing info about the coverage of the recurrent SNPs + REC_OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.REC_SNP_COV.txt + + while IFS=$'\t' read -ra var; do + gene="${var[0]}" + chr="${var[1]}" + pos="${var[2]}" + lo=$(expr $pos - 15) + hi=$(expr $pos + 15) + reg="$lo-$hi" + echo "=============================================" + echo "$gene : recurrent variant at $chr:$pos" + echo "exploring coverage at $chr:$reg" + + echo "---------------------------------------------" + echo "precisely at the position" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos" + + echo "---------------------------------------------" + echo "average in the +/- 15bp region" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}' + + echo "---------------------------------------------" + echo "detailed in the +/- 15bp region" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} + done < ${REC_SNP} > ${REC_OUT_FILE} + + + echo "" + echo "" + echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done " + echo " Coverage file = ${REC_OUT_FILE}" + echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + echo "" + echo "" + + + ############################################################################################# @@ -684,9 +732,9 @@ time ${VEP} \ --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ - --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt" \ + --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ - --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20210706.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} + --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" @@ -700,7 +748,8 @@ echo "" ######################################################## -### run coverage for each proband (DD genes) ### +### run coverage for each proband (DD genes) ### +### run reccurent SNP coverage for each proband ### ### already did it as part of the trio analysis ### ######################################################## diff --git a/process_shared.sh b/process_shared.sh index 0ac17aa790276a63427dc312cbc194c9277c215a..c3e10666114d645fc06a8eadb2e04d14d89c5598 100755 --- a/process_shared.sh +++ b/process_shared.sh @@ -29,16 +29,18 @@ SCRIPTS_DIR=/home/u035/u035/shared/scripts # other files to be used -TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.plus15bp.merged.bed # OK -CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.clinvar.20210626.plus15bp.txt # OK +TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed # OK +CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt # OK BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt # OK TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt # OK +REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1 ### TOOLS ### +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix @@ -179,9 +181,9 @@ time ${VEP} \ --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ - --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt" \ + --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ - --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20210706.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} + --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" @@ -248,6 +250,49 @@ for INDI_ID in `${BCFTOOLS} query -l $file`; do echo "" echo "" + + + + # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) # + echo "Performing recurrent coverage analysis for PROBAND_ID = ${INDI_ID} ..." + + # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it + # set the name of the file containing info about the coverage of the recurrent SNPs + REC_OUT_FILE=${COV_DIR}/${INDI_ID}.REC_SNP_COV.txt + + while IFS=$'\t' read -ra var; do + gene="${var[0]}" + chr="${var[1]}" + pos="${var[2]}" + lo=$(expr $pos - 15) + hi=$(expr $pos + 15) + reg="$lo-$hi" + echo "=============================================" + echo "$gene : recurrent variant at $chr:$pos" + echo "exploring coverage at $chr:$reg" + + echo "---------------------------------------------" + echo "precisely at the position" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos" + + echo "---------------------------------------------" + echo "average in the +/- 15bp region" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}' + + echo "---------------------------------------------" + echo "detailed in the +/- 15bp region" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} + done < ${REC_SNP} > ${REC_OUT_FILE} + + echo "" + echo "" + echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${INDI_ID}: done " + echo " Coverage file = ${REC_OUT_FILE}" + echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + echo "" + echo "" + done diff --git a/process_trio.sh b/process_trio.sh index 5cbe9b40d3f86bc2cdfbc07e8bcba1f9333797cf..b0ae363af34b001a7da780fdbfda5ef9ba586093 100755 --- a/process_trio.sh +++ b/process_trio.sh @@ -33,14 +33,16 @@ SCRIPTS_DIR=/home/u035/u035/shared/scripts # other files to be used FAMILY_IDS=${WORK_DIR}/FAM_IDs.txt # created by trio_setup.sh CHILD_IDS=${WORK_DIR}/PRO_IDs.txt # created by trio_setup.sh -TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.plus15bp.merged.bed # OK -CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20210706.clinvar.20210626.plus15bp.txt # OK +TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed # OK +CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt # OK BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt # OK TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt # OK +REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1 ### TOOLS ### +SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix @@ -161,9 +163,9 @@ time ${VEP} \ --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ - --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20210706.txt" \ + --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ - --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20210706.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} + --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" @@ -329,6 +331,53 @@ echo "" +################################################################################################ +# check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) # +################################################################################################ +echo "Performing recurrent coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..." + +# we have identified the name of the proband's BAM file above (BAM_FILE), reuse it +# set the name of the file containing info about the coverage of the recurrent SNPs +REC_OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.REC_SNP_COV.txt + +while IFS=$'\t' read -ra var; do + gene="${var[0]}" + chr="${var[1]}" + pos="${var[2]}" + lo=$(expr $pos - 15) + hi=$(expr $pos + 15) + reg="$lo-$hi" + echo "=============================================" + echo "$gene : recurrent variant at $chr:$pos" + echo "exploring coverage at $chr:$reg" + + echo "---------------------------------------------" + echo "precisely at the position" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos" + + echo "---------------------------------------------" + echo "average in the +/- 15bp region" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}' + + echo "---------------------------------------------" + echo "detailed in the +/- 15bp region" + ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} +done < ${REC_SNP} > ${REC_OUT_FILE} + + +echo "" +echo "" +echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" +echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done " +echo " Coverage file = ${REC_OUT_FILE}" +echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" +echo "" +echo "" + + + + + ################################################################################### ### for each proband generate the DECIPHER file ### @@ -367,7 +416,7 @@ FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_ ## call the python scrpit -#~~#time ${PYTHON2} /home/u035/u035/shared/temp/generate_DEC_IGV_scripts.py \ +##time ${PYTHON2} /home/u035/u035/shared/temp/generate_DEC_IGV_scripts.py \ time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_scripts.py \ ${DEC_MAP} \ ${TRANS_MAP} \