From e19e28e5108702fb97836dbcc3dc60cb9d863c20 Mon Sep 17 00:00:00 2001
From: ameyner2 <alison.meynert@igmm.ed.ac.uk>
Date: Fri, 18 Jun 2021 11:39:12 +0100
Subject: [PATCH] Update to handle different G2P output

---
 NHS_WES_generate_DEC_IGV_aff_probands.py  | 14 ++++++++------
 NHS_WES_generate_DEC_IGV_sib_from_quad.py | 14 ++++++++------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/NHS_WES_generate_DEC_IGV_aff_probands.py b/NHS_WES_generate_DEC_IGV_aff_probands.py
index 29023d3..25a965b 100755
--- a/NHS_WES_generate_DEC_IGV_aff_probands.py
+++ b/NHS_WES_generate_DEC_IGV_aff_probands.py
@@ -349,27 +349,29 @@ def read_G2P(in_file):
             ##########################################
 
             check_key = '%s:%s' % (sam_id,second_key)
-            if check_key not in CHECK_DICT:
+            if check_key not in CHECK_DICT:						# first time we see this var in this sample, any OBS_state
                 CHECK_DICT[check_key][OBS_state] = 1
                 if sam_id not in KIDS_G2P_DICT:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                 elif second_key not in KIDS_G2P_DICT[sam_id]:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
-                else:
-                    print "ERROR: should not be here: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                else:									# sanity check
+                    print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
                     raise SystemExit
 
-            elif OBS_state not in CHECK_DICT[check_key].keys():
+            elif OBS_state not in CHECK_DICT[check_key].keys():				# first time we see this var in this sample with this OBS_state
                 CHECK_DICT[check_key][OBS_state] = 1
                 if sam_id not in KIDS_G2P_DICT:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                 elif second_key not in KIDS_G2P_DICT[sam_id]:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript):    # diff OBS_state, but must have same (GT,gene_name,transcript)
+                    pass
                 else:
-                    print "ERROR: should not be here: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                    print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
                     raise SystemExit
 
-            else: 	# same individual, same variant, same OBS_state
+            else: 	# same individual, same variant, known OBS_state
                         # due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq
                         # check the gene name in KIDS_G2P_DICT[sam_id][second_key]
                 if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'):             # recorded is refseq
diff --git a/NHS_WES_generate_DEC_IGV_sib_from_quad.py b/NHS_WES_generate_DEC_IGV_sib_from_quad.py
index 304ceb7..eb978f2 100755
--- a/NHS_WES_generate_DEC_IGV_sib_from_quad.py
+++ b/NHS_WES_generate_DEC_IGV_sib_from_quad.py
@@ -354,27 +354,29 @@ def read_G2P(in_file):
             ##########################################
 
             check_key = '%s:%s' % (sam_id,second_key)
-            if check_key not in CHECK_DICT:
+            if check_key not in CHECK_DICT:						# first time we see this var in this sample, any OBS_state
                 CHECK_DICT[check_key][OBS_state] = 1
                 if sam_id not in KIDS_G2P_DICT:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                 elif second_key not in KIDS_G2P_DICT[sam_id]:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
-                else:
-                    print "ERROR: should not be here: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                else:									# sanity check
+                    print "ERROR: first time var already seen?: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
                     raise SystemExit
 
-            elif OBS_state not in CHECK_DICT[check_key].keys():
+            elif OBS_state not in CHECK_DICT[check_key].keys():				# first time we see this var in this sample with this OBS_state
                 CHECK_DICT[check_key][OBS_state] = 1
                 if sam_id not in KIDS_G2P_DICT:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
                 elif second_key not in KIDS_G2P_DICT[sam_id]:
                     KIDS_G2P_DICT[sam_id][second_key] = (GT,gene_name,transcript)
+                elif KIDS_G2P_DICT[sam_id][second_key] == (GT,gene_name,transcript):	# diff OBS_state, but must have same (GT,gene_name,transcript)
+                    pass
                 else:
-                    print "ERROR: should not be here: variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
+                    print "ERROR: diff (GT,gene_name,transcript) for variant %s in %s gene for CHILD = %s" % (second_key,gene_name,sam_id)
                     raise SystemExit
 
-            else:       # same individual, same variant, same OBS_state
+            else:       # same individual, same variant, known OBS_state
                         # due to the new output of G2P we may have the same variant but with different gene names - ensembl/refseq
                         # check the gene name in KIDS_G2P_DICT[sam_id][second_key]
                 if not KIDS_G2P_DICT[sam_id][second_key][1].startswith('ENSG'):             # recorded is refseq
-- 
GitLab