Mike updates

3d2e76b8 · ameyner2 · 9e11a751 · 3d2e76b8 · 3d2e76b8 · 3d2e76b8
Commit 3d2e76b8 authored 4 years ago by ameyner2
--- a/compare_indi_vars_by_version.py
+++ b/compare_indi_vars_by_version.py
+#
+#       Author: MH
+#       last modified: SEPT 25, 2020
+import sys
+import os
+import fnmatch
+from collections import defaultdict
+V1_VARIANTS = defaultdict(list)		# key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
+V2_VARIANTS = defaultdict(list)		# key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
+def go(v1_dir,v2_dir):
+    v1_files = []
+    for root, dirnames, filenames in os.walk(v1_dir):
+        for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
+            v1_files.append(os.path.join(root, filename))
+    print "Found %s *_DEC_FLT.csv files in %s" % (len(v1_files),v1_dir)
+    for v1_file in v1_files:
+        parsed = [x.strip() for x in v1_file.strip().split('/')]
+        dname = parsed[8]
+        fname = parsed[9]
+        fname_parsed = [y.strip() for y in fname.strip().split('_')]
+        indi_id = fname_parsed[0]
+        fam_id = fname_parsed[1]
+        key = "%s_%s" % (indi_id,fam_id)
+        V1_VARIANTS[key] = []  
+#        print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)   
+        # now open the file and read the variants
+        in_han = open(v1_file,'r')
+        for line in in_han:
+            if line.startswith("Internal reference number or ID"):
+                continue
+            data = [z.strip() for z in line.strip().split(',')]
+            internal_id = data[0]
+            chr = data[1]
+            pos = data[2]
+            ref = data[4]
+            alt = data[5]
+            value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
+#            print value
+            V1_VARIANTS[key].append(value)
+#        print "    Found %s variants for %s" % (len(V1_VARIANTS[key]),key)              
+    # now process V2
+    v2_files = []
+    for root, dirnames, filenames in os.walk(v2_dir):
+        for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
+            v2_files.append(os.path.join(root, filename))
+    print "Found %s *_DEC_FLT.csv files in %s" % (len(v2_files),v2_dir)
+    for v2_file in v2_files:
+        parsed = [x.strip() for x in v2_file.strip().split('/')]
+        dname = parsed[8]
+        fname = parsed[9]
+        fname_parsed = [y.strip() for y in fname.strip().split('_')]
+        indi_id = fname_parsed[0]
+        fam_id = fname_parsed[1]
+        key = "%s_%s" % (indi_id,fam_id)
+        V2_VARIANTS[key] = []
+#        print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
+        # now open the file and read the variants
+        in_han = open(v2_file,'r')
+        for line in in_han:
+            if line.startswith("Internal reference number or ID"):
+                continue
+            data = [z.strip() for z in line.strip().split(',')]
+            internal_id = data[0]
+            chr = data[1]
+            pos = data[2]
+            ref = data[4]
+            alt = data[5]
+            value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
+#            print value
+            V2_VARIANTS[key].append(value)
+#        print "    Found %s variants for %s" % (len(V2_VARIANTS[key]),key)
+    # check that in both folders we have the same individuals (i.e. keys)
+    v1_keys = V1_VARIANTS.keys()
+    v2_keys = V2_VARIANTS.keys()
+    for v1_key in v1_keys:
+        if v1_key not in v2_keys:  
+            print "v1_key = %s not found in v2_keys" % (v1_key)
+            raise SystemExit
+    for v2_key in v2_keys:
+        if v2_key not in v1_keys:
+            print "v2_key = %s not found in v1_keys" % (v2_key)
+            raise SystemExit
+    print "Match individuals between the two vesrions: OK"
+    sys.stdout.flush()
+    # now compare the variants per individual
+    for key in v1_keys:
+        print ""
+        print "Processing individual = %s" % (key)
+        v1_variants = V1_VARIANTS[key]
+        v2_variants = V2_VARIANTS[key]
+        for v1_var in v1_variants:
+            if v1_var in v2_variants:
+                print "%s: %s found in both versions" % (key,v1_var)
+                v2_variants.remove(v1_var)
+            else:
+                print "  <<<< %s: %s found only in V1" % (key,v1_var)    
+        # in v2_variants, remaining ar only those for which no match in V1 was found
+        for v2_var in v2_variants:
+            print "  >>>> %s: %s found only in V2" % (key,v2_var)
+        print "-----------------------------"
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        go(sys.argv[1],sys.argv[2])
+    else:
+        print ("Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2")
+        raise SystemExit
--- a/gather_NHS_WES_trio_results.sh
+++ b/gather_NHS_WES_trio_results.sh
@@ -10,7 +10,7 @@
 ### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh ###
 BASE=/scratch/u035/project/trio_whole_exome/analysis
 WORK_DIR=${BASE}/${PROJECT_ID}
-NHS_DIR=${WORK_DIR}/${PLATE_ID}_results
+NHS_DIR=${WORK_DIR}/${PLATE_ID}_${VERSION_N}_results
 # other files to be used
@@ -20,6 +20,7 @@ CHILD_IDS=${WORK_DIR}/PRO_IDs.txt
 echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 16862
 echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
 # check if ${NHS_DIR} already exists - if not, exit and ask to be created

--- a/process_NHS_WES_trio.sh
+++ b/process_NHS_WES_trio.sh
 #!/bin/bash
 #PBS -l walltime=24:00:00
 #PBS -l ncpus=1,mem=16gb
-#PBS -q uv2000
+#PBS -q sgp
 #PBS -N process_trio
 #PBS -j oe
@@ -57,7 +57,7 @@ echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM fil
 echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 11870_Germain_Lorna
 echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 16862
 echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
+echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
@@ -266,10 +266,10 @@ echo ""
-#################################################################################################################################################
+##############################################################################################################################################################
-###          run coverage for each proband (DD genes)                                                                                         ###
+###          run coverage for each proband (DD genes)                                                                                                      ###
-###   format: ${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam   ###
+###   format: ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam   ###
-#################################################################################################################################################
+##############################################################################################################################################################
 #################################
@@ -282,7 +282,7 @@ echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} .
 # make sure we are reading the data from the exact batch & plate ID
-BAM_FILE=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam
+BAM_FILE=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam
 OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15
@@ -353,7 +353,7 @@ DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt
 IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt
 IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf
 FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}
-FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}
+FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}
 ## call the python scrpit
@@ -432,9 +432,9 @@ done
 echo "...kid_id = ${kid_id}, par_1_id = ${par_1_id}, par_2_id = ${par_2_id} "
 # gather the trio BAM files
-kid_bam=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam
+kid_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam
-par_1_bam=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam
+par_1_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam
-par_2_bam=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam
+par_2_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam
 echo "...kid_bam = ${kid_bam}..."
 echo "...par_1_bam = ${par_1_bam}..."