Skip to content
Snippets Groups Projects
Commit 3d2e76b8 authored by ameyner2's avatar ameyner2
Browse files

Mike updates

parent 9e11a751
No related branches found
No related tags found
No related merge requests found
#
# Author: MH
# last modified: SEPT 25, 2020
import sys
import os
import fnmatch
from collections import defaultdict
V1_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
V2_VARIANTS = defaultdict(list) # key: <indi_id>_<fam_id>; value: list of variants in the decipher upload file in the form internal_id:chr:pos:ref:alt
def go(v1_dir,v2_dir):
v1_files = []
for root, dirnames, filenames in os.walk(v1_dir):
for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
v1_files.append(os.path.join(root, filename))
print "Found %s *_DEC_FLT.csv files in %s" % (len(v1_files),v1_dir)
for v1_file in v1_files:
parsed = [x.strip() for x in v1_file.strip().split('/')]
dname = parsed[8]
fname = parsed[9]
fname_parsed = [y.strip() for y in fname.strip().split('_')]
indi_id = fname_parsed[0]
fam_id = fname_parsed[1]
key = "%s_%s" % (indi_id,fam_id)
V1_VARIANTS[key] = []
# print "Processing V1: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han = open(v1_file,'r')
for line in in_han:
if line.startswith("Internal reference number or ID"):
continue
data = [z.strip() for z in line.strip().split(',')]
internal_id = data[0]
chr = data[1]
pos = data[2]
ref = data[4]
alt = data[5]
value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
# print value
V1_VARIANTS[key].append(value)
# print " Found %s variants for %s" % (len(V1_VARIANTS[key]),key)
# now process V2
v2_files = []
for root, dirnames, filenames in os.walk(v2_dir):
for filename in fnmatch.filter(filenames, '*_DEC_FLT.csv'):
v2_files.append(os.path.join(root, filename))
print "Found %s *_DEC_FLT.csv files in %s" % (len(v2_files),v2_dir)
for v2_file in v2_files:
parsed = [x.strip() for x in v2_file.strip().split('/')]
dname = parsed[8]
fname = parsed[9]
fname_parsed = [y.strip() for y in fname.strip().split('_')]
indi_id = fname_parsed[0]
fam_id = fname_parsed[1]
key = "%s_%s" % (indi_id,fam_id)
V2_VARIANTS[key] = []
# print "Processing V2: folder = %s, file = %s, indi_id = %s, family_id = %s, key = %s" % (dname,fname,indi_id,fam_id,key)
# now open the file and read the variants
in_han = open(v2_file,'r')
for line in in_han:
if line.startswith("Internal reference number or ID"):
continue
data = [z.strip() for z in line.strip().split(',')]
internal_id = data[0]
chr = data[1]
pos = data[2]
ref = data[4]
alt = data[5]
value = '%s:%s:%s:%s:%s' % (internal_id,chr,pos,ref,alt)
# print value
V2_VARIANTS[key].append(value)
# print " Found %s variants for %s" % (len(V2_VARIANTS[key]),key)
# check that in both folders we have the same individuals (i.e. keys)
v1_keys = V1_VARIANTS.keys()
v2_keys = V2_VARIANTS.keys()
for v1_key in v1_keys:
if v1_key not in v2_keys:
print "v1_key = %s not found in v2_keys" % (v1_key)
raise SystemExit
for v2_key in v2_keys:
if v2_key not in v1_keys:
print "v2_key = %s not found in v1_keys" % (v2_key)
raise SystemExit
print "Match individuals between the two vesrions: OK"
sys.stdout.flush()
# now compare the variants per individual
for key in v1_keys:
print ""
print "Processing individual = %s" % (key)
v1_variants = V1_VARIANTS[key]
v2_variants = V2_VARIANTS[key]
for v1_var in v1_variants:
if v1_var in v2_variants:
print "%s: %s found in both versions" % (key,v1_var)
v2_variants.remove(v1_var)
else:
print " <<<< %s: %s found only in V1" % (key,v1_var)
# in v2_variants, remaining ar only those for which no match in V1 was found
for v2_var in v2_variants:
print " >>>> %s: %s found only in V2" % (key,v2_var)
print "-----------------------------"
if __name__ == '__main__':
if len(sys.argv) == 3:
go(sys.argv[1],sys.argv[2])
else:
print ("Suggested use: time python compare_indi_vars_by_version.py results_folder_v1 results_folder_v2")
raise SystemExit
...@@ -10,7 +10,7 @@ ...@@ -10,7 +10,7 @@
### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh ### ### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh ###
BASE=/scratch/u035/project/trio_whole_exome/analysis BASE=/scratch/u035/project/trio_whole_exome/analysis
WORK_DIR=${BASE}/${PROJECT_ID} WORK_DIR=${BASE}/${PROJECT_ID}
NHS_DIR=${WORK_DIR}/${PLATE_ID}_results NHS_DIR=${WORK_DIR}/${PLATE_ID}_${VERSION_N}_results
# other files to be used # other files to be used
...@@ -20,6 +20,7 @@ CHILD_IDS=${WORK_DIR}/PRO_IDs.txt ...@@ -20,6 +20,7 @@ CHILD_IDS=${WORK_DIR}/PRO_IDs.txt
echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862 echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862
echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis
# check if ${NHS_DIR} already exists - if not, exit and ask to be created # check if ${NHS_DIR} already exists - if not, exit and ask to be created
......
#!/bin/bash #!/bin/bash
#PBS -l walltime=24:00:00 #PBS -l walltime=24:00:00
#PBS -l ncpus=1,mem=16gb #PBS -l ncpus=1,mem=16gb
#PBS -q uv2000 #PBS -q sgp
#PBS -N process_trio #PBS -N process_trio
#PBS -j oe #PBS -j oe
...@@ -57,7 +57,7 @@ echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM fil ...@@ -57,7 +57,7 @@ echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM fil
echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 11870_Germain_Lorna echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 11870_Germain_Lorna
echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862 echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862
echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis
...@@ -266,10 +266,10 @@ echo "" ...@@ -266,10 +266,10 @@ echo ""
################################################################################################################################################# ##############################################################################################################################################################
### run coverage for each proband (DD genes) ### ### run coverage for each proband (DD genes) ###
### format: ${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam ### ### format: ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam ###
################################################################################################################################################# ##############################################################################################################################################################
################################# #################################
...@@ -282,7 +282,7 @@ echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} . ...@@ -282,7 +282,7 @@ echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} .
# make sure we are reading the data from the exact batch & plate ID # make sure we are reading the data from the exact batch & plate ID
BAM_FILE=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam BAM_FILE=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam
OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15 OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15
...@@ -353,7 +353,7 @@ DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt ...@@ -353,7 +353,7 @@ DEC_MAP=${WORK_DIR}/DECIPHER_INTERNAL_IDs.txt
IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt
IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf
FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID} FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}
FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID} FAM_BAM_DIR=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}
## call the python scrpit ## call the python scrpit
...@@ -432,9 +432,9 @@ done ...@@ -432,9 +432,9 @@ done
echo "...kid_id = ${kid_id}, par_1_id = ${par_1_id}, par_2_id = ${par_2_id} " echo "...kid_id = ${kid_id}, par_1_id = ${par_1_id}, par_2_id = ${par_2_id} "
# gather the trio BAM files # gather the trio BAM files
kid_bam=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam kid_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${kid_id}/${kid_id}-ready.bam
par_1_bam=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam par_1_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_1_id}/${par_1_id}-ready.bam
par_2_bam=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam par_2_bam=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${par_2_id}/${par_2_id}-ready.bam
echo "...kid_bam = ${kid_bam}..." echo "...kid_bam = ${kid_bam}..."
echo "...par_1_bam = ${par_1_bam}..." echo "...par_1_bam = ${par_1_bam}..."
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment