From 523c39ee7474cac7715d7a0bdaf8fbfe9e0ff788 Mon Sep 17 00:00:00 2001
From: ameyner2 <alison.meynert@igmm.ed.ac.uk>
Date: Wed, 9 Sep 2020 10:37:44 +0100
Subject: [PATCH] Added Mike's new scripts for prioritization

---
 NHS_WES_trio_setup.sh                  |  2 +-
 convert_DEC_to_v10.py                  | 82 ++++++++++++++++++++++++++
 gather_NHS_WES_aff_probands_results.sh |  2 +
 gather_NHS_WES_trio_results.sh         |  1 +
 process_NHS_WES_aff_probands.sh        | 38 +++++++++---
 process_NHS_WES_trio.sh                | 33 +++++++----
 run_processing.sh                      | 15 ++---
 7 files changed, 145 insertions(+), 28 deletions(-)
 create mode 100644 convert_DEC_to_v10.py

diff --git a/NHS_WES_trio_setup.sh b/NHS_WES_trio_setup.sh
index 904f2fb..b608cec 100755
--- a/NHS_WES_trio_setup.sh
+++ b/NHS_WES_trio_setup.sh
@@ -36,7 +36,7 @@ fi
 
 
 
-echo "SOURCE_DIR = ${SOURCE_DIR}"	# the general path to the source VCF, BAM and PED files			i.e. /scratch/u035/project/trio_whole_exome/analysis/output/
+echo "SOURCE_DIR = ${SOURCE_DIR}"	# the general path to the source VCF, BAM and PED files			i.e. /scratch/u035/project/trio_whole_exome/analysis/output
 echo "BATCH_ID = ${BATCH_ID}"		# the ID of the batch being processed 					e.g. 11870_Germain_Lorna
 echo "PLATE_ID = ${PLATE_ID}" 		# the PCR plate ID of the batch being currently processed, 		e.g. 16862
 echo "PROJECT_ID = ${PROJECT_ID}"	# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
diff --git a/convert_DEC_to_v10.py b/convert_DEC_to_v10.py
new file mode 100644
index 0000000..061f8b9
--- /dev/null
+++ b/convert_DEC_to_v10.py
@@ -0,0 +1,82 @@
+#	given a DECIPHER bulk upload file v9
+#	convert it to v10
+#
+#       Author: MH
+#       last modified: AUG 05, 2020
+
+
+
+import sys
+import os
+import csv
+import xlsxwriter
+
+
+
+
+def go(inout_dir,id):		# the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id>
+
+    in_file = '%s/%s_DEC_FLT.csv' % (inout_dir,id)
+    out_file = '%s/%s_DECIPHER_v10.xlsx' % (inout_dir,id)
+
+    # create the workbook
+    workbook = xlsxwriter.Workbook(out_file)
+    
+    # create the worksheet
+    worksheet = workbook.add_worksheet('Sequence Variants')
+
+
+    # write the header row
+    header = ('Patient internal reference number or ID','Shared','Assembly','HGVS code','Chromosome','Genomic start','Ref sequence','Alt sequence','Gene name','Transcript','Is intergenic','Genotype','Inheritance','Pathogenicity','Pathogenicity evidence','Contribution','Genotype groups')
+    worksheet.write_row(0,0,header)
+
+
+    # now, open and read the old file, for each variant collecting the information required for v10 and writing it in the v10 file
+    cntr = 0
+
+    with open(in_file,'r') as tsvfile:
+        reader = csv.reader(tsvfile, delimiter=',', quotechar='"')
+        for row in reader:
+            if row[0] == 'Internal reference number or ID':      # ignore the header line
+                continue
+
+            cntr += 1 
+            id = str(row[0])
+            shared = 'NHS-SCE'
+            assembly = 'GRCh38'
+            HGVS = ''
+            chr = str(row[1])
+            start = str(row[2])
+            ref = str(row[4])
+            alt = str(row[5])
+            gene = str(row[7])
+            trans = str(row[6])
+            inter = str(row[8])
+            genotype = str(row[19])
+            inher = str(row[15])
+            if inher == 'Maternally inherited, constitutive in mother':
+                inher = 'Maternally inherited'
+            elif inher == 'Paternally inherited, constitutive in father':
+                inher = 'Paternally inherited'
+            patho = ''
+            evid = ''
+            cont = ''
+            gt_groups = ''
+            data = (id,shared,assembly,HGVS,chr,start,ref,alt,gene,trans,inter,genotype,inher,patho,evid,cont,gt_groups)   
+
+            # write it
+            worksheet.write_row(cntr,0,data)
+
+
+    # close the workbook
+    workbook.close()
+
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        go(sys.argv[1],sys.argv[2])
+    else:
+        print ("Suggested use: time python convert_DEC_to_v10.py decipher_dir <indiv_id>_<family_id>")
+        raise SystemExit
+
diff --git a/gather_NHS_WES_aff_probands_results.sh b/gather_NHS_WES_aff_probands_results.sh
index 23f66f3..a70e579 100755
--- a/gather_NHS_WES_aff_probands_results.sh
+++ b/gather_NHS_WES_aff_probands_results.sh
@@ -39,6 +39,8 @@ cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.rep
 
 # copy all the DECIPHER files for bulk upload
 cp ${WORK_DIR}/DECIPHER/*_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/*_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
+
 
 # copy the variant snapshots
 cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR}
diff --git a/gather_NHS_WES_trio_results.sh b/gather_NHS_WES_trio_results.sh
index d42d93f..1697445 100755
--- a/gather_NHS_WES_trio_results.sh
+++ b/gather_NHS_WES_trio_results.sh
@@ -58,6 +58,7 @@ cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.rep
 
 # copy the DECIPHER file for bulk upload
 cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR}
+cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR}
 
 
 # copy the variant snapshots
diff --git a/process_NHS_WES_aff_probands.sh b/process_NHS_WES_aff_probands.sh
index 281748d..e9406ba 100755
--- a/process_NHS_WES_aff_probands.sh
+++ b/process_NHS_WES_aff_probands.sh
@@ -11,6 +11,7 @@ export PATH=$PATH:/home/u035/project/software/bcbio/anaconda/envs/python2/bin:/h
 export PERL5LIB=$PERL5LIB:/home/u035/project/software/bcbio/anaconda/lib/site_perl/5.26.2
 
 
+
 ### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh, done previously by the stanard trio-based pipeline ###
 BASE=/scratch/u035/project/trio_whole_exome/analysis
 WORK_DIR=$BASE/${PROJECT_ID}
@@ -36,17 +37,20 @@ TRANS_MAP=/home/u035/project/resources/current_trans_map.txt				# OK
 
 
 
+
 ### TOOLS ###
 BCFTOOLS=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bcftools
 BGZIP=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bgzip
 TABIX=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/tabix
 VT=/home/u035/project/software/bcbio/anaconda/bin/vt
 VASE=/home/u035/project/software/bcbio/anaconda/bin/vase
-GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk
+GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk							# points to ../share/gatk4-4.1.8.1-0/gatk
 GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
+PYTHON3=/home/u035/project/software/bcbio/anaconda/bin/python3							# points to python3.6
 PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7
-VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-97.3-0/vep
-REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa
+VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-100.4-0/vep
+REFERENCE_GENOME=/home/u035/project/resources/hg38.fa
+
 
 
 
@@ -159,7 +163,8 @@ G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
 mkdir ${G2P_LOG_DIR}
 TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
 HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
-VCF_KEYS='gnomADe|gnomADg'
+#VCF_KEYS='gnomADe|gnomADg'     # old VEP version 97
+VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38'
 
 
 time ${VEP} \
@@ -171,11 +176,11 @@ time ${VEP} \
     --offline \
     --merged \
     --use_given_ref \
-    --cache --cache_version 97 \
+    --cache --cache_version 100 \
     --dir_cache /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/vep \
     --individual all \
     --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.01062020.txt" \
-    --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \
+    --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
     --plugin G2P,file='/home/u035/project/resources/DDG2P.01062020.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
 
 
@@ -296,6 +301,23 @@ ${DEC_DIR} \
 ${FAM_BAM_DIR}
 
 
+
+#############################################################################################################################
+## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ##
+#############################################################################################################################
+# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)
+# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
+
+file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
+for INDI_ID in `${BCFTOOLS} query -l $file`; do
+    #################################
+    #####    for each proband    ####
+    #################################
+    echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...."
+    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID}
+done
+
+
 echo ""
 echo ""
 echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
@@ -400,10 +422,10 @@ do
 
   # generate the bamout file
   echo "...doing the bamout"
-  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME}" ${INPUT_BAM_LINE}" -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
+  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME}" ${INPUT_BAM_LINE}" -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
 
   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \
-  --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 \
+  --active-probability-threshold 0.000 -ploidy 2 \
   --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
 
 done
diff --git a/process_NHS_WES_trio.sh b/process_NHS_WES_trio.sh
index 8757cb1..2c28003 100755
--- a/process_NHS_WES_trio.sh
+++ b/process_NHS_WES_trio.sh
@@ -44,12 +44,12 @@ BGZIP=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bgzip
 TABIX=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/tabix
 VT=/home/u035/project/software/bcbio/anaconda/bin/vt
 VASE=/home/u035/project/software/bcbio/anaconda/bin/vase
-GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk
+GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk							# points to ../share/gatk4-4.1.8.1-0/gatk
 GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
+PYTHON3=/home/u035/project/software/bcbio/anaconda/bin/python3							# points to python3.6
 PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7
-VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-97.3-0/vep
-REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa
-
+VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-100.4-0/vep
+REFERENCE_GENOME=/home/u035/project/resources/hg38.fa
 
 
 
@@ -155,8 +155,8 @@ G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
 mkdir ${G2P_LOG_DIR}
 TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
 HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
-VCF_KEYS='gnomADe|gnomADg'
-
+#VCF_KEYS='gnomADe|gnomADg'	# old VEP version 97
+VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38'
 
 time ${VEP} \
     -i ${IN_FILE} \
@@ -167,11 +167,11 @@ time ${VEP} \
     --offline \
     --merged \
     --use_given_ref \
-    --cache --cache_version 97 \
+    --cache --cache_version 100 \
     --dir_cache /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/vep \
     --individual all \
     --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.01062020.txt" \
-    --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \
+    --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
     --plugin G2P,file='/home/u035/project/resources/DDG2P.01062020.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
 
 
@@ -225,7 +225,7 @@ time ${VASE} \
 cd ${VASE_DIR}
 
 # index the denovo VCF
-time ${GATK4} IndexFeatureFile -F ${OUT_FILE}
+time ${GATK4} IndexFeatureFile -I ${OUT_FILE}
 
 # select only variants on the 24 chromosomes
 time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf -L /home/u035/project/resources/24_chr.list --exclude-non-variants
@@ -236,7 +236,7 @@ grep '^#' ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf > ${PLATE_ID}_${FAMIL
 && grep -v '^#' ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
 
 # index the sorted VCF
-time ${GATK4} IndexFeatureFile -F ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
+time ${GATK4} IndexFeatureFile -I ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf
 
 # split multi-allelic sites [by -m -any]
 # left-alignment and normalization [by adding the -f]
@@ -371,6 +371,15 @@ ${DEC_DIR} \
 ${FAM_BAM_DIR}
 
 
+
+
+
+## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10
+echo "...Generating v10 Decipher bulk upload file for proband = ${PROBAND_ID}, family_id = ${FAMILY_ID} ..."
+time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${PROBAND_ID}_${FAMILY_ID}
+
+
+
 echo ""
 echo ""
 echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
@@ -462,11 +471,11 @@ do
   # generate the bamout file
   echo "...doing the bamout"
   echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \"
-  echo "   --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 \"
+  echo "   --active-probability-threshold 0.000 -ploidy 2 \"
   echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"
 
   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \
-  --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 \
+  --active-probability-threshold 0.000 -ploidy 2 \
   --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
 
 done
diff --git a/run_processing.sh b/run_processing.sh
index 82718d1..f6208eb 100755
--- a/run_processing.sh
+++ b/run_processing.sh
@@ -4,7 +4,7 @@
 #PBS -q uv2000
 #PBS -N run_processing
 #PBS -j oe
-
+ 
 
 # setup PATH
 export PATH=$PATH:/home/u035/project/software/bcbio/anaconda/envs/python2/bin:/home/u035/project/software/bcbio/anaconda/bin
@@ -47,7 +47,7 @@ GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk
 GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
 PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7
 VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-97.3-0/vep
-REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa
+REFERENCE_GENOME=/home/u035/project/resources/hg38.fa
 
 
 
@@ -149,7 +149,8 @@ G2P_LOG_DIR=${G2P_DIR}/${FAMILY_ID}_LOG_DIR
 mkdir ${G2P_LOG_DIR}
 TXT_OUT=${G2P_LOG_DIR}/${FAMILY_ID}.report.txt
 HTML_OUT=${G2P_LOG_DIR}/${FAMILY_ID}.report.html
-VCF_KEYS='gnomADe|gnomADg'
+#VCF_KEYS='gnomADe|gnomADg'	# old VEP
+VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38'
 
 
 time ${VEP} \
@@ -161,12 +162,12 @@ time ${VEP} \
     --offline \
     --merged \
     --use_given_ref \
-    --cache --cache_version 97 \
+    --cache --cache_version 100 \
     --dir_cache /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/vep \
     --individual all \
-    --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.19092019.txt" \
-    --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \
-    --plugin G2P,file='/home/u035/project/resources/DDG2P.19092019.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
+    --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.01062020.txt" \
+    --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
+    --plugin G2P,file='/home/u035/project/resources/DDG2P.01062020.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}
 
 echo ""
 echo ""
-- 
GitLab