From 523c39ee7474cac7715d7a0bdaf8fbfe9e0ff788 Mon Sep 17 00:00:00 2001 From: ameyner2 <alison.meynert@igmm.ed.ac.uk> Date: Wed, 9 Sep 2020 10:37:44 +0100 Subject: [PATCH] Added Mike's new scripts for prioritization --- NHS_WES_trio_setup.sh | 2 +- convert_DEC_to_v10.py | 82 ++++++++++++++++++++++++++ gather_NHS_WES_aff_probands_results.sh | 2 + gather_NHS_WES_trio_results.sh | 1 + process_NHS_WES_aff_probands.sh | 38 +++++++++--- process_NHS_WES_trio.sh | 33 +++++++---- run_processing.sh | 15 ++--- 7 files changed, 145 insertions(+), 28 deletions(-) create mode 100644 convert_DEC_to_v10.py diff --git a/NHS_WES_trio_setup.sh b/NHS_WES_trio_setup.sh index 904f2fb..b608cec 100755 --- a/NHS_WES_trio_setup.sh +++ b/NHS_WES_trio_setup.sh @@ -36,7 +36,7 @@ fi -echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /scratch/u035/project/trio_whole_exome/analysis/output/ +echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /scratch/u035/project/trio_whole_exome/analysis/output echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 11870_Germain_Lorna echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862 echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done diff --git a/convert_DEC_to_v10.py b/convert_DEC_to_v10.py new file mode 100644 index 0000000..061f8b9 --- /dev/null +++ b/convert_DEC_to_v10.py @@ -0,0 +1,82 @@ +# given a DECIPHER bulk upload file v9 +# convert it to v10 +# +# Author: MH +# last modified: AUG 05, 2020 + + + +import sys +import os +import csv +import xlsxwriter + + + + +def go(inout_dir,id): # the folder where the bulk upload files are strored; id is in the format <indiv_id>_<family_id> + + in_file = '%s/%s_DEC_FLT.csv' % (inout_dir,id) + out_file = '%s/%s_DECIPHER_v10.xlsx' % (inout_dir,id) + + # create the workbook + workbook = xlsxwriter.Workbook(out_file) + + # create the worksheet + worksheet = workbook.add_worksheet('Sequence Variants') + + + # write the header row + header = ('Patient internal reference number or ID','Shared','Assembly','HGVS code','Chromosome','Genomic start','Ref sequence','Alt sequence','Gene name','Transcript','Is intergenic','Genotype','Inheritance','Pathogenicity','Pathogenicity evidence','Contribution','Genotype groups') + worksheet.write_row(0,0,header) + + + # now, open and read the old file, for each variant collecting the information required for v10 and writing it in the v10 file + cntr = 0 + + with open(in_file,'r') as tsvfile: + reader = csv.reader(tsvfile, delimiter=',', quotechar='"') + for row in reader: + if row[0] == 'Internal reference number or ID': # ignore the header line + continue + + cntr += 1 + id = str(row[0]) + shared = 'NHS-SCE' + assembly = 'GRCh38' + HGVS = '' + chr = str(row[1]) + start = str(row[2]) + ref = str(row[4]) + alt = str(row[5]) + gene = str(row[7]) + trans = str(row[6]) + inter = str(row[8]) + genotype = str(row[19]) + inher = str(row[15]) + if inher == 'Maternally inherited, constitutive in mother': + inher = 'Maternally inherited' + elif inher == 'Paternally inherited, constitutive in father': + inher = 'Paternally inherited' + patho = '' + evid = '' + cont = '' + gt_groups = '' + data = (id,shared,assembly,HGVS,chr,start,ref,alt,gene,trans,inter,genotype,inher,patho,evid,cont,gt_groups) + + # write it + worksheet.write_row(cntr,0,data) + + + # close the workbook + workbook.close() + + + +if __name__ == '__main__': + if len(sys.argv) == 3: + go(sys.argv[1],sys.argv[2]) + else: + print ("Suggested use: time python convert_DEC_to_v10.py decipher_dir <indiv_id>_<family_id>") + raise SystemExit + diff --git a/gather_NHS_WES_aff_probands_results.sh b/gather_NHS_WES_aff_probands_results.sh index 23f66f3..a70e579 100755 --- a/gather_NHS_WES_aff_probands_results.sh +++ b/gather_NHS_WES_aff_probands_results.sh @@ -39,6 +39,8 @@ cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.rep # copy all the DECIPHER files for bulk upload cp ${WORK_DIR}/DECIPHER/*_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR} +cp ${WORK_DIR}/DECIPHER/*_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR} + # copy the variant snapshots cp ${WORK_DIR}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}/*.png ${FAM_DIR} diff --git a/gather_NHS_WES_trio_results.sh b/gather_NHS_WES_trio_results.sh index d42d93f..1697445 100755 --- a/gather_NHS_WES_trio_results.sh +++ b/gather_NHS_WES_trio_results.sh @@ -58,6 +58,7 @@ cp ${WORK_DIR}/G2P/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.rep # copy the DECIPHER file for bulk upload cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DEC_FLT.csv ${FAM_DIR} +cp ${WORK_DIR}/DECIPHER/${PROBAND_ID}_${FAMILY_ID}_DECIPHER_v10.xlsx ${FAM_DIR} # copy the variant snapshots diff --git a/process_NHS_WES_aff_probands.sh b/process_NHS_WES_aff_probands.sh index 281748d..e9406ba 100755 --- a/process_NHS_WES_aff_probands.sh +++ b/process_NHS_WES_aff_probands.sh @@ -11,6 +11,7 @@ export PATH=$PATH:/home/u035/project/software/bcbio/anaconda/envs/python2/bin:/h export PERL5LIB=$PERL5LIB:/home/u035/project/software/bcbio/anaconda/lib/site_perl/5.26.2 + ### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh, done previously by the stanard trio-based pipeline ### BASE=/scratch/u035/project/trio_whole_exome/analysis WORK_DIR=$BASE/${PROJECT_ID} @@ -36,17 +37,20 @@ TRANS_MAP=/home/u035/project/resources/current_trans_map.txt # OK + ### TOOLS ### BCFTOOLS=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bcftools BGZIP=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/tabix VT=/home/u035/project/software/bcbio/anaconda/bin/vt VASE=/home/u035/project/software/bcbio/anaconda/bin/vase -GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk +GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk # points to ../share/gatk4-4.1.8.1-0/gatk GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar +PYTHON3=/home/u035/project/software/bcbio/anaconda/bin/python3 # points to python3.6 PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7 -VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-97.3-0/vep -REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa +VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-100.4-0/vep +REFERENCE_GENOME=/home/u035/project/resources/hg38.fa + @@ -159,7 +163,8 @@ G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR mkdir ${G2P_LOG_DIR} TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html -VCF_KEYS='gnomADe|gnomADg' +#VCF_KEYS='gnomADe|gnomADg' # old VEP version 97 +VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38' time ${VEP} \ @@ -171,11 +176,11 @@ time ${VEP} \ --offline \ --merged \ --use_given_ref \ - --cache --cache_version 97 \ + --cache --cache_version 100 \ --dir_cache /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.01062020.txt" \ - --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \ + --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ --plugin G2P,file='/home/u035/project/resources/DDG2P.01062020.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} @@ -296,6 +301,23 @@ ${DEC_DIR} \ ${FAM_BAM_DIR} + +############################################################################################################################# +## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ## +############################################################################################################################# +# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start) +# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID} + +file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz +for INDI_ID in `${BCFTOOLS} query -l $file`; do + ################################# + ##### for each proband #### + ################################# + echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...." + time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID} +done + + echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" @@ -400,10 +422,10 @@ do # generate the bamout file echo "...doing the bamout" - echo " time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME}" ${INPUT_BAM_LINE}" -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" + echo " time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME}" ${INPUT_BAM_LINE}" -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \ - --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 \ + --active-probability-threshold 0.000 -ploidy 2 \ --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam done diff --git a/process_NHS_WES_trio.sh b/process_NHS_WES_trio.sh index 8757cb1..2c28003 100755 --- a/process_NHS_WES_trio.sh +++ b/process_NHS_WES_trio.sh @@ -44,12 +44,12 @@ BGZIP=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/tabix VT=/home/u035/project/software/bcbio/anaconda/bin/vt VASE=/home/u035/project/software/bcbio/anaconda/bin/vase -GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk +GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk # points to ../share/gatk4-4.1.8.1-0/gatk GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar +PYTHON3=/home/u035/project/software/bcbio/anaconda/bin/python3 # points to python3.6 PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7 -VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-97.3-0/vep -REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa - +VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-100.4-0/vep +REFERENCE_GENOME=/home/u035/project/resources/hg38.fa @@ -155,8 +155,8 @@ G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR mkdir ${G2P_LOG_DIR} TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html -VCF_KEYS='gnomADe|gnomADg' - +#VCF_KEYS='gnomADe|gnomADg' # old VEP version 97 +VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38' time ${VEP} \ -i ${IN_FILE} \ @@ -167,11 +167,11 @@ time ${VEP} \ --offline \ --merged \ --use_given_ref \ - --cache --cache_version 97 \ + --cache --cache_version 100 \ --dir_cache /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.01062020.txt" \ - --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \ + --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ --plugin G2P,file='/home/u035/project/resources/DDG2P.01062020.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} @@ -225,7 +225,7 @@ time ${VASE} \ cd ${VASE_DIR} # index the denovo VCF -time ${GATK4} IndexFeatureFile -F ${OUT_FILE} +time ${GATK4} IndexFeatureFile -I ${OUT_FILE} # select only variants on the 24 chromosomes time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf -L /home/u035/project/resources/24_chr.list --exclude-non-variants @@ -236,7 +236,7 @@ grep '^#' ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf > ${PLATE_ID}_${FAMIL && grep -v '^#' ${PLATE_ID}_${FAMILY_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf # index the sorted VCF -time ${GATK4} IndexFeatureFile -F ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf +time ${GATK4} IndexFeatureFile -I ${PLATE_ID}_${FAMILY_ID}.strict.24chr.sort.denovo.vcf # split multi-allelic sites [by -m -any] # left-alignment and normalization [by adding the -f] @@ -371,6 +371,15 @@ ${DEC_DIR} \ ${FAM_BAM_DIR} + + + +## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10 +echo "...Generating v10 Decipher bulk upload file for proband = ${PROBAND_ID}, family_id = ${FAMILY_ID} ..." +time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${PROBAND_ID}_${FAMILY_ID} + + + echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" @@ -462,11 +471,11 @@ do # generate the bamout file echo "...doing the bamout" echo " time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \" - echo " --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 \" + echo " --active-probability-threshold 0.000 -ploidy 2 \" echo " --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \ - --active-probability-threshold 0.000 --disable-optimizations True -ploidy 2 \ + --active-probability-threshold 0.000 -ploidy 2 \ --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam done diff --git a/run_processing.sh b/run_processing.sh index 82718d1..f6208eb 100755 --- a/run_processing.sh +++ b/run_processing.sh @@ -4,7 +4,7 @@ #PBS -q uv2000 #PBS -N run_processing #PBS -j oe - + # setup PATH export PATH=$PATH:/home/u035/project/software/bcbio/anaconda/envs/python2/bin:/home/u035/project/software/bcbio/anaconda/bin @@ -47,7 +47,7 @@ GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7 VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-97.3-0/vep -REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa +REFERENCE_GENOME=/home/u035/project/resources/hg38.fa @@ -149,7 +149,8 @@ G2P_LOG_DIR=${G2P_DIR}/${FAMILY_ID}_LOG_DIR mkdir ${G2P_LOG_DIR} TXT_OUT=${G2P_LOG_DIR}/${FAMILY_ID}.report.txt HTML_OUT=${G2P_LOG_DIR}/${FAMILY_ID}.report.html -VCF_KEYS='gnomADe|gnomADg' +#VCF_KEYS='gnomADe|gnomADg' # old VEP +VCF_KEYS='gnomADe_GRCh38|gnomADg_r3.0_GRCh38' time ${VEP} \ @@ -161,12 +162,12 @@ time ${VEP} \ --offline \ --merged \ --use_given_ref \ - --cache --cache_version 97 \ + --cache --cache_version 100 \ --dir_cache /home/u035/project/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ - --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.19092019.txt" \ - --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \ - --plugin G2P,file='/home/u035/project/resources/DDG2P.19092019.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} + --transcript_filter "gene_symbol in /home/u035/project/resources/genes_in_DDG2P.01062020.txt" \ + --dir_plugins /home/u035/project/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ + --plugin G2P,file='/home/u035/project/resources/DDG2P.01062020.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" echo "" -- GitLab