#!/bin/bash #PBS -l walltime=24:00:00 #PBS -l ncpus=1,mem=16gb #PBS -q uv2000 #PBS -N process_trio #PBS -j oe # setup PATH export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2 ### folder structure for the downstream analysis - created by NHS_WES_trio_setup.sh ### #BASE=/scratch/u035/u035/shared/analysis/wes_pilot BASE=/scratch/u035/u035/shared/trio_whole_exome/analysis WORK_DIR=$BASE/${PROJECT_ID} VCF_DIR=${WORK_DIR}/VCF PED_DIR=${WORK_DIR}/PED LOG_DIR=${WORK_DIR}/LOG G2P_DIR=${WORK_DIR}/G2P VASE_DIR=${WORK_DIR}/VASE COV_DIR=${WORK_DIR}/COV DEC_DIR=${WORK_DIR}/DECIPHER IGV_DIR=${DEC_DIR}/IGV CNV_DIR=${WORK_DIR}/CNV SCRIPTS_DIR=/home/u035/u035/shared/scripts # other files to be used FAMILY_IDS=${WORK_DIR}/FAM_IDs.txt # created by NHS_WES_trio_setup.sh CHILD_IDS=${WORK_DIR}/PRO_IDs.txt # created by NHS_WES_trio_setup.sh TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20190919.plus15bp.merged.bed # OK CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20190919.clinvar.20190916.plus15bp.txt # OK BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt # OK TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt # OK ### TOOLS ### BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7 VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-97.3-0/vep REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM files (VCF and PED already copied) i.e. /scratch/u035/u035/shared/trio_whole_exome/analysis/output/ echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 11870_Germain_Lorna echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862 echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done # enable running singletons if [ -z $PBS_ARRAY_INDEX ] then if [ -z $INDEX ] then export PBS_ARRAY_INDEX=1 else export PBS_ARRAY_INDEX=$INDEX fi fi # change to the LOG folder cd ${LOG_DIR} ################################ ##### for each family #### ################################ FAMILY_ID=`head -n ${PBS_ARRAY_INDEX} ${FAMILY_IDS} | tail -n 1` ######################################################################## ### DNU and clean the each family VCF ############ ### ### format: ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ### ######################################################################## echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}'s VCF file..." time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz -o ${VCF_DIR}/$${PLATE_ID}_{FAMILY_ID}.DNU.vcf.gz # remove sites with AC=0 time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0 # exlude variants from the blacklist (matching on chr,pos,ref,alt) time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf # bgzip and tabix it time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz # delete intermediate files rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf # to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, will be deleted later ##############rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}'s VCF file: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ########################################################### ### run G2P for each family VCF (DD genes) ### ### format: ${FAMILY_ID}.ready.vcf.gz ### ########################################################### echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${FAMILY_ID}..." ######################################################################################IN_FILE=${VCF_DIR}/${FAMILY_ID}.ready.vcf.gz IN_FILE=${VCF_DIR}/${FAMILY_ID}.clean.vcf ################################################################################# G2P_LOG_DIR=${G2P_DIR}/${FAMILY_ID}_LOG_DIR mkdir ${G2P_LOG_DIR} TXT_OUT=${G2P_LOG_DIR}/${FAMILY_ID}.report.txt HTML_OUT=${G2P_LOG_DIR}/${FAMILY_ID}.report.html VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38' time ${VEP} \ -i ${IN_FILE} \ --output_file ${G2P_LOG_DIR}/${FAMILY_ID}_inter_out.txt \ --force_overwrite \ --assembly GRCh38 \ --fasta ${REFERENCE_GENOME} \ --offline \ --merged \ --use_given_ref \ --cache --cache_version 97 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.19092019.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-97.3-0 \ --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.19092019.csv',af_from_vcf=1,confidence_levels='confirmed&probable&both RD and IF', af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "G2P analysis of FAMILY_ID = ${FAMILY_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ########################################################### ### run VASE for each family VCF (de novo) ### ### format: ${FAMILY_ID}.ready.vcf.gz ### ########################################################### echo "Performing de novo analysis with VASE for FAMILY_ID = ${FAMILY_ID} ..." IN_FILE=${VCF_DIR}/${FAMILY_ID}.ready.vcf.gz OUT_FILE=${VASE_DIR}/${FAMILY_ID}.strict.denovo.vcf # PED_FILE=${PED_DIR}/${BATCH}_${FAMILY_ID}.ped PED_FILE=${PED_DIR}/*_${FAMILY_ID}.ped time ${VASE} \ -i ${IN_FILE} \ -o ${OUT_FILE} \ --log_progress \ --prog_interval 100000 \ --freq 0.0001 \ --gq 30 --dp 10 \ --het_ab 0.3 \ --max_alt_alleles 1 \ --csq all \ --biotypes all \ --control_gq 15 --control_dp 5 \ --control_het_ab 0.01 \ --control_max_ref_ab 0.05 \ --de_novo \ --ped ${PED_FILE} # do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions ### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with “Unknown” status may be classified as “denovo” if enough support cd ${VASE_DIR} # index the denovo VCF time ${GATK4} IndexFeatureFile -F ${OUT_FILE} # select only variants on the 24 chromosomes time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${FAMILY_ID}.strict.24chr.denovo.vcf -L /home/u035/u035/shared/resources/24_chr.list --exclude-non-variants # sort the VCF (maybe not needed?, but just in case, and it is quick) rm ${FAMILY_ID}.strict.24chr.sort.denovo.vcf grep '^#' ${FAMILY_ID}.strict.24chr.denovo.vcf > ${FAMILY_ID}.strict.24chr.sort.denovo.vcf \ && grep -v '^#' ${FAMILY_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${FAMILY_ID}.strict.24chr.sort.denovo.vcf # index the sorted VCF time ${GATK4} IndexFeatureFile -F ${FAMILY_ID}.strict.24chr.sort.denovo.vcf # split multi-allelic sites [by -m -any] # left-alignment and normalization [by adding the -f] file=${FAMILY_ID}.strict.24chr.sort.denovo.vcf echo "$file" ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Ov -o ${file/.strict.24chr.sort.denovo.vcf/.ready.denovo.vcf} $file # clean intermediate denovo files rm ${FAMILY_ID}.strict.denovo.vcf* rm ${FAMILY_ID}.strict.24chr.denovo.vcf* rm ${FAMILY_ID}.strict.24chr.sort.denovo.vcf* ###rm ${FAMILY_ID}.clean.denovo.vcf* # change back to the LOG folder cd ${LOG_DIR} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "De novo analysis of FAMILY_ID = ${FAMILY_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ################################################################################################################################################# ### run coverage for each proband (DD genes) ### ### format: ${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam ### ################################################################################################################################################# ################################# ##### for each proband #### ################################# PROBAND_ID=`head -n ${PBS_ARRAY_INDEX} ${CHILD_IDS} | tail -n 1` # contains only the proband IDs (e.g. 107060) echo "Performing coverage analysis for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..." # make sure we are reading the data from the exact batch & plate ID BAM_FILE=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam OUT_FILE=${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15 time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \ --omitDepthOutputAtEachBase \ --minBaseQuality 20 \ --minMappingQuality 20 \ -ct 20 \ -jdk_deflater \ -jdk_inflater \ --allow_potentially_misencoded_quality_scores echo "" echo "" echo "----------------------------------------------------------------------------------------------------" echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..." cat ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.sample_summary | awk '{print $7}' echo "----------------------------------------------------------------------------------------------------" # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval time ${PYTHON2} ${SCRIPTS_DIR}/generate_coverage_result_file.py ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "Coverage analysis of PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID}: done " echo " Coverage file = ${COV_DIR}/${PROBAND_ID}_${FAMILY_ID}.DD15.COV.txt" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ......................... ################################################################################### ### for each proband generate the DECIPHER file ### ### ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz - the cleaned family VCF ### ### ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf - the VASE file ### ### ${TRANS_MAP} - the current transcript mapping file ### ################################################################################### echo "Generating the DECIPHER file for PROBAND_ID = ${PROBAND_ID}_${FAMILY_ID} ..." # first, split the family VCF to individual VCFs # -c1: minimum allele count (INFO/AC) of sites to be printed # split multi-allelic sites (by -m -any) # left-alignment and normalization (by adding the -f) file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz echo "splitting $file" for indi in `${BCFTOOLS} query -l $file`; do ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz} rm ${file/.vcf*/.$indi.rough.vcf.gz} done # VASE file - already split, left-aligned and normalized # get the ped file PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped # call the python scrpit time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_generate_DEC_IGV.py ${PED_FILE} ${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt \ ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.denovo.vcf \ ${VCF_DIR}/${FAMILY_ID} ${DEC_DIR} ${IGV_DIR} ${IGV_DIR}/${FAMILY_ID} ${SOURCE_DIR} ${FAMILY_ID} \ ${WORK_DIR} ${TRANS_MAP} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "DECIPHER analysis of PROBAND_ID = ${PROBAND_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" rm ${VCF_DIR}/${FAMILY_ID}.clean.vcf ##################################################