#!/bin/bash #SBATCH --cpus-per-task=1 #SBATCH --mem=16GB #SBATCH --time=24:00:00 #SBATCH --job-name=process_quad #SBATCH --output=process_quad.%A_%a.out #SBATCH --error=process_quad.%A_%a.err # setup PATH export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2 ### folder structure for the downstream analysis - created by trio_setup.sh, done previously by the stanard trio-based pipeline ### BASE=/home/u035/u035/shared/analysis/work WORK_DIR=$BASE/${PROJECT_ID} VCF_DIR=${WORK_DIR}/VCF PED_DIR=${WORK_DIR}/PED LOG_DIR=${WORK_DIR}/LOG G2P_DIR=${WORK_DIR}/G2P VASE_DIR=${WORK_DIR}/VASE COV_DIR=${WORK_DIR}/COV DEC_DIR=${WORK_DIR}/DECIPHER IGV_DIR=${DEC_DIR}/IGV CNV_DIR=${WORK_DIR}/CNV BAMOUT_DIR=${WORK_DIR}/BAMOUT SCRIPTS_DIR=/home/u035/u035/shared/scripts # other files to be used TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed # OK CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt # OK BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt # OK TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt # OK REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1 ### TOOLS ### SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk # points to ../share/gatk4-4.2.1.0-0/gatk GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3 # points to python3.6 PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7 VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-100.4-0/vep REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source BAM files (VCF and PED already copied) i.e. /home/u035/u035/shared/results echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 19650_Ansari_Morad echo "BATCH_NUM = ${BATCH_NUM}" # the numerical part of the BATCH_ID e.g. 19650 echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 19285 echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis echo "FAMILY_ID = ${FAMILY_ID}" # the family ID of this family with affected probands echo "KID_1_ID = ${KID_1_ID}" # the ID of the first affected proband, in the format INDI_ID echo "KID_2_ID = ${KID_2_ID}" # the ID of the second affected proband echo "PAR_1_ID = ${PAR_1_ID}" # the ID of the first unaffected parent, in the format INDI_ID echo "PAR_2_ID = ${PAR_2_ID}" # the ID of the second unaffected parent echo "DECIPHER_ID = ${DECIPHER_ID}" # the DECIPHER_ID for this family # change to the LOG folder cd ${LOG_DIR} ######################################################################################################### ### check the PED file to make sure exactly 2 affected probands with exactly 2 unaffected parents ### ######################################################################################################### echo "" echo "" echo "checking the ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped file..." time ${PYTHON2} ${SCRIPTS_DIR}/check_quad_PED.py ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped # check if the PED file checks were successful (python exit code = 0), if not exit the bash script ret=$? if [ $ret -ne 0 ]; then echo "...it appears that the PED file does not corresponds to a quad family !" echo "ERROR: Aborting the analysis" exit fi echo "" echo "" ################################################################################################## ################################################################################################## ### split the quad to two trios, one for each child and process as a standard TRIO family ### ################################################################################################## ################################################################################################## echo "" echo "" echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "+++ Analysing QUAD family ${FAMILY_ID} as two trios +++" echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "" echo "" # Create an array of kid ids to loop KID_IDS=() KID_IDS+=(${KID_1_ID}) KID_IDS+=(${KID_2_ID}) for KID_ID in ${KID_IDS[@]}; do echo "" echo "++++++++++++++++++++++++++++++++++++" echo "processing trio for child = $KID_ID" echo "++++++++++++++++++++++++++++++++++++" echo "" ############################################################# # generate this trio's PED file # # named: ${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped # ############################################################# quad_ped_file=${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped time ${PYTHON2} ${SCRIPTS_DIR}/generate_trio_PED_from_quad.py ${PED_DIR} ${quad_ped_file} ${KID_ID} ${PAR_1_ID} ${PAR_2_ID} ############################################################################# # generate this trio's VCF file # # named: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz # ############################################################################# quad_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz trio_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz # tabix index the quad file just in case time ${TABIX} -p vcf ${quad_vcf_file} # extract a trio VCF for this kid time java -Xmx24g -jar ${GATK3} -T SelectVariants \ -R ${REFERENCE_GENOME} \ -V ${quad_vcf_file} \ -sn ${KID_ID}_${FAMILY_ID} \ -sn ${PAR_1_ID}_${FAMILY_ID} \ -sn ${PAR_2_ID}_${FAMILY_ID} \ -jdk_deflater \ -jdk_inflater \ -o ${trio_vcf_file} \ -env ################################################################################## ### DNU and clean the the family VCF ### ### format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz ### ################################################################################## echo "" echo "" echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}_${KID_ID}'s VCF file..." time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz # remove sites with AC=0 time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0 # exlude variants from the blacklist (matching on chr,pos,ref,alt) time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf # bgzip and tabix it time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz # delete intermediate files - keep them for now for debugging reasons rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf # to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last # rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}_${KID_ID}'s VCF file: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ################################################################### ### run G2P for this trio VCF (DD genes) ### ### format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf ### ################################################################### echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}..." echo "Using ${TARGETS}" IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR mkdir ${G2P_LOG_DIR} TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.html VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38' time ${VEP} \ -i ${IN_FILE} \ --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_inter_out.txt \ --force_overwrite \ --assembly GRCh38 \ --fasta ${REFERENCE_GENOME} \ --offline \ --merged \ --use_given_ref \ --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" ################################################################### ### run VASE for this trio VCF (de novo) ### ### format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz ### ################################################################### echo "Performing de novo analysis with VASE for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID} ..." IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz OUT_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.denovo.vcf PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped time ${VASE} \ -i ${IN_FILE} \ -o ${OUT_FILE} \ --log_progress \ --prog_interval 100000 \ --freq 0.0001 \ --gq 30 --dp 10 \ --het_ab 0.3 \ --max_alt_alleles 1 \ --csq all \ --biotypes all \ --control_gq 15 --control_dp 5 \ --control_het_ab 0.01 \ --control_max_ref_ab 0.05 \ --de_novo \ --ped ${PED_FILE} # do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions ### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with “Unknown” status may be classified as “denovo” if enough support cd ${VASE_DIR} # index the denovo VCF time ${GATK4} IndexFeatureFile -I ${OUT_FILE} # select only variants on the 24 chromosomes time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf -L /home/u035/u035/shared/resources/24_chr.list --exclude-non-variants # sort the VCF (maybe not needed?, but just in case, and it is quick) rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf grep '^#' ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf > ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf \ && grep -v '^#' ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf # index the sorted VCF time ${GATK4} IndexFeatureFile -I ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf # split multi-allelic sites [by -m -any] # left-alignment and normalization [by adding the -f] file=${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf echo "$file" ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Ov -o ${file/.strict.24chr.sort.denovo.vcf/.ready.denovo.vcf} $file # clean intermediate denovo files rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.denovo.vcf* rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf* rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf* # change back to the LOG folder cd ${LOG_DIR} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "De novo analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ################################################################################################################################################# ### run coverage for this proband (DD genes) ### ### format: ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam ### ################################################################################################################################################# echo "Performing coverage analysis for PROBAND_ID = ${KID_ID} ...." # make sure we are reading the data from the exact batch & plate ID & version N BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}-ready.bam OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15 time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \ --omitDepthOutputAtEachBase \ --minBaseQuality 20 \ --minMappingQuality 20 \ -ct 20 \ -jdk_deflater \ -jdk_inflater \ --allow_potentially_misencoded_quality_scores echo "" echo "" echo "----------------------------------------------------------------------------------------------------" echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${KID_ID} ..." cat ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.sample_summary | awk '{print $7}' echo "----------------------------------------------------------------------------------------------------" # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.COV.txt echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "Coverage analysis of PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done " echo " Coverage file = ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.COV.txt" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ################################################################################################ # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) # ################################################################################################ echo "Performing recurrent coverage analysis for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..." # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it # set the name of the file containing info about the coverage of the recurrent SNPs REC_OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.REC_SNP_COV.txt while IFS=$'\t' read -ra var; do gene="${var[0]}" chr="${var[1]}" pos="${var[2]}" lo=$(expr $pos - 15) hi=$(expr $pos + 15) reg="$lo-$hi" echo "=============================================" echo "$gene : recurrent variant at $chr:$pos" echo "exploring coverage at $chr:$reg" echo "---------------------------------------------" echo "precisely at the position" ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos" echo "---------------------------------------------" echo "average in the +/- 15bp region" ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}' echo "---------------------------------------------" echo "detailed in the +/- 15bp region" ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} done < ${REC_SNP} > ${REC_OUT_FILE} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done " echo " Coverage file = ${REC_OUT_FILE}" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ############################################################################################# ### generate the DECIPHER file for this proband ### ### ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz - the cleaned family VCF ### ### ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf - the VASE file ### ### ${TRANS_MAP} - the current transcript mapping file ### ############################################################################################# echo "Generating the DECIPHER file for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..." # first, split the family VCF to individual VCFs # -c1: minimum allele count (INFO/AC) of sites to be printed # split multi-allelic sites (by -m -any) # left-alignment and normalization (by adding the -f) file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz echo "splitting $file" for indi in `${BCFTOOLS} query -l $file`; do ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz} rm ${file/.vcf*/.$indi.rough.vcf.gz} done # VASE file - already split, left-aligned and normalized # create the names of the needed files PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID} FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID} ## call the python scrpit time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_trio_scripts_from_quad.py \ ${DECIPHER_ID} \ ${TRANS_MAP} \ ${PED_FILE} \ ${IN_G2P_FILE} \ ${IN_VASE_FILE} \ ${FAM_IGV_DIR} \ ${VCF_DIR} \ ${PLATE_ID} \ ${FAMILY_ID} \ ${DEC_DIR} \ ${FAM_BAM_DIR} \ ${KID_ID} ## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10 echo "...Generating v10 Decipher bulk upload file for proband = ${KID_ID}, family_id = ${FAMILY_ID} ..." time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${KID_ID}_${FAMILY_ID} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "DECIPHER analysis of PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" #rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf ############################################################################################## ### for each variant in the DECIPHER upload file ### ### generate a IGV snapshot based on the realigned BAM used by GATK for calling variants ### ### first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder) ### ### then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs ### ############################################################################################## # we have so far: FAMILY_ID and KID_ID echo "...Generating BAMOUT files for the ${FAMILY_ID} family, proband = ${KID_ID} ..." echo "...KID_ID = ${KID_ID}, PAR_1_ID = ${PAR_1_ID}, PAR_2_ID = ${PAR_2_ID} " # gather the trio BAM files kid_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}-ready.bam par_1_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PAR_1_ID}_${FAMILY_ID}/${PAR_1_ID}_${FAMILY_ID}-ready.bam par_2_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PAR_2_ID}_${FAMILY_ID}/${PAR_2_ID}_${FAMILY_ID}-ready.bam echo "...kid_bam = ${kid_bam}..." echo "...par_1_bam = ${par_1_bam}..." echo "...par_2_bam = ${par_2_bam}..." # gather the variants in the DECIPHER file for which to generate bamouts # chr is the second column - need to add the 'chr' prefix # pos is the third column # the first line is a header line, starting with 'Internal reference number or ID' # file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv # and for each run GATK to generate the bamout files # to be stored in ${BAMOUT_DIR}/${FAMILY_ID} mkdir ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID} var_file=${DEC_DIR}/${KID_ID}_${FAMILY_ID}_DEC_FLT.csv echo "... reading ${var_file} to generate the bamouts..." grep -v '^Internal' ${var_file} | while IFS= read -r line do echo "$line" IFS=, read -ra ary <<<"$line" chr=${ary[1]} pos=${ary[2]} ref=${ary[4]} alt=${ary[5]} echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}" # generate the bamout file echo "...doing the bamout" echo " time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500" echo " --active-probability-threshold 0.000 -ploidy 2" echo " --output ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \ --active-probability-threshold 0.000 -ploidy 2 \ --output ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam done ################################################################# # write the IGV batch file for this family based on the bamouts # # to be stored as /scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}/DECIPHER/IGV/bamout_${KID_ID}_${FAMILY_ID}.snapshot.txt # ################################################################# snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${KID_ID}_${FAMILY_ID}.snapshot.txt # check if previous version exist, if so - delete it if [ -f "${snap_file}" ]; then echo "previous version of ${snap_file} exist --> deleted" rm ${snap_file} fi # write the header for the IGV batch file echo "new" >> ${snap_file} echo "genome hg38" >> ${snap_file} echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_${KID_ID}\"" >> ${snap_file} echo "" >> ${snap_file} # now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants var_file=${DEC_DIR}/${KID_ID}_${FAMILY_ID}_DEC_FLT.csv echo "... reading ${var_file} to generate the IGV batch file using the bamouts..." grep -v '^Internal' ${var_file} | while IFS= read -r line do IFS=, read -ra ary <<<"$line" chr=${ary[1]} pos=${ary[2]} ref=${ary[4]} alt=${ary[5]} left=$((${pos}-25)) right=$((${pos}+25)) echo "new" >> ${snap_file} echo "load ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file} echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file} echo "goto chr${chr}:${left}-${right}" >> ${snap_file} echo "group SAMPLE" >> ${snap_file} echo "sort base" >> ${snap_file} echo "squish" >> ${snap_file} echo "snapshot bamout_${KID_ID}_${FAMILY_ID}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file} echo "" >> ${snap_file} echo "" >> ${snap_file} done echo "Generating of the IGV batch files based on bamouts - done!" echo "snap_file = ${snap_file}" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "+++ Variant prioritization of trio ${FAMILY_ID} with ${KID_ID} completed +++" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" done echo "" echo "" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "+++ Analysing QUAD family ${FAMILY_ID} as two trios: DONE! +++" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "" echo "" ####################################################################################### ####################################################################################### ### analyze only the two affected siblings and identify shared variants in them ### ####################################################################################### ####################################################################################### echo "" echo "" echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "+++ Analysing QUAD family ${FAMILY_ID} for shared variants in the two affected siblings +++" echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "" echo "" ########################################################## # generate the affacted siblings PED file # # named: ${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped # ########################################################## quad_ped_file=${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped time ${PYTHON2} ${SCRIPTS_DIR}/generate_aff_sib_PED_from_quad.py ${PED_DIR} ${quad_ped_file} ${KID_1_ID} ${KID_2_ID} ########################################################################## # generate the affected siblings VCF file # # named: ${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz # ########################################################################## quad_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz shared_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz # tabix index the quad file just in case time ${TABIX} -p vcf ${quad_vcf_file} # extract a VCF for these two kids time java -Xmx24g -jar ${GATK3} -T SelectVariants \ -R ${REFERENCE_GENOME} \ -V ${quad_vcf_file} \ -sn ${KID_1_ID}_${FAMILY_ID} \ -sn ${KID_2_ID}_${FAMILY_ID} \ -jdk_deflater \ -jdk_inflater \ -o ${shared_vcf_file} \ -env ############################################################################### ### DNU and clean the the siblings VCF ### ### format: ${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz ### ############################################################################### echo "" echo "" echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}_shared's VCF file..." time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz # remove sites with AC=0 time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0 # exlude variants from the blacklist (matching on chr,pos,ref,alt) time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf # bgzip and tabix it time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz # delete intermediate files rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz* rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf # to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last # rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}_shared's VCF file: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" ################################################################ ### run G2P for the two affected siblings (DD genes) ### ### format: ${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf ### ################################################################ echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_shared..." echo "Using ${TARGETS}" IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR mkdir ${G2P_LOG_DIR} TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.report.txt HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.report.html VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38' time ${VEP} \ -i ${IN_FILE} \ --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_inter_out.txt \ --force_overwrite \ --assembly GRCh38 \ --fasta ${REFERENCE_GENOME} \ --offline \ --merged \ --use_given_ref \ --cache --cache_version 100 \ --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \ --individual all \ --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \ --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \ --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT} echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_shared: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" ######################################################## ### run coverage for each proband (DD genes) ### ### run reccurent SNP coverage for each proband ### ### already did it as part of the trio analysis ### ######################################################## ################################################################################### ### for each proband generate the DECIPHER file ### ### ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz - the sibling VCF ### ### ${TRANS_MAP} - the current transcript mapping file ### ################################################################################### echo "Generating the DECIPHER file for all probands in ${FAMILY_ID} ..." # first, split the family VCF to individual VCFs # -c1: minimum allele count (INFO/AC) of sites to be printed # split multi-allelic sites (by -m -any) # left-alignment and normalization (by adding the -f) file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz echo "splitting $file" for indi in `${BCFTOOLS} query -l $file`; do ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz} rm ${file/.vcf*/.$indi.rough.vcf.gz} done # create the names of the needed files PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.txt FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_shared FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID} ## call the python scrpit time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_aff_sib_scripts_from_quad.py \ ${DECIPHER_ID} \ ${TRANS_MAP} \ ${PED_FILE} \ ${IN_G2P_FILE} \ ${FAM_IGV_DIR} \ ${VCF_DIR} \ ${PLATE_ID} \ ${FAMILY_ID} \ ${DEC_DIR} \ ${FAM_BAM_DIR} ############################################################################################################################# ## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ## ############################################################################################################################# # from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start) # INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID} file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz for INDI_ID in `${BCFTOOLS} query -l $file`; do ################################# ##### for each proband #### ################################# echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...." time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID}_shared done echo "" echo "" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "DECIPHER analysis of all probands in ${FAMILY_ID}: done" echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" echo "" echo "" #rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf ############################################################################################## ### for each of the affected probands ### ### for each variant in the DECIPHER upload file (only the shared variants) ### ### generate a IGV snapshot based on the realigned BAM used by GATK for calling variants ### ### first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder) ### ### then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs ### ############################################################################################## echo "" echo "" echo "Generating the BAMOUT files for the ${FAMILY_ID} family ...." # get the ids of the affected probands to generate the part of the command line which refers to the BAM files to by analysed file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz # INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID} aff_pro_arr=() for INDI_ID in `${BCFTOOLS} query -l $file`; do aff_pro_arr+=(${INDI_ID}) done echo " Found ${#aff_pro_arr[@]} affected probands in ${FAMILY_ID} for which BAMOUTs to be generated" for key in "${!aff_pro_arr[@]}"; do echo " ${aff_pro_arr[$key]}"; #~# INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam" INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam" done # now go over the shared G2P variants in the for which to generate bamouts # the variants should be identical in the DECIPHER files for all affected individuals, pick the first individual # chr is the second column - need to add the 'chr' prefix # pos is the third column # the first line is a header line, starting with 'Internal reference number or ID' # file called: ${DEC_DIR}/<proband_id>_<fam_id>_shared_DEC_FLT.csv # and for each run GATK to generate the bamout files # to be stored in ${BAMOUT_DIR}/${FAMILY_ID}_shared mkdir ${BAMOUT_DIR}/${FAMILY_ID}_shared var_file=${DEC_DIR}/${aff_pro_arr[0]}_shared_DEC_FLT.csv echo "... reading the shared variants in ${var_file} to generate the bamouts for each ..." grep -v '^Internal' ${var_file} | while IFS= read -r line do echo "" echo "" echo "" echo "$line" IFS=, read -ra ary <<<"$line" chr=${ary[1]} pos=${ary[2]} ref=${ary[4]} alt=${ary[5]} echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}" # generate the bamout file echo "...doing the bamout" echo " time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE}" echo " -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2" echo " --output ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \ --active-probability-threshold 0.000 -ploidy 2 \ --output ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam done ############################################################################################## ## write the IGV batch file for each affected individual in this family based on the bamouts # ## to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.shared.snapshot.txt # ## ${PROBAND_ID}_${FAMILY_ID} == ${aff_pro_arr[$key] # ################################################################## for key in "${!aff_pro_arr[@]}"; do echo "" echo "" echo "Generating the IGV batch file for ${aff_pro_arr[$key]}"; #~# snap_file=/scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.shared.snapshot.txt snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.shared.snapshot.txt # check if previous version exist, if so - delete it if [ -f "${snap_file}" ]; then echo "previous version of ${snap_file} exist --> deleted" rm ${snap_file} fi # write the header for the IGV batch file echo "new" >> ${snap_file} echo "genome hg38" >> ${snap_file} echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_shared\"" >> ${snap_file} echo "" >> ${snap_file} # now, go again over the variants in this individual's DECIPHER file and generate one snapshot file for all the variants var_file=${DEC_DIR}/${aff_pro_arr[$key]}_shared_DEC_FLT.csv echo "... reading ${var_file} to generate the IGV batch file using the bamouts..." grep -v '^Internal' ${var_file} | while IFS= read -r line do IFS=, read -ra ary <<<"$line" chr=${ary[1]} pos=${ary[2]} ref=${ary[4]} alt=${ary[5]} left=$((${pos}-25)) right=$((${pos}+25)) echo "new" >> ${snap_file} echo "load ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file} echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file} echo "goto chr${chr}:${left}-${right}" >> ${snap_file} echo "group SAMPLE" >> ${snap_file} echo "sort base" >> ${snap_file} echo "squish" >> ${snap_file} echo "snapshot bamout_${aff_pro_arr[$key]}_shared_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file} echo "" >> ${snap_file} echo "" >> ${snap_file} done echo "Generating of the IGV batch file based on bamouts for ${aff_pro_arr[$key]}- done!" echo "snap_file = ${snap_file}" done echo "" echo "" echo "" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "+++ Variant prioritization of family ${FAMILY_ID} as affected sib-pair completed +++" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "" echo "" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "+++ Analysing QUAD family ${FAMILY_ID} for shared variants in the two affected siblings: DONE! +++" echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" echo "" echo ""