Skip to content
Snippets Groups Projects
process_quad.sh 43.5 KiB
Newer Older
#!/bin/bash
#SBATCH --cpus-per-task=1
#SBATCH --mem=16GB
#SBATCH --time=24:00:00
#SBATCH --job-name=process_quad
#SBATCH --output=process_quad.%A_%a.out
#SBATCH --error=process_quad.%A_%a.err



# setup PATH
export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin
export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2


### folder structure for the downstream analysis - created by trio_setup.sh, done previously by the stanard trio-based pipeline ###
BASE=/home/u035/u035/shared/analysis/work
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
BAMOUT_DIR=${WORK_DIR}/BAMOUT
SCRIPTS_DIR=/home/u035/u035/shared/scripts


# other files to be used
TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed			# OK
CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt	# OK
BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt			# OK
TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt			# OK
REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed                        # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1

SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools
BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip
TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix
VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt
VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase
GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk							# points to ../share/gatk4-4.2.1.0-0/gatk
GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3							# points to python3.6
PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"	# points to ../share/ensembl-vep-100.4-0/vep
REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa



echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM files (VCF and PED already copied)		i.e. /home/u035/u035/shared/results
echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 19650_Ansari_Morad
echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID                                            e.g. 19650
echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
echo "FAMILY_ID = ${FAMILY_ID}"		# the family ID of this family with affected probands
echo "KID_1_ID = ${KID_1_ID}"		# the ID of the first affected proband, in the format INDI_ID
echo "KID_2_ID = ${KID_2_ID}"           # the ID of the second affected proband
echo "PAR_1_ID = ${PAR_1_ID}"           # the ID of the first unaffected parent, in the format INDI_ID
echo "PAR_2_ID = ${PAR_2_ID}"           # the ID of the second unaffected parent
echo "DECIPHER_ID = ${DECIPHER_ID}"	# the DECIPHER_ID for this family




# change to the LOG folder
cd ${LOG_DIR}



#########################################################################################################
###   check the PED file to make sure exactly 2 affected probands with exactly 2 unaffected parents   ###
#########################################################################################################

echo ""
echo ""
echo "checking the ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped file..."

time ${PYTHON2} ${SCRIPTS_DIR}/check_quad_PED.py ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped

# check if the PED file checks were successful (python exit code = 0), if not exit the bash script
ret=$?
if [ $ret -ne 0 ]; then
     echo "...it appears that the PED file does not corresponds to a quad family !"
     echo "ERROR: Aborting the analysis"
     exit
fi
echo ""
echo ""




##################################################################################################
##################################################################################################
###    split the quad to two trios, one for each child and process as a standard TRIO family   ###
##################################################################################################
##################################################################################################

echo ""
echo ""
echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo "+++ Analysing QUAD family ${FAMILY_ID} as two trios +++"
echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo ""
echo ""


# Create an array of kid ids to loop
KID_IDS=()
KID_IDS+=(${KID_1_ID})
KID_IDS+=(${KID_2_ID})


for KID_ID in ${KID_IDS[@]}; do
    echo ""
    echo "++++++++++++++++++++++++++++++++++++"
    echo "processing trio for child =  $KID_ID"
    echo "++++++++++++++++++++++++++++++++++++"
    echo ""

    #############################################################
    # generate this trio's PED file                             #
    # named: ${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped #
    #############################################################
    quad_ped_file=${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
    time ${PYTHON2} ${SCRIPTS_DIR}/generate_trio_PED_from_quad.py ${PED_DIR} ${quad_ped_file} ${KID_ID} ${PAR_1_ID} ${PAR_2_ID}


    #############################################################################
    # generate this trio's VCF file                                             #
    # named: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz #
    #############################################################################
    quad_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
    trio_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz

    # tabix index the quad file just in case
    time ${TABIX} -p vcf ${quad_vcf_file}

    # extract a trio VCF for this kid
    time java -Xmx24g -jar ${GATK3} -T SelectVariants \
        -R ${REFERENCE_GENOME} \
        -V ${quad_vcf_file} \
        -sn ${KID_ID}_${FAMILY_ID} \
        -sn ${PAR_1_ID}_${FAMILY_ID} \
        -sn ${PAR_2_ID}_${FAMILY_ID} \
        -jdk_deflater \
        -jdk_inflater \
        -o ${trio_vcf_file} \
        -env


    ##################################################################################
    ###        DNU and clean the the family VCF                                    ###
    ### format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz ###
    ##################################################################################

    echo ""
    echo ""
    echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}_${KID_ID}'s VCF file..."

    time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz
    time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz
    time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz

    # remove sites with AC=0
    time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf

    # reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
    # exlude variants from the blacklist (matching on chr,pos,ref,alt)
    time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf

    # bgzip and tabix it
    time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
    time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz


    # delete intermediate files - keep them for now for debugging reasons
    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}-gatk-haplotype-annotated.vcf.gz
    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.decomp.vcf.gz*
    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.norm.vcf.gz*
    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.DNU.vcf.gz*
    rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.AC0.vcf


    # to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
    # rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}_${KID_ID}'s VCF file: done"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""



    ###################################################################
    ###     run G2P for this trio VCF (DD genes)                    ###
    ###     format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf    ###
    ###################################################################

    echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}..."
    echo "Using ${TARGETS}"

    IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.clean.vcf
    G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR
    mkdir ${G2P_LOG_DIR}
    TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt
    HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.html
    VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'

    time ${VEP} \
        -i ${IN_FILE} \
        --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_inter_out.txt \
        --force_overwrite \
        --assembly GRCh38 \
        --fasta ${REFERENCE_GENOME} \
        --offline \
        --merged \
        --use_given_ref \
        --cache --cache_version 100 \
        --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
        --individual all \
        --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
        --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
        --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}: done"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""



    ###################################################################
    ###     run VASE for this trio VCF (de novo)                    ###
    ###   format: ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz   ###
    ###################################################################

    echo "Performing de novo analysis with VASE for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID} ..."

    IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
    OUT_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.denovo.vcf
    PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped

    time ${VASE} \
        -i ${IN_FILE} \
        -o ${OUT_FILE} \
        --log_progress \
        --prog_interval 100000 \
        --freq 0.0001 \
        --gq 30 --dp 10 \
        --het_ab 0.3 \
        --max_alt_alleles 1 \
        --csq all \
        --biotypes all \
        --control_gq 15 --control_dp 5 \
        --control_het_ab 0.01 \
        --control_max_ref_ab 0.05 \
        --de_novo \
        --ped ${PED_FILE}



    # do some filtering on the denovo VCFs - exclude variants not on the 24 chr, as well as variants in LCR and telomere/centromere regions
    ### actually, ignore the filtering of variants in LCR and telomere/centromere regions --> more variants with  “Unknown” status may be classified as “denovo” if enough support

    cd ${VASE_DIR}

    # index the denovo VCF
    time ${GATK4} IndexFeatureFile -I ${OUT_FILE}

    # select only variants on the 24 chromosomes
    time ${GATK4} SelectVariants -R ${REFERENCE_GENOME} -V ${OUT_FILE} -O ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf -L /home/u035/u035/shared/resources/24_chr.list --exclude-non-variants

    # sort the VCF (maybe not needed?, but just in case, and it is quick)
    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf
    grep '^#' ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf > ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf \
    && grep -v '^#' ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf | LC_ALL=C sort -t $'\t' -k1,1V -k2,2n >> ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf

    # index the sorted VCF
    time ${GATK4} IndexFeatureFile -I ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf

    # split multi-allelic sites [by -m -any]
    # left-alignment and normalization [by adding the -f]
    file=${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf
    echo "$file"
    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Ov -o ${file/.strict.24chr.sort.denovo.vcf/.ready.denovo.vcf} $file

    # clean intermediate denovo files
    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.denovo.vcf*
    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.denovo.vcf*
    rm ${PLATE_ID}_${FAMILY_ID}_${KID_ID}.strict.24chr.sort.denovo.vcf*

    # change back to the LOG folder
    cd ${LOG_DIR}

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "De novo analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_${KID_ID}: done"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""




    #################################################################################################################################################
    ###          run coverage for this proband (DD genes)                                                                                         ###
    ###   format: ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam   ###
    #################################################################################################################################################

    echo "Performing coverage analysis for PROBAND_ID = ${KID_ID} ...."

    # make sure we are reading the data from the exact batch & plate ID & version N
    BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}-ready.bam
    OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15

    time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \
        --omitDepthOutputAtEachBase \
        --minBaseQuality 20 \
        --minMappingQuality 20 \
        -ct 20 \
        -jdk_deflater \
        -jdk_inflater \
        --allow_potentially_misencoded_quality_scores

    echo ""
    echo ""
    echo "----------------------------------------------------------------------------------------------------"
    echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${KID_ID} ..."
    cat ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.sample_summary | awk '{print $7}'
    echo "----------------------------------------------------------------------------------------------------"

    # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
    time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.COV.txt

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "Coverage analysis of PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done    "
    echo "    Coverage file = ${COV_DIR}/${KID_ID}_${FAMILY_ID}.DD15.COV.txt"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""



    ################################################################################################
    # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) #
    ################################################################################################
    echo "Performing recurrent coverage analysis for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..."

    # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it
    # set the name of the file containing info about the coverage of the recurrent SNPs
    REC_OUT_FILE=${COV_DIR}/${KID_ID}_${FAMILY_ID}.REC_SNP_COV.txt

    while IFS=$'\t' read -ra var; do
      gene="${var[0]}"
      chr="${var[1]}"
      pos="${var[2]}"
      lo=$(expr $pos - 15)
      hi=$(expr $pos + 15)
      reg="$lo-$hi"
      echo "============================================="
      echo "$gene : recurrent variant at $chr:$pos"
      echo "exploring coverage at $chr:$reg"

      echo "---------------------------------------------"
      echo "precisely at the position"
      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos"

      echo "---------------------------------------------"
      echo "average in the +/- 15bp region"
      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}'

      echo "---------------------------------------------"
      echo "detailed in the +/- 15bp region"
      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE}
    done < ${REC_SNP} > ${REC_OUT_FILE}


    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done    "
    echo "    Coverage file = ${REC_OUT_FILE}"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""





    #############################################################################################
    ###      generate the DECIPHER file for this proband                                      ###
    ###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz - the cleaned family VCF  ###
    ###  ${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf - the VASE file      ###
    ###  ${TRANS_MAP} - the current transcript mapping file                                   ###
    #############################################################################################

    echo "Generating the DECIPHER file for PROBAND_ID = ${KID_ID}_${FAMILY_ID} ..."

    # first, split the family VCF to individual VCFs
    # -c1:  minimum allele count (INFO/AC) of sites to be printed
    # split multi-allelic sites (by -m -any)
    # left-alignment and normalization (by adding the -f)

    file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.vcf.gz
    echo "splitting $file"
    for indi in `${BCFTOOLS} query -l $file`; do
        ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
        ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
        rm ${file/.vcf*/.$indi.rough.vcf.gz}
    done

    # VASE file - already split, left-aligned and normalized

    # create the names of the needed files
    PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ped
    IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.report.txt
    IN_VASE_FILE=${VASE_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}.ready.denovo.vcf
    FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_${KID_ID}
    FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}



    ## call the python scrpit
    time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_trio_scripts_from_quad.py \
    ${DECIPHER_ID} \
    ${TRANS_MAP} \
    ${PED_FILE} \
    ${IN_G2P_FILE} \
    ${IN_VASE_FILE} \
    ${FAM_IGV_DIR} \
    ${VCF_DIR} \
    ${PLATE_ID} \
    ${FAMILY_ID} \
    ${DEC_DIR} \
    ${FAM_BAM_DIR} \
    ${KID_ID}

    ## using the DECIPHER bulk upload file v9 --> generate the DECIPHER bulk upload file v10
    echo "...Generating v10 Decipher bulk upload file for proband = ${KID_ID}, family_id = ${FAMILY_ID} ..."
    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${KID_ID}_${FAMILY_ID}

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "DECIPHER analysis of PROBAND_ID = ${KID_ID}_${FAMILY_ID}: done"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""

    #rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf





    ##############################################################################################
    ###  for each variant in the DECIPHER upload file                                          ###
    ###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
    ###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
    ###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
    ##############################################################################################

    # we have so far: FAMILY_ID and KID_ID
    echo "...Generating BAMOUT files for the ${FAMILY_ID} family, proband = ${KID_ID} ..."
    echo "...KID_ID = ${KID_ID}, PAR_1_ID = ${PAR_1_ID}, PAR_2_ID = ${PAR_2_ID} "

    # gather the trio BAM files
    kid_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}/${KID_ID}_${FAMILY_ID}-ready.bam
    par_1_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PAR_1_ID}_${FAMILY_ID}/${PAR_1_ID}_${FAMILY_ID}-ready.bam
    par_2_bam=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PAR_2_ID}_${FAMILY_ID}/${PAR_2_ID}_${FAMILY_ID}-ready.bam

    echo "...kid_bam = ${kid_bam}..."
    echo "...par_1_bam = ${par_1_bam}..."
    echo "...par_2_bam = ${par_2_bam}..."

    # gather the variants in the DECIPHER file for which to generate bamouts
    # chr is the second column - need to add the 'chr' prefix
    # pos is the third column
    # the first line is a header line, starting with 'Internal reference number or ID'
    # file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv
    # and for each run GATK to generate the bamout files
    # to be stored in ${BAMOUT_DIR}/${FAMILY_ID}

    mkdir ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}

    var_file=${DEC_DIR}/${KID_ID}_${FAMILY_ID}_DEC_FLT.csv
    echo "... reading ${var_file} to generate the bamouts..."

    grep -v '^Internal' ${var_file} |
    while IFS= read -r line
    do
      echo "$line"
      IFS=, read -ra ary <<<"$line"
      chr=${ary[1]}
      pos=${ary[2]}
      ref=${ary[4]}
      alt=${ary[5]}
      echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"

      # generate the bamout file
      echo "...doing the bamout"
      echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500"
      echo "   --active-probability-threshold 0.000 -ploidy 2"
      echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"

      time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} --input ${kid_bam} --input ${par_1_bam} --input ${par_2_bam} -L chr${chr}:${pos} --interval-padding 500 \
      --active-probability-threshold 0.000 -ploidy 2 \
      --output ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam
    done


    #################################################################
    # write the IGV batch file for this family based on the bamouts #
    # to be stored as /scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}/DECIPHER/IGV/bamout_${KID_ID}_${FAMILY_ID}.snapshot.txt #
    #################################################################

    snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${KID_ID}_${FAMILY_ID}.snapshot.txt

    # check if previous version exist, if so - delete it
    if [ -f "${snap_file}" ]; then
        echo "previous version of ${snap_file} exist --> deleted"
        rm ${snap_file}
    fi

    # write the header for the IGV batch file
    echo "new" >> ${snap_file}
    echo "genome hg38" >> ${snap_file}
    echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_${KID_ID}\"" >> ${snap_file}
    echo "" >> ${snap_file}

    # now, go again over the variants in the DECIPHER file and generate one snapshot file for all the variants
    var_file=${DEC_DIR}/${KID_ID}_${FAMILY_ID}_DEC_FLT.csv
    echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."

    grep -v '^Internal' ${var_file} |
    while IFS= read -r line
    do
      IFS=, read -ra ary <<<"$line"
      chr=${ary[1]}
      pos=${ary[2]}
      ref=${ary[4]}
      alt=${ary[5]}
      left=$((${pos}-25))
      right=$((${pos}+25))

      echo "new" >> ${snap_file}
      echo "load ${BAMOUT_DIR}/${FAMILY_ID}_${KID_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
      echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}

      echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
      echo "group SAMPLE" >> ${snap_file}
      echo "sort base" >> ${snap_file}
      echo "squish" >> ${snap_file}
      echo "snapshot bamout_${KID_ID}_${FAMILY_ID}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
      echo "" >> ${snap_file}
      echo "" >> ${snap_file}

    done

    echo "Generating of the IGV batch files based on bamouts - done!"
    echo "snap_file = ${snap_file}"

    echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    echo "+++   Variant prioritization of trio ${FAMILY_ID} with ${KID_ID} completed   +++"
    echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"


done

echo ""
echo ""
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo "+++ Analysing QUAD family ${FAMILY_ID} as two trios: DONE! +++"
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo ""
echo ""
















#######################################################################################
#######################################################################################
###    analyze only the two affected siblings and identify shared variants in them  ###
#######################################################################################
#######################################################################################


echo ""
echo ""
echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo "+++ Analysing QUAD family ${FAMILY_ID} for shared variants in the two affected siblings +++"
echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo ""
echo ""


##########################################################
# generate the affacted siblings PED file                #
# named: ${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped #
##########################################################
quad_ped_file=${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
time ${PYTHON2} ${SCRIPTS_DIR}/generate_aff_sib_PED_from_quad.py ${PED_DIR} ${quad_ped_file} ${KID_1_ID} ${KID_2_ID}




##########################################################################
# generate the affected siblings VCF file                                #
# named: ${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz #
##########################################################################
quad_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
shared_vcf_file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz

# tabix index the quad file just in case
time ${TABIX} -p vcf ${quad_vcf_file}

# extract a VCF for these two kids
time java -Xmx24g -jar ${GATK3} -T SelectVariants \
    -R ${REFERENCE_GENOME} \
    -V ${quad_vcf_file} \
    -sn ${KID_1_ID}_${FAMILY_ID} \
    -sn ${KID_2_ID}_${FAMILY_ID} \
    -jdk_deflater \
    -jdk_inflater \
    -o ${shared_vcf_file} \
    -env





###############################################################################
###        DNU and clean the the siblings VCF                               ###
### format: ${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz ###
###############################################################################

echo ""
echo ""
echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}_shared's VCF file..."


time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz
time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz
time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz


# remove sites with AC=0
time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf

# reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
# exlude variants from the blacklist (matching on chr,pos,ref,alt)
time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf

# bgzip and tabix it
time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz

# delete intermediate files
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared-gatk-haplotype-annotated.vcf.gz
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.decomp.vcf.gz*
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.norm.vcf.gz*
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.DNU.vcf.gz*
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.AC0.vcf


# to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
# rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf


echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}_shared's VCF file: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""
echo ""


################################################################
###     run G2P for the two affected siblings (DD genes)     ###
###     format: ${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf    ###
################################################################
echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_shared..."
echo "Using ${TARGETS}"

IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf
G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR
mkdir ${G2P_LOG_DIR}
TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.report.html
VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'

time ${VEP} \
    -i ${IN_FILE} \
    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_inter_out.txt \
    --force_overwrite \
    --assembly GRCh38 \
    --fasta ${REFERENCE_GENOME} \
    --offline \
    --merged \
    --use_given_ref \
    --cache --cache_version 100 \
    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
    --individual all \
    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}


echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}_shared: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""




########################################################
###    run coverage for each proband (DD genes)      ###
###    run reccurent SNP coverage for each proband   ###
###    already did it as part of the trio analysis   ###
########################################################







###################################################################################
###      for each proband generate the DECIPHER file                            ###
###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz - the sibling VCF  ###
###  ${TRANS_MAP} - the current transcript mapping file                         ###
###################################################################################


echo "Generating the DECIPHER file for all probands in ${FAMILY_ID} ..."


# first, split the family VCF to individual VCFs
# -c1:  minimum allele count (INFO/AC) of sites to be printed
# split multi-allelic sites (by -m -any)
# left-alignment and normalization (by adding the -f)

file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
echo "splitting $file"
for indi in `${BCFTOOLS} query -l $file`; do
    ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
    rm ${file/.vcf*/.$indi.rough.vcf.gz}
done




# create the names of the needed files
PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}_shared.ped
IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_shared_LOG_DIR/${PLATE_ID}_${FAMILY_ID}_shared.report.txt
FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}_shared
FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}



## call the python scrpit
time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_aff_sib_scripts_from_quad.py \
${DECIPHER_ID} \
${TRANS_MAP} \
${PED_FILE} \
${IN_G2P_FILE} \
${FAM_IGV_DIR} \
${VCF_DIR} \
${PLATE_ID} \
${FAMILY_ID} \
${DEC_DIR} \
${FAM_BAM_DIR}





#############################################################################################################################
## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ##
#############################################################################################################################
# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)
# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}

file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
for INDI_ID in `${BCFTOOLS} query -l $file`; do
    #################################
    #####    for each proband    ####
    #################################
    echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...."
    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID}_shared
done

echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "DECIPHER analysis of all probands in ${FAMILY_ID}: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""
echo ""


#rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.clean.vcf





##############################################################################################
###  for each of the affected probands                                                     ###
###  for each variant in the DECIPHER upload file (only the shared variants)               ###
###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
##############################################################################################

echo ""
echo ""
echo "Generating the BAMOUT files for the ${FAMILY_ID} family ...."


# get the ids of the affected probands to generate the part of the command line which refers to the BAM files to by analysed
file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}_shared.ready.vcf.gz
# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
aff_pro_arr=()
for INDI_ID in `${BCFTOOLS} query -l $file`; do
    aff_pro_arr+=(${INDI_ID})
done
echo "  Found ${#aff_pro_arr[@]} affected probands in ${FAMILY_ID} for which BAMOUTs to be generated"

for key in "${!aff_pro_arr[@]}"; do
  echo "    ${aff_pro_arr[$key]}";
#~#  INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam"
  INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam"

done


# now go over the shared G2P variants in the for which to generate bamouts
# the variants should be identical in the DECIPHER files for all affected individuals, pick the first individual
# chr is the second column - need to add the 'chr' prefix
# pos is the third column
# the first line is a header line, starting with 'Internal reference number or ID'
# file called: ${DEC_DIR}/<proband_id>_<fam_id>_shared_DEC_FLT.csv
# and for each run GATK to generate the bamout files
# to be stored in ${BAMOUT_DIR}/${FAMILY_ID}_shared


mkdir ${BAMOUT_DIR}/${FAMILY_ID}_shared


var_file=${DEC_DIR}/${aff_pro_arr[0]}_shared_DEC_FLT.csv
echo "... reading the shared variants in ${var_file} to generate the bamouts for each ..."


grep -v '^Internal' ${var_file} |
while IFS= read -r line
do
  echo ""
  echo ""
  echo ""
  echo "$line"
  IFS=, read -ra ary <<<"$line"
  chr=${ary[1]}
  pos=${ary[2]}
  ref=${ary[4]}
  alt=${ary[5]}
  echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"

  # generate the bamout file
  echo "...doing the bamout"
  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE}"
  echo "   -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2"
  echo "   --output ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"

  time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \
  --active-probability-threshold 0.000 -ploidy 2 \
  --output ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam

done



##############################################################################################
## write the IGV batch file for each affected individual in this family based on the bamouts #
## to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.shared.snapshot.txt #
## ${PROBAND_ID}_${FAMILY_ID} == ${aff_pro_arr[$key] #
##################################################################

for key in "${!aff_pro_arr[@]}"; do
  echo ""
  echo ""
  echo "Generating the IGV batch file for ${aff_pro_arr[$key]}";

#~#  snap_file=/scratch/u035/u035/shared/trio_whole_exome/analysis/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.shared.snapshot.txt
  snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.shared.snapshot.txt


  # check if previous version exist, if so - delete it
  if [ -f "${snap_file}" ]; then
    echo "previous version of ${snap_file} exist --> deleted"
    rm ${snap_file}
  fi


  # write the header for the IGV batch file
  echo "new" >> ${snap_file}
  echo "genome hg38" >> ${snap_file}
  echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}_shared\"" >> ${snap_file}
  echo "" >> ${snap_file}


  # now, go again over the variants in this individual's DECIPHER file and generate one snapshot file for all the variants
  var_file=${DEC_DIR}/${aff_pro_arr[$key]}_shared_DEC_FLT.csv

  echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."



  grep -v '^Internal' ${var_file} |
  while IFS= read -r line
  do
    IFS=, read -ra ary <<<"$line"
    chr=${ary[1]}
    pos=${ary[2]}
    ref=${ary[4]}
    alt=${ary[5]}
    left=$((${pos}-25))
    right=$((${pos}+25))

    echo "new" >> ${snap_file}
    echo "load ${BAMOUT_DIR}/${FAMILY_ID}_shared/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
    echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}

    echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
    echo "group SAMPLE" >> ${snap_file}
    echo "sort base" >> ${snap_file}
    echo "squish" >> ${snap_file}
    echo "snapshot bamout_${aff_pro_arr[$key]}_shared_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
    echo "" >> ${snap_file}
    echo "" >> ${snap_file}

  done

  echo "Generating of the IGV batch file based on bamouts for ${aff_pro_arr[$key]}- done!"
  echo "snap_file = ${snap_file}"

done

echo ""
echo ""
echo ""
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo "+++   Variant prioritization of family ${FAMILY_ID} as affected sib-pair completed   +++"
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"








echo ""
echo ""
echo "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"