process_shared.sh

#!/bin/bash
#SBATCH --cpus-per-task=1
#SBATCH --mem=16GB
#SBATCH --time=24:00:00
#SBATCH --job-name=process_shared
#SBATCH --output=process_shared.%A_%a.out
#SBATCH --error=process_shared.%A_%a.err


# setup PATH
export PATH=$PATH:/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin:/home/u035/u035/shared/software/bcbio/anaconda/bin
export PERL5LIB=$PERL5LIB:/home/u035/u035/shared/software/bcbio/anaconda/lib/site_perl/5.26.2


### folder structure for the downstream analysis - created by trio_setup.sh, done previously by the stanard trio-based pipeline ###
BASE=/home/u035/u035/shared/analysis/work
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
BAMOUT_DIR=${WORK_DIR}/BAMOUT
SCRIPTS_DIR=/home/u035/u035/shared/scripts


# other files to be used
TARGETS=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.plus15bp.merged.bed			# OK
CLINVAR=/home/u035/u035/shared/resources/G2P/DDG2P.20220113.clinvar.20220109.plus15bp.txt	# OK
BLACKLIST=/home/u035/u035/shared/resources/blacklist/current_blacklist.txt			# OK
TRANS_MAP=/home/u035/u035/shared/resources/trans_map/current_trans_map.txt			# OK
REC_SNP=/home/u035/u035/shared/resources/reccurent/current_reccurent.bed                        # OK, https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7116826/, Extended Data Table 1


### TOOLS ###
SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
BCFTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bcftools
BGZIP=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/bgzip
TABIX=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/tabix
VT=/home/u035/u035/shared/software/bcbio/anaconda/bin/vt
VASE=/home/u035/u035/shared/software/bcbio/anaconda/bin/vase
GATK4=/home/u035/u035/shared/software/bcbio/anaconda/bin/gatk                                                           # points to ../share/gatk4-4.2.1.0-0/gatk
GATK3=/home/u035/u035/shared/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
PYTHON3=/home/u035/u035/shared/software/bcbio/anaconda/bin/python3                                                      # points to python3.6
PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
VEP="/home/u035/u035/shared/software/bcbio/anaconda/bin/perl /home/u035/u035/shared/software/bcbio/anaconda/bin/vep"    # points to ../share/ensembl-vep-100.4-0/vep
REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa


echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source BAM files (VCF and PED already copied)		i.e. /home/u035/u035/shared/results
echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   	e.g. 19650_Ansari_Morad
echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID						e.g. 19650
echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              	e.g. 19285
echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis
echo "FAMILY_ID = ${FAMILY_ID}"		# the family ID of this family with affected probands
echo "DECIPHER_ID = ${DECIPHER_ID}"	# the DECIPHER_ID for this family


# change to the LOG folder
cd ${LOG_DIR}


##########################################################################################
###   check the PED file to make sure only affected individuals with missing parents   ###
###   write a file with all affected individuals with no parents                       ###
###   write a file with all unaffected individuals - should be none                    ###
##########################################################################################

echo ""
echo ""
echo "checking the ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped file..."

time ${PYTHON2} ${SCRIPTS_DIR}/check_shared_PED.py ${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped

# check if the PED file checks were successful (python exit code = 0), if not exit the bash script
ret=$?
if [ $ret -ne 0 ]; then
     echo "...it appears that the PED file does not corresponds to a family consisting of affected probands only!"
     echo "ERROR: Aborting the analysis"
     exit
fi
echo ""
echo ""


########################################################################
###        DNU and clean the the family VCF   			     ###
### format: ${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz ###
########################################################################

echo ""
echo ""
echo "Performing DNU and cleaning of the ${PLATE_ID}_${FAMILY_ID}'s VCF file..."


time ${VT} decompose -s ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz
time ${VT} normalize ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz -r ${REFERENCE_GENOME} -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz
time ${VT} uniq ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz -o ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz


# remove sites with AC=0
time ${BCFTOOLS} view --min-ac=1 --no-update ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf

# reset GT to no-call if num_ALT < num_ALT_THERSH or VAF < VAF_THRESH and GT != 0/0
# exlude variants from the blacklist (matching on chr,pos,ref,alt)
time ${PYTHON2} ${SCRIPTS_DIR}/filter_LQ_GT.py ${BLACKLIST} ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf

# bgzip and tabix it
time cat ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf | ${BGZIP} > ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
time ${TABIX} -p vcf ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz

# delete intermediate files
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}-gatk-haplotype-annotated.vcf.gz
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.decomp.vcf.gz*
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.norm.vcf.gz*
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.DNU.vcf.gz*
rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.AC0.vcf


# to avoid bgzip pipe broken annoying, but not problematic message - skip the next step, the file will be used by G2P as IN_FILE and will be deleted last
# rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf


echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "DNU, AC=0, num_ALT & VAF & blacklist cleaning and of the ${PLATE_ID}_${FAMILY_ID}'s VCF file: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""
echo ""


###########################################################
###     run G2P for each family VCF (DD genes)          ###
###     format: ${PLATE_ID}_${FAMILY_ID}.clean.vcf      ###
###########################################################


echo "Performing G2P analysis (DD genes)for FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}..."
echo "Using ${TARGETS}"


IN_FILE=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf
G2P_LOG_DIR=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR
mkdir ${G2P_LOG_DIR}
TXT_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.txt
HTML_OUT=${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}.report.html
VCF_KEYS='gnomADe_r2.1.1_GRCh38|gnomADg_r3.1.1_GRCh38'


time ${VEP} \
    -i ${IN_FILE} \
    --output_file ${G2P_LOG_DIR}/${PLATE_ID}_${FAMILY_ID}_inter_out.txt \
    --force_overwrite \
    --assembly GRCh38 \
    --fasta ${REFERENCE_GENOME} \
    --offline \
    --merged \
    --use_given_ref \
    --cache --cache_version 100 \
    --dir_cache /home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/vep \
    --individual all \
    --transcript_filter "gene_symbol in /home/u035/u035/shared/resources/G2P/genes_in_DDG2P.20220113.txt" \
    --dir_plugins /home/u035/u035/shared/software/bcbio/anaconda/share/ensembl-vep-100.4-0 \
    --plugin G2P,file='/home/u035/u035/shared/resources/G2P/DDG2P.20220113.csv',af_from_vcf=1,confidence_levels='definitive&strong',af_from_vcf_keys=${VCF_KEYS},log_dir=${G2P_LOG_DIR},txt_report=${TXT_OUT},html_report=${HTML_OUT}


echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "G2P analysis of FAMILY_ID = ${PLATE_ID}_${FAMILY_ID}: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""
echo ""


#################################################################################################################################################
###          run coverage for each proband (DD genes)                                                                                         ###
###   format: ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}/${PROBAND_ID}_${FAMILY_ID}-ready.bam   ###
#################################################################################################################################################


# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)

file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz

# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}

for INDI_ID in `${BCFTOOLS} query -l $file`; do

    #################################
    #####    for each proband    ####
    #################################

    echo "Performing coverage analysis for PROBAND_ID = ${INDI_ID} ...."

    # make sure we are reading the data from the exact batch & plate ID
    BAM_FILE=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${INDI_ID}/${INDI_ID}-ready.bam
    OUT_FILE=${COV_DIR}/${INDI_ID}.DD15

    time java -Xmx8g -jar ${GATK3} -T DepthOfCoverage -R ${REFERENCE_GENOME} -o ${OUT_FILE} -I ${BAM_FILE} -L ${TARGETS} \
        --omitDepthOutputAtEachBase \
        --minBaseQuality 20 \
        --minMappingQuality 20 \
        -ct 20 \
        -jdk_deflater \
        -jdk_inflater \
        --allow_potentially_misencoded_quality_scores

    echo ""
    echo ""
    echo "----------------------------------------------------------------------------------------------------"
    echo "percentage of DD exons (+/-15bp) covered at least 20x in PROBAND_ID = ${INDI_ID} ..."
    cat ${COV_DIR}/${INDI_ID}.DD15.sample_summary | awk '{print $7}'
    echo "----------------------------------------------------------------------------------------------------"

    # now compute the coverage per DD exon (+/-15bp) interval, adding the number of P/LP ClinVar variants (assertion criteria provided) in each interval
    time ${PYTHON2} ${SCRIPTS_DIR}/get_cov_output.py ${COV_DIR}/${INDI_ID}.DD15.sample_interval_summary ${CLINVAR} ${COV_DIR}/${INDI_ID}.DD15.COV.txt

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "Coverage analysis of PROBAND_ID = ${INDI_ID}: done    "
    echo "    Coverage file = ${COV_DIR}/${INDI_ID}.DD15.COV.txt"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""


    # check the coverage per each of the reccurent de novo SNPs (padded with 15bp both directions) #
    echo "Performing recurrent coverage analysis for PROBAND_ID = ${INDI_ID} ..."

    # we have identified the name of the proband's BAM file above (BAM_FILE), reuse it
    # set the name of the file containing info about the coverage of the recurrent SNPs
    REC_OUT_FILE=${COV_DIR}/${INDI_ID}.REC_SNP_COV.txt

    while IFS=$'\t' read -ra var; do
      gene="${var[0]}"
      chr="${var[1]}"
      pos="${var[2]}"
      lo=$(expr $pos - 15)
      hi=$(expr $pos + 15)
      reg="$lo-$hi"
      echo "============================================="
      echo "$gene : recurrent variant at $chr:$pos"
      echo "exploring coverage at $chr:$reg"

      echo "---------------------------------------------"
      echo "precisely at the position"
      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | grep "$pos"

      echo "---------------------------------------------"
      echo "average in the +/- 15bp region"
      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE} | awk '{sum+=$3} END { print "Average = ",sum/NR}'

      echo "---------------------------------------------"
      echo "detailed in the +/- 15bp region"
      ${SAMTOOLS} depth -aa -Q 20 -r $chr:$reg ${BAM_FILE}
    done < ${REC_SNP} > ${REC_OUT_FILE}

    echo ""
    echo ""
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo "Coverage analysis of recurring SNPs for PROBAND_ID = ${INDI_ID}: done    "
    echo "    Coverage file = ${REC_OUT_FILE}"
    echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
    echo ""
    echo ""

done


###################################################################################
###      for each proband generate the DECIPHER file                            ###
###  ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz - the cleaned family VCF  ###
###  ${TRANS_MAP} - the current transcript mapping file                         ###
###################################################################################


echo "Generating the DECIPHER file for all probands in ${FAMILY_ID} ..."


# first, split the family VCF to individual VCFs
# -c1:  minimum allele count (INFO/AC) of sites to be printed
# split multi-allelic sites (by -m -any)
# left-alignment and normalization (by adding the -f)

file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
echo "splitting $file"
for indi in `${BCFTOOLS} query -l $file`; do
    ${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
    ${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
    rm ${file/.vcf*/.$indi.rough.vcf.gz}
done


# create the names of the needed files
PED_FILE=${PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAMILY_ID}.ped
IN_G2P_FILE=${G2P_DIR}/${PLATE_ID}_${FAMILY_ID}_LOG_DIR/${PLATE_ID}_${FAMILY_ID}.report.txt
FAM_IGV_DIR=${IGV_DIR}/${PLATE_ID}_${FAMILY_ID}
FAM_BAM_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}


## call the python scrpit
time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV_shared_scripts.py \
${DECIPHER_ID} \
${TRANS_MAP} \
${PED_FILE} \
${IN_G2P_FILE} \
${FAM_IGV_DIR} \
${VCF_DIR} \
${PLATE_ID} \
${FAMILY_ID} \
${DEC_DIR} \
${FAM_BAM_DIR}


#############################################################################################################################
## using the DECIPHER bulk upload file v9 for each proband --> generate the DECIPHER bulk upload file v10 for each proband ##
#############################################################################################################################
# from the VCF, get all IDs (in the format probad_family_id), they are all affected probands (already checked at the start)
# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}

file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
for INDI_ID in `${BCFTOOLS} query -l $file`; do
    #################################
    #####    for each proband    ####
    #################################
    echo "...Generating v10 Decipher bulk upload file for proband = ${INDI_ID} ...."
    time ${PYTHON3} ${SCRIPTS_DIR}/convert_DEC_to_v10.py ${DEC_DIR} ${INDI_ID}
done


echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "DECIPHER analysis of all probands in ${FAMILY_ID}: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""
echo ""


rm ${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.clean.vcf


##############################################################################################
###  for each of the affected probands                                                     ###
###  for each variant in the DECIPHER upload file (only the shared variants)               ###
###  generate a IGV snapshot based on the realigned BAM used by GATK for calling variants  ###
###  first, generate BAMOUTs for each variant (to be stored in the BAMOUT folder)          ###
###  then, generate a batch script for IGV to produce the snapshots based on the BAMOUTs   ###
##############################################################################################

echo ""
echo ""
echo "Generating the BAMOUT files for the ${FAMILY_ID} family ...."


# get the ids of the affected probands to generate the part of the command line which refers to the BAM files to by analysed
file=${VCF_DIR}/${PLATE_ID}_${FAMILY_ID}.ready.vcf.gz
# INDI_ID is in format ${PROBAND_ID}_${FAMILY_ID}
aff_pro_arr=()
for INDI_ID in `${BCFTOOLS} query -l $file`; do
    aff_pro_arr+=(${INDI_ID})
done
echo "  Found ${#aff_pro_arr[@]} affected probands in ${FAMILY_ID} for which BAMOUTs to be generated"

for key in "${!aff_pro_arr[@]}"; do
  echo "    ${aff_pro_arr[$key]}";
  INPUT_BAM_LINE=${INPUT_BAM_LINE}" --input ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAMILY_ID}/${aff_pro_arr[$key]}/${aff_pro_arr[$key]}-ready.bam"

done


# now go over the shared G2P variants in the for which to generate bamouts
# the variants should be identical in the DECIPHER files for all affected individuals, pick the first individual
# chr is the second column - need to add the 'chr' prefix
# pos is the third column
# the first line is a header line, starting with 'Internal reference number or ID'
# file called: ${DEC_DIR}/<proband_id>_<fam_id>_DEC_FLT.csv
# and for each run GATK to generate the bamout files
# to be stored in ${BAMOUT_DIR}/${FAMILY_ID}

mkdir ${BAMOUT_DIR}/${FAMILY_ID}

var_file=${DEC_DIR}/${aff_pro_arr[0]}_DEC_FLT.csv
echo "... reading the shared variants in ${var_file} to generate the bamouts for each ..."


grep -v '^Internal' ${var_file} |
while IFS= read -r line
do
  echo ""
  echo ""
  echo ""
  echo "$line"
  IFS=, read -ra ary <<<"$line"
#  for key in "${!ary[@]}"; do echo "$key ${ary[$key]}"; done
  chr=${ary[1]}
  pos=${ary[2]}
  ref=${ary[4]}
  alt=${ary[5]}
  echo " --> chr = $chr, pos = $pos, ref = ${ref}, alt = ${alt}"

  # generate the bamout file
  echo "...doing the bamout"
  echo "   time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME}" ${INPUT_BAM_LINE}" -L chr${chr}:${pos} --interval-padding 500 --active-probability-threshold 0.000 -ploidy 2 --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam"

  time ${GATK4} HaplotypeCaller --reference ${REFERENCE_GENOME} ${INPUT_BAM_LINE} -L chr${chr}:${pos} --interval-padding 500 \
  --active-probability-threshold 0.000 -ploidy 2 \
  --output ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.vcf -bamout ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam

done


##############################################################################################
## write the IGV batch file for each affected individual in this family based on the bamouts #
## to be stored as /home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${PROBAND_ID}_${FAMILY_ID}.snapshot.txt #
## ${PROBAND_ID}_${FAMILY_ID} == ${aff_pro_arr[$key] #
##################################################################

for key in "${!aff_pro_arr[@]}"; do
  echo ""
  echo ""
  echo "Generating the IGV batch file for ${aff_pro_arr[$key]}";

  snap_file=/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/bamout_${aff_pro_arr[$key]}.snapshot.txt

  # check if previous version exist, if so - delete it
  if [ -f "${snap_file}" ]; then
    echo "previous version of ${snap_file} exist --> deleted"
    rm ${snap_file}
  fi

  # write the header for the IGV batch file
  echo "new" >> ${snap_file}
  echo "genome hg38" >> ${snap_file}
  echo "snapshotDirectory \"/home/u035/u035/shared/analysis/work/${PROJECT_ID}/DECIPHER/IGV/${PLATE_ID}_${FAMILY_ID}\"" >> ${snap_file}
  echo "" >> ${snap_file}


  # now, go again over the variants in this individual's DECIPHER file and generate one snapshot file for all the variants
  var_file=${DEC_DIR}/${aff_pro_arr[$key]}_DEC_FLT.csv
  echo "... reading ${var_file} to generate the IGV batch file using the bamouts..."

  grep -v '^Internal' ${var_file} |
  while IFS= read -r line
  do
    IFS=, read -ra ary <<<"$line"
    chr=${ary[1]}
    pos=${ary[2]}
    ref=${ary[4]}
    alt=${ary[5]}
    left=$((${pos}-25))
    right=$((${pos}+25))

    echo "new" >> ${snap_file}
    echo "load ${BAMOUT_DIR}/${FAMILY_ID}/${FAMILY_ID}_chr${chr}_${pos}.bamout.bam" >> ${snap_file}
    echo "preference SAM.SHADE_BASE_QUALITY true" >> ${snap_file}

    echo "goto chr${chr}:${left}-${right}" >> ${snap_file}
    echo "group SAMPLE" >> ${snap_file}
    echo "sort base" >> ${snap_file}
    echo "squish" >> ${snap_file}
    echo "snapshot bamout_${aff_pro_arr[$key]}_chr${chr}_${pos}_${ref}_${alt}.png" >> ${snap_file}
    echo "" >> ${snap_file}
    echo "" >> ${snap_file}

  done

  echo "Generating of the IGV batch file based on bamouts for ${aff_pro_arr[$key]}- done!"
  echo "snap_file = ${snap_file}"

done

echo ""
echo ""
echo ""
echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
echo "+++   Variant prioritization of family ${FAMILY_ID} completed   +++"
echo "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"