Skip to content
Snippets Groups Projects
trio_cram_setup.sh 4.47 KiB
Newer Older
#!/bin/bash
#SBATCH --cpus-per-task=16
#SBATCH --mem=2GB
#SBATCH --time=24:00:00
#SBATCH --job-name=trio_cram_setup
#SBATCH --output=trio_cram_setup.%A_%a.out
#SBATCH --error=trio_cram_setup.%A_%a.err


### Setup the folder structure for the downstream analysis###
BASE=/home/u035/u035/shared/analysis/work
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
BAMOUT_DIR=${WORK_DIR}/BAMOUT
SCRIPTS_DIR=/home/u035/u035/shared/scripts



### Tools
PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
PICARD=/home/u035/u035/shared/software/bcbio/anaconda/bin/picard
REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa



#~## check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting
#~#if [ -d "${WORK_DIR}" ]; then
#~#  echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!"
#~#  exit
#~#fi




echo "SOURCE_DIR = ${SOURCE_DIR}"       # the general path to the source VCF, BAM and PED files                 i.e. /home/u035/u035/shared/results
echo "BATCH_ID = ${BATCH_ID}"           # the ID of the batch being processed                                   e.g. 19650_Ansari_Morad
echo "BATCH_NUM = ${BATCH_NUM}"         # the numerical part of the BATCH_ID                                    e.g. 19650
echo "PLATE_ID = ${PLATE_ID}"           # the PCR plate ID of the batch being currently processed,              e.g. 19285
echo "PROJECT_ID = ${PROJECT_ID}"       # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis

#~#S_PED_DIR=${SOURCE_DIR}/../../params		# requires that the family PED files are in this folder




# create the working dir and the required subfolders
mkdir ${WORK_DIR}
mkdir ${VCF_DIR}
mkdir ${PED_DIR}
mkdir ${LOG_DIR}
mkdir ${G2P_DIR}
mkdir ${VASE_DIR}
mkdir ${COV_DIR}
mkdir ${DEC_DIR}
mkdir ${IGV_DIR}
mkdir ${CNV_DIR}
mkdir ${BAMOUT_DIR}
echo "Created ${WORK_DIR} for this batch and all the required subfolders"



######################################################
###   Copy the VCF and PED file per each family    ###
######################################################

SOURCE_VCF_DIRS=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_*
SOURCE_PED_DIR=${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/params

for S_VCF_DIR in ${SOURCE_VCF_DIRS}
do
#  echo "  ${S_VCF_DIR}"
  VCF_DIR_NAME="${S_VCF_DIR##*/}"
#  echo "    ${VCF_DIR_NAME}"
  IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
  FAM_ID=${my_arr[-1]}
#  echo "      BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}"
  echo "  FAM_ID = ${FAM_ID}"

  # construct the VCF and PED file names for this family
  S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz
  S_PED_FILE=${SOURCE_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped


  # copy the trio VCF and PED files
  cp ${S_VCF_FILE} ${VCF_DIR}
  cp ${S_PED_FILE} ${PED_DIR}
  echo "    copied ${S_VCF_FILE} --> ${VCF_DIR}"
  echo "    copied ${S_PED_FILE} --> ${PED_DIR}"


  # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>)
  cd ${SOURCE_DIR}/${BATCH_NUM}_${VERSION_N}/families/????-??-??_${BATCH_NUM}_${VERSION_N}_${PLATE_ID}_${FAM_ID}
  for ITEM in `ls -l`
  do
      if test -d $ITEM && [[ "$ITEM" == *"_"* ]]
      then
        echo "    $ITEM is a CRAM/BAM folder..."
        BAM=${ITEM}/${ITEM}-ready.bam
        CRAM=${ITEM}/${ITEM}-ready.cram
        ${SAMTOOLS} view -@ 16 -T ${REFERENCE_GENOME} -hb -o ${BAM} ${CRAM}
        ${PICARD} BuildBamIndex -I ${BAM} -O ${BAM}.bai -USE_JDK_DEFLATER true -USE_JDK_INFLATER true -VERBOSITY ERROR -QUIET true
        echo "      Generated ${BAM} and ${BAM}.bai from ${CRAM}"
      fi
  done

done



######################################################################################
### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ###
######################################################################################

time ${PYTHON2} ${SCRIPTS_DIR}/extract_trio_FAM_PRO_ID.py ${WORK_DIR}

echo ""
echo ""
echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"