Skip to content
Snippets Groups Projects
NHS_WES_trio_cram_setup.sh 4.22 KiB
Newer Older
#!/bin/bash
#PBS -l walltime=24:00:00
#PBS -l ncpus=16,mem=2gb
#PBS -q uv2000
#PBS -N NHS_WES_cram_setup
#PBS -j oe


### Setup the folder structure for the downstream analysis###
BASE=/scratch/u035/u035/shared/trio_whole_exome/analysis
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE 
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
BAMOUT_DIR=${WORK_DIR}/BAMOUT
SCRIPTS_DIR=/home/u035/u035/shared/scripts
PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7
SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools
PICARD=/home/u035/u035/shared/software/bcbio/anaconda/bin/picard
REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa


# check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting
if [ -d "${WORK_DIR}" ]; then
  echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!"
  exit
fi




echo "SOURCE_DIR = ${SOURCE_DIR}"	# the general path to the source VCF, BAM and PED files			i.e. /scratch/u035/u035/shared/trio_whole_exome/analysis/output
echo "BATCH_ID = ${BATCH_ID}"		# the ID of the batch being processed 					e.g. 11870_Germain_Lorna
echo "PLATE_ID = ${PLATE_ID}" 		# the PCR plate ID of the batch being currently processed, 		e.g. 16862
echo "PROJECT_ID = ${PROJECT_ID}"	# this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "VERSION_N = ${VERSION_N}"         # the version of the alignment and genotyping analysis


S_PED_DIR=${SOURCE_DIR}/../../params	# requires that the family PED files are in this folder




# create the working dir and the required subfolders
mkdir ${WORK_DIR}
mkdir ${VCF_DIR}
mkdir ${PED_DIR}
mkdir ${LOG_DIR}
mkdir ${G2P_DIR}
mkdir ${VASE_DIR}
mkdir ${COV_DIR}
mkdir ${DEC_DIR}
mkdir ${IGV_DIR}
mkdir ${CNV_DIR}
mkdir ${BAMOUT_DIR}
echo "Created ${WORK_DIR} for this batch and all the required subfolders"




######################################################
###   Copy the VCF and PED file per each family    ###
######################################################

# make sure we are reading the data from the exact version, batch & plate ID
SOURCE_VCF_DIRS=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_*	
#echo "Found the following source VCF folders"

for S_VCF_DIR in ${SOURCE_VCF_DIRS}
do
  # echo "  ${S_VCF_DIR}"
  VCF_DIR_NAME="${S_VCF_DIR##*/}" 
  # echo "    ${VCF_DIR_NAME}" 
  IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
  FAM_ID=${my_arr[-1]}  
  # echo "      BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}" 
  echo "  FAM_ID = ${FAM_ID}"

  # construct the VCF and PED file names for this family
  S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz
  S_PED_FILE=${S_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped

  # copy the trio VCF and PED files
  cp ${S_VCF_FILE} ${VCF_DIR}
  cp ${S_PED_FILE} ${PED_DIR} 
  echo "    copied ${S_VCF_FILE} --> ${VCF_DIR}" 
  echo "    copied ${S_PED_FILE} --> ${PED_DIR}"



  # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>)
  cd ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAM_ID}
  for ITEM in `ls -l`
  do
      if test -d $ITEM && [[ "$ITEM" == *"_"* ]]
      then
        echo "    $ITEM is a CRAM/BAM folder..."
        BAM=${ITEM}/${ITEM}-ready.bam
        CRAM=${ITEM}/${ITEM}-ready.cram
        ${SAMTOOLS} view -@ 16 -T ${REFERENCE_GENOME} -hb -o ${BAM} ${CRAM}
ameyner2's avatar
ameyner2 committed
        ${PICARD} BuildBamIndex -I ${BAM} -O ${BAM}.bai -USE_JDK_DEFLATER true -USE_JDK_INFLATER true -VERBOSITY ERROR -QUIET true
        echo "      Generated ${BAM} and ${BAM}.bai from ${CRAM}"  
      fi
  done

done




######################################################################################
### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ###
######################################################################################

time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_extract_trio_FAM_PRO_ID.py ${WORK_DIR}

echo ""
echo ""
echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"