#!/bin/bash #PBS -l walltime=24:00:00 #PBS -l ncpus=16,mem=2gb #PBS -q uv2000 #PBS -N NHS_WES_cram_setup #PBS -j oe ### Setup the folder structure for the downstream analysis### BASE=/scratch/u035/u035/shared/trio_whole_exome/analysis WORK_DIR=$BASE/${PROJECT_ID} VCF_DIR=${WORK_DIR}/VCF PED_DIR=${WORK_DIR}/PED LOG_DIR=${WORK_DIR}/LOG G2P_DIR=${WORK_DIR}/G2P VASE_DIR=${WORK_DIR}/VASE COV_DIR=${WORK_DIR}/COV DEC_DIR=${WORK_DIR}/DECIPHER IGV_DIR=${DEC_DIR}/IGV CNV_DIR=${WORK_DIR}/CNV BAMOUT_DIR=${WORK_DIR}/BAMOUT SCRIPTS_DIR=/home/u035/u035/shared/scripts ### Tools PYTHON2=/home/u035/u035/shared/software/bcbio/anaconda/envs/python2/bin/python2.7 SAMTOOLS=/home/u035/u035/shared/software/bcbio/anaconda/bin/samtools PICARD=/home/u035/u035/shared/software/bcbio/anaconda/bin/picard REFERENCE_GENOME=/home/u035/u035/shared/software/bcbio/genomes/Hsapiens/hg38/seq/hg38.fa # check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting if [ -d "${WORK_DIR}" ]; then echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!" exit fi echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /scratch/u035/u035/shared/trio_whole_exome/analysis/output echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 11870_Germain_Lorna echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862 echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done echo "VERSION_N = ${VERSION_N}" # the version of the alignment and genotyping analysis S_PED_DIR=${SOURCE_DIR}/../../params # requires that the family PED files are in this folder # create the working dir and the required subfolders mkdir ${WORK_DIR} mkdir ${VCF_DIR} mkdir ${PED_DIR} mkdir ${LOG_DIR} mkdir ${G2P_DIR} mkdir ${VASE_DIR} mkdir ${COV_DIR} mkdir ${DEC_DIR} mkdir ${IGV_DIR} mkdir ${CNV_DIR} mkdir ${BAMOUT_DIR} echo "Created ${WORK_DIR} for this batch and all the required subfolders" ###################################################### ### Copy the VCF and PED file per each family ### ###################################################### # make sure we are reading the data from the exact version, batch & plate ID SOURCE_VCF_DIRS=${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_* #echo "Found the following source VCF folders" for S_VCF_DIR in ${SOURCE_VCF_DIRS} do # echo " ${S_VCF_DIR}" VCF_DIR_NAME="${S_VCF_DIR##*/}" # echo " ${VCF_DIR_NAME}" IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}" FAM_ID=${my_arr[-1]} # echo " BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}" echo " FAM_ID = ${FAM_ID}" # construct the VCF and PED file names for this family S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz S_PED_FILE=${S_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped # copy the trio VCF and PED files cp ${S_VCF_FILE} ${VCF_DIR} cp ${S_PED_FILE} ${PED_DIR} echo " copied ${S_VCF_FILE} --> ${VCF_DIR}" echo " copied ${S_PED_FILE} --> ${PED_DIR}" # identify all folders (one for each individual) for this family containing cram/bam files (format: <INDI_ID>_<FAM_ID>) cd ${SOURCE_DIR}/????-??-??_${VERSION_N}_${BATCH_ID}_${PLATE_ID}_${FAM_ID} for ITEM in `ls -l` do if test -d $ITEM && [[ "$ITEM" == *"_"* ]] then echo " $ITEM is a CRAM/BAM folder..." BAM=${ITEM}/${ITEM}-ready.bam CRAM=${ITEM}/${ITEM}-ready.cram ${SAMTOOLS} view -@ 16 -T ${REFERENCE_GENOME} -hb -o ${BAM} ${CRAM} ${PICARD} BuildBamIndex -I ${BAM} -O ${BAM}.bai -USE_JDK_DEFLATER true -USE_JDK_INFLATER true -VERBOSITY ERROR -QUIET true echo " Generated ${BAM} and ${BAM}.bai from ${CRAM}" fi done done ###################################################################################### ### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ### ###################################################################################### time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_extract_trio_FAM_PRO_ID.py ${WORK_DIR} echo "" echo "" echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"