test_run_processing.sh 4.16 KiB
#!/bin/bash
#PBS -l walltime=06:00:00
#PBS -l ncpus=1,mem=16gb
#PBS -q uv2000
#PBS -N run_processing
#PBS -j oe
# setup PATH
export PATH=$PATH:/home/u035/project/software/bcbio/anaconda/envs/python2/bin:/home/u035/project/software/bcbio/anaconda/bin
export PERL5LIB=$PERL5LIB:/home/u035/project/software/bcbio/anaconda/lib/site_perl/5.26.2
### folder structure for the downstream analysis - created by processing_setup.sh ###
BASE=/scratch/u035/project/analysis/wes_pilot
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
SCRIPTS_DIR=/home/u035/project/scripts
# other files to be used
FAMILY_IDS=${WORK_DIR}/FAM_IDs.txt # created by processing_setup.sh
CHILD_IDS=${WORK_DIR}/PRO_IDs.txt # created by processing_setup.sh
TARGETS=/home/u035/project/resources/DDG2P.20190613.plus15bp.merged.bed # OK
CLINVAR=/home/u035/project/resources/DDG2P.20190613.clinvar.20190902.plus15bp.txt # OK
### TOOLS ###
BCFTOOLS=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bcftools
BGZIP=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/bgzip
TABIX=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/tabix
VT=/home/u035/project/software/bcbio/anaconda/bin/vt
VASE=/home/u035/project/software/bcbio/anaconda/bin/vase
GATK4=/home/u035/project/software/bcbio/anaconda/bin/gatk
GATK3=/home/u035/project/software/GenomeAnalysisTK-3.8/GenomeAnalysisTK.jar
PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7
VEP="/home/u035/project/software/bcbio/anaconda/bin/perl /home/u035/project/software/bcbio/anaconda/bin/vep" # points to ../share/ensembl-vep-97.3-0/vep
REFERENCE_GENOME=/home/u035/project/data/reference/hg38.fa
echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
echo "SOURCE_DIR = ${SOURCE_DIR}" # the command-line argument SOURCE_DIR is the general path to the source BAM files (VCF and PED already copied)
# enable running singletons
if [ -z $PBS_ARRAY_INDEX ]
then
if [ -z $INDEX ]
then
export PBS_ARRAY_INDEX=1
else
export PBS_ARRAY_INDEX=$INDEX
fi
fi
# change to the LOG folder
cd ${LOG_DIR}
################################
##### for each family ####
################################
FAMILY_ID=`head -n ${PBS_ARRAY_INDEX} ${FAMILY_IDS} | tail -n 1`
#######################################################################
### for each proband generate the DECIPHER file ###
### ${VCF_DIR}/${FAMILY_ID}.ready.vcf.gz - the cleaned family VCF ###
### ${VASE_DIR}/${FAMILY_ID}.ready.denovo.vcf - the VASE file ###
#######################################################################
echo "Generating the DECIPHER file for PROBAND_ID = ${PROBAND_ID} ..."
# first, split the family VCF to individual VCFs
# -c1: minimum allele count (INFO/AC) of sites to be printed
# split multi-allelic sites (by -m -any)
# left-alignment and normalization (by adding the -f)
file=${VCF_DIR}/${FAMILY_ID}.ready.vcf.gz
echo "splitting $file"
for indi in `${BCFTOOLS} query -l $file`; do
${BCFTOOLS} view -c1 -Oz -s $indi -o ${file/.vcf*/.$indi.rough.vcf.gz} $file
${BCFTOOLS} norm -f ${REFERENCE_GENOME} -m -any -Oz -o ${file/.vcf*/.$indi.vcf.gz} ${file/.vcf*/.$indi.rough.vcf.gz}
rm ${file/.vcf*/.$indi.rough.vcf.gz}
done
# VASE file - already split, left-aligned and normalized
# call the py scrpit
PED_FILE=${PED_DIR}/*_${FAMILY_ID}.ped
time ${PYTHON2} ${SCRIPTS_DIR}/generate_DEC_IGV.py ${PED_FILE} ${G2P_DIR}/${FAMILY_ID}_LOG_DIR/${FAMILY_ID}.report.txt \
${VASE_DIR}/${FAMILY_ID}.ready.denovo.vcf ${VCF_DIR}/${FAMILY_ID} ${DEC_DIR} ${IGV_DIR} ${IGV_DIR}/${FAMILY_ID} ${SOURCE_DIR}
echo ""
echo ""
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo "DECIPHER analysis of PROBAND_ID = ${PROBAND_ID}: done"
echo "^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"
echo ""
echo ""
rm ${VCF_DIR}/${FAMILY_ID}.clean.vcf ##################################################