Newer
Older
#!/bin/bash
#PBS -l walltime=01:00:00
#PBS -l ncpus=1,mem=2gb
#PBS -q uv2000
#PBS -N NHS_WES_setup
#PBS -j oe
### Setup the folder structure for the downstream analysis###
WORK_DIR=$BASE/${PROJECT_ID}
VCF_DIR=${WORK_DIR}/VCF
PED_DIR=${WORK_DIR}/PED
LOG_DIR=${WORK_DIR}/LOG
G2P_DIR=${WORK_DIR}/G2P
VASE_DIR=${WORK_DIR}/VASE
COV_DIR=${WORK_DIR}/COV
DEC_DIR=${WORK_DIR}/DECIPHER
IGV_DIR=${DEC_DIR}/IGV
CNV_DIR=${WORK_DIR}/CNV
BAMOUT_DIR=${WORK_DIR}/BAMOUT
PYTHON2=/home/u035/project/software/bcbio/anaconda/envs/python2/bin/python2.7
# check if ${WORK_DIR} already exists - if so, exit - to prevent accidental overwriting
if [ -d "${WORK_DIR}" ]; then
echo "${WORK_DIR} already exists - EXIT! If really intended, delete manually!!!!"
exit
fi
echo "SOURCE_DIR = ${SOURCE_DIR}" # the general path to the source VCF, BAM and PED files i.e. /scratch/u035/project/trio_whole_exome/analysis/output/
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
echo "BATCH_ID = ${BATCH_ID}" # the ID of the batch being processed e.g. 11870_Germain_Lorna
echo "PLATE_ID = ${PLATE_ID}" # the PCR plate ID of the batch being currently processed, e.g. 16862
echo "PROJECT_ID = ${PROJECT_ID}" # this the the folder (${BASE}/${PROJECT_ID}) where the downstream analysis will be done
S_PED_DIR=${SOURCE_DIR}/../params # requires that the family PED files are in this folder
# create the working dir and the required subfolders
mkdir ${WORK_DIR}
mkdir ${VCF_DIR}
mkdir ${PED_DIR}
mkdir ${LOG_DIR}
mkdir ${G2P_DIR}
mkdir ${VASE_DIR}
mkdir ${COV_DIR}
mkdir ${DEC_DIR}
mkdir ${IGV_DIR}
mkdir ${CNV_DIR}
mkdir ${BAMOUT_DIR}
echo "Created ${WORK_DIR} for this batch and all the required subfolders"
######################################################
### Copy the VCF and PED file per each family ###
######################################################
SOURCE_VCF_DIRS=${SOURCE_DIR}/????-??-??_${BATCH_ID}_${PLATE_ID}_* # make sure we are reading the data from the exact batch & plate ID
#echo "Found the following source VCF folders"
for S_VCF_DIR in ${SOURCE_VCF_DIRS}
do
# echo " ${S_VCF_DIR}"
VCF_DIR_NAME="${S_VCF_DIR##*/}"
# echo " ${VCF_DIR_NAME}"
IFS=_ read -ra my_arr <<< "${VCF_DIR_NAME}"
FAM_ID=${my_arr[-1]}
# echo " BATCH = ${BATCH_ID}, PLATE = ${PLATE_ID}, FAM_ID = ${FAM_ID}"
echo " FAM_ID = ${FAM_ID}"
# construct the VCF and PED file names for this family
S_VCF_FILE=${S_VCF_DIR}/${PLATE_ID}_${FAM_ID}-gatk-haplotype-annotated.vcf.gz
S_PED_FILE=${S_PED_DIR}/${BATCH_ID}_${PLATE_ID}_${FAM_ID}.ped
# copy the trio VCF and PED files
cp ${S_VCF_FILE} ${VCF_DIR}
cp ${S_PED_FILE} ${PED_DIR}
echo " copied ${S_VCF_FILE} --> ${VCF_DIR}"
echo " copied ${S_PED_FILE} --> ${PED_DIR}"
done
######################################################################################
### generate the FAM_IDs.txt, PRO_IDs.txt and FAM_PRO.txt *only for trio* families ###
######################################################################################
time ${PYTHON2} ${SCRIPTS_DIR}/NHS_WES_extract_trio_FAM_PRO_ID.py ${WORK_DIR}
echo ""
echo ""
echo "OK: Setup for PROJECT_ID = $PROJECT_ID successful"